1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/sctp.h>
143 #include <linux/crash_dump.h>
144
145 #include "net-sysfs.h"
146
147 /* Instead of increasing this, you should create a hash table. */
148 #define MAX_GRO_SKBS 8
149
150 /* This should be increased if a protocol with a bigger head is added. */
151 #define GRO_MAX_HEAD (MAX_HEADER + 128)
152
153 static DEFINE_SPINLOCK(ptype_lock);
154 static DEFINE_SPINLOCK(offload_lock);
155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
156 struct list_head ptype_all __read_mostly; /* Taps */
157 static struct list_head offload_base __read_mostly;
158
159 static int netif_rx_internal(struct sk_buff *skb);
160 static int call_netdevice_notifiers_info(unsigned long val,
161 struct net_device *dev,
162 struct netdev_notifier_info *info);
163
164 /*
165 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
166 * semaphore.
167 *
168 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
169 *
170 * Writers must hold the rtnl semaphore while they loop through the
171 * dev_base_head list, and hold dev_base_lock for writing when they do the
172 * actual updates. This allows pure readers to access the list even
173 * while a writer is preparing to update it.
174 *
175 * To put it another way, dev_base_lock is held for writing only to
176 * protect against pure readers; the rtnl semaphore provides the
177 * protection against other writers.
178 *
179 * See, for example usages, register_netdevice() and
180 * unregister_netdevice(), which must be called with the rtnl
181 * semaphore held.
182 */
183 DEFINE_RWLOCK(dev_base_lock);
184 EXPORT_SYMBOL(dev_base_lock);
185
186 /* protects napi_hash addition/deletion and napi_gen_id */
187 static DEFINE_SPINLOCK(napi_hash_lock);
188
189 static unsigned int napi_gen_id = NR_CPUS;
190 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
191
192 static seqcount_t devnet_rename_seq;
193
dev_base_seq_inc(struct net * net)194 static inline void dev_base_seq_inc(struct net *net)
195 {
196 while (++net->dev_base_seq == 0);
197 }
198
dev_name_hash(struct net * net,const char * name)199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
202
203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
204 }
205
dev_index_hash(struct net * net,int ifindex)206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 {
208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
209 }
210
rps_lock(struct softnet_data * sd)211 static inline void rps_lock(struct softnet_data *sd)
212 {
213 #ifdef CONFIG_RPS
214 spin_lock(&sd->input_pkt_queue.lock);
215 #endif
216 }
217
rps_unlock(struct softnet_data * sd)218 static inline void rps_unlock(struct softnet_data *sd)
219 {
220 #ifdef CONFIG_RPS
221 spin_unlock(&sd->input_pkt_queue.lock);
222 #endif
223 }
224
225 /* Device list insertion */
list_netdevice(struct net_device * dev)226 static void list_netdevice(struct net_device *dev)
227 {
228 struct net *net = dev_net(dev);
229
230 ASSERT_RTNL();
231
232 write_lock_bh(&dev_base_lock);
233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
237 write_unlock_bh(&dev_base_lock);
238
239 dev_base_seq_inc(net);
240 }
241
242 /* Device list removal
243 * caller must respect a RCU grace period before freeing/reusing dev
244 */
unlist_netdevice(struct net_device * dev)245 static void unlist_netdevice(struct net_device *dev)
246 {
247 ASSERT_RTNL();
248
249 /* Unlink dev from the device chain */
250 write_lock_bh(&dev_base_lock);
251 list_del_rcu(&dev->dev_list);
252 hlist_del_rcu(&dev->name_hlist);
253 hlist_del_rcu(&dev->index_hlist);
254 write_unlock_bh(&dev_base_lock);
255
256 dev_base_seq_inc(dev_net(dev));
257 }
258
259 /*
260 * Our notifier list
261 */
262
263 static RAW_NOTIFIER_HEAD(netdev_chain);
264
265 /*
266 * Device drivers call our routines to queue packets here. We empty the
267 * queue in the local softnet handler.
268 */
269
270 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
271 EXPORT_PER_CPU_SYMBOL(softnet_data);
272
273 #ifdef CONFIG_LOCKDEP
274 /*
275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
276 * according to dev->type
277 */
278 static const unsigned short netdev_lock_type[] =
279 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
280 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
281 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
282 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
283 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
284 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
285 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
286 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
287 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
288 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
289 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
290 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
291 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
292 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
293 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
294
295 static const char *const netdev_lock_name[] =
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
309 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
310 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
311
312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314
netdev_lock_pos(unsigned short dev_type)315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316 {
317 int i;
318
319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 if (netdev_lock_type[i] == dev_type)
321 return i;
322 /* the last key is used by default */
323 return ARRAY_SIZE(netdev_lock_type) - 1;
324 }
325
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
328 {
329 int i;
330
331 i = netdev_lock_pos(dev_type);
332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 netdev_lock_name[i]);
334 }
335
netdev_set_addr_lockdep_class(struct net_device * dev)336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 int i;
339
340 i = netdev_lock_pos(dev->type);
341 lockdep_set_class_and_name(&dev->addr_list_lock,
342 &netdev_addr_lock_key[i],
343 netdev_lock_name[i]);
344 }
345 #else
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 unsigned short dev_type)
348 {
349 }
netdev_set_addr_lockdep_class(struct net_device * dev)350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351 {
352 }
353 #endif
354
355 /*******************************************************************************
356
357 Protocol management and registration routines
358
359 *******************************************************************************/
360
361 /*
362 * Add a protocol ID to the list. Now that the input handler is
363 * smarter we can dispense with all the messy stuff that used to be
364 * here.
365 *
366 * BEWARE!!! Protocol handlers, mangling input packets,
367 * MUST BE last in hash buckets and checking protocol handlers
368 * MUST start from promiscuous ptype_all chain in net_bh.
369 * It is true now, do not change it.
370 * Explanation follows: if protocol handler, mangling packet, will
371 * be the first on list, it is not able to sense, that packet
372 * is cloned and should be copied-on-write, so that it will
373 * change it and subsequent readers will get broken packet.
374 * --ANK (980803)
375 */
376
ptype_head(const struct packet_type * pt)377 static inline struct list_head *ptype_head(const struct packet_type *pt)
378 {
379 if (pt->type == htons(ETH_P_ALL))
380 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
381 else
382 return pt->dev ? &pt->dev->ptype_specific :
383 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385
386 /**
387 * dev_add_pack - add packet handler
388 * @pt: packet type declaration
389 *
390 * Add a protocol handler to the networking stack. The passed &packet_type
391 * is linked into kernel lists and may not be freed until it has been
392 * removed from the kernel lists.
393 *
394 * This call does not sleep therefore it can not
395 * guarantee all CPU's that are in middle of receiving packets
396 * will see the new packet type (until the next received packet).
397 */
398
dev_add_pack(struct packet_type * pt)399 void dev_add_pack(struct packet_type *pt)
400 {
401 struct list_head *head = ptype_head(pt);
402
403 spin_lock(&ptype_lock);
404 list_add_rcu(&pt->list, head);
405 spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408
409 /**
410 * __dev_remove_pack - remove packet handler
411 * @pt: packet type declaration
412 *
413 * Remove a protocol handler that was previously added to the kernel
414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
415 * from the kernel lists and can be freed or reused once this function
416 * returns.
417 *
418 * The packet type might still be in use by receivers
419 * and must not be freed until after all the CPU's have gone
420 * through a quiescent state.
421 */
__dev_remove_pack(struct packet_type * pt)422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 struct list_head *head = ptype_head(pt);
425 struct packet_type *pt1;
426
427 spin_lock(&ptype_lock);
428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 pr_warn("dev_remove_pack: %p not found\n", pt);
437 out:
438 spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441
442 /**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
dev_remove_pack(struct packet_type * pt)454 void dev_remove_pack(struct packet_type *pt)
455 {
456 __dev_remove_pack(pt);
457
458 synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461
462
463 /**
464 * dev_add_offload - register offload handlers
465 * @po: protocol offload declaration
466 *
467 * Add protocol offload handlers to the networking stack. The passed
468 * &proto_offload is linked into kernel lists and may not be freed until
469 * it has been removed from the kernel lists.
470 *
471 * This call does not sleep therefore it can not
472 * guarantee all CPU's that are in middle of receiving packets
473 * will see the new offload handlers (until the next received packet).
474 */
dev_add_offload(struct packet_offload * po)475 void dev_add_offload(struct packet_offload *po)
476 {
477 struct packet_offload *elem;
478
479 spin_lock(&offload_lock);
480 list_for_each_entry(elem, &offload_base, list) {
481 if (po->priority < elem->priority)
482 break;
483 }
484 list_add_rcu(&po->list, elem->list.prev);
485 spin_unlock(&offload_lock);
486 }
487 EXPORT_SYMBOL(dev_add_offload);
488
489 /**
490 * __dev_remove_offload - remove offload handler
491 * @po: packet offload declaration
492 *
493 * Remove a protocol offload handler that was previously added to the
494 * kernel offload handlers by dev_add_offload(). The passed &offload_type
495 * is removed from the kernel lists and can be freed or reused once this
496 * function returns.
497 *
498 * The packet type might still be in use by receivers
499 * and must not be freed until after all the CPU's have gone
500 * through a quiescent state.
501 */
__dev_remove_offload(struct packet_offload * po)502 static void __dev_remove_offload(struct packet_offload *po)
503 {
504 struct list_head *head = &offload_base;
505 struct packet_offload *po1;
506
507 spin_lock(&offload_lock);
508
509 list_for_each_entry(po1, head, list) {
510 if (po == po1) {
511 list_del_rcu(&po->list);
512 goto out;
513 }
514 }
515
516 pr_warn("dev_remove_offload: %p not found\n", po);
517 out:
518 spin_unlock(&offload_lock);
519 }
520
521 /**
522 * dev_remove_offload - remove packet offload handler
523 * @po: packet offload declaration
524 *
525 * Remove a packet offload handler that was previously added to the kernel
526 * offload handlers by dev_add_offload(). The passed &offload_type is
527 * removed from the kernel lists and can be freed or reused once this
528 * function returns.
529 *
530 * This call sleeps to guarantee that no CPU is looking at the packet
531 * type after return.
532 */
dev_remove_offload(struct packet_offload * po)533 void dev_remove_offload(struct packet_offload *po)
534 {
535 __dev_remove_offload(po);
536
537 synchronize_net();
538 }
539 EXPORT_SYMBOL(dev_remove_offload);
540
541 /******************************************************************************
542
543 Device Boot-time Settings Routines
544
545 *******************************************************************************/
546
547 /* Boot time configuration table */
548 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
549
550 /**
551 * netdev_boot_setup_add - add new setup entry
552 * @name: name of the device
553 * @map: configured settings for the device
554 *
555 * Adds new setup entry to the dev_boot_setup list. The function
556 * returns 0 on error and 1 on success. This is a generic routine to
557 * all netdevices.
558 */
netdev_boot_setup_add(char * name,struct ifmap * map)559 static int netdev_boot_setup_add(char *name, struct ifmap *map)
560 {
561 struct netdev_boot_setup *s;
562 int i;
563
564 s = dev_boot_setup;
565 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
566 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
567 memset(s[i].name, 0, sizeof(s[i].name));
568 strlcpy(s[i].name, name, IFNAMSIZ);
569 memcpy(&s[i].map, map, sizeof(s[i].map));
570 break;
571 }
572 }
573
574 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
575 }
576
577 /**
578 * netdev_boot_setup_check - check boot time settings
579 * @dev: the netdevice
580 *
581 * Check boot time settings for the device.
582 * The found settings are set for the device to be used
583 * later in the device probing.
584 * Returns 0 if no settings found, 1 if they are.
585 */
netdev_boot_setup_check(struct net_device * dev)586 int netdev_boot_setup_check(struct net_device *dev)
587 {
588 struct netdev_boot_setup *s = dev_boot_setup;
589 int i;
590
591 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
592 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
593 !strcmp(dev->name, s[i].name)) {
594 dev->irq = s[i].map.irq;
595 dev->base_addr = s[i].map.base_addr;
596 dev->mem_start = s[i].map.mem_start;
597 dev->mem_end = s[i].map.mem_end;
598 return 1;
599 }
600 }
601 return 0;
602 }
603 EXPORT_SYMBOL(netdev_boot_setup_check);
604
605
606 /**
607 * netdev_boot_base - get address from boot time settings
608 * @prefix: prefix for network device
609 * @unit: id for network device
610 *
611 * Check boot time settings for the base address of device.
612 * The found settings are set for the device to be used
613 * later in the device probing.
614 * Returns 0 if no settings found.
615 */
netdev_boot_base(const char * prefix,int unit)616 unsigned long netdev_boot_base(const char *prefix, int unit)
617 {
618 const struct netdev_boot_setup *s = dev_boot_setup;
619 char name[IFNAMSIZ];
620 int i;
621
622 sprintf(name, "%s%d", prefix, unit);
623
624 /*
625 * If device already registered then return base of 1
626 * to indicate not to probe for this interface
627 */
628 if (__dev_get_by_name(&init_net, name))
629 return 1;
630
631 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
632 if (!strcmp(name, s[i].name))
633 return s[i].map.base_addr;
634 return 0;
635 }
636
637 /*
638 * Saves at boot time configured settings for any netdevice.
639 */
netdev_boot_setup(char * str)640 int __init netdev_boot_setup(char *str)
641 {
642 int ints[5];
643 struct ifmap map;
644
645 str = get_options(str, ARRAY_SIZE(ints), ints);
646 if (!str || !*str)
647 return 0;
648
649 /* Save settings */
650 memset(&map, 0, sizeof(map));
651 if (ints[0] > 0)
652 map.irq = ints[1];
653 if (ints[0] > 1)
654 map.base_addr = ints[2];
655 if (ints[0] > 2)
656 map.mem_start = ints[3];
657 if (ints[0] > 3)
658 map.mem_end = ints[4];
659
660 /* Add new entry to the list */
661 return netdev_boot_setup_add(str, &map);
662 }
663
664 __setup("netdev=", netdev_boot_setup);
665
666 /*******************************************************************************
667
668 Device Interface Subroutines
669
670 *******************************************************************************/
671
672 /**
673 * dev_get_iflink - get 'iflink' value of a interface
674 * @dev: targeted interface
675 *
676 * Indicates the ifindex the interface is linked to.
677 * Physical interfaces have the same 'ifindex' and 'iflink' values.
678 */
679
dev_get_iflink(const struct net_device * dev)680 int dev_get_iflink(const struct net_device *dev)
681 {
682 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
683 return dev->netdev_ops->ndo_get_iflink(dev);
684
685 return dev->ifindex;
686 }
687 EXPORT_SYMBOL(dev_get_iflink);
688
689 /**
690 * dev_fill_metadata_dst - Retrieve tunnel egress information.
691 * @dev: targeted interface
692 * @skb: The packet.
693 *
694 * For better visibility of tunnel traffic OVS needs to retrieve
695 * egress tunnel information for a packet. Following API allows
696 * user to get this info.
697 */
dev_fill_metadata_dst(struct net_device * dev,struct sk_buff * skb)698 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
699 {
700 struct ip_tunnel_info *info;
701
702 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
703 return -EINVAL;
704
705 info = skb_tunnel_info_unclone(skb);
706 if (!info)
707 return -ENOMEM;
708 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
709 return -EINVAL;
710
711 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
712 }
713 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
714
715 /**
716 * __dev_get_by_name - find a device by its name
717 * @net: the applicable net namespace
718 * @name: name to find
719 *
720 * Find an interface by name. Must be called under RTNL semaphore
721 * or @dev_base_lock. If the name is found a pointer to the device
722 * is returned. If the name is not found then %NULL is returned. The
723 * reference counters are not incremented so the caller must be
724 * careful with locks.
725 */
726
__dev_get_by_name(struct net * net,const char * name)727 struct net_device *__dev_get_by_name(struct net *net, const char *name)
728 {
729 struct net_device *dev;
730 struct hlist_head *head = dev_name_hash(net, name);
731
732 hlist_for_each_entry(dev, head, name_hlist)
733 if (!strncmp(dev->name, name, IFNAMSIZ))
734 return dev;
735
736 return NULL;
737 }
738 EXPORT_SYMBOL(__dev_get_by_name);
739
740 /**
741 * dev_get_by_name_rcu - find a device by its name
742 * @net: the applicable net namespace
743 * @name: name to find
744 *
745 * Find an interface by name.
746 * If the name is found a pointer to the device is returned.
747 * If the name is not found then %NULL is returned.
748 * The reference counters are not incremented so the caller must be
749 * careful with locks. The caller must hold RCU lock.
750 */
751
dev_get_by_name_rcu(struct net * net,const char * name)752 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
753 {
754 struct net_device *dev;
755 struct hlist_head *head = dev_name_hash(net, name);
756
757 hlist_for_each_entry_rcu(dev, head, name_hlist)
758 if (!strncmp(dev->name, name, IFNAMSIZ))
759 return dev;
760
761 return NULL;
762 }
763 EXPORT_SYMBOL(dev_get_by_name_rcu);
764
765 /**
766 * dev_get_by_name - find a device by its name
767 * @net: the applicable net namespace
768 * @name: name to find
769 *
770 * Find an interface by name. This can be called from any
771 * context and does its own locking. The returned handle has
772 * the usage count incremented and the caller must use dev_put() to
773 * release it when it is no longer needed. %NULL is returned if no
774 * matching device is found.
775 */
776
dev_get_by_name(struct net * net,const char * name)777 struct net_device *dev_get_by_name(struct net *net, const char *name)
778 {
779 struct net_device *dev;
780
781 rcu_read_lock();
782 dev = dev_get_by_name_rcu(net, name);
783 if (dev)
784 dev_hold(dev);
785 rcu_read_unlock();
786 return dev;
787 }
788 EXPORT_SYMBOL(dev_get_by_name);
789
790 /**
791 * __dev_get_by_index - find a device by its ifindex
792 * @net: the applicable net namespace
793 * @ifindex: index of device
794 *
795 * Search for an interface by index. Returns %NULL if the device
796 * is not found or a pointer to the device. The device has not
797 * had its reference counter increased so the caller must be careful
798 * about locking. The caller must hold either the RTNL semaphore
799 * or @dev_base_lock.
800 */
801
__dev_get_by_index(struct net * net,int ifindex)802 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
803 {
804 struct net_device *dev;
805 struct hlist_head *head = dev_index_hash(net, ifindex);
806
807 hlist_for_each_entry(dev, head, index_hlist)
808 if (dev->ifindex == ifindex)
809 return dev;
810
811 return NULL;
812 }
813 EXPORT_SYMBOL(__dev_get_by_index);
814
815 /**
816 * dev_get_by_index_rcu - find a device by its ifindex
817 * @net: the applicable net namespace
818 * @ifindex: index of device
819 *
820 * Search for an interface by index. Returns %NULL if the device
821 * is not found or a pointer to the device. The device has not
822 * had its reference counter increased so the caller must be careful
823 * about locking. The caller must hold RCU lock.
824 */
825
dev_get_by_index_rcu(struct net * net,int ifindex)826 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
827 {
828 struct net_device *dev;
829 struct hlist_head *head = dev_index_hash(net, ifindex);
830
831 hlist_for_each_entry_rcu(dev, head, index_hlist)
832 if (dev->ifindex == ifindex)
833 return dev;
834
835 return NULL;
836 }
837 EXPORT_SYMBOL(dev_get_by_index_rcu);
838
839
840 /**
841 * dev_get_by_index - find a device by its ifindex
842 * @net: the applicable net namespace
843 * @ifindex: index of device
844 *
845 * Search for an interface by index. Returns NULL if the device
846 * is not found or a pointer to the device. The device returned has
847 * had a reference added and the pointer is safe until the user calls
848 * dev_put to indicate they have finished with it.
849 */
850
dev_get_by_index(struct net * net,int ifindex)851 struct net_device *dev_get_by_index(struct net *net, int ifindex)
852 {
853 struct net_device *dev;
854
855 rcu_read_lock();
856 dev = dev_get_by_index_rcu(net, ifindex);
857 if (dev)
858 dev_hold(dev);
859 rcu_read_unlock();
860 return dev;
861 }
862 EXPORT_SYMBOL(dev_get_by_index);
863
864 /**
865 * netdev_get_name - get a netdevice name, knowing its ifindex.
866 * @net: network namespace
867 * @name: a pointer to the buffer where the name will be stored.
868 * @ifindex: the ifindex of the interface to get the name from.
869 *
870 * The use of raw_seqcount_begin() and cond_resched() before
871 * retrying is required as we want to give the writers a chance
872 * to complete when CONFIG_PREEMPT is not set.
873 */
netdev_get_name(struct net * net,char * name,int ifindex)874 int netdev_get_name(struct net *net, char *name, int ifindex)
875 {
876 struct net_device *dev;
877 unsigned int seq;
878
879 retry:
880 seq = raw_seqcount_begin(&devnet_rename_seq);
881 rcu_read_lock();
882 dev = dev_get_by_index_rcu(net, ifindex);
883 if (!dev) {
884 rcu_read_unlock();
885 return -ENODEV;
886 }
887
888 strcpy(name, dev->name);
889 rcu_read_unlock();
890 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
891 cond_resched();
892 goto retry;
893 }
894
895 return 0;
896 }
897
898 /**
899 * dev_getbyhwaddr_rcu - find a device by its hardware address
900 * @net: the applicable net namespace
901 * @type: media type of device
902 * @ha: hardware address
903 *
904 * Search for an interface by MAC address. Returns NULL if the device
905 * is not found or a pointer to the device.
906 * The caller must hold RCU or RTNL.
907 * The returned device has not had its ref count increased
908 * and the caller must therefore be careful about locking
909 *
910 */
911
dev_getbyhwaddr_rcu(struct net * net,unsigned short type,const char * ha)912 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
913 const char *ha)
914 {
915 struct net_device *dev;
916
917 for_each_netdev_rcu(net, dev)
918 if (dev->type == type &&
919 !memcmp(dev->dev_addr, ha, dev->addr_len))
920 return dev;
921
922 return NULL;
923 }
924 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
925
__dev_getfirstbyhwtype(struct net * net,unsigned short type)926 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
927 {
928 struct net_device *dev;
929
930 ASSERT_RTNL();
931 for_each_netdev(net, dev)
932 if (dev->type == type)
933 return dev;
934
935 return NULL;
936 }
937 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
938
dev_getfirstbyhwtype(struct net * net,unsigned short type)939 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
940 {
941 struct net_device *dev, *ret = NULL;
942
943 rcu_read_lock();
944 for_each_netdev_rcu(net, dev)
945 if (dev->type == type) {
946 dev_hold(dev);
947 ret = dev;
948 break;
949 }
950 rcu_read_unlock();
951 return ret;
952 }
953 EXPORT_SYMBOL(dev_getfirstbyhwtype);
954
955 /**
956 * __dev_get_by_flags - find any device with given flags
957 * @net: the applicable net namespace
958 * @if_flags: IFF_* values
959 * @mask: bitmask of bits in if_flags to check
960 *
961 * Search for any interface with the given flags. Returns NULL if a device
962 * is not found or a pointer to the device. Must be called inside
963 * rtnl_lock(), and result refcount is unchanged.
964 */
965
__dev_get_by_flags(struct net * net,unsigned short if_flags,unsigned short mask)966 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
967 unsigned short mask)
968 {
969 struct net_device *dev, *ret;
970
971 ASSERT_RTNL();
972
973 ret = NULL;
974 for_each_netdev(net, dev) {
975 if (((dev->flags ^ if_flags) & mask) == 0) {
976 ret = dev;
977 break;
978 }
979 }
980 return ret;
981 }
982 EXPORT_SYMBOL(__dev_get_by_flags);
983
984 /**
985 * dev_valid_name - check if name is okay for network device
986 * @name: name string
987 *
988 * Network device names need to be valid file names to
989 * to allow sysfs to work. We also disallow any kind of
990 * whitespace.
991 */
dev_valid_name(const char * name)992 bool dev_valid_name(const char *name)
993 {
994 if (*name == '\0')
995 return false;
996 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
997 return false;
998 if (!strcmp(name, ".") || !strcmp(name, ".."))
999 return false;
1000
1001 while (*name) {
1002 if (*name == '/' || *name == ':' || isspace(*name))
1003 return false;
1004 name++;
1005 }
1006 return true;
1007 }
1008 EXPORT_SYMBOL(dev_valid_name);
1009
1010 /**
1011 * __dev_alloc_name - allocate a name for a device
1012 * @net: network namespace to allocate the device name in
1013 * @name: name format string
1014 * @buf: scratch buffer and result name string
1015 *
1016 * Passed a format string - eg "lt%d" it will try and find a suitable
1017 * id. It scans list of devices to build up a free map, then chooses
1018 * the first empty slot. The caller must hold the dev_base or rtnl lock
1019 * while allocating the name and adding the device in order to avoid
1020 * duplicates.
1021 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022 * Returns the number of the unit assigned or a negative errno code.
1023 */
1024
__dev_alloc_name(struct net * net,const char * name,char * buf)1025 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1026 {
1027 int i = 0;
1028 const char *p;
1029 const int max_netdevices = 8*PAGE_SIZE;
1030 unsigned long *inuse;
1031 struct net_device *d;
1032
1033 p = strnchr(name, IFNAMSIZ-1, '%');
1034 if (p) {
1035 /*
1036 * Verify the string as this thing may have come from
1037 * the user. There must be either one "%d" and no other "%"
1038 * characters.
1039 */
1040 if (p[1] != 'd' || strchr(p + 2, '%'))
1041 return -EINVAL;
1042
1043 /* Use one page as a bit array of possible slots */
1044 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1045 if (!inuse)
1046 return -ENOMEM;
1047
1048 for_each_netdev(net, d) {
1049 if (!sscanf(d->name, name, &i))
1050 continue;
1051 if (i < 0 || i >= max_netdevices)
1052 continue;
1053
1054 /* avoid cases where sscanf is not exact inverse of printf */
1055 snprintf(buf, IFNAMSIZ, name, i);
1056 if (!strncmp(buf, d->name, IFNAMSIZ))
1057 set_bit(i, inuse);
1058 }
1059
1060 i = find_first_zero_bit(inuse, max_netdevices);
1061 free_page((unsigned long) inuse);
1062 }
1063
1064 if (buf != name)
1065 snprintf(buf, IFNAMSIZ, name, i);
1066 if (!__dev_get_by_name(net, buf))
1067 return i;
1068
1069 /* It is possible to run out of possible slots
1070 * when the name is long and there isn't enough space left
1071 * for the digits, or if all bits are used.
1072 */
1073 return -ENFILE;
1074 }
1075
1076 /**
1077 * dev_alloc_name - allocate a name for a device
1078 * @dev: device
1079 * @name: name format string
1080 *
1081 * Passed a format string - eg "lt%d" it will try and find a suitable
1082 * id. It scans list of devices to build up a free map, then chooses
1083 * the first empty slot. The caller must hold the dev_base or rtnl lock
1084 * while allocating the name and adding the device in order to avoid
1085 * duplicates.
1086 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1087 * Returns the number of the unit assigned or a negative errno code.
1088 */
1089
dev_alloc_name(struct net_device * dev,const char * name)1090 int dev_alloc_name(struct net_device *dev, const char *name)
1091 {
1092 char buf[IFNAMSIZ];
1093 struct net *net;
1094 int ret;
1095
1096 BUG_ON(!dev_net(dev));
1097 net = dev_net(dev);
1098 ret = __dev_alloc_name(net, name, buf);
1099 if (ret >= 0)
1100 strlcpy(dev->name, buf, IFNAMSIZ);
1101 return ret;
1102 }
1103 EXPORT_SYMBOL(dev_alloc_name);
1104
dev_alloc_name_ns(struct net * net,struct net_device * dev,const char * name)1105 static int dev_alloc_name_ns(struct net *net,
1106 struct net_device *dev,
1107 const char *name)
1108 {
1109 char buf[IFNAMSIZ];
1110 int ret;
1111
1112 ret = __dev_alloc_name(net, name, buf);
1113 if (ret >= 0)
1114 strlcpy(dev->name, buf, IFNAMSIZ);
1115 return ret;
1116 }
1117
dev_get_valid_name(struct net * net,struct net_device * dev,const char * name)1118 int dev_get_valid_name(struct net *net, struct net_device *dev,
1119 const char *name)
1120 {
1121 BUG_ON(!net);
1122
1123 if (!dev_valid_name(name))
1124 return -EINVAL;
1125
1126 if (strchr(name, '%'))
1127 return dev_alloc_name_ns(net, dev, name);
1128 else if (__dev_get_by_name(net, name))
1129 return -EEXIST;
1130 else if (dev->name != name)
1131 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133 return 0;
1134 }
1135 EXPORT_SYMBOL(dev_get_valid_name);
1136
1137 /**
1138 * dev_change_name - change name of a device
1139 * @dev: device
1140 * @newname: name (or format string) must be at least IFNAMSIZ
1141 *
1142 * Change name of a device, can pass format strings "eth%d".
1143 * for wildcarding.
1144 */
dev_change_name(struct net_device * dev,const char * newname)1145 int dev_change_name(struct net_device *dev, const char *newname)
1146 {
1147 unsigned char old_assign_type;
1148 char oldname[IFNAMSIZ];
1149 int err = 0;
1150 int ret;
1151 struct net *net;
1152
1153 ASSERT_RTNL();
1154 BUG_ON(!dev_net(dev));
1155
1156 net = dev_net(dev);
1157 if (dev->flags & IFF_UP)
1158 return -EBUSY;
1159
1160 write_seqcount_begin(&devnet_rename_seq);
1161
1162 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1163 write_seqcount_end(&devnet_rename_seq);
1164 return 0;
1165 }
1166
1167 memcpy(oldname, dev->name, IFNAMSIZ);
1168
1169 err = dev_get_valid_name(net, dev, newname);
1170 if (err < 0) {
1171 write_seqcount_end(&devnet_rename_seq);
1172 return err;
1173 }
1174
1175 if (oldname[0] && !strchr(oldname, '%'))
1176 netdev_info(dev, "renamed from %s\n", oldname);
1177
1178 old_assign_type = dev->name_assign_type;
1179 dev->name_assign_type = NET_NAME_RENAMED;
1180
1181 rollback:
1182 ret = device_rename(&dev->dev, dev->name);
1183 if (ret) {
1184 memcpy(dev->name, oldname, IFNAMSIZ);
1185 dev->name_assign_type = old_assign_type;
1186 write_seqcount_end(&devnet_rename_seq);
1187 return ret;
1188 }
1189
1190 write_seqcount_end(&devnet_rename_seq);
1191
1192 netdev_adjacent_rename_links(dev, oldname);
1193
1194 write_lock_bh(&dev_base_lock);
1195 hlist_del_rcu(&dev->name_hlist);
1196 write_unlock_bh(&dev_base_lock);
1197
1198 synchronize_rcu();
1199
1200 write_lock_bh(&dev_base_lock);
1201 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1202 write_unlock_bh(&dev_base_lock);
1203
1204 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1205 ret = notifier_to_errno(ret);
1206
1207 if (ret) {
1208 /* err >= 0 after dev_alloc_name() or stores the first errno */
1209 if (err >= 0) {
1210 err = ret;
1211 write_seqcount_begin(&devnet_rename_seq);
1212 memcpy(dev->name, oldname, IFNAMSIZ);
1213 memcpy(oldname, newname, IFNAMSIZ);
1214 dev->name_assign_type = old_assign_type;
1215 old_assign_type = NET_NAME_RENAMED;
1216 goto rollback;
1217 } else {
1218 pr_err("%s: name change rollback failed: %d\n",
1219 dev->name, ret);
1220 }
1221 }
1222
1223 return err;
1224 }
1225
1226 /**
1227 * dev_set_alias - change ifalias of a device
1228 * @dev: device
1229 * @alias: name up to IFALIASZ
1230 * @len: limit of bytes to copy from info
1231 *
1232 * Set ifalias for a device,
1233 */
dev_set_alias(struct net_device * dev,const char * alias,size_t len)1234 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1235 {
1236 char *new_ifalias;
1237
1238 ASSERT_RTNL();
1239
1240 if (len >= IFALIASZ)
1241 return -EINVAL;
1242
1243 if (!len) {
1244 kfree(dev->ifalias);
1245 dev->ifalias = NULL;
1246 return 0;
1247 }
1248
1249 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1250 if (!new_ifalias)
1251 return -ENOMEM;
1252 dev->ifalias = new_ifalias;
1253 memcpy(dev->ifalias, alias, len);
1254 dev->ifalias[len] = 0;
1255
1256 return len;
1257 }
1258
1259
1260 /**
1261 * netdev_features_change - device changes features
1262 * @dev: device to cause notification
1263 *
1264 * Called to indicate a device has changed features.
1265 */
netdev_features_change(struct net_device * dev)1266 void netdev_features_change(struct net_device *dev)
1267 {
1268 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1269 }
1270 EXPORT_SYMBOL(netdev_features_change);
1271
1272 /**
1273 * netdev_state_change - device changes state
1274 * @dev: device to cause notification
1275 *
1276 * Called to indicate a device has changed state. This function calls
1277 * the notifier chains for netdev_chain and sends a NEWLINK message
1278 * to the routing socket.
1279 */
netdev_state_change(struct net_device * dev)1280 void netdev_state_change(struct net_device *dev)
1281 {
1282 if (dev->flags & IFF_UP) {
1283 struct netdev_notifier_change_info change_info;
1284
1285 change_info.flags_changed = 0;
1286 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1287 &change_info.info);
1288 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1289 }
1290 }
1291 EXPORT_SYMBOL(netdev_state_change);
1292
1293 /**
1294 * netdev_notify_peers - notify network peers about existence of @dev
1295 * @dev: network device
1296 *
1297 * Generate traffic such that interested network peers are aware of
1298 * @dev, such as by generating a gratuitous ARP. This may be used when
1299 * a device wants to inform the rest of the network about some sort of
1300 * reconfiguration such as a failover event or virtual machine
1301 * migration.
1302 */
netdev_notify_peers(struct net_device * dev)1303 void netdev_notify_peers(struct net_device *dev)
1304 {
1305 rtnl_lock();
1306 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1307 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1308 rtnl_unlock();
1309 }
1310 EXPORT_SYMBOL(netdev_notify_peers);
1311
__dev_open(struct net_device * dev)1312 static int __dev_open(struct net_device *dev)
1313 {
1314 const struct net_device_ops *ops = dev->netdev_ops;
1315 int ret;
1316
1317 ASSERT_RTNL();
1318
1319 if (!netif_device_present(dev))
1320 return -ENODEV;
1321
1322 /* Block netpoll from trying to do any rx path servicing.
1323 * If we don't do this there is a chance ndo_poll_controller
1324 * or ndo_poll may be running while we open the device
1325 */
1326 netpoll_poll_disable(dev);
1327
1328 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1329 ret = notifier_to_errno(ret);
1330 if (ret)
1331 return ret;
1332
1333 set_bit(__LINK_STATE_START, &dev->state);
1334
1335 if (ops->ndo_validate_addr)
1336 ret = ops->ndo_validate_addr(dev);
1337
1338 if (!ret && ops->ndo_open)
1339 ret = ops->ndo_open(dev);
1340
1341 netpoll_poll_enable(dev);
1342
1343 if (ret)
1344 clear_bit(__LINK_STATE_START, &dev->state);
1345 else {
1346 dev->flags |= IFF_UP;
1347 dev_set_rx_mode(dev);
1348 dev_activate(dev);
1349 add_device_randomness(dev->dev_addr, dev->addr_len);
1350 }
1351
1352 return ret;
1353 }
1354
1355 /**
1356 * dev_open - prepare an interface for use.
1357 * @dev: device to open
1358 *
1359 * Takes a device from down to up state. The device's private open
1360 * function is invoked and then the multicast lists are loaded. Finally
1361 * the device is moved into the up state and a %NETDEV_UP message is
1362 * sent to the netdev notifier chain.
1363 *
1364 * Calling this function on an active interface is a nop. On a failure
1365 * a negative errno code is returned.
1366 */
dev_open(struct net_device * dev)1367 int dev_open(struct net_device *dev)
1368 {
1369 int ret;
1370
1371 if (dev->flags & IFF_UP)
1372 return 0;
1373
1374 ret = __dev_open(dev);
1375 if (ret < 0)
1376 return ret;
1377
1378 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1379 call_netdevice_notifiers(NETDEV_UP, dev);
1380
1381 return ret;
1382 }
1383 EXPORT_SYMBOL(dev_open);
1384
__dev_close_many(struct list_head * head)1385 static int __dev_close_many(struct list_head *head)
1386 {
1387 struct net_device *dev;
1388
1389 ASSERT_RTNL();
1390 might_sleep();
1391
1392 list_for_each_entry(dev, head, close_list) {
1393 /* Temporarily disable netpoll until the interface is down */
1394 netpoll_poll_disable(dev);
1395
1396 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1397
1398 clear_bit(__LINK_STATE_START, &dev->state);
1399
1400 /* Synchronize to scheduled poll. We cannot touch poll list, it
1401 * can be even on different cpu. So just clear netif_running().
1402 *
1403 * dev->stop() will invoke napi_disable() on all of it's
1404 * napi_struct instances on this device.
1405 */
1406 smp_mb__after_atomic(); /* Commit netif_running(). */
1407 }
1408
1409 dev_deactivate_many(head);
1410
1411 list_for_each_entry(dev, head, close_list) {
1412 const struct net_device_ops *ops = dev->netdev_ops;
1413
1414 /*
1415 * Call the device specific close. This cannot fail.
1416 * Only if device is UP
1417 *
1418 * We allow it to be called even after a DETACH hot-plug
1419 * event.
1420 */
1421 if (ops->ndo_stop)
1422 ops->ndo_stop(dev);
1423
1424 dev->flags &= ~IFF_UP;
1425 netpoll_poll_enable(dev);
1426 }
1427
1428 return 0;
1429 }
1430
__dev_close(struct net_device * dev)1431 static int __dev_close(struct net_device *dev)
1432 {
1433 int retval;
1434 LIST_HEAD(single);
1435
1436 list_add(&dev->close_list, &single);
1437 retval = __dev_close_many(&single);
1438 list_del(&single);
1439
1440 return retval;
1441 }
1442
dev_close_many(struct list_head * head,bool unlink)1443 int dev_close_many(struct list_head *head, bool unlink)
1444 {
1445 struct net_device *dev, *tmp;
1446
1447 /* Remove the devices that don't need to be closed */
1448 list_for_each_entry_safe(dev, tmp, head, close_list)
1449 if (!(dev->flags & IFF_UP))
1450 list_del_init(&dev->close_list);
1451
1452 __dev_close_many(head);
1453
1454 list_for_each_entry_safe(dev, tmp, head, close_list) {
1455 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1456 call_netdevice_notifiers(NETDEV_DOWN, dev);
1457 if (unlink)
1458 list_del_init(&dev->close_list);
1459 }
1460
1461 return 0;
1462 }
1463 EXPORT_SYMBOL(dev_close_many);
1464
1465 /**
1466 * dev_close - shutdown an interface.
1467 * @dev: device to shutdown
1468 *
1469 * This function moves an active device into down state. A
1470 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1471 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1472 * chain.
1473 */
dev_close(struct net_device * dev)1474 int dev_close(struct net_device *dev)
1475 {
1476 if (dev->flags & IFF_UP) {
1477 LIST_HEAD(single);
1478
1479 list_add(&dev->close_list, &single);
1480 dev_close_many(&single, true);
1481 list_del(&single);
1482 }
1483 return 0;
1484 }
1485 EXPORT_SYMBOL(dev_close);
1486
1487
1488 /**
1489 * dev_disable_lro - disable Large Receive Offload on a device
1490 * @dev: device
1491 *
1492 * Disable Large Receive Offload (LRO) on a net device. Must be
1493 * called under RTNL. This is needed if received packets may be
1494 * forwarded to another interface.
1495 */
dev_disable_lro(struct net_device * dev)1496 void dev_disable_lro(struct net_device *dev)
1497 {
1498 struct net_device *lower_dev;
1499 struct list_head *iter;
1500
1501 dev->wanted_features &= ~NETIF_F_LRO;
1502 netdev_update_features(dev);
1503
1504 if (unlikely(dev->features & NETIF_F_LRO))
1505 netdev_WARN(dev, "failed to disable LRO!\n");
1506
1507 netdev_for_each_lower_dev(dev, lower_dev, iter)
1508 dev_disable_lro(lower_dev);
1509 }
1510 EXPORT_SYMBOL(dev_disable_lro);
1511
call_netdevice_notifier(struct notifier_block * nb,unsigned long val,struct net_device * dev)1512 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1513 struct net_device *dev)
1514 {
1515 struct netdev_notifier_info info;
1516
1517 netdev_notifier_info_init(&info, dev);
1518 return nb->notifier_call(nb, val, &info);
1519 }
1520
1521 static int dev_boot_phase = 1;
1522
1523 /**
1524 * register_netdevice_notifier - register a network notifier block
1525 * @nb: notifier
1526 *
1527 * Register a notifier to be called when network device events occur.
1528 * The notifier passed is linked into the kernel structures and must
1529 * not be reused until it has been unregistered. A negative errno code
1530 * is returned on a failure.
1531 *
1532 * When registered all registration and up events are replayed
1533 * to the new notifier to allow device to have a race free
1534 * view of the network device list.
1535 */
1536
register_netdevice_notifier(struct notifier_block * nb)1537 int register_netdevice_notifier(struct notifier_block *nb)
1538 {
1539 struct net_device *dev;
1540 struct net_device *last;
1541 struct net *net;
1542 int err;
1543
1544 rtnl_lock();
1545 err = raw_notifier_chain_register(&netdev_chain, nb);
1546 if (err)
1547 goto unlock;
1548 if (dev_boot_phase)
1549 goto unlock;
1550 for_each_net(net) {
1551 for_each_netdev(net, dev) {
1552 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1553 err = notifier_to_errno(err);
1554 if (err)
1555 goto rollback;
1556
1557 if (!(dev->flags & IFF_UP))
1558 continue;
1559
1560 call_netdevice_notifier(nb, NETDEV_UP, dev);
1561 }
1562 }
1563
1564 unlock:
1565 rtnl_unlock();
1566 return err;
1567
1568 rollback:
1569 last = dev;
1570 for_each_net(net) {
1571 for_each_netdev(net, dev) {
1572 if (dev == last)
1573 goto outroll;
1574
1575 if (dev->flags & IFF_UP) {
1576 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1577 dev);
1578 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1579 }
1580 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1581 }
1582 }
1583
1584 outroll:
1585 raw_notifier_chain_unregister(&netdev_chain, nb);
1586 goto unlock;
1587 }
1588 EXPORT_SYMBOL(register_netdevice_notifier);
1589
1590 /**
1591 * unregister_netdevice_notifier - unregister a network notifier block
1592 * @nb: notifier
1593 *
1594 * Unregister a notifier previously registered by
1595 * register_netdevice_notifier(). The notifier is unlinked into the
1596 * kernel structures and may then be reused. A negative errno code
1597 * is returned on a failure.
1598 *
1599 * After unregistering unregister and down device events are synthesized
1600 * for all devices on the device list to the removed notifier to remove
1601 * the need for special case cleanup code.
1602 */
1603
unregister_netdevice_notifier(struct notifier_block * nb)1604 int unregister_netdevice_notifier(struct notifier_block *nb)
1605 {
1606 struct net_device *dev;
1607 struct net *net;
1608 int err;
1609
1610 rtnl_lock();
1611 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1612 if (err)
1613 goto unlock;
1614
1615 for_each_net(net) {
1616 for_each_netdev(net, dev) {
1617 if (dev->flags & IFF_UP) {
1618 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1619 dev);
1620 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1621 }
1622 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1623 }
1624 }
1625 unlock:
1626 rtnl_unlock();
1627 return err;
1628 }
1629 EXPORT_SYMBOL(unregister_netdevice_notifier);
1630
1631 /**
1632 * call_netdevice_notifiers_info - call all network notifier blocks
1633 * @val: value passed unmodified to notifier function
1634 * @dev: net_device pointer passed unmodified to notifier function
1635 * @info: notifier information data
1636 *
1637 * Call all network notifier blocks. Parameters and return value
1638 * are as for raw_notifier_call_chain().
1639 */
1640
call_netdevice_notifiers_info(unsigned long val,struct net_device * dev,struct netdev_notifier_info * info)1641 static int call_netdevice_notifiers_info(unsigned long val,
1642 struct net_device *dev,
1643 struct netdev_notifier_info *info)
1644 {
1645 ASSERT_RTNL();
1646 netdev_notifier_info_init(info, dev);
1647 return raw_notifier_call_chain(&netdev_chain, val, info);
1648 }
1649
1650 /**
1651 * call_netdevice_notifiers - call all network notifier blocks
1652 * @val: value passed unmodified to notifier function
1653 * @dev: net_device pointer passed unmodified to notifier function
1654 *
1655 * Call all network notifier blocks. Parameters and return value
1656 * are as for raw_notifier_call_chain().
1657 */
1658
call_netdevice_notifiers(unsigned long val,struct net_device * dev)1659 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1660 {
1661 struct netdev_notifier_info info;
1662
1663 return call_netdevice_notifiers_info(val, dev, &info);
1664 }
1665 EXPORT_SYMBOL(call_netdevice_notifiers);
1666
1667 #ifdef CONFIG_NET_INGRESS
1668 static struct static_key ingress_needed __read_mostly;
1669
net_inc_ingress_queue(void)1670 void net_inc_ingress_queue(void)
1671 {
1672 static_key_slow_inc(&ingress_needed);
1673 }
1674 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1675
net_dec_ingress_queue(void)1676 void net_dec_ingress_queue(void)
1677 {
1678 static_key_slow_dec(&ingress_needed);
1679 }
1680 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1681 #endif
1682
1683 #ifdef CONFIG_NET_EGRESS
1684 static struct static_key egress_needed __read_mostly;
1685
net_inc_egress_queue(void)1686 void net_inc_egress_queue(void)
1687 {
1688 static_key_slow_inc(&egress_needed);
1689 }
1690 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1691
net_dec_egress_queue(void)1692 void net_dec_egress_queue(void)
1693 {
1694 static_key_slow_dec(&egress_needed);
1695 }
1696 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1697 #endif
1698
1699 static struct static_key netstamp_needed __read_mostly;
1700 #ifdef HAVE_JUMP_LABEL
1701 static atomic_t netstamp_needed_deferred;
1702 static atomic_t netstamp_wanted;
netstamp_clear(struct work_struct * work)1703 static void netstamp_clear(struct work_struct *work)
1704 {
1705 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1706 int wanted;
1707
1708 wanted = atomic_add_return(deferred, &netstamp_wanted);
1709 if (wanted > 0)
1710 static_key_enable(&netstamp_needed);
1711 else
1712 static_key_disable(&netstamp_needed);
1713 }
1714 static DECLARE_WORK(netstamp_work, netstamp_clear);
1715 #endif
1716
net_enable_timestamp(void)1717 void net_enable_timestamp(void)
1718 {
1719 #ifdef HAVE_JUMP_LABEL
1720 int wanted;
1721
1722 while (1) {
1723 wanted = atomic_read(&netstamp_wanted);
1724 if (wanted <= 0)
1725 break;
1726 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1727 return;
1728 }
1729 atomic_inc(&netstamp_needed_deferred);
1730 schedule_work(&netstamp_work);
1731 #else
1732 static_key_slow_inc(&netstamp_needed);
1733 #endif
1734 }
1735 EXPORT_SYMBOL(net_enable_timestamp);
1736
net_disable_timestamp(void)1737 void net_disable_timestamp(void)
1738 {
1739 #ifdef HAVE_JUMP_LABEL
1740 int wanted;
1741
1742 while (1) {
1743 wanted = atomic_read(&netstamp_wanted);
1744 if (wanted <= 1)
1745 break;
1746 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1747 return;
1748 }
1749 atomic_dec(&netstamp_needed_deferred);
1750 schedule_work(&netstamp_work);
1751 #else
1752 static_key_slow_dec(&netstamp_needed);
1753 #endif
1754 }
1755 EXPORT_SYMBOL(net_disable_timestamp);
1756
net_timestamp_set(struct sk_buff * skb)1757 static inline void net_timestamp_set(struct sk_buff *skb)
1758 {
1759 skb->tstamp.tv64 = 0;
1760 if (static_key_false(&netstamp_needed))
1761 __net_timestamp(skb);
1762 }
1763
1764 #define net_timestamp_check(COND, SKB) \
1765 if (static_key_false(&netstamp_needed)) { \
1766 if ((COND) && !(SKB)->tstamp.tv64) \
1767 __net_timestamp(SKB); \
1768 } \
1769
is_skb_forwardable(const struct net_device * dev,const struct sk_buff * skb)1770 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1771 {
1772 unsigned int len;
1773
1774 if (!(dev->flags & IFF_UP))
1775 return false;
1776
1777 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1778 if (skb->len <= len)
1779 return true;
1780
1781 /* if TSO is enabled, we don't care about the length as the packet
1782 * could be forwarded without being segmented before
1783 */
1784 if (skb_is_gso(skb))
1785 return true;
1786
1787 return false;
1788 }
1789 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1790
__dev_forward_skb(struct net_device * dev,struct sk_buff * skb)1791 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1792 {
1793 int ret = ____dev_forward_skb(dev, skb);
1794
1795 if (likely(!ret)) {
1796 skb->protocol = eth_type_trans(skb, dev);
1797 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1798 }
1799
1800 return ret;
1801 }
1802 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1803
1804 /**
1805 * dev_forward_skb - loopback an skb to another netif
1806 *
1807 * @dev: destination network device
1808 * @skb: buffer to forward
1809 *
1810 * return values:
1811 * NET_RX_SUCCESS (no congestion)
1812 * NET_RX_DROP (packet was dropped, but freed)
1813 *
1814 * dev_forward_skb can be used for injecting an skb from the
1815 * start_xmit function of one device into the receive queue
1816 * of another device.
1817 *
1818 * The receiving device may be in another namespace, so
1819 * we have to clear all information in the skb that could
1820 * impact namespace isolation.
1821 */
dev_forward_skb(struct net_device * dev,struct sk_buff * skb)1822 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1823 {
1824 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1825 }
1826 EXPORT_SYMBOL_GPL(dev_forward_skb);
1827
deliver_skb(struct sk_buff * skb,struct packet_type * pt_prev,struct net_device * orig_dev)1828 static inline int deliver_skb(struct sk_buff *skb,
1829 struct packet_type *pt_prev,
1830 struct net_device *orig_dev)
1831 {
1832 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1833 return -ENOMEM;
1834 atomic_inc(&skb->users);
1835 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1836 }
1837
deliver_ptype_list_skb(struct sk_buff * skb,struct packet_type ** pt,struct net_device * orig_dev,__be16 type,struct list_head * ptype_list)1838 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1839 struct packet_type **pt,
1840 struct net_device *orig_dev,
1841 __be16 type,
1842 struct list_head *ptype_list)
1843 {
1844 struct packet_type *ptype, *pt_prev = *pt;
1845
1846 list_for_each_entry_rcu(ptype, ptype_list, list) {
1847 if (ptype->type != type)
1848 continue;
1849 if (pt_prev)
1850 deliver_skb(skb, pt_prev, orig_dev);
1851 pt_prev = ptype;
1852 }
1853 *pt = pt_prev;
1854 }
1855
skb_loop_sk(struct packet_type * ptype,struct sk_buff * skb)1856 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1857 {
1858 if (!ptype->af_packet_priv || !skb->sk)
1859 return false;
1860
1861 if (ptype->id_match)
1862 return ptype->id_match(ptype, skb->sk);
1863 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1864 return true;
1865
1866 return false;
1867 }
1868
1869 /*
1870 * Support routine. Sends outgoing frames to any network
1871 * taps currently in use.
1872 */
1873
dev_queue_xmit_nit(struct sk_buff * skb,struct net_device * dev)1874 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1875 {
1876 struct packet_type *ptype;
1877 struct sk_buff *skb2 = NULL;
1878 struct packet_type *pt_prev = NULL;
1879 struct list_head *ptype_list = &ptype_all;
1880
1881 rcu_read_lock();
1882 again:
1883 list_for_each_entry_rcu(ptype, ptype_list, list) {
1884 /* Never send packets back to the socket
1885 * they originated from - MvS (miquels@drinkel.ow.org)
1886 */
1887 if (skb_loop_sk(ptype, skb))
1888 continue;
1889
1890 if (pt_prev) {
1891 deliver_skb(skb2, pt_prev, skb->dev);
1892 pt_prev = ptype;
1893 continue;
1894 }
1895
1896 /* need to clone skb, done only once */
1897 skb2 = skb_clone(skb, GFP_ATOMIC);
1898 if (!skb2)
1899 goto out_unlock;
1900
1901 net_timestamp_set(skb2);
1902
1903 /* skb->nh should be correctly
1904 * set by sender, so that the second statement is
1905 * just protection against buggy protocols.
1906 */
1907 skb_reset_mac_header(skb2);
1908
1909 if (skb_network_header(skb2) < skb2->data ||
1910 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1911 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1912 ntohs(skb2->protocol),
1913 dev->name);
1914 skb_reset_network_header(skb2);
1915 }
1916
1917 skb2->transport_header = skb2->network_header;
1918 skb2->pkt_type = PACKET_OUTGOING;
1919 pt_prev = ptype;
1920 }
1921
1922 if (ptype_list == &ptype_all) {
1923 ptype_list = &dev->ptype_all;
1924 goto again;
1925 }
1926 out_unlock:
1927 if (pt_prev)
1928 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1929 rcu_read_unlock();
1930 }
1931 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1932
1933 /**
1934 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1935 * @dev: Network device
1936 * @txq: number of queues available
1937 *
1938 * If real_num_tx_queues is changed the tc mappings may no longer be
1939 * valid. To resolve this verify the tc mapping remains valid and if
1940 * not NULL the mapping. With no priorities mapping to this
1941 * offset/count pair it will no longer be used. In the worst case TC0
1942 * is invalid nothing can be done so disable priority mappings. If is
1943 * expected that drivers will fix this mapping if they can before
1944 * calling netif_set_real_num_tx_queues.
1945 */
netif_setup_tc(struct net_device * dev,unsigned int txq)1946 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1947 {
1948 int i;
1949 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1950
1951 /* If TC0 is invalidated disable TC mapping */
1952 if (tc->offset + tc->count > txq) {
1953 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1954 dev->num_tc = 0;
1955 return;
1956 }
1957
1958 /* Invalidated prio to tc mappings set to TC0 */
1959 for (i = 1; i < TC_BITMASK + 1; i++) {
1960 int q = netdev_get_prio_tc_map(dev, i);
1961
1962 tc = &dev->tc_to_txq[q];
1963 if (tc->offset + tc->count > txq) {
1964 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1965 i, q);
1966 netdev_set_prio_tc_map(dev, i, 0);
1967 }
1968 }
1969 }
1970
1971 #ifdef CONFIG_XPS
1972 static DEFINE_MUTEX(xps_map_mutex);
1973 #define xmap_dereference(P) \
1974 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1975
remove_xps_queue(struct xps_dev_maps * dev_maps,int cpu,u16 index)1976 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1977 int cpu, u16 index)
1978 {
1979 struct xps_map *map = NULL;
1980 int pos;
1981
1982 if (dev_maps)
1983 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1984
1985 for (pos = 0; map && pos < map->len; pos++) {
1986 if (map->queues[pos] == index) {
1987 if (map->len > 1) {
1988 map->queues[pos] = map->queues[--map->len];
1989 } else {
1990 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1991 kfree_rcu(map, rcu);
1992 map = NULL;
1993 }
1994 break;
1995 }
1996 }
1997
1998 return map;
1999 }
2000
netif_reset_xps_queues_gt(struct net_device * dev,u16 index)2001 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2002 {
2003 struct xps_dev_maps *dev_maps;
2004 int cpu, i;
2005 bool active = false;
2006
2007 mutex_lock(&xps_map_mutex);
2008 dev_maps = xmap_dereference(dev->xps_maps);
2009
2010 if (!dev_maps)
2011 goto out_no_maps;
2012
2013 for_each_possible_cpu(cpu) {
2014 for (i = index; i < dev->num_tx_queues; i++) {
2015 if (!remove_xps_queue(dev_maps, cpu, i))
2016 break;
2017 }
2018 if (i == dev->num_tx_queues)
2019 active = true;
2020 }
2021
2022 if (!active) {
2023 RCU_INIT_POINTER(dev->xps_maps, NULL);
2024 kfree_rcu(dev_maps, rcu);
2025 }
2026
2027 for (i = index; i < dev->num_tx_queues; i++)
2028 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2029 NUMA_NO_NODE);
2030
2031 out_no_maps:
2032 mutex_unlock(&xps_map_mutex);
2033 }
2034
expand_xps_map(struct xps_map * map,int cpu,u16 index)2035 static struct xps_map *expand_xps_map(struct xps_map *map,
2036 int cpu, u16 index)
2037 {
2038 struct xps_map *new_map;
2039 int alloc_len = XPS_MIN_MAP_ALLOC;
2040 int i, pos;
2041
2042 for (pos = 0; map && pos < map->len; pos++) {
2043 if (map->queues[pos] != index)
2044 continue;
2045 return map;
2046 }
2047
2048 /* Need to add queue to this CPU's existing map */
2049 if (map) {
2050 if (pos < map->alloc_len)
2051 return map;
2052
2053 alloc_len = map->alloc_len * 2;
2054 }
2055
2056 /* Need to allocate new map to store queue on this CPU's map */
2057 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2058 cpu_to_node(cpu));
2059 if (!new_map)
2060 return NULL;
2061
2062 for (i = 0; i < pos; i++)
2063 new_map->queues[i] = map->queues[i];
2064 new_map->alloc_len = alloc_len;
2065 new_map->len = pos;
2066
2067 return new_map;
2068 }
2069
netif_set_xps_queue(struct net_device * dev,const struct cpumask * mask,u16 index)2070 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2071 u16 index)
2072 {
2073 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2074 struct xps_map *map, *new_map;
2075 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2076 int cpu, numa_node_id = -2;
2077 bool active = false;
2078
2079 mutex_lock(&xps_map_mutex);
2080
2081 dev_maps = xmap_dereference(dev->xps_maps);
2082
2083 /* allocate memory for queue storage */
2084 for_each_online_cpu(cpu) {
2085 if (!cpumask_test_cpu(cpu, mask))
2086 continue;
2087
2088 if (!new_dev_maps)
2089 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2090 if (!new_dev_maps) {
2091 mutex_unlock(&xps_map_mutex);
2092 return -ENOMEM;
2093 }
2094
2095 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2096 NULL;
2097
2098 map = expand_xps_map(map, cpu, index);
2099 if (!map)
2100 goto error;
2101
2102 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2103 }
2104
2105 if (!new_dev_maps)
2106 goto out_no_new_maps;
2107
2108 for_each_possible_cpu(cpu) {
2109 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2110 /* add queue to CPU maps */
2111 int pos = 0;
2112
2113 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2114 while ((pos < map->len) && (map->queues[pos] != index))
2115 pos++;
2116
2117 if (pos == map->len)
2118 map->queues[map->len++] = index;
2119 #ifdef CONFIG_NUMA
2120 if (numa_node_id == -2)
2121 numa_node_id = cpu_to_node(cpu);
2122 else if (numa_node_id != cpu_to_node(cpu))
2123 numa_node_id = -1;
2124 #endif
2125 } else if (dev_maps) {
2126 /* fill in the new device map from the old device map */
2127 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2128 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2129 }
2130
2131 }
2132
2133 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2134
2135 /* Cleanup old maps */
2136 if (dev_maps) {
2137 for_each_possible_cpu(cpu) {
2138 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2139 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2140 if (map && map != new_map)
2141 kfree_rcu(map, rcu);
2142 }
2143
2144 kfree_rcu(dev_maps, rcu);
2145 }
2146
2147 dev_maps = new_dev_maps;
2148 active = true;
2149
2150 out_no_new_maps:
2151 /* update Tx queue numa node */
2152 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2153 (numa_node_id >= 0) ? numa_node_id :
2154 NUMA_NO_NODE);
2155
2156 if (!dev_maps)
2157 goto out_no_maps;
2158
2159 /* removes queue from unused CPUs */
2160 for_each_possible_cpu(cpu) {
2161 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2162 continue;
2163
2164 if (remove_xps_queue(dev_maps, cpu, index))
2165 active = true;
2166 }
2167
2168 /* free map if not active */
2169 if (!active) {
2170 RCU_INIT_POINTER(dev->xps_maps, NULL);
2171 kfree_rcu(dev_maps, rcu);
2172 }
2173
2174 out_no_maps:
2175 mutex_unlock(&xps_map_mutex);
2176
2177 return 0;
2178 error:
2179 /* remove any maps that we added */
2180 for_each_possible_cpu(cpu) {
2181 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2182 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2183 NULL;
2184 if (new_map && new_map != map)
2185 kfree(new_map);
2186 }
2187
2188 mutex_unlock(&xps_map_mutex);
2189
2190 kfree(new_dev_maps);
2191 return -ENOMEM;
2192 }
2193 EXPORT_SYMBOL(netif_set_xps_queue);
2194
2195 #endif
2196 /*
2197 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2198 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2199 */
netif_set_real_num_tx_queues(struct net_device * dev,unsigned int txq)2200 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2201 {
2202 bool disabling;
2203 int rc;
2204
2205 disabling = txq < dev->real_num_tx_queues;
2206
2207 if (txq < 1 || txq > dev->num_tx_queues)
2208 return -EINVAL;
2209
2210 if (dev->reg_state == NETREG_REGISTERED ||
2211 dev->reg_state == NETREG_UNREGISTERING) {
2212 ASSERT_RTNL();
2213
2214 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2215 txq);
2216 if (rc)
2217 return rc;
2218
2219 if (dev->num_tc)
2220 netif_setup_tc(dev, txq);
2221
2222 dev->real_num_tx_queues = txq;
2223
2224 if (disabling) {
2225 synchronize_net();
2226 qdisc_reset_all_tx_gt(dev, txq);
2227 #ifdef CONFIG_XPS
2228 netif_reset_xps_queues_gt(dev, txq);
2229 #endif
2230 }
2231 } else {
2232 dev->real_num_tx_queues = txq;
2233 }
2234
2235 return 0;
2236 }
2237 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2238
2239 #ifdef CONFIG_SYSFS
2240 /**
2241 * netif_set_real_num_rx_queues - set actual number of RX queues used
2242 * @dev: Network device
2243 * @rxq: Actual number of RX queues
2244 *
2245 * This must be called either with the rtnl_lock held or before
2246 * registration of the net device. Returns 0 on success, or a
2247 * negative error code. If called before registration, it always
2248 * succeeds.
2249 */
netif_set_real_num_rx_queues(struct net_device * dev,unsigned int rxq)2250 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2251 {
2252 int rc;
2253
2254 if (rxq < 1 || rxq > dev->num_rx_queues)
2255 return -EINVAL;
2256
2257 if (dev->reg_state == NETREG_REGISTERED) {
2258 ASSERT_RTNL();
2259
2260 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2261 rxq);
2262 if (rc)
2263 return rc;
2264 }
2265
2266 dev->real_num_rx_queues = rxq;
2267 return 0;
2268 }
2269 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2270 #endif
2271
2272 /**
2273 * netif_get_num_default_rss_queues - default number of RSS queues
2274 *
2275 * This routine should set an upper limit on the number of RSS queues
2276 * used by default by multiqueue devices.
2277 */
netif_get_num_default_rss_queues(void)2278 int netif_get_num_default_rss_queues(void)
2279 {
2280 return is_kdump_kernel() ?
2281 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2282 }
2283 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2284
__netif_reschedule(struct Qdisc * q)2285 static void __netif_reschedule(struct Qdisc *q)
2286 {
2287 struct softnet_data *sd;
2288 unsigned long flags;
2289
2290 local_irq_save(flags);
2291 sd = this_cpu_ptr(&softnet_data);
2292 q->next_sched = NULL;
2293 *sd->output_queue_tailp = q;
2294 sd->output_queue_tailp = &q->next_sched;
2295 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2296 local_irq_restore(flags);
2297 }
2298
__netif_schedule(struct Qdisc * q)2299 void __netif_schedule(struct Qdisc *q)
2300 {
2301 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2302 __netif_reschedule(q);
2303 }
2304 EXPORT_SYMBOL(__netif_schedule);
2305
2306 struct dev_kfree_skb_cb {
2307 enum skb_free_reason reason;
2308 };
2309
get_kfree_skb_cb(const struct sk_buff * skb)2310 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2311 {
2312 return (struct dev_kfree_skb_cb *)skb->cb;
2313 }
2314
netif_schedule_queue(struct netdev_queue * txq)2315 void netif_schedule_queue(struct netdev_queue *txq)
2316 {
2317 rcu_read_lock();
2318 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2319 struct Qdisc *q = rcu_dereference(txq->qdisc);
2320
2321 __netif_schedule(q);
2322 }
2323 rcu_read_unlock();
2324 }
2325 EXPORT_SYMBOL(netif_schedule_queue);
2326
2327 /**
2328 * netif_wake_subqueue - allow sending packets on subqueue
2329 * @dev: network device
2330 * @queue_index: sub queue index
2331 *
2332 * Resume individual transmit queue of a device with multiple transmit queues.
2333 */
netif_wake_subqueue(struct net_device * dev,u16 queue_index)2334 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2335 {
2336 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2337
2338 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2339 struct Qdisc *q;
2340
2341 rcu_read_lock();
2342 q = rcu_dereference(txq->qdisc);
2343 __netif_schedule(q);
2344 rcu_read_unlock();
2345 }
2346 }
2347 EXPORT_SYMBOL(netif_wake_subqueue);
2348
netif_tx_wake_queue(struct netdev_queue * dev_queue)2349 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2350 {
2351 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2352 struct Qdisc *q;
2353
2354 rcu_read_lock();
2355 q = rcu_dereference(dev_queue->qdisc);
2356 __netif_schedule(q);
2357 rcu_read_unlock();
2358 }
2359 }
2360 EXPORT_SYMBOL(netif_tx_wake_queue);
2361
__dev_kfree_skb_irq(struct sk_buff * skb,enum skb_free_reason reason)2362 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2363 {
2364 unsigned long flags;
2365
2366 if (unlikely(!skb))
2367 return;
2368
2369 if (likely(atomic_read(&skb->users) == 1)) {
2370 smp_rmb();
2371 atomic_set(&skb->users, 0);
2372 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2373 return;
2374 }
2375 get_kfree_skb_cb(skb)->reason = reason;
2376 local_irq_save(flags);
2377 skb->next = __this_cpu_read(softnet_data.completion_queue);
2378 __this_cpu_write(softnet_data.completion_queue, skb);
2379 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2380 local_irq_restore(flags);
2381 }
2382 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2383
__dev_kfree_skb_any(struct sk_buff * skb,enum skb_free_reason reason)2384 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2385 {
2386 if (in_irq() || irqs_disabled())
2387 __dev_kfree_skb_irq(skb, reason);
2388 else
2389 dev_kfree_skb(skb);
2390 }
2391 EXPORT_SYMBOL(__dev_kfree_skb_any);
2392
2393
2394 /**
2395 * netif_device_detach - mark device as removed
2396 * @dev: network device
2397 *
2398 * Mark device as removed from system and therefore no longer available.
2399 */
netif_device_detach(struct net_device * dev)2400 void netif_device_detach(struct net_device *dev)
2401 {
2402 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2403 netif_running(dev)) {
2404 netif_tx_stop_all_queues(dev);
2405 }
2406 }
2407 EXPORT_SYMBOL(netif_device_detach);
2408
2409 /**
2410 * netif_device_attach - mark device as attached
2411 * @dev: network device
2412 *
2413 * Mark device as attached from system and restart if needed.
2414 */
netif_device_attach(struct net_device * dev)2415 void netif_device_attach(struct net_device *dev)
2416 {
2417 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2418 netif_running(dev)) {
2419 netif_tx_wake_all_queues(dev);
2420 __netdev_watchdog_up(dev);
2421 }
2422 }
2423 EXPORT_SYMBOL(netif_device_attach);
2424
2425 /*
2426 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2427 * to be used as a distribution range.
2428 */
__skb_tx_hash(const struct net_device * dev,struct sk_buff * skb,unsigned int num_tx_queues)2429 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2430 unsigned int num_tx_queues)
2431 {
2432 u32 hash;
2433 u16 qoffset = 0;
2434 u16 qcount = num_tx_queues;
2435
2436 if (skb_rx_queue_recorded(skb)) {
2437 hash = skb_get_rx_queue(skb);
2438 while (unlikely(hash >= num_tx_queues))
2439 hash -= num_tx_queues;
2440 return hash;
2441 }
2442
2443 if (dev->num_tc) {
2444 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2445 qoffset = dev->tc_to_txq[tc].offset;
2446 qcount = dev->tc_to_txq[tc].count;
2447 }
2448
2449 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2450 }
2451 EXPORT_SYMBOL(__skb_tx_hash);
2452
skb_warn_bad_offload(const struct sk_buff * skb)2453 static void skb_warn_bad_offload(const struct sk_buff *skb)
2454 {
2455 static const netdev_features_t null_features;
2456 struct net_device *dev = skb->dev;
2457 const char *name = "";
2458
2459 if (!net_ratelimit())
2460 return;
2461
2462 if (dev) {
2463 if (dev->dev.parent)
2464 name = dev_driver_string(dev->dev.parent);
2465 else
2466 name = netdev_name(dev);
2467 }
2468 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2469 "gso_type=%d ip_summed=%d\n",
2470 name, dev ? &dev->features : &null_features,
2471 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2472 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2473 skb_shinfo(skb)->gso_type, skb->ip_summed);
2474 }
2475
2476 /*
2477 * Invalidate hardware checksum when packet is to be mangled, and
2478 * complete checksum manually on outgoing path.
2479 */
skb_checksum_help(struct sk_buff * skb)2480 int skb_checksum_help(struct sk_buff *skb)
2481 {
2482 __wsum csum;
2483 int ret = 0, offset;
2484
2485 if (skb->ip_summed == CHECKSUM_COMPLETE)
2486 goto out_set_summed;
2487
2488 if (unlikely(skb_shinfo(skb)->gso_size)) {
2489 skb_warn_bad_offload(skb);
2490 return -EINVAL;
2491 }
2492
2493 /* Before computing a checksum, we should make sure no frag could
2494 * be modified by an external entity : checksum could be wrong.
2495 */
2496 if (skb_has_shared_frag(skb)) {
2497 ret = __skb_linearize(skb);
2498 if (ret)
2499 goto out;
2500 }
2501
2502 offset = skb_checksum_start_offset(skb);
2503 BUG_ON(offset >= skb_headlen(skb));
2504 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2505
2506 offset += skb->csum_offset;
2507 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2508
2509 if (skb_cloned(skb) &&
2510 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2511 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2512 if (ret)
2513 goto out;
2514 }
2515
2516 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2517 out_set_summed:
2518 skb->ip_summed = CHECKSUM_NONE;
2519 out:
2520 return ret;
2521 }
2522 EXPORT_SYMBOL(skb_checksum_help);
2523
2524 /* skb_csum_offload_check - Driver helper function to determine if a device
2525 * with limited checksum offload capabilities is able to offload the checksum
2526 * for a given packet.
2527 *
2528 * Arguments:
2529 * skb - sk_buff for the packet in question
2530 * spec - contains the description of what device can offload
2531 * csum_encapped - returns true if the checksum being offloaded is
2532 * encpasulated. That is it is checksum for the transport header
2533 * in the inner headers.
2534 * checksum_help - when set indicates that helper function should
2535 * call skb_checksum_help if offload checks fail
2536 *
2537 * Returns:
2538 * true: Packet has passed the checksum checks and should be offloadable to
2539 * the device (a driver may still need to check for additional
2540 * restrictions of its device)
2541 * false: Checksum is not offloadable. If checksum_help was set then
2542 * skb_checksum_help was called to resolve checksum for non-GSO
2543 * packets and when IP protocol is not SCTP
2544 */
__skb_csum_offload_chk(struct sk_buff * skb,const struct skb_csum_offl_spec * spec,bool * csum_encapped,bool csum_help)2545 bool __skb_csum_offload_chk(struct sk_buff *skb,
2546 const struct skb_csum_offl_spec *spec,
2547 bool *csum_encapped,
2548 bool csum_help)
2549 {
2550 struct iphdr *iph;
2551 struct ipv6hdr *ipv6;
2552 void *nhdr;
2553 int protocol;
2554 u8 ip_proto;
2555
2556 if (skb->protocol == htons(ETH_P_8021Q) ||
2557 skb->protocol == htons(ETH_P_8021AD)) {
2558 if (!spec->vlan_okay)
2559 goto need_help;
2560 }
2561
2562 /* We check whether the checksum refers to a transport layer checksum in
2563 * the outermost header or an encapsulated transport layer checksum that
2564 * corresponds to the inner headers of the skb. If the checksum is for
2565 * something else in the packet we need help.
2566 */
2567 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2568 /* Non-encapsulated checksum */
2569 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2570 nhdr = skb_network_header(skb);
2571 *csum_encapped = false;
2572 if (spec->no_not_encapped)
2573 goto need_help;
2574 } else if (skb->encapsulation && spec->encap_okay &&
2575 skb_checksum_start_offset(skb) ==
2576 skb_inner_transport_offset(skb)) {
2577 /* Encapsulated checksum */
2578 *csum_encapped = true;
2579 switch (skb->inner_protocol_type) {
2580 case ENCAP_TYPE_ETHER:
2581 protocol = eproto_to_ipproto(skb->inner_protocol);
2582 break;
2583 case ENCAP_TYPE_IPPROTO:
2584 protocol = skb->inner_protocol;
2585 break;
2586 }
2587 nhdr = skb_inner_network_header(skb);
2588 } else {
2589 goto need_help;
2590 }
2591
2592 switch (protocol) {
2593 case IPPROTO_IP:
2594 if (!spec->ipv4_okay)
2595 goto need_help;
2596 iph = nhdr;
2597 ip_proto = iph->protocol;
2598 if (iph->ihl != 5 && !spec->ip_options_okay)
2599 goto need_help;
2600 break;
2601 case IPPROTO_IPV6:
2602 if (!spec->ipv6_okay)
2603 goto need_help;
2604 if (spec->no_encapped_ipv6 && *csum_encapped)
2605 goto need_help;
2606 ipv6 = nhdr;
2607 nhdr += sizeof(*ipv6);
2608 ip_proto = ipv6->nexthdr;
2609 break;
2610 default:
2611 goto need_help;
2612 }
2613
2614 ip_proto_again:
2615 switch (ip_proto) {
2616 case IPPROTO_TCP:
2617 if (!spec->tcp_okay ||
2618 skb->csum_offset != offsetof(struct tcphdr, check))
2619 goto need_help;
2620 break;
2621 case IPPROTO_UDP:
2622 if (!spec->udp_okay ||
2623 skb->csum_offset != offsetof(struct udphdr, check))
2624 goto need_help;
2625 break;
2626 case IPPROTO_SCTP:
2627 if (!spec->sctp_okay ||
2628 skb->csum_offset != offsetof(struct sctphdr, checksum))
2629 goto cant_help;
2630 break;
2631 case NEXTHDR_HOP:
2632 case NEXTHDR_ROUTING:
2633 case NEXTHDR_DEST: {
2634 u8 *opthdr = nhdr;
2635
2636 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2637 goto need_help;
2638
2639 ip_proto = opthdr[0];
2640 nhdr += (opthdr[1] + 1) << 3;
2641
2642 goto ip_proto_again;
2643 }
2644 default:
2645 goto need_help;
2646 }
2647
2648 /* Passed the tests for offloading checksum */
2649 return true;
2650
2651 need_help:
2652 if (csum_help && !skb_shinfo(skb)->gso_size)
2653 skb_checksum_help(skb);
2654 cant_help:
2655 return false;
2656 }
2657 EXPORT_SYMBOL(__skb_csum_offload_chk);
2658
skb_network_protocol(struct sk_buff * skb,int * depth)2659 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2660 {
2661 __be16 type = skb->protocol;
2662
2663 /* Tunnel gso handlers can set protocol to ethernet. */
2664 if (type == htons(ETH_P_TEB)) {
2665 struct ethhdr *eth;
2666
2667 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2668 return 0;
2669
2670 eth = (struct ethhdr *)skb->data;
2671 type = eth->h_proto;
2672 }
2673
2674 return __vlan_get_protocol(skb, type, depth);
2675 }
2676
2677 /**
2678 * skb_mac_gso_segment - mac layer segmentation handler.
2679 * @skb: buffer to segment
2680 * @features: features for the output path (see dev->features)
2681 */
skb_mac_gso_segment(struct sk_buff * skb,netdev_features_t features)2682 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2683 netdev_features_t features)
2684 {
2685 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2686 struct packet_offload *ptype;
2687 int vlan_depth = skb->mac_len;
2688 __be16 type = skb_network_protocol(skb, &vlan_depth);
2689
2690 if (unlikely(!type))
2691 return ERR_PTR(-EINVAL);
2692
2693 __skb_pull(skb, vlan_depth);
2694
2695 rcu_read_lock();
2696 list_for_each_entry_rcu(ptype, &offload_base, list) {
2697 if (ptype->type == type && ptype->callbacks.gso_segment) {
2698 segs = ptype->callbacks.gso_segment(skb, features);
2699 break;
2700 }
2701 }
2702 rcu_read_unlock();
2703
2704 __skb_push(skb, skb->data - skb_mac_header(skb));
2705
2706 return segs;
2707 }
2708 EXPORT_SYMBOL(skb_mac_gso_segment);
2709
2710
2711 /* openvswitch calls this on rx path, so we need a different check.
2712 */
skb_needs_check(struct sk_buff * skb,bool tx_path)2713 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2714 {
2715 if (tx_path)
2716 return skb->ip_summed != CHECKSUM_PARTIAL &&
2717 skb->ip_summed != CHECKSUM_UNNECESSARY;
2718
2719 return skb->ip_summed == CHECKSUM_NONE;
2720 }
2721
2722 /**
2723 * __skb_gso_segment - Perform segmentation on skb.
2724 * @skb: buffer to segment
2725 * @features: features for the output path (see dev->features)
2726 * @tx_path: whether it is called in TX path
2727 *
2728 * This function segments the given skb and returns a list of segments.
2729 *
2730 * It may return NULL if the skb requires no segmentation. This is
2731 * only possible when GSO is used for verifying header integrity.
2732 *
2733 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2734 */
__skb_gso_segment(struct sk_buff * skb,netdev_features_t features,bool tx_path)2735 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2736 netdev_features_t features, bool tx_path)
2737 {
2738 struct sk_buff *segs;
2739
2740 if (unlikely(skb_needs_check(skb, tx_path))) {
2741 int err;
2742
2743 /* We're going to init ->check field in TCP or UDP header */
2744 err = skb_cow_head(skb, 0);
2745 if (err < 0)
2746 return ERR_PTR(err);
2747 }
2748
2749 /* Only report GSO partial support if it will enable us to
2750 * support segmentation on this frame without needing additional
2751 * work.
2752 */
2753 if (features & NETIF_F_GSO_PARTIAL) {
2754 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2755 struct net_device *dev = skb->dev;
2756
2757 partial_features |= dev->features & dev->gso_partial_features;
2758 if (!skb_gso_ok(skb, features | partial_features))
2759 features &= ~NETIF_F_GSO_PARTIAL;
2760 }
2761
2762 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2763 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2764
2765 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2766 SKB_GSO_CB(skb)->encap_level = 0;
2767
2768 skb_reset_mac_header(skb);
2769 skb_reset_mac_len(skb);
2770
2771 segs = skb_mac_gso_segment(skb, features);
2772
2773 if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2774 skb_warn_bad_offload(skb);
2775
2776 return segs;
2777 }
2778 EXPORT_SYMBOL(__skb_gso_segment);
2779
2780 /* Take action when hardware reception checksum errors are detected. */
2781 #ifdef CONFIG_BUG
netdev_rx_csum_fault(struct net_device * dev)2782 void netdev_rx_csum_fault(struct net_device *dev)
2783 {
2784 if (net_ratelimit()) {
2785 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2786 dump_stack();
2787 }
2788 }
2789 EXPORT_SYMBOL(netdev_rx_csum_fault);
2790 #endif
2791
2792 /* Actually, we should eliminate this check as soon as we know, that:
2793 * 1. IOMMU is present and allows to map all the memory.
2794 * 2. No high memory really exists on this machine.
2795 */
2796
illegal_highdma(struct net_device * dev,struct sk_buff * skb)2797 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2798 {
2799 #ifdef CONFIG_HIGHMEM
2800 int i;
2801 if (!(dev->features & NETIF_F_HIGHDMA)) {
2802 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2803 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2804 if (PageHighMem(skb_frag_page(frag)))
2805 return 1;
2806 }
2807 }
2808
2809 if (PCI_DMA_BUS_IS_PHYS) {
2810 struct device *pdev = dev->dev.parent;
2811
2812 if (!pdev)
2813 return 0;
2814 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2815 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2816 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2817 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2818 return 1;
2819 }
2820 }
2821 #endif
2822 return 0;
2823 }
2824
2825 /* If MPLS offload request, verify we are testing hardware MPLS features
2826 * instead of standard features for the netdev.
2827 */
2828 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)2829 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2830 netdev_features_t features,
2831 __be16 type)
2832 {
2833 if (eth_p_mpls(type))
2834 features &= skb->dev->mpls_features;
2835
2836 return features;
2837 }
2838 #else
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)2839 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2840 netdev_features_t features,
2841 __be16 type)
2842 {
2843 return features;
2844 }
2845 #endif
2846
harmonize_features(struct sk_buff * skb,netdev_features_t features)2847 static netdev_features_t harmonize_features(struct sk_buff *skb,
2848 netdev_features_t features)
2849 {
2850 int tmp;
2851 __be16 type;
2852
2853 type = skb_network_protocol(skb, &tmp);
2854 features = net_mpls_features(skb, features, type);
2855
2856 if (skb->ip_summed != CHECKSUM_NONE &&
2857 !can_checksum_protocol(features, type)) {
2858 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2859 }
2860 if (illegal_highdma(skb->dev, skb))
2861 features &= ~NETIF_F_SG;
2862
2863 return features;
2864 }
2865
passthru_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2866 netdev_features_t passthru_features_check(struct sk_buff *skb,
2867 struct net_device *dev,
2868 netdev_features_t features)
2869 {
2870 return features;
2871 }
2872 EXPORT_SYMBOL(passthru_features_check);
2873
dflt_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2874 static netdev_features_t dflt_features_check(struct sk_buff *skb,
2875 struct net_device *dev,
2876 netdev_features_t features)
2877 {
2878 return vlan_features_check(skb, features);
2879 }
2880
gso_features_check(const struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2881 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2882 struct net_device *dev,
2883 netdev_features_t features)
2884 {
2885 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2886
2887 if (gso_segs > dev->gso_max_segs)
2888 return features & ~NETIF_F_GSO_MASK;
2889
2890 /* Support for GSO partial features requires software
2891 * intervention before we can actually process the packets
2892 * so we need to strip support for any partial features now
2893 * and we can pull them back in after we have partially
2894 * segmented the frame.
2895 */
2896 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2897 features &= ~dev->gso_partial_features;
2898
2899 /* Make sure to clear the IPv4 ID mangling feature if the
2900 * IPv4 header has the potential to be fragmented.
2901 */
2902 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2903 struct iphdr *iph = skb->encapsulation ?
2904 inner_ip_hdr(skb) : ip_hdr(skb);
2905
2906 if (!(iph->frag_off & htons(IP_DF)))
2907 features &= ~NETIF_F_TSO_MANGLEID;
2908 }
2909
2910 return features;
2911 }
2912
netif_skb_features(struct sk_buff * skb)2913 netdev_features_t netif_skb_features(struct sk_buff *skb)
2914 {
2915 struct net_device *dev = skb->dev;
2916 netdev_features_t features = dev->features;
2917
2918 if (skb_is_gso(skb))
2919 features = gso_features_check(skb, dev, features);
2920
2921 /* If encapsulation offload request, verify we are testing
2922 * hardware encapsulation features instead of standard
2923 * features for the netdev
2924 */
2925 if (skb->encapsulation)
2926 features &= dev->hw_enc_features;
2927
2928 if (skb_vlan_tagged(skb))
2929 features = netdev_intersect_features(features,
2930 dev->vlan_features |
2931 NETIF_F_HW_VLAN_CTAG_TX |
2932 NETIF_F_HW_VLAN_STAG_TX);
2933
2934 if (dev->netdev_ops->ndo_features_check)
2935 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2936 features);
2937 else
2938 features &= dflt_features_check(skb, dev, features);
2939
2940 return harmonize_features(skb, features);
2941 }
2942 EXPORT_SYMBOL(netif_skb_features);
2943
xmit_one(struct sk_buff * skb,struct net_device * dev,struct netdev_queue * txq,bool more)2944 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2945 struct netdev_queue *txq, bool more)
2946 {
2947 unsigned int len;
2948 int rc;
2949
2950 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2951 dev_queue_xmit_nit(skb, dev);
2952
2953 len = skb->len;
2954 trace_net_dev_start_xmit(skb, dev);
2955 rc = netdev_start_xmit(skb, dev, txq, more);
2956 trace_net_dev_xmit(skb, rc, dev, len);
2957
2958 return rc;
2959 }
2960
dev_hard_start_xmit(struct sk_buff * first,struct net_device * dev,struct netdev_queue * txq,int * ret)2961 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2962 struct netdev_queue *txq, int *ret)
2963 {
2964 struct sk_buff *skb = first;
2965 int rc = NETDEV_TX_OK;
2966
2967 while (skb) {
2968 struct sk_buff *next = skb->next;
2969
2970 skb->next = NULL;
2971 rc = xmit_one(skb, dev, txq, next != NULL);
2972 if (unlikely(!dev_xmit_complete(rc))) {
2973 skb->next = next;
2974 goto out;
2975 }
2976
2977 skb = next;
2978 if (netif_xmit_stopped(txq) && skb) {
2979 rc = NETDEV_TX_BUSY;
2980 break;
2981 }
2982 }
2983
2984 out:
2985 *ret = rc;
2986 return skb;
2987 }
2988
validate_xmit_vlan(struct sk_buff * skb,netdev_features_t features)2989 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2990 netdev_features_t features)
2991 {
2992 if (skb_vlan_tag_present(skb) &&
2993 !vlan_hw_offload_capable(features, skb->vlan_proto))
2994 skb = __vlan_hwaccel_push_inside(skb);
2995 return skb;
2996 }
2997
validate_xmit_skb(struct sk_buff * skb,struct net_device * dev)2998 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2999 {
3000 netdev_features_t features;
3001
3002 features = netif_skb_features(skb);
3003 skb = validate_xmit_vlan(skb, features);
3004 if (unlikely(!skb))
3005 goto out_null;
3006
3007 if (netif_needs_gso(skb, features)) {
3008 struct sk_buff *segs;
3009
3010 segs = skb_gso_segment(skb, features);
3011 if (IS_ERR(segs)) {
3012 goto out_kfree_skb;
3013 } else if (segs) {
3014 consume_skb(skb);
3015 skb = segs;
3016 }
3017 } else {
3018 if (skb_needs_linearize(skb, features) &&
3019 __skb_linearize(skb))
3020 goto out_kfree_skb;
3021
3022 /* If packet is not checksummed and device does not
3023 * support checksumming for this protocol, complete
3024 * checksumming here.
3025 */
3026 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3027 if (skb->encapsulation)
3028 skb_set_inner_transport_header(skb,
3029 skb_checksum_start_offset(skb));
3030 else
3031 skb_set_transport_header(skb,
3032 skb_checksum_start_offset(skb));
3033 if (!(features & NETIF_F_CSUM_MASK) &&
3034 skb_checksum_help(skb))
3035 goto out_kfree_skb;
3036 }
3037 }
3038
3039 return skb;
3040
3041 out_kfree_skb:
3042 kfree_skb(skb);
3043 out_null:
3044 atomic_long_inc(&dev->tx_dropped);
3045 return NULL;
3046 }
3047
validate_xmit_skb_list(struct sk_buff * skb,struct net_device * dev)3048 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3049 {
3050 struct sk_buff *next, *head = NULL, *tail;
3051
3052 for (; skb != NULL; skb = next) {
3053 next = skb->next;
3054 skb->next = NULL;
3055
3056 /* in case skb wont be segmented, point to itself */
3057 skb->prev = skb;
3058
3059 skb = validate_xmit_skb(skb, dev);
3060 if (!skb)
3061 continue;
3062
3063 if (!head)
3064 head = skb;
3065 else
3066 tail->next = skb;
3067 /* If skb was segmented, skb->prev points to
3068 * the last segment. If not, it still contains skb.
3069 */
3070 tail = skb->prev;
3071 }
3072 return head;
3073 }
3074 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3075
qdisc_pkt_len_init(struct sk_buff * skb)3076 static void qdisc_pkt_len_init(struct sk_buff *skb)
3077 {
3078 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3079
3080 qdisc_skb_cb(skb)->pkt_len = skb->len;
3081
3082 /* To get more precise estimation of bytes sent on wire,
3083 * we add to pkt_len the headers size of all segments
3084 */
3085 if (shinfo->gso_size) {
3086 unsigned int hdr_len;
3087 u16 gso_segs = shinfo->gso_segs;
3088
3089 /* mac layer + network layer */
3090 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3091
3092 /* + transport layer */
3093 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3094 const struct tcphdr *th;
3095 struct tcphdr _tcphdr;
3096
3097 th = skb_header_pointer(skb, skb_transport_offset(skb),
3098 sizeof(_tcphdr), &_tcphdr);
3099 if (likely(th))
3100 hdr_len += __tcp_hdrlen(th);
3101 } else {
3102 struct udphdr _udphdr;
3103
3104 if (skb_header_pointer(skb, skb_transport_offset(skb),
3105 sizeof(_udphdr), &_udphdr))
3106 hdr_len += sizeof(struct udphdr);
3107 }
3108
3109 if (shinfo->gso_type & SKB_GSO_DODGY)
3110 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3111 shinfo->gso_size);
3112
3113 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3114 }
3115 }
3116
__dev_xmit_skb(struct sk_buff * skb,struct Qdisc * q,struct net_device * dev,struct netdev_queue * txq)3117 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3118 struct net_device *dev,
3119 struct netdev_queue *txq)
3120 {
3121 spinlock_t *root_lock = qdisc_lock(q);
3122 struct sk_buff *to_free = NULL;
3123 bool contended;
3124 int rc;
3125
3126 qdisc_calculate_pkt_len(skb, q);
3127 /*
3128 * Heuristic to force contended enqueues to serialize on a
3129 * separate lock before trying to get qdisc main lock.
3130 * This permits qdisc->running owner to get the lock more
3131 * often and dequeue packets faster.
3132 */
3133 contended = qdisc_is_running(q);
3134 if (unlikely(contended))
3135 spin_lock(&q->busylock);
3136
3137 spin_lock(root_lock);
3138 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3139 __qdisc_drop(skb, &to_free);
3140 rc = NET_XMIT_DROP;
3141 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3142 qdisc_run_begin(q)) {
3143 /*
3144 * This is a work-conserving queue; there are no old skbs
3145 * waiting to be sent out; and the qdisc is not running -
3146 * xmit the skb directly.
3147 */
3148
3149 qdisc_bstats_update(q, skb);
3150
3151 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3152 if (unlikely(contended)) {
3153 spin_unlock(&q->busylock);
3154 contended = false;
3155 }
3156 __qdisc_run(q);
3157 } else
3158 qdisc_run_end(q);
3159
3160 rc = NET_XMIT_SUCCESS;
3161 } else {
3162 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3163 if (qdisc_run_begin(q)) {
3164 if (unlikely(contended)) {
3165 spin_unlock(&q->busylock);
3166 contended = false;
3167 }
3168 __qdisc_run(q);
3169 }
3170 }
3171 spin_unlock(root_lock);
3172 if (unlikely(to_free))
3173 kfree_skb_list(to_free);
3174 if (unlikely(contended))
3175 spin_unlock(&q->busylock);
3176 return rc;
3177 }
3178
3179 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
skb_update_prio(struct sk_buff * skb)3180 static void skb_update_prio(struct sk_buff *skb)
3181 {
3182 const struct netprio_map *map;
3183 const struct sock *sk;
3184 unsigned int prioidx;
3185
3186 if (skb->priority)
3187 return;
3188 map = rcu_dereference_bh(skb->dev->priomap);
3189 if (!map)
3190 return;
3191 sk = skb_to_full_sk(skb);
3192 if (!sk)
3193 return;
3194
3195 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3196
3197 if (prioidx < map->priomap_len)
3198 skb->priority = map->priomap[prioidx];
3199 }
3200 #else
3201 #define skb_update_prio(skb)
3202 #endif
3203
3204 DEFINE_PER_CPU(int, xmit_recursion);
3205 EXPORT_SYMBOL(xmit_recursion);
3206
3207 /**
3208 * dev_loopback_xmit - loop back @skb
3209 * @net: network namespace this loopback is happening in
3210 * @sk: sk needed to be a netfilter okfn
3211 * @skb: buffer to transmit
3212 */
dev_loopback_xmit(struct net * net,struct sock * sk,struct sk_buff * skb)3213 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3214 {
3215 skb_reset_mac_header(skb);
3216 __skb_pull(skb, skb_network_offset(skb));
3217 skb->pkt_type = PACKET_LOOPBACK;
3218 skb->ip_summed = CHECKSUM_UNNECESSARY;
3219 WARN_ON(!skb_dst(skb));
3220 skb_dst_force(skb);
3221 netif_rx_ni(skb);
3222 return 0;
3223 }
3224 EXPORT_SYMBOL(dev_loopback_xmit);
3225
3226 #ifdef CONFIG_NET_EGRESS
3227 static struct sk_buff *
sch_handle_egress(struct sk_buff * skb,int * ret,struct net_device * dev)3228 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3229 {
3230 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3231 struct tcf_result cl_res;
3232
3233 if (!cl)
3234 return skb;
3235
3236 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3237 * earlier by the caller.
3238 */
3239 qdisc_bstats_cpu_update(cl->q, skb);
3240
3241 switch (tc_classify(skb, cl, &cl_res, false)) {
3242 case TC_ACT_OK:
3243 case TC_ACT_RECLASSIFY:
3244 skb->tc_index = TC_H_MIN(cl_res.classid);
3245 break;
3246 case TC_ACT_SHOT:
3247 qdisc_qstats_cpu_drop(cl->q);
3248 *ret = NET_XMIT_DROP;
3249 kfree_skb(skb);
3250 return NULL;
3251 case TC_ACT_STOLEN:
3252 case TC_ACT_QUEUED:
3253 *ret = NET_XMIT_SUCCESS;
3254 consume_skb(skb);
3255 return NULL;
3256 case TC_ACT_REDIRECT:
3257 /* No need to push/pop skb's mac_header here on egress! */
3258 skb_do_redirect(skb);
3259 *ret = NET_XMIT_SUCCESS;
3260 return NULL;
3261 default:
3262 break;
3263 }
3264
3265 return skb;
3266 }
3267 #endif /* CONFIG_NET_EGRESS */
3268
get_xps_queue(struct net_device * dev,struct sk_buff * skb)3269 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3270 {
3271 #ifdef CONFIG_XPS
3272 struct xps_dev_maps *dev_maps;
3273 struct xps_map *map;
3274 int queue_index = -1;
3275
3276 rcu_read_lock();
3277 dev_maps = rcu_dereference(dev->xps_maps);
3278 if (dev_maps) {
3279 map = rcu_dereference(
3280 dev_maps->cpu_map[skb->sender_cpu - 1]);
3281 if (map) {
3282 if (map->len == 1)
3283 queue_index = map->queues[0];
3284 else
3285 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3286 map->len)];
3287 if (unlikely(queue_index >= dev->real_num_tx_queues))
3288 queue_index = -1;
3289 }
3290 }
3291 rcu_read_unlock();
3292
3293 return queue_index;
3294 #else
3295 return -1;
3296 #endif
3297 }
3298
__netdev_pick_tx(struct net_device * dev,struct sk_buff * skb)3299 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3300 {
3301 struct sock *sk = skb->sk;
3302 int queue_index = sk_tx_queue_get(sk);
3303
3304 if (queue_index < 0 || skb->ooo_okay ||
3305 queue_index >= dev->real_num_tx_queues) {
3306 int new_index = get_xps_queue(dev, skb);
3307 if (new_index < 0)
3308 new_index = skb_tx_hash(dev, skb);
3309
3310 if (queue_index != new_index && sk &&
3311 sk_fullsock(sk) &&
3312 rcu_access_pointer(sk->sk_dst_cache))
3313 sk_tx_queue_set(sk, new_index);
3314
3315 queue_index = new_index;
3316 }
3317
3318 return queue_index;
3319 }
3320
netdev_pick_tx(struct net_device * dev,struct sk_buff * skb,void * accel_priv)3321 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3322 struct sk_buff *skb,
3323 void *accel_priv)
3324 {
3325 int queue_index = 0;
3326
3327 #ifdef CONFIG_XPS
3328 u32 sender_cpu = skb->sender_cpu - 1;
3329
3330 if (sender_cpu >= (u32)NR_CPUS)
3331 skb->sender_cpu = raw_smp_processor_id() + 1;
3332 #endif
3333
3334 if (dev->real_num_tx_queues != 1) {
3335 const struct net_device_ops *ops = dev->netdev_ops;
3336 if (ops->ndo_select_queue)
3337 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3338 __netdev_pick_tx);
3339 else
3340 queue_index = __netdev_pick_tx(dev, skb);
3341
3342 if (!accel_priv)
3343 queue_index = netdev_cap_txqueue(dev, queue_index);
3344 }
3345
3346 skb_set_queue_mapping(skb, queue_index);
3347 return netdev_get_tx_queue(dev, queue_index);
3348 }
3349
3350 /**
3351 * __dev_queue_xmit - transmit a buffer
3352 * @skb: buffer to transmit
3353 * @accel_priv: private data used for L2 forwarding offload
3354 *
3355 * Queue a buffer for transmission to a network device. The caller must
3356 * have set the device and priority and built the buffer before calling
3357 * this function. The function can be called from an interrupt.
3358 *
3359 * A negative errno code is returned on a failure. A success does not
3360 * guarantee the frame will be transmitted as it may be dropped due
3361 * to congestion or traffic shaping.
3362 *
3363 * -----------------------------------------------------------------------------------
3364 * I notice this method can also return errors from the queue disciplines,
3365 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3366 * be positive.
3367 *
3368 * Regardless of the return value, the skb is consumed, so it is currently
3369 * difficult to retry a send to this method. (You can bump the ref count
3370 * before sending to hold a reference for retry if you are careful.)
3371 *
3372 * When calling this method, interrupts MUST be enabled. This is because
3373 * the BH enable code must have IRQs enabled so that it will not deadlock.
3374 * --BLG
3375 */
__dev_queue_xmit(struct sk_buff * skb,void * accel_priv)3376 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3377 {
3378 struct net_device *dev = skb->dev;
3379 struct netdev_queue *txq;
3380 struct Qdisc *q;
3381 int rc = -ENOMEM;
3382
3383 skb_reset_mac_header(skb);
3384
3385 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3386 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3387
3388 /* Disable soft irqs for various locks below. Also
3389 * stops preemption for RCU.
3390 */
3391 rcu_read_lock_bh();
3392
3393 skb_update_prio(skb);
3394
3395 qdisc_pkt_len_init(skb);
3396 #ifdef CONFIG_NET_CLS_ACT
3397 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3398 # ifdef CONFIG_NET_EGRESS
3399 if (static_key_false(&egress_needed)) {
3400 skb = sch_handle_egress(skb, &rc, dev);
3401 if (!skb)
3402 goto out;
3403 }
3404 # endif
3405 #endif
3406 /* If device/qdisc don't need skb->dst, release it right now while
3407 * its hot in this cpu cache.
3408 */
3409 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3410 skb_dst_drop(skb);
3411 else
3412 skb_dst_force(skb);
3413
3414 txq = netdev_pick_tx(dev, skb, accel_priv);
3415 q = rcu_dereference_bh(txq->qdisc);
3416
3417 trace_net_dev_queue(skb);
3418 if (q->enqueue) {
3419 rc = __dev_xmit_skb(skb, q, dev, txq);
3420 goto out;
3421 }
3422
3423 /* The device has no queue. Common case for software devices:
3424 loopback, all the sorts of tunnels...
3425
3426 Really, it is unlikely that netif_tx_lock protection is necessary
3427 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3428 counters.)
3429 However, it is possible, that they rely on protection
3430 made by us here.
3431
3432 Check this and shot the lock. It is not prone from deadlocks.
3433 Either shot noqueue qdisc, it is even simpler 8)
3434 */
3435 if (dev->flags & IFF_UP) {
3436 int cpu = smp_processor_id(); /* ok because BHs are off */
3437
3438 if (txq->xmit_lock_owner != cpu) {
3439 if (unlikely(__this_cpu_read(xmit_recursion) >
3440 XMIT_RECURSION_LIMIT))
3441 goto recursion_alert;
3442
3443 skb = validate_xmit_skb(skb, dev);
3444 if (!skb)
3445 goto out;
3446
3447 HARD_TX_LOCK(dev, txq, cpu);
3448
3449 if (!netif_xmit_stopped(txq)) {
3450 __this_cpu_inc(xmit_recursion);
3451 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3452 __this_cpu_dec(xmit_recursion);
3453 if (dev_xmit_complete(rc)) {
3454 HARD_TX_UNLOCK(dev, txq);
3455 goto out;
3456 }
3457 }
3458 HARD_TX_UNLOCK(dev, txq);
3459 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3460 dev->name);
3461 } else {
3462 /* Recursion is detected! It is possible,
3463 * unfortunately
3464 */
3465 recursion_alert:
3466 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3467 dev->name);
3468 }
3469 }
3470
3471 rc = -ENETDOWN;
3472 rcu_read_unlock_bh();
3473
3474 atomic_long_inc(&dev->tx_dropped);
3475 kfree_skb_list(skb);
3476 return rc;
3477 out:
3478 rcu_read_unlock_bh();
3479 return rc;
3480 }
3481
dev_queue_xmit(struct sk_buff * skb)3482 int dev_queue_xmit(struct sk_buff *skb)
3483 {
3484 return __dev_queue_xmit(skb, NULL);
3485 }
3486 EXPORT_SYMBOL(dev_queue_xmit);
3487
dev_queue_xmit_accel(struct sk_buff * skb,void * accel_priv)3488 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3489 {
3490 return __dev_queue_xmit(skb, accel_priv);
3491 }
3492 EXPORT_SYMBOL(dev_queue_xmit_accel);
3493
3494
3495 /*=======================================================================
3496 Receiver routines
3497 =======================================================================*/
3498
3499 int netdev_max_backlog __read_mostly = 1000;
3500 EXPORT_SYMBOL(netdev_max_backlog);
3501
3502 int netdev_tstamp_prequeue __read_mostly = 1;
3503 int netdev_budget __read_mostly = 300;
3504 int weight_p __read_mostly = 64; /* old backlog weight */
3505
3506 /* Called with irq disabled */
____napi_schedule(struct softnet_data * sd,struct napi_struct * napi)3507 static inline void ____napi_schedule(struct softnet_data *sd,
3508 struct napi_struct *napi)
3509 {
3510 list_add_tail(&napi->poll_list, &sd->poll_list);
3511 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3512 }
3513
3514 #ifdef CONFIG_RPS
3515
3516 /* One global table that all flow-based protocols share. */
3517 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3518 EXPORT_SYMBOL(rps_sock_flow_table);
3519 u32 rps_cpu_mask __read_mostly;
3520 EXPORT_SYMBOL(rps_cpu_mask);
3521
3522 struct static_key rps_needed __read_mostly;
3523 EXPORT_SYMBOL(rps_needed);
3524
3525 static struct rps_dev_flow *
set_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow * rflow,u16 next_cpu)3526 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3527 struct rps_dev_flow *rflow, u16 next_cpu)
3528 {
3529 if (next_cpu < nr_cpu_ids) {
3530 #ifdef CONFIG_RFS_ACCEL
3531 struct netdev_rx_queue *rxqueue;
3532 struct rps_dev_flow_table *flow_table;
3533 struct rps_dev_flow *old_rflow;
3534 u32 flow_id;
3535 u16 rxq_index;
3536 int rc;
3537
3538 /* Should we steer this flow to a different hardware queue? */
3539 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3540 !(dev->features & NETIF_F_NTUPLE))
3541 goto out;
3542 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3543 if (rxq_index == skb_get_rx_queue(skb))
3544 goto out;
3545
3546 rxqueue = dev->_rx + rxq_index;
3547 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3548 if (!flow_table)
3549 goto out;
3550 flow_id = skb_get_hash(skb) & flow_table->mask;
3551 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3552 rxq_index, flow_id);
3553 if (rc < 0)
3554 goto out;
3555 old_rflow = rflow;
3556 rflow = &flow_table->flows[flow_id];
3557 rflow->filter = rc;
3558 if (old_rflow->filter == rflow->filter)
3559 old_rflow->filter = RPS_NO_FILTER;
3560 out:
3561 #endif
3562 rflow->last_qtail =
3563 per_cpu(softnet_data, next_cpu).input_queue_head;
3564 }
3565
3566 rflow->cpu = next_cpu;
3567 return rflow;
3568 }
3569
3570 /*
3571 * get_rps_cpu is called from netif_receive_skb and returns the target
3572 * CPU from the RPS map of the receiving queue for a given skb.
3573 * rcu_read_lock must be held on entry.
3574 */
get_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow ** rflowp)3575 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3576 struct rps_dev_flow **rflowp)
3577 {
3578 const struct rps_sock_flow_table *sock_flow_table;
3579 struct netdev_rx_queue *rxqueue = dev->_rx;
3580 struct rps_dev_flow_table *flow_table;
3581 struct rps_map *map;
3582 int cpu = -1;
3583 u32 tcpu;
3584 u32 hash;
3585
3586 if (skb_rx_queue_recorded(skb)) {
3587 u16 index = skb_get_rx_queue(skb);
3588
3589 if (unlikely(index >= dev->real_num_rx_queues)) {
3590 WARN_ONCE(dev->real_num_rx_queues > 1,
3591 "%s received packet on queue %u, but number "
3592 "of RX queues is %u\n",
3593 dev->name, index, dev->real_num_rx_queues);
3594 goto done;
3595 }
3596 rxqueue += index;
3597 }
3598
3599 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3600
3601 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3602 map = rcu_dereference(rxqueue->rps_map);
3603 if (!flow_table && !map)
3604 goto done;
3605
3606 skb_reset_network_header(skb);
3607 hash = skb_get_hash(skb);
3608 if (!hash)
3609 goto done;
3610
3611 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3612 if (flow_table && sock_flow_table) {
3613 struct rps_dev_flow *rflow;
3614 u32 next_cpu;
3615 u32 ident;
3616
3617 /* First check into global flow table if there is a match */
3618 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3619 if ((ident ^ hash) & ~rps_cpu_mask)
3620 goto try_rps;
3621
3622 next_cpu = ident & rps_cpu_mask;
3623
3624 /* OK, now we know there is a match,
3625 * we can look at the local (per receive queue) flow table
3626 */
3627 rflow = &flow_table->flows[hash & flow_table->mask];
3628 tcpu = rflow->cpu;
3629
3630 /*
3631 * If the desired CPU (where last recvmsg was done) is
3632 * different from current CPU (one in the rx-queue flow
3633 * table entry), switch if one of the following holds:
3634 * - Current CPU is unset (>= nr_cpu_ids).
3635 * - Current CPU is offline.
3636 * - The current CPU's queue tail has advanced beyond the
3637 * last packet that was enqueued using this table entry.
3638 * This guarantees that all previous packets for the flow
3639 * have been dequeued, thus preserving in order delivery.
3640 */
3641 if (unlikely(tcpu != next_cpu) &&
3642 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3643 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3644 rflow->last_qtail)) >= 0)) {
3645 tcpu = next_cpu;
3646 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3647 }
3648
3649 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3650 *rflowp = rflow;
3651 cpu = tcpu;
3652 goto done;
3653 }
3654 }
3655
3656 try_rps:
3657
3658 if (map) {
3659 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3660 if (cpu_online(tcpu)) {
3661 cpu = tcpu;
3662 goto done;
3663 }
3664 }
3665
3666 done:
3667 return cpu;
3668 }
3669
3670 #ifdef CONFIG_RFS_ACCEL
3671
3672 /**
3673 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3674 * @dev: Device on which the filter was set
3675 * @rxq_index: RX queue index
3676 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3677 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3678 *
3679 * Drivers that implement ndo_rx_flow_steer() should periodically call
3680 * this function for each installed filter and remove the filters for
3681 * which it returns %true.
3682 */
rps_may_expire_flow(struct net_device * dev,u16 rxq_index,u32 flow_id,u16 filter_id)3683 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3684 u32 flow_id, u16 filter_id)
3685 {
3686 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3687 struct rps_dev_flow_table *flow_table;
3688 struct rps_dev_flow *rflow;
3689 bool expire = true;
3690 unsigned int cpu;
3691
3692 rcu_read_lock();
3693 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3694 if (flow_table && flow_id <= flow_table->mask) {
3695 rflow = &flow_table->flows[flow_id];
3696 cpu = ACCESS_ONCE(rflow->cpu);
3697 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3698 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3699 rflow->last_qtail) <
3700 (int)(10 * flow_table->mask)))
3701 expire = false;
3702 }
3703 rcu_read_unlock();
3704 return expire;
3705 }
3706 EXPORT_SYMBOL(rps_may_expire_flow);
3707
3708 #endif /* CONFIG_RFS_ACCEL */
3709
3710 /* Called from hardirq (IPI) context */
rps_trigger_softirq(void * data)3711 static void rps_trigger_softirq(void *data)
3712 {
3713 struct softnet_data *sd = data;
3714
3715 ____napi_schedule(sd, &sd->backlog);
3716 sd->received_rps++;
3717 }
3718
3719 #endif /* CONFIG_RPS */
3720
3721 /*
3722 * Check if this softnet_data structure is another cpu one
3723 * If yes, queue it to our IPI list and return 1
3724 * If no, return 0
3725 */
rps_ipi_queued(struct softnet_data * sd)3726 static int rps_ipi_queued(struct softnet_data *sd)
3727 {
3728 #ifdef CONFIG_RPS
3729 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3730
3731 if (sd != mysd) {
3732 sd->rps_ipi_next = mysd->rps_ipi_list;
3733 mysd->rps_ipi_list = sd;
3734
3735 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3736 return 1;
3737 }
3738 #endif /* CONFIG_RPS */
3739 return 0;
3740 }
3741
3742 #ifdef CONFIG_NET_FLOW_LIMIT
3743 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3744 #endif
3745
skb_flow_limit(struct sk_buff * skb,unsigned int qlen)3746 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3747 {
3748 #ifdef CONFIG_NET_FLOW_LIMIT
3749 struct sd_flow_limit *fl;
3750 struct softnet_data *sd;
3751 unsigned int old_flow, new_flow;
3752
3753 if (qlen < (netdev_max_backlog >> 1))
3754 return false;
3755
3756 sd = this_cpu_ptr(&softnet_data);
3757
3758 rcu_read_lock();
3759 fl = rcu_dereference(sd->flow_limit);
3760 if (fl) {
3761 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3762 old_flow = fl->history[fl->history_head];
3763 fl->history[fl->history_head] = new_flow;
3764
3765 fl->history_head++;
3766 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3767
3768 if (likely(fl->buckets[old_flow]))
3769 fl->buckets[old_flow]--;
3770
3771 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3772 fl->count++;
3773 rcu_read_unlock();
3774 return true;
3775 }
3776 }
3777 rcu_read_unlock();
3778 #endif
3779 return false;
3780 }
3781
3782 /*
3783 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3784 * queue (may be a remote CPU queue).
3785 */
enqueue_to_backlog(struct sk_buff * skb,int cpu,unsigned int * qtail)3786 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3787 unsigned int *qtail)
3788 {
3789 struct softnet_data *sd;
3790 unsigned long flags;
3791 unsigned int qlen;
3792
3793 sd = &per_cpu(softnet_data, cpu);
3794
3795 local_irq_save(flags);
3796
3797 rps_lock(sd);
3798 if (!netif_running(skb->dev))
3799 goto drop;
3800 qlen = skb_queue_len(&sd->input_pkt_queue);
3801 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3802 if (qlen) {
3803 enqueue:
3804 __skb_queue_tail(&sd->input_pkt_queue, skb);
3805 input_queue_tail_incr_save(sd, qtail);
3806 rps_unlock(sd);
3807 local_irq_restore(flags);
3808 return NET_RX_SUCCESS;
3809 }
3810
3811 /* Schedule NAPI for backlog device
3812 * We can use non atomic operation since we own the queue lock
3813 */
3814 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3815 if (!rps_ipi_queued(sd))
3816 ____napi_schedule(sd, &sd->backlog);
3817 }
3818 goto enqueue;
3819 }
3820
3821 drop:
3822 sd->dropped++;
3823 rps_unlock(sd);
3824
3825 local_irq_restore(flags);
3826
3827 atomic_long_inc(&skb->dev->rx_dropped);
3828 kfree_skb(skb);
3829 return NET_RX_DROP;
3830 }
3831
netif_rx_internal(struct sk_buff * skb)3832 static int netif_rx_internal(struct sk_buff *skb)
3833 {
3834 int ret;
3835
3836 net_timestamp_check(netdev_tstamp_prequeue, skb);
3837
3838 trace_netif_rx(skb);
3839 #ifdef CONFIG_RPS
3840 if (static_key_false(&rps_needed)) {
3841 struct rps_dev_flow voidflow, *rflow = &voidflow;
3842 int cpu;
3843
3844 preempt_disable();
3845 rcu_read_lock();
3846
3847 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3848 if (cpu < 0)
3849 cpu = smp_processor_id();
3850
3851 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3852
3853 rcu_read_unlock();
3854 preempt_enable();
3855 } else
3856 #endif
3857 {
3858 unsigned int qtail;
3859 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3860 put_cpu();
3861 }
3862 return ret;
3863 }
3864
3865 /**
3866 * netif_rx - post buffer to the network code
3867 * @skb: buffer to post
3868 *
3869 * This function receives a packet from a device driver and queues it for
3870 * the upper (protocol) levels to process. It always succeeds. The buffer
3871 * may be dropped during processing for congestion control or by the
3872 * protocol layers.
3873 *
3874 * return values:
3875 * NET_RX_SUCCESS (no congestion)
3876 * NET_RX_DROP (packet was dropped)
3877 *
3878 */
3879
netif_rx(struct sk_buff * skb)3880 int netif_rx(struct sk_buff *skb)
3881 {
3882 trace_netif_rx_entry(skb);
3883
3884 return netif_rx_internal(skb);
3885 }
3886 EXPORT_SYMBOL(netif_rx);
3887
netif_rx_ni(struct sk_buff * skb)3888 int netif_rx_ni(struct sk_buff *skb)
3889 {
3890 int err;
3891
3892 trace_netif_rx_ni_entry(skb);
3893
3894 preempt_disable();
3895 err = netif_rx_internal(skb);
3896 if (local_softirq_pending())
3897 do_softirq();
3898 preempt_enable();
3899
3900 return err;
3901 }
3902 EXPORT_SYMBOL(netif_rx_ni);
3903
net_tx_action(struct softirq_action * h)3904 static __latent_entropy void net_tx_action(struct softirq_action *h)
3905 {
3906 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3907
3908 if (sd->completion_queue) {
3909 struct sk_buff *clist;
3910
3911 local_irq_disable();
3912 clist = sd->completion_queue;
3913 sd->completion_queue = NULL;
3914 local_irq_enable();
3915
3916 while (clist) {
3917 struct sk_buff *skb = clist;
3918 clist = clist->next;
3919
3920 WARN_ON(atomic_read(&skb->users));
3921 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3922 trace_consume_skb(skb);
3923 else
3924 trace_kfree_skb(skb, net_tx_action);
3925
3926 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3927 __kfree_skb(skb);
3928 else
3929 __kfree_skb_defer(skb);
3930 }
3931
3932 __kfree_skb_flush();
3933 }
3934
3935 if (sd->output_queue) {
3936 struct Qdisc *head;
3937
3938 local_irq_disable();
3939 head = sd->output_queue;
3940 sd->output_queue = NULL;
3941 sd->output_queue_tailp = &sd->output_queue;
3942 local_irq_enable();
3943
3944 while (head) {
3945 struct Qdisc *q = head;
3946 spinlock_t *root_lock;
3947
3948 head = head->next_sched;
3949
3950 root_lock = qdisc_lock(q);
3951 spin_lock(root_lock);
3952 /* We need to make sure head->next_sched is read
3953 * before clearing __QDISC_STATE_SCHED
3954 */
3955 smp_mb__before_atomic();
3956 clear_bit(__QDISC_STATE_SCHED, &q->state);
3957 qdisc_run(q);
3958 spin_unlock(root_lock);
3959 }
3960 }
3961 }
3962
3963 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3964 /* This hook is defined here for ATM LANE */
3965 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3966 unsigned char *addr) __read_mostly;
3967 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3968 #endif
3969
3970 static inline struct sk_buff *
sch_handle_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)3971 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3972 struct net_device *orig_dev)
3973 {
3974 #ifdef CONFIG_NET_CLS_ACT
3975 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3976 struct tcf_result cl_res;
3977
3978 /* If there's at least one ingress present somewhere (so
3979 * we get here via enabled static key), remaining devices
3980 * that are not configured with an ingress qdisc will bail
3981 * out here.
3982 */
3983 if (!cl)
3984 return skb;
3985 if (*pt_prev) {
3986 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3987 *pt_prev = NULL;
3988 }
3989
3990 qdisc_skb_cb(skb)->pkt_len = skb->len;
3991 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3992 qdisc_bstats_cpu_update(cl->q, skb);
3993
3994 switch (tc_classify(skb, cl, &cl_res, false)) {
3995 case TC_ACT_OK:
3996 case TC_ACT_RECLASSIFY:
3997 skb->tc_index = TC_H_MIN(cl_res.classid);
3998 break;
3999 case TC_ACT_SHOT:
4000 qdisc_qstats_cpu_drop(cl->q);
4001 kfree_skb(skb);
4002 return NULL;
4003 case TC_ACT_STOLEN:
4004 case TC_ACT_QUEUED:
4005 consume_skb(skb);
4006 return NULL;
4007 case TC_ACT_REDIRECT:
4008 /* skb_mac_header check was done by cls/act_bpf, so
4009 * we can safely push the L2 header back before
4010 * redirecting to another netdev
4011 */
4012 __skb_push(skb, skb->mac_len);
4013 skb_do_redirect(skb);
4014 return NULL;
4015 default:
4016 break;
4017 }
4018 #endif /* CONFIG_NET_CLS_ACT */
4019 return skb;
4020 }
4021
4022 /**
4023 * netdev_is_rx_handler_busy - check if receive handler is registered
4024 * @dev: device to check
4025 *
4026 * Check if a receive handler is already registered for a given device.
4027 * Return true if there one.
4028 *
4029 * The caller must hold the rtnl_mutex.
4030 */
netdev_is_rx_handler_busy(struct net_device * dev)4031 bool netdev_is_rx_handler_busy(struct net_device *dev)
4032 {
4033 ASSERT_RTNL();
4034 return dev && rtnl_dereference(dev->rx_handler);
4035 }
4036 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4037
4038 /**
4039 * netdev_rx_handler_register - register receive handler
4040 * @dev: device to register a handler for
4041 * @rx_handler: receive handler to register
4042 * @rx_handler_data: data pointer that is used by rx handler
4043 *
4044 * Register a receive handler for a device. This handler will then be
4045 * called from __netif_receive_skb. A negative errno code is returned
4046 * on a failure.
4047 *
4048 * The caller must hold the rtnl_mutex.
4049 *
4050 * For a general description of rx_handler, see enum rx_handler_result.
4051 */
netdev_rx_handler_register(struct net_device * dev,rx_handler_func_t * rx_handler,void * rx_handler_data)4052 int netdev_rx_handler_register(struct net_device *dev,
4053 rx_handler_func_t *rx_handler,
4054 void *rx_handler_data)
4055 {
4056 ASSERT_RTNL();
4057
4058 if (dev->rx_handler)
4059 return -EBUSY;
4060
4061 /* Note: rx_handler_data must be set before rx_handler */
4062 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4063 rcu_assign_pointer(dev->rx_handler, rx_handler);
4064
4065 return 0;
4066 }
4067 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4068
4069 /**
4070 * netdev_rx_handler_unregister - unregister receive handler
4071 * @dev: device to unregister a handler from
4072 *
4073 * Unregister a receive handler from a device.
4074 *
4075 * The caller must hold the rtnl_mutex.
4076 */
netdev_rx_handler_unregister(struct net_device * dev)4077 void netdev_rx_handler_unregister(struct net_device *dev)
4078 {
4079
4080 ASSERT_RTNL();
4081 RCU_INIT_POINTER(dev->rx_handler, NULL);
4082 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4083 * section has a guarantee to see a non NULL rx_handler_data
4084 * as well.
4085 */
4086 synchronize_net();
4087 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4088 }
4089 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4090
4091 /*
4092 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4093 * the special handling of PFMEMALLOC skbs.
4094 */
skb_pfmemalloc_protocol(struct sk_buff * skb)4095 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4096 {
4097 switch (skb->protocol) {
4098 case htons(ETH_P_ARP):
4099 case htons(ETH_P_IP):
4100 case htons(ETH_P_IPV6):
4101 case htons(ETH_P_8021Q):
4102 case htons(ETH_P_8021AD):
4103 return true;
4104 default:
4105 return false;
4106 }
4107 }
4108
nf_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)4109 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4110 int *ret, struct net_device *orig_dev)
4111 {
4112 #ifdef CONFIG_NETFILTER_INGRESS
4113 if (nf_hook_ingress_active(skb)) {
4114 int ingress_retval;
4115
4116 if (*pt_prev) {
4117 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4118 *pt_prev = NULL;
4119 }
4120
4121 rcu_read_lock();
4122 ingress_retval = nf_hook_ingress(skb);
4123 rcu_read_unlock();
4124 return ingress_retval;
4125 }
4126 #endif /* CONFIG_NETFILTER_INGRESS */
4127 return 0;
4128 }
4129
__netif_receive_skb_core(struct sk_buff * skb,bool pfmemalloc)4130 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4131 {
4132 struct packet_type *ptype, *pt_prev;
4133 rx_handler_func_t *rx_handler;
4134 struct net_device *orig_dev;
4135 bool deliver_exact = false;
4136 int ret = NET_RX_DROP;
4137 __be16 type;
4138
4139 net_timestamp_check(!netdev_tstamp_prequeue, skb);
4140
4141 trace_netif_receive_skb(skb);
4142
4143 orig_dev = skb->dev;
4144
4145 skb_reset_network_header(skb);
4146 if (!skb_transport_header_was_set(skb))
4147 skb_reset_transport_header(skb);
4148 skb_reset_mac_len(skb);
4149
4150 pt_prev = NULL;
4151
4152 another_round:
4153 skb->skb_iif = skb->dev->ifindex;
4154
4155 __this_cpu_inc(softnet_data.processed);
4156
4157 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4158 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4159 skb = skb_vlan_untag(skb);
4160 if (unlikely(!skb))
4161 goto out;
4162 }
4163
4164 #ifdef CONFIG_NET_CLS_ACT
4165 if (skb->tc_verd & TC_NCLS) {
4166 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4167 goto ncls;
4168 }
4169 #endif
4170
4171 if (pfmemalloc)
4172 goto skip_taps;
4173
4174 list_for_each_entry_rcu(ptype, &ptype_all, list) {
4175 if (pt_prev)
4176 ret = deliver_skb(skb, pt_prev, orig_dev);
4177 pt_prev = ptype;
4178 }
4179
4180 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4181 if (pt_prev)
4182 ret = deliver_skb(skb, pt_prev, orig_dev);
4183 pt_prev = ptype;
4184 }
4185
4186 skip_taps:
4187 #ifdef CONFIG_NET_INGRESS
4188 if (static_key_false(&ingress_needed)) {
4189 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4190 if (!skb)
4191 goto out;
4192
4193 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4194 goto out;
4195 }
4196 #endif
4197 #ifdef CONFIG_NET_CLS_ACT
4198 skb->tc_verd = 0;
4199 ncls:
4200 #endif
4201 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4202 goto drop;
4203
4204 if (skb_vlan_tag_present(skb)) {
4205 if (pt_prev) {
4206 ret = deliver_skb(skb, pt_prev, orig_dev);
4207 pt_prev = NULL;
4208 }
4209 if (vlan_do_receive(&skb))
4210 goto another_round;
4211 else if (unlikely(!skb))
4212 goto out;
4213 }
4214
4215 rx_handler = rcu_dereference(skb->dev->rx_handler);
4216 if (rx_handler) {
4217 if (pt_prev) {
4218 ret = deliver_skb(skb, pt_prev, orig_dev);
4219 pt_prev = NULL;
4220 }
4221 switch (rx_handler(&skb)) {
4222 case RX_HANDLER_CONSUMED:
4223 ret = NET_RX_SUCCESS;
4224 goto out;
4225 case RX_HANDLER_ANOTHER:
4226 goto another_round;
4227 case RX_HANDLER_EXACT:
4228 deliver_exact = true;
4229 case RX_HANDLER_PASS:
4230 break;
4231 default:
4232 BUG();
4233 }
4234 }
4235
4236 if (unlikely(skb_vlan_tag_present(skb))) {
4237 if (skb_vlan_tag_get_id(skb))
4238 skb->pkt_type = PACKET_OTHERHOST;
4239 /* Note: we might in the future use prio bits
4240 * and set skb->priority like in vlan_do_receive()
4241 * For the time being, just ignore Priority Code Point
4242 */
4243 skb->vlan_tci = 0;
4244 }
4245
4246 type = skb->protocol;
4247
4248 /* deliver only exact match when indicated */
4249 if (likely(!deliver_exact)) {
4250 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4251 &ptype_base[ntohs(type) &
4252 PTYPE_HASH_MASK]);
4253 }
4254
4255 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4256 &orig_dev->ptype_specific);
4257
4258 if (unlikely(skb->dev != orig_dev)) {
4259 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4260 &skb->dev->ptype_specific);
4261 }
4262
4263 if (pt_prev) {
4264 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4265 goto drop;
4266 else
4267 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4268 } else {
4269 drop:
4270 if (!deliver_exact)
4271 atomic_long_inc(&skb->dev->rx_dropped);
4272 else
4273 atomic_long_inc(&skb->dev->rx_nohandler);
4274 kfree_skb(skb);
4275 /* Jamal, now you will not able to escape explaining
4276 * me how you were going to use this. :-)
4277 */
4278 ret = NET_RX_DROP;
4279 }
4280
4281 out:
4282 return ret;
4283 }
4284
__netif_receive_skb(struct sk_buff * skb)4285 static int __netif_receive_skb(struct sk_buff *skb)
4286 {
4287 int ret;
4288
4289 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4290 unsigned long pflags = current->flags;
4291
4292 /*
4293 * PFMEMALLOC skbs are special, they should
4294 * - be delivered to SOCK_MEMALLOC sockets only
4295 * - stay away from userspace
4296 * - have bounded memory usage
4297 *
4298 * Use PF_MEMALLOC as this saves us from propagating the allocation
4299 * context down to all allocation sites.
4300 */
4301 current->flags |= PF_MEMALLOC;
4302 ret = __netif_receive_skb_core(skb, true);
4303 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4304 } else
4305 ret = __netif_receive_skb_core(skb, false);
4306
4307 return ret;
4308 }
4309
netif_receive_skb_internal(struct sk_buff * skb)4310 static int netif_receive_skb_internal(struct sk_buff *skb)
4311 {
4312 int ret;
4313
4314 net_timestamp_check(netdev_tstamp_prequeue, skb);
4315
4316 if (skb_defer_rx_timestamp(skb))
4317 return NET_RX_SUCCESS;
4318
4319 rcu_read_lock();
4320
4321 #ifdef CONFIG_RPS
4322 if (static_key_false(&rps_needed)) {
4323 struct rps_dev_flow voidflow, *rflow = &voidflow;
4324 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4325
4326 if (cpu >= 0) {
4327 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4328 rcu_read_unlock();
4329 return ret;
4330 }
4331 }
4332 #endif
4333 ret = __netif_receive_skb(skb);
4334 rcu_read_unlock();
4335 return ret;
4336 }
4337
4338 /**
4339 * netif_receive_skb - process receive buffer from network
4340 * @skb: buffer to process
4341 *
4342 * netif_receive_skb() is the main receive data processing function.
4343 * It always succeeds. The buffer may be dropped during processing
4344 * for congestion control or by the protocol layers.
4345 *
4346 * This function may only be called from softirq context and interrupts
4347 * should be enabled.
4348 *
4349 * Return values (usually ignored):
4350 * NET_RX_SUCCESS: no congestion
4351 * NET_RX_DROP: packet was dropped
4352 */
netif_receive_skb(struct sk_buff * skb)4353 int netif_receive_skb(struct sk_buff *skb)
4354 {
4355 trace_netif_receive_skb_entry(skb);
4356
4357 return netif_receive_skb_internal(skb);
4358 }
4359 EXPORT_SYMBOL(netif_receive_skb);
4360
4361 DEFINE_PER_CPU(struct work_struct, flush_works);
4362
4363 /* Network device is going away, flush any packets still pending */
flush_backlog(struct work_struct * work)4364 static void flush_backlog(struct work_struct *work)
4365 {
4366 struct sk_buff *skb, *tmp;
4367 struct softnet_data *sd;
4368
4369 local_bh_disable();
4370 sd = this_cpu_ptr(&softnet_data);
4371
4372 local_irq_disable();
4373 rps_lock(sd);
4374 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4375 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4376 __skb_unlink(skb, &sd->input_pkt_queue);
4377 kfree_skb(skb);
4378 input_queue_head_incr(sd);
4379 }
4380 }
4381 rps_unlock(sd);
4382 local_irq_enable();
4383
4384 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4385 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4386 __skb_unlink(skb, &sd->process_queue);
4387 kfree_skb(skb);
4388 input_queue_head_incr(sd);
4389 }
4390 }
4391 local_bh_enable();
4392 }
4393
flush_all_backlogs(void)4394 static void flush_all_backlogs(void)
4395 {
4396 unsigned int cpu;
4397
4398 get_online_cpus();
4399
4400 for_each_online_cpu(cpu)
4401 queue_work_on(cpu, system_highpri_wq,
4402 per_cpu_ptr(&flush_works, cpu));
4403
4404 for_each_online_cpu(cpu)
4405 flush_work(per_cpu_ptr(&flush_works, cpu));
4406
4407 put_online_cpus();
4408 }
4409
napi_gro_complete(struct sk_buff * skb)4410 static int napi_gro_complete(struct sk_buff *skb)
4411 {
4412 struct packet_offload *ptype;
4413 __be16 type = skb->protocol;
4414 struct list_head *head = &offload_base;
4415 int err = -ENOENT;
4416
4417 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4418
4419 if (NAPI_GRO_CB(skb)->count == 1) {
4420 skb_shinfo(skb)->gso_size = 0;
4421 goto out;
4422 }
4423
4424 rcu_read_lock();
4425 list_for_each_entry_rcu(ptype, head, list) {
4426 if (ptype->type != type || !ptype->callbacks.gro_complete)
4427 continue;
4428
4429 err = ptype->callbacks.gro_complete(skb, 0);
4430 break;
4431 }
4432 rcu_read_unlock();
4433
4434 if (err) {
4435 WARN_ON(&ptype->list == head);
4436 kfree_skb(skb);
4437 return NET_RX_SUCCESS;
4438 }
4439
4440 out:
4441 return netif_receive_skb_internal(skb);
4442 }
4443
4444 /* napi->gro_list contains packets ordered by age.
4445 * youngest packets at the head of it.
4446 * Complete skbs in reverse order to reduce latencies.
4447 */
napi_gro_flush(struct napi_struct * napi,bool flush_old)4448 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4449 {
4450 struct sk_buff *skb, *prev = NULL;
4451
4452 /* scan list and build reverse chain */
4453 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4454 skb->prev = prev;
4455 prev = skb;
4456 }
4457
4458 for (skb = prev; skb; skb = prev) {
4459 skb->next = NULL;
4460
4461 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4462 return;
4463
4464 prev = skb->prev;
4465 napi_gro_complete(skb);
4466 napi->gro_count--;
4467 }
4468
4469 napi->gro_list = NULL;
4470 }
4471 EXPORT_SYMBOL(napi_gro_flush);
4472
gro_list_prepare(struct napi_struct * napi,struct sk_buff * skb)4473 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4474 {
4475 struct sk_buff *p;
4476 unsigned int maclen = skb->dev->hard_header_len;
4477 u32 hash = skb_get_hash_raw(skb);
4478
4479 for (p = napi->gro_list; p; p = p->next) {
4480 unsigned long diffs;
4481
4482 NAPI_GRO_CB(p)->flush = 0;
4483
4484 if (hash != skb_get_hash_raw(p)) {
4485 NAPI_GRO_CB(p)->same_flow = 0;
4486 continue;
4487 }
4488
4489 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4490 diffs |= p->vlan_tci ^ skb->vlan_tci;
4491 diffs |= skb_metadata_dst_cmp(p, skb);
4492 if (maclen == ETH_HLEN)
4493 diffs |= compare_ether_header(skb_mac_header(p),
4494 skb_mac_header(skb));
4495 else if (!diffs)
4496 diffs = memcmp(skb_mac_header(p),
4497 skb_mac_header(skb),
4498 maclen);
4499 NAPI_GRO_CB(p)->same_flow = !diffs;
4500 }
4501 }
4502
skb_gro_reset_offset(struct sk_buff * skb)4503 static void skb_gro_reset_offset(struct sk_buff *skb)
4504 {
4505 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4506 const skb_frag_t *frag0 = &pinfo->frags[0];
4507
4508 NAPI_GRO_CB(skb)->data_offset = 0;
4509 NAPI_GRO_CB(skb)->frag0 = NULL;
4510 NAPI_GRO_CB(skb)->frag0_len = 0;
4511
4512 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4513 pinfo->nr_frags &&
4514 !PageHighMem(skb_frag_page(frag0))) {
4515 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4516 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4517 skb_frag_size(frag0),
4518 skb->end - skb->tail);
4519 }
4520 }
4521
gro_pull_from_frag0(struct sk_buff * skb,int grow)4522 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4523 {
4524 struct skb_shared_info *pinfo = skb_shinfo(skb);
4525
4526 BUG_ON(skb->end - skb->tail < grow);
4527
4528 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4529
4530 skb->data_len -= grow;
4531 skb->tail += grow;
4532
4533 pinfo->frags[0].page_offset += grow;
4534 skb_frag_size_sub(&pinfo->frags[0], grow);
4535
4536 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4537 skb_frag_unref(skb, 0);
4538 memmove(pinfo->frags, pinfo->frags + 1,
4539 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4540 }
4541 }
4542
dev_gro_receive(struct napi_struct * napi,struct sk_buff * skb)4543 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4544 {
4545 struct sk_buff **pp = NULL;
4546 struct packet_offload *ptype;
4547 __be16 type = skb->protocol;
4548 struct list_head *head = &offload_base;
4549 int same_flow;
4550 enum gro_result ret;
4551 int grow;
4552
4553 if (!(skb->dev->features & NETIF_F_GRO))
4554 goto normal;
4555
4556 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4557 goto normal;
4558
4559 gro_list_prepare(napi, skb);
4560
4561 rcu_read_lock();
4562 list_for_each_entry_rcu(ptype, head, list) {
4563 if (ptype->type != type || !ptype->callbacks.gro_receive)
4564 continue;
4565
4566 skb_set_network_header(skb, skb_gro_offset(skb));
4567 skb_reset_mac_len(skb);
4568 NAPI_GRO_CB(skb)->same_flow = 0;
4569 NAPI_GRO_CB(skb)->flush = 0;
4570 NAPI_GRO_CB(skb)->free = 0;
4571 NAPI_GRO_CB(skb)->encap_mark = 0;
4572 NAPI_GRO_CB(skb)->recursion_counter = 0;
4573 NAPI_GRO_CB(skb)->is_fou = 0;
4574 NAPI_GRO_CB(skb)->is_atomic = 1;
4575 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4576
4577 /* Setup for GRO checksum validation */
4578 switch (skb->ip_summed) {
4579 case CHECKSUM_COMPLETE:
4580 NAPI_GRO_CB(skb)->csum = skb->csum;
4581 NAPI_GRO_CB(skb)->csum_valid = 1;
4582 NAPI_GRO_CB(skb)->csum_cnt = 0;
4583 break;
4584 case CHECKSUM_UNNECESSARY:
4585 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4586 NAPI_GRO_CB(skb)->csum_valid = 0;
4587 break;
4588 default:
4589 NAPI_GRO_CB(skb)->csum_cnt = 0;
4590 NAPI_GRO_CB(skb)->csum_valid = 0;
4591 }
4592
4593 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4594 break;
4595 }
4596 rcu_read_unlock();
4597
4598 if (&ptype->list == head)
4599 goto normal;
4600
4601 same_flow = NAPI_GRO_CB(skb)->same_flow;
4602 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4603
4604 if (pp) {
4605 struct sk_buff *nskb = *pp;
4606
4607 *pp = nskb->next;
4608 nskb->next = NULL;
4609 napi_gro_complete(nskb);
4610 napi->gro_count--;
4611 }
4612
4613 if (same_flow)
4614 goto ok;
4615
4616 if (NAPI_GRO_CB(skb)->flush)
4617 goto normal;
4618
4619 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4620 struct sk_buff *nskb = napi->gro_list;
4621
4622 /* locate the end of the list to select the 'oldest' flow */
4623 while (nskb->next) {
4624 pp = &nskb->next;
4625 nskb = *pp;
4626 }
4627 *pp = NULL;
4628 nskb->next = NULL;
4629 napi_gro_complete(nskb);
4630 } else {
4631 napi->gro_count++;
4632 }
4633 NAPI_GRO_CB(skb)->count = 1;
4634 NAPI_GRO_CB(skb)->age = jiffies;
4635 NAPI_GRO_CB(skb)->last = skb;
4636 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4637 skb->next = napi->gro_list;
4638 napi->gro_list = skb;
4639 ret = GRO_HELD;
4640
4641 pull:
4642 grow = skb_gro_offset(skb) - skb_headlen(skb);
4643 if (grow > 0)
4644 gro_pull_from_frag0(skb, grow);
4645 ok:
4646 return ret;
4647
4648 normal:
4649 ret = GRO_NORMAL;
4650 goto pull;
4651 }
4652
gro_find_receive_by_type(__be16 type)4653 struct packet_offload *gro_find_receive_by_type(__be16 type)
4654 {
4655 struct list_head *offload_head = &offload_base;
4656 struct packet_offload *ptype;
4657
4658 list_for_each_entry_rcu(ptype, offload_head, list) {
4659 if (ptype->type != type || !ptype->callbacks.gro_receive)
4660 continue;
4661 return ptype;
4662 }
4663 return NULL;
4664 }
4665 EXPORT_SYMBOL(gro_find_receive_by_type);
4666
gro_find_complete_by_type(__be16 type)4667 struct packet_offload *gro_find_complete_by_type(__be16 type)
4668 {
4669 struct list_head *offload_head = &offload_base;
4670 struct packet_offload *ptype;
4671
4672 list_for_each_entry_rcu(ptype, offload_head, list) {
4673 if (ptype->type != type || !ptype->callbacks.gro_complete)
4674 continue;
4675 return ptype;
4676 }
4677 return NULL;
4678 }
4679 EXPORT_SYMBOL(gro_find_complete_by_type);
4680
napi_skb_free_stolen_head(struct sk_buff * skb)4681 static void napi_skb_free_stolen_head(struct sk_buff *skb)
4682 {
4683 skb_dst_drop(skb);
4684 kmem_cache_free(skbuff_head_cache, skb);
4685 }
4686
napi_skb_finish(gro_result_t ret,struct sk_buff * skb)4687 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4688 {
4689 switch (ret) {
4690 case GRO_NORMAL:
4691 if (netif_receive_skb_internal(skb))
4692 ret = GRO_DROP;
4693 break;
4694
4695 case GRO_DROP:
4696 kfree_skb(skb);
4697 break;
4698
4699 case GRO_MERGED_FREE:
4700 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4701 napi_skb_free_stolen_head(skb);
4702 else
4703 __kfree_skb(skb);
4704 break;
4705
4706 case GRO_HELD:
4707 case GRO_MERGED:
4708 break;
4709 }
4710
4711 return ret;
4712 }
4713
napi_gro_receive(struct napi_struct * napi,struct sk_buff * skb)4714 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4715 {
4716 skb_mark_napi_id(skb, napi);
4717 trace_napi_gro_receive_entry(skb);
4718
4719 skb_gro_reset_offset(skb);
4720
4721 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4722 }
4723 EXPORT_SYMBOL(napi_gro_receive);
4724
napi_reuse_skb(struct napi_struct * napi,struct sk_buff * skb)4725 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4726 {
4727 if (unlikely(skb->pfmemalloc)) {
4728 consume_skb(skb);
4729 return;
4730 }
4731 __skb_pull(skb, skb_headlen(skb));
4732 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4733 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4734 skb->vlan_tci = 0;
4735 skb->dev = napi->dev;
4736 skb->skb_iif = 0;
4737 skb->encapsulation = 0;
4738 skb_shinfo(skb)->gso_type = 0;
4739 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4740
4741 napi->skb = skb;
4742 }
4743
napi_get_frags(struct napi_struct * napi)4744 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4745 {
4746 struct sk_buff *skb = napi->skb;
4747
4748 if (!skb) {
4749 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4750 if (skb) {
4751 napi->skb = skb;
4752 skb_mark_napi_id(skb, napi);
4753 }
4754 }
4755 return skb;
4756 }
4757 EXPORT_SYMBOL(napi_get_frags);
4758
napi_frags_finish(struct napi_struct * napi,struct sk_buff * skb,gro_result_t ret)4759 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4760 struct sk_buff *skb,
4761 gro_result_t ret)
4762 {
4763 switch (ret) {
4764 case GRO_NORMAL:
4765 case GRO_HELD:
4766 __skb_push(skb, ETH_HLEN);
4767 skb->protocol = eth_type_trans(skb, skb->dev);
4768 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4769 ret = GRO_DROP;
4770 break;
4771
4772 case GRO_DROP:
4773 napi_reuse_skb(napi, skb);
4774 break;
4775
4776 case GRO_MERGED_FREE:
4777 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4778 napi_skb_free_stolen_head(skb);
4779 else
4780 napi_reuse_skb(napi, skb);
4781 break;
4782
4783 case GRO_MERGED:
4784 break;
4785 }
4786
4787 return ret;
4788 }
4789
4790 /* Upper GRO stack assumes network header starts at gro_offset=0
4791 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4792 * We copy ethernet header into skb->data to have a common layout.
4793 */
napi_frags_skb(struct napi_struct * napi)4794 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4795 {
4796 struct sk_buff *skb = napi->skb;
4797 const struct ethhdr *eth;
4798 unsigned int hlen = sizeof(*eth);
4799
4800 napi->skb = NULL;
4801
4802 skb_reset_mac_header(skb);
4803 skb_gro_reset_offset(skb);
4804
4805 eth = skb_gro_header_fast(skb, 0);
4806 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4807 eth = skb_gro_header_slow(skb, hlen, 0);
4808 if (unlikely(!eth)) {
4809 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4810 __func__, napi->dev->name);
4811 napi_reuse_skb(napi, skb);
4812 return NULL;
4813 }
4814 } else {
4815 gro_pull_from_frag0(skb, hlen);
4816 NAPI_GRO_CB(skb)->frag0 += hlen;
4817 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4818 }
4819 __skb_pull(skb, hlen);
4820
4821 /*
4822 * This works because the only protocols we care about don't require
4823 * special handling.
4824 * We'll fix it up properly in napi_frags_finish()
4825 */
4826 skb->protocol = eth->h_proto;
4827
4828 return skb;
4829 }
4830
napi_gro_frags(struct napi_struct * napi)4831 gro_result_t napi_gro_frags(struct napi_struct *napi)
4832 {
4833 struct sk_buff *skb = napi_frags_skb(napi);
4834
4835 if (!skb)
4836 return GRO_DROP;
4837
4838 trace_napi_gro_frags_entry(skb);
4839
4840 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4841 }
4842 EXPORT_SYMBOL(napi_gro_frags);
4843
4844 /* Compute the checksum from gro_offset and return the folded value
4845 * after adding in any pseudo checksum.
4846 */
__skb_gro_checksum_complete(struct sk_buff * skb)4847 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4848 {
4849 __wsum wsum;
4850 __sum16 sum;
4851
4852 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4853
4854 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4855 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4856 if (likely(!sum)) {
4857 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4858 !skb->csum_complete_sw)
4859 netdev_rx_csum_fault(skb->dev);
4860 }
4861
4862 NAPI_GRO_CB(skb)->csum = wsum;
4863 NAPI_GRO_CB(skb)->csum_valid = 1;
4864
4865 return sum;
4866 }
4867 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4868
4869 /*
4870 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4871 * Note: called with local irq disabled, but exits with local irq enabled.
4872 */
net_rps_action_and_irq_enable(struct softnet_data * sd)4873 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4874 {
4875 #ifdef CONFIG_RPS
4876 struct softnet_data *remsd = sd->rps_ipi_list;
4877
4878 if (remsd) {
4879 sd->rps_ipi_list = NULL;
4880
4881 local_irq_enable();
4882
4883 /* Send pending IPI's to kick RPS processing on remote cpus. */
4884 while (remsd) {
4885 struct softnet_data *next = remsd->rps_ipi_next;
4886
4887 if (cpu_online(remsd->cpu))
4888 smp_call_function_single_async(remsd->cpu,
4889 &remsd->csd);
4890 remsd = next;
4891 }
4892 } else
4893 #endif
4894 local_irq_enable();
4895 }
4896
sd_has_rps_ipi_waiting(struct softnet_data * sd)4897 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4898 {
4899 #ifdef CONFIG_RPS
4900 return sd->rps_ipi_list != NULL;
4901 #else
4902 return false;
4903 #endif
4904 }
4905
process_backlog(struct napi_struct * napi,int quota)4906 static int process_backlog(struct napi_struct *napi, int quota)
4907 {
4908 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4909 bool again = true;
4910 int work = 0;
4911
4912 /* Check if we have pending ipi, its better to send them now,
4913 * not waiting net_rx_action() end.
4914 */
4915 if (sd_has_rps_ipi_waiting(sd)) {
4916 local_irq_disable();
4917 net_rps_action_and_irq_enable(sd);
4918 }
4919
4920 napi->weight = weight_p;
4921 while (again) {
4922 struct sk_buff *skb;
4923
4924 while ((skb = __skb_dequeue(&sd->process_queue))) {
4925 rcu_read_lock();
4926 __netif_receive_skb(skb);
4927 rcu_read_unlock();
4928 input_queue_head_incr(sd);
4929 if (++work >= quota)
4930 return work;
4931
4932 }
4933
4934 local_irq_disable();
4935 rps_lock(sd);
4936 if (skb_queue_empty(&sd->input_pkt_queue)) {
4937 /*
4938 * Inline a custom version of __napi_complete().
4939 * only current cpu owns and manipulates this napi,
4940 * and NAPI_STATE_SCHED is the only possible flag set
4941 * on backlog.
4942 * We can use a plain write instead of clear_bit(),
4943 * and we dont need an smp_mb() memory barrier.
4944 */
4945 napi->state = 0;
4946 again = false;
4947 } else {
4948 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4949 &sd->process_queue);
4950 }
4951 rps_unlock(sd);
4952 local_irq_enable();
4953 }
4954
4955 return work;
4956 }
4957
4958 /**
4959 * __napi_schedule - schedule for receive
4960 * @n: entry to schedule
4961 *
4962 * The entry's receive function will be scheduled to run.
4963 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4964 */
__napi_schedule(struct napi_struct * n)4965 void __napi_schedule(struct napi_struct *n)
4966 {
4967 unsigned long flags;
4968
4969 local_irq_save(flags);
4970 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4971 local_irq_restore(flags);
4972 }
4973 EXPORT_SYMBOL(__napi_schedule);
4974
4975 /**
4976 * __napi_schedule_irqoff - schedule for receive
4977 * @n: entry to schedule
4978 *
4979 * Variant of __napi_schedule() assuming hard irqs are masked
4980 */
__napi_schedule_irqoff(struct napi_struct * n)4981 void __napi_schedule_irqoff(struct napi_struct *n)
4982 {
4983 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4984 }
4985 EXPORT_SYMBOL(__napi_schedule_irqoff);
4986
__napi_complete(struct napi_struct * n)4987 void __napi_complete(struct napi_struct *n)
4988 {
4989 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4990
4991 list_del_init(&n->poll_list);
4992 smp_mb__before_atomic();
4993 clear_bit(NAPI_STATE_SCHED, &n->state);
4994 }
4995 EXPORT_SYMBOL(__napi_complete);
4996
napi_complete_done(struct napi_struct * n,int work_done)4997 void napi_complete_done(struct napi_struct *n, int work_done)
4998 {
4999 unsigned long flags;
5000
5001 /*
5002 * don't let napi dequeue from the cpu poll list
5003 * just in case its running on a different cpu
5004 */
5005 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
5006 return;
5007
5008 if (n->gro_list) {
5009 unsigned long timeout = 0;
5010
5011 if (work_done)
5012 timeout = n->dev->gro_flush_timeout;
5013
5014 if (timeout)
5015 hrtimer_start(&n->timer, ns_to_ktime(timeout),
5016 HRTIMER_MODE_REL_PINNED);
5017 else
5018 napi_gro_flush(n, false);
5019 }
5020 if (likely(list_empty(&n->poll_list))) {
5021 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
5022 } else {
5023 /* If n->poll_list is not empty, we need to mask irqs */
5024 local_irq_save(flags);
5025 __napi_complete(n);
5026 local_irq_restore(flags);
5027 }
5028 }
5029 EXPORT_SYMBOL(napi_complete_done);
5030
5031 /* must be called under rcu_read_lock(), as we dont take a reference */
napi_by_id(unsigned int napi_id)5032 static struct napi_struct *napi_by_id(unsigned int napi_id)
5033 {
5034 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5035 struct napi_struct *napi;
5036
5037 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5038 if (napi->napi_id == napi_id)
5039 return napi;
5040
5041 return NULL;
5042 }
5043
5044 #if defined(CONFIG_NET_RX_BUSY_POLL)
5045 #define BUSY_POLL_BUDGET 8
sk_busy_loop(struct sock * sk,int nonblock)5046 bool sk_busy_loop(struct sock *sk, int nonblock)
5047 {
5048 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5049 int (*busy_poll)(struct napi_struct *dev);
5050 struct napi_struct *napi;
5051 int rc = false;
5052
5053 rcu_read_lock();
5054
5055 napi = napi_by_id(sk->sk_napi_id);
5056 if (!napi)
5057 goto out;
5058
5059 /* Note: ndo_busy_poll method is optional in linux-4.5 */
5060 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5061
5062 do {
5063 rc = 0;
5064 local_bh_disable();
5065 if (busy_poll) {
5066 rc = busy_poll(napi);
5067 } else if (napi_schedule_prep(napi)) {
5068 void *have = netpoll_poll_lock(napi);
5069
5070 if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
5071 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5072 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5073 if (rc == BUSY_POLL_BUDGET) {
5074 napi_complete_done(napi, rc);
5075 napi_schedule(napi);
5076 }
5077 }
5078 netpoll_poll_unlock(have);
5079 }
5080 if (rc > 0)
5081 __NET_ADD_STATS(sock_net(sk),
5082 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5083 local_bh_enable();
5084
5085 if (rc == LL_FLUSH_FAILED)
5086 break; /* permanent failure */
5087
5088 cpu_relax();
5089 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5090 !need_resched() && !busy_loop_timeout(end_time));
5091
5092 rc = !skb_queue_empty(&sk->sk_receive_queue);
5093 out:
5094 rcu_read_unlock();
5095 return rc;
5096 }
5097 EXPORT_SYMBOL(sk_busy_loop);
5098
5099 #endif /* CONFIG_NET_RX_BUSY_POLL */
5100
napi_hash_add(struct napi_struct * napi)5101 void napi_hash_add(struct napi_struct *napi)
5102 {
5103 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5104 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5105 return;
5106
5107 spin_lock(&napi_hash_lock);
5108
5109 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5110 do {
5111 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5112 napi_gen_id = NR_CPUS + 1;
5113 } while (napi_by_id(napi_gen_id));
5114 napi->napi_id = napi_gen_id;
5115
5116 hlist_add_head_rcu(&napi->napi_hash_node,
5117 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5118
5119 spin_unlock(&napi_hash_lock);
5120 }
5121 EXPORT_SYMBOL_GPL(napi_hash_add);
5122
5123 /* Warning : caller is responsible to make sure rcu grace period
5124 * is respected before freeing memory containing @napi
5125 */
napi_hash_del(struct napi_struct * napi)5126 bool napi_hash_del(struct napi_struct *napi)
5127 {
5128 bool rcu_sync_needed = false;
5129
5130 spin_lock(&napi_hash_lock);
5131
5132 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5133 rcu_sync_needed = true;
5134 hlist_del_rcu(&napi->napi_hash_node);
5135 }
5136 spin_unlock(&napi_hash_lock);
5137 return rcu_sync_needed;
5138 }
5139 EXPORT_SYMBOL_GPL(napi_hash_del);
5140
napi_watchdog(struct hrtimer * timer)5141 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5142 {
5143 struct napi_struct *napi;
5144
5145 napi = container_of(timer, struct napi_struct, timer);
5146 if (napi->gro_list)
5147 napi_schedule(napi);
5148
5149 return HRTIMER_NORESTART;
5150 }
5151
netif_napi_add(struct net_device * dev,struct napi_struct * napi,int (* poll)(struct napi_struct *,int),int weight)5152 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5153 int (*poll)(struct napi_struct *, int), int weight)
5154 {
5155 INIT_LIST_HEAD(&napi->poll_list);
5156 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5157 napi->timer.function = napi_watchdog;
5158 napi->gro_count = 0;
5159 napi->gro_list = NULL;
5160 napi->skb = NULL;
5161 napi->poll = poll;
5162 if (weight > NAPI_POLL_WEIGHT)
5163 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5164 weight, dev->name);
5165 napi->weight = weight;
5166 list_add(&napi->dev_list, &dev->napi_list);
5167 napi->dev = dev;
5168 #ifdef CONFIG_NETPOLL
5169 spin_lock_init(&napi->poll_lock);
5170 napi->poll_owner = -1;
5171 #endif
5172 set_bit(NAPI_STATE_SCHED, &napi->state);
5173 napi_hash_add(napi);
5174 }
5175 EXPORT_SYMBOL(netif_napi_add);
5176
napi_disable(struct napi_struct * n)5177 void napi_disable(struct napi_struct *n)
5178 {
5179 might_sleep();
5180 set_bit(NAPI_STATE_DISABLE, &n->state);
5181
5182 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5183 msleep(1);
5184 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5185 msleep(1);
5186
5187 hrtimer_cancel(&n->timer);
5188
5189 clear_bit(NAPI_STATE_DISABLE, &n->state);
5190 }
5191 EXPORT_SYMBOL(napi_disable);
5192
5193 /* Must be called in process context */
netif_napi_del(struct napi_struct * napi)5194 void netif_napi_del(struct napi_struct *napi)
5195 {
5196 might_sleep();
5197 if (napi_hash_del(napi))
5198 synchronize_net();
5199 list_del_init(&napi->dev_list);
5200 napi_free_frags(napi);
5201
5202 kfree_skb_list(napi->gro_list);
5203 napi->gro_list = NULL;
5204 napi->gro_count = 0;
5205 }
5206 EXPORT_SYMBOL(netif_napi_del);
5207
napi_poll(struct napi_struct * n,struct list_head * repoll)5208 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5209 {
5210 void *have;
5211 int work, weight;
5212
5213 list_del_init(&n->poll_list);
5214
5215 have = netpoll_poll_lock(n);
5216
5217 weight = n->weight;
5218
5219 /* This NAPI_STATE_SCHED test is for avoiding a race
5220 * with netpoll's poll_napi(). Only the entity which
5221 * obtains the lock and sees NAPI_STATE_SCHED set will
5222 * actually make the ->poll() call. Therefore we avoid
5223 * accidentally calling ->poll() when NAPI is not scheduled.
5224 */
5225 work = 0;
5226 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5227 work = n->poll(n, weight);
5228 trace_napi_poll(n, work, weight);
5229 }
5230
5231 WARN_ON_ONCE(work > weight);
5232
5233 if (likely(work < weight))
5234 goto out_unlock;
5235
5236 /* Drivers must not modify the NAPI state if they
5237 * consume the entire weight. In such cases this code
5238 * still "owns" the NAPI instance and therefore can
5239 * move the instance around on the list at-will.
5240 */
5241 if (unlikely(napi_disable_pending(n))) {
5242 napi_complete(n);
5243 goto out_unlock;
5244 }
5245
5246 if (n->gro_list) {
5247 /* flush too old packets
5248 * If HZ < 1000, flush all packets.
5249 */
5250 napi_gro_flush(n, HZ >= 1000);
5251 }
5252
5253 /* Some drivers may have called napi_schedule
5254 * prior to exhausting their budget.
5255 */
5256 if (unlikely(!list_empty(&n->poll_list))) {
5257 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5258 n->dev ? n->dev->name : "backlog");
5259 goto out_unlock;
5260 }
5261
5262 list_add_tail(&n->poll_list, repoll);
5263
5264 out_unlock:
5265 netpoll_poll_unlock(have);
5266
5267 return work;
5268 }
5269
net_rx_action(struct softirq_action * h)5270 static __latent_entropy void net_rx_action(struct softirq_action *h)
5271 {
5272 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5273 unsigned long time_limit = jiffies + 2;
5274 int budget = netdev_budget;
5275 LIST_HEAD(list);
5276 LIST_HEAD(repoll);
5277
5278 local_irq_disable();
5279 list_splice_init(&sd->poll_list, &list);
5280 local_irq_enable();
5281
5282 for (;;) {
5283 struct napi_struct *n;
5284
5285 if (list_empty(&list)) {
5286 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5287 return;
5288 break;
5289 }
5290
5291 n = list_first_entry(&list, struct napi_struct, poll_list);
5292 budget -= napi_poll(n, &repoll);
5293
5294 /* If softirq window is exhausted then punt.
5295 * Allow this to run for 2 jiffies since which will allow
5296 * an average latency of 1.5/HZ.
5297 */
5298 if (unlikely(budget <= 0 ||
5299 time_after_eq(jiffies, time_limit))) {
5300 sd->time_squeeze++;
5301 break;
5302 }
5303 }
5304
5305 __kfree_skb_flush();
5306 local_irq_disable();
5307
5308 list_splice_tail_init(&sd->poll_list, &list);
5309 list_splice_tail(&repoll, &list);
5310 list_splice(&list, &sd->poll_list);
5311 if (!list_empty(&sd->poll_list))
5312 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5313
5314 net_rps_action_and_irq_enable(sd);
5315 }
5316
5317 struct netdev_adjacent {
5318 struct net_device *dev;
5319
5320 /* upper master flag, there can only be one master device per list */
5321 bool master;
5322
5323 /* counter for the number of times this device was added to us */
5324 u16 ref_nr;
5325
5326 /* private field for the users */
5327 void *private;
5328
5329 struct list_head list;
5330 struct rcu_head rcu;
5331 };
5332
__netdev_find_adj(struct net_device * adj_dev,struct list_head * adj_list)5333 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5334 struct list_head *adj_list)
5335 {
5336 struct netdev_adjacent *adj;
5337
5338 list_for_each_entry(adj, adj_list, list) {
5339 if (adj->dev == adj_dev)
5340 return adj;
5341 }
5342 return NULL;
5343 }
5344
5345 /**
5346 * netdev_has_upper_dev - Check if device is linked to an upper device
5347 * @dev: device
5348 * @upper_dev: upper device to check
5349 *
5350 * Find out if a device is linked to specified upper device and return true
5351 * in case it is. Note that this checks only immediate upper device,
5352 * not through a complete stack of devices. The caller must hold the RTNL lock.
5353 */
netdev_has_upper_dev(struct net_device * dev,struct net_device * upper_dev)5354 bool netdev_has_upper_dev(struct net_device *dev,
5355 struct net_device *upper_dev)
5356 {
5357 ASSERT_RTNL();
5358
5359 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5360 }
5361 EXPORT_SYMBOL(netdev_has_upper_dev);
5362
5363 /**
5364 * netdev_has_any_upper_dev - Check if device is linked to some device
5365 * @dev: device
5366 *
5367 * Find out if a device is linked to an upper device and return true in case
5368 * it is. The caller must hold the RTNL lock.
5369 */
netdev_has_any_upper_dev(struct net_device * dev)5370 bool netdev_has_any_upper_dev(struct net_device *dev)
5371 {
5372 ASSERT_RTNL();
5373
5374 return !list_empty(&dev->all_adj_list.upper);
5375 }
5376 EXPORT_SYMBOL(netdev_has_any_upper_dev);
5377
5378 /**
5379 * netdev_master_upper_dev_get - Get master upper device
5380 * @dev: device
5381 *
5382 * Find a master upper device and return pointer to it or NULL in case
5383 * it's not there. The caller must hold the RTNL lock.
5384 */
netdev_master_upper_dev_get(struct net_device * dev)5385 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5386 {
5387 struct netdev_adjacent *upper;
5388
5389 ASSERT_RTNL();
5390
5391 if (list_empty(&dev->adj_list.upper))
5392 return NULL;
5393
5394 upper = list_first_entry(&dev->adj_list.upper,
5395 struct netdev_adjacent, list);
5396 if (likely(upper->master))
5397 return upper->dev;
5398 return NULL;
5399 }
5400 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5401
netdev_adjacent_get_private(struct list_head * adj_list)5402 void *netdev_adjacent_get_private(struct list_head *adj_list)
5403 {
5404 struct netdev_adjacent *adj;
5405
5406 adj = list_entry(adj_list, struct netdev_adjacent, list);
5407
5408 return adj->private;
5409 }
5410 EXPORT_SYMBOL(netdev_adjacent_get_private);
5411
5412 /**
5413 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5414 * @dev: device
5415 * @iter: list_head ** of the current position
5416 *
5417 * Gets the next device from the dev's upper list, starting from iter
5418 * position. The caller must hold RCU read lock.
5419 */
netdev_upper_get_next_dev_rcu(struct net_device * dev,struct list_head ** iter)5420 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5421 struct list_head **iter)
5422 {
5423 struct netdev_adjacent *upper;
5424
5425 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5426
5427 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5428
5429 if (&upper->list == &dev->adj_list.upper)
5430 return NULL;
5431
5432 *iter = &upper->list;
5433
5434 return upper->dev;
5435 }
5436 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5437
5438 /**
5439 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5440 * @dev: device
5441 * @iter: list_head ** of the current position
5442 *
5443 * Gets the next device from the dev's upper list, starting from iter
5444 * position. The caller must hold RCU read lock.
5445 */
netdev_all_upper_get_next_dev_rcu(struct net_device * dev,struct list_head ** iter)5446 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5447 struct list_head **iter)
5448 {
5449 struct netdev_adjacent *upper;
5450
5451 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5452
5453 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5454
5455 if (&upper->list == &dev->all_adj_list.upper)
5456 return NULL;
5457
5458 *iter = &upper->list;
5459
5460 return upper->dev;
5461 }
5462 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5463
5464 /**
5465 * netdev_lower_get_next_private - Get the next ->private from the
5466 * lower neighbour list
5467 * @dev: device
5468 * @iter: list_head ** of the current position
5469 *
5470 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5471 * list, starting from iter position. The caller must hold either hold the
5472 * RTNL lock or its own locking that guarantees that the neighbour lower
5473 * list will remain unchanged.
5474 */
netdev_lower_get_next_private(struct net_device * dev,struct list_head ** iter)5475 void *netdev_lower_get_next_private(struct net_device *dev,
5476 struct list_head **iter)
5477 {
5478 struct netdev_adjacent *lower;
5479
5480 lower = list_entry(*iter, struct netdev_adjacent, list);
5481
5482 if (&lower->list == &dev->adj_list.lower)
5483 return NULL;
5484
5485 *iter = lower->list.next;
5486
5487 return lower->private;
5488 }
5489 EXPORT_SYMBOL(netdev_lower_get_next_private);
5490
5491 /**
5492 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5493 * lower neighbour list, RCU
5494 * variant
5495 * @dev: device
5496 * @iter: list_head ** of the current position
5497 *
5498 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5499 * list, starting from iter position. The caller must hold RCU read lock.
5500 */
netdev_lower_get_next_private_rcu(struct net_device * dev,struct list_head ** iter)5501 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5502 struct list_head **iter)
5503 {
5504 struct netdev_adjacent *lower;
5505
5506 WARN_ON_ONCE(!rcu_read_lock_held());
5507
5508 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5509
5510 if (&lower->list == &dev->adj_list.lower)
5511 return NULL;
5512
5513 *iter = &lower->list;
5514
5515 return lower->private;
5516 }
5517 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5518
5519 /**
5520 * netdev_lower_get_next - Get the next device from the lower neighbour
5521 * list
5522 * @dev: device
5523 * @iter: list_head ** of the current position
5524 *
5525 * Gets the next netdev_adjacent from the dev's lower neighbour
5526 * list, starting from iter position. The caller must hold RTNL lock or
5527 * its own locking that guarantees that the neighbour lower
5528 * list will remain unchanged.
5529 */
netdev_lower_get_next(struct net_device * dev,struct list_head ** iter)5530 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5531 {
5532 struct netdev_adjacent *lower;
5533
5534 lower = list_entry(*iter, struct netdev_adjacent, list);
5535
5536 if (&lower->list == &dev->adj_list.lower)
5537 return NULL;
5538
5539 *iter = lower->list.next;
5540
5541 return lower->dev;
5542 }
5543 EXPORT_SYMBOL(netdev_lower_get_next);
5544
5545 /**
5546 * netdev_all_lower_get_next - Get the next device from all lower neighbour list
5547 * @dev: device
5548 * @iter: list_head ** of the current position
5549 *
5550 * Gets the next netdev_adjacent from the dev's all lower neighbour
5551 * list, starting from iter position. The caller must hold RTNL lock or
5552 * its own locking that guarantees that the neighbour all lower
5553 * list will remain unchanged.
5554 */
netdev_all_lower_get_next(struct net_device * dev,struct list_head ** iter)5555 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5556 {
5557 struct netdev_adjacent *lower;
5558
5559 lower = list_entry(*iter, struct netdev_adjacent, list);
5560
5561 if (&lower->list == &dev->all_adj_list.lower)
5562 return NULL;
5563
5564 *iter = lower->list.next;
5565
5566 return lower->dev;
5567 }
5568 EXPORT_SYMBOL(netdev_all_lower_get_next);
5569
5570 /**
5571 * netdev_all_lower_get_next_rcu - Get the next device from all
5572 * lower neighbour list, RCU variant
5573 * @dev: device
5574 * @iter: list_head ** of the current position
5575 *
5576 * Gets the next netdev_adjacent from the dev's all lower neighbour
5577 * list, starting from iter position. The caller must hold RCU read lock.
5578 */
netdev_all_lower_get_next_rcu(struct net_device * dev,struct list_head ** iter)5579 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
5580 struct list_head **iter)
5581 {
5582 struct netdev_adjacent *lower;
5583
5584 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5585
5586 if (&lower->list == &dev->all_adj_list.lower)
5587 return NULL;
5588
5589 *iter = &lower->list;
5590
5591 return lower->dev;
5592 }
5593 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
5594
5595 /**
5596 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5597 * lower neighbour list, RCU
5598 * variant
5599 * @dev: device
5600 *
5601 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5602 * list. The caller must hold RCU read lock.
5603 */
netdev_lower_get_first_private_rcu(struct net_device * dev)5604 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5605 {
5606 struct netdev_adjacent *lower;
5607
5608 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5609 struct netdev_adjacent, list);
5610 if (lower)
5611 return lower->private;
5612 return NULL;
5613 }
5614 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5615
5616 /**
5617 * netdev_master_upper_dev_get_rcu - Get master upper device
5618 * @dev: device
5619 *
5620 * Find a master upper device and return pointer to it or NULL in case
5621 * it's not there. The caller must hold the RCU read lock.
5622 */
netdev_master_upper_dev_get_rcu(struct net_device * dev)5623 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5624 {
5625 struct netdev_adjacent *upper;
5626
5627 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5628 struct netdev_adjacent, list);
5629 if (upper && likely(upper->master))
5630 return upper->dev;
5631 return NULL;
5632 }
5633 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5634
netdev_adjacent_sysfs_add(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)5635 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5636 struct net_device *adj_dev,
5637 struct list_head *dev_list)
5638 {
5639 char linkname[IFNAMSIZ+7];
5640 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5641 "upper_%s" : "lower_%s", adj_dev->name);
5642 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5643 linkname);
5644 }
netdev_adjacent_sysfs_del(struct net_device * dev,char * name,struct list_head * dev_list)5645 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5646 char *name,
5647 struct list_head *dev_list)
5648 {
5649 char linkname[IFNAMSIZ+7];
5650 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5651 "upper_%s" : "lower_%s", name);
5652 sysfs_remove_link(&(dev->dev.kobj), linkname);
5653 }
5654
netdev_adjacent_is_neigh_list(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)5655 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5656 struct net_device *adj_dev,
5657 struct list_head *dev_list)
5658 {
5659 return (dev_list == &dev->adj_list.upper ||
5660 dev_list == &dev->adj_list.lower) &&
5661 net_eq(dev_net(dev), dev_net(adj_dev));
5662 }
5663
__netdev_adjacent_dev_insert(struct net_device * dev,struct net_device * adj_dev,u16 ref_nr,struct list_head * dev_list,void * private,bool master)5664 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5665 struct net_device *adj_dev,
5666 u16 ref_nr,
5667 struct list_head *dev_list,
5668 void *private, bool master)
5669 {
5670 struct netdev_adjacent *adj;
5671 int ret;
5672
5673 adj = __netdev_find_adj(adj_dev, dev_list);
5674
5675 if (adj) {
5676 adj->ref_nr += ref_nr;
5677 return 0;
5678 }
5679
5680 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5681 if (!adj)
5682 return -ENOMEM;
5683
5684 adj->dev = adj_dev;
5685 adj->master = master;
5686 adj->ref_nr = ref_nr;
5687 adj->private = private;
5688 dev_hold(adj_dev);
5689
5690 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5691 adj_dev->name, dev->name, adj_dev->name);
5692
5693 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5694 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5695 if (ret)
5696 goto free_adj;
5697 }
5698
5699 /* Ensure that master link is always the first item in list. */
5700 if (master) {
5701 ret = sysfs_create_link(&(dev->dev.kobj),
5702 &(adj_dev->dev.kobj), "master");
5703 if (ret)
5704 goto remove_symlinks;
5705
5706 list_add_rcu(&adj->list, dev_list);
5707 } else {
5708 list_add_tail_rcu(&adj->list, dev_list);
5709 }
5710
5711 return 0;
5712
5713 remove_symlinks:
5714 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5715 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5716 free_adj:
5717 kfree(adj);
5718 dev_put(adj_dev);
5719
5720 return ret;
5721 }
5722
__netdev_adjacent_dev_remove(struct net_device * dev,struct net_device * adj_dev,u16 ref_nr,struct list_head * dev_list)5723 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5724 struct net_device *adj_dev,
5725 u16 ref_nr,
5726 struct list_head *dev_list)
5727 {
5728 struct netdev_adjacent *adj;
5729
5730 adj = __netdev_find_adj(adj_dev, dev_list);
5731
5732 if (!adj) {
5733 pr_err("tried to remove device %s from %s\n",
5734 dev->name, adj_dev->name);
5735 BUG();
5736 }
5737
5738 if (adj->ref_nr > ref_nr) {
5739 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5740 ref_nr, adj->ref_nr-ref_nr);
5741 adj->ref_nr -= ref_nr;
5742 return;
5743 }
5744
5745 if (adj->master)
5746 sysfs_remove_link(&(dev->dev.kobj), "master");
5747
5748 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5749 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5750
5751 list_del_rcu(&adj->list);
5752 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5753 adj_dev->name, dev->name, adj_dev->name);
5754 dev_put(adj_dev);
5755 kfree_rcu(adj, rcu);
5756 }
5757
__netdev_adjacent_dev_link_lists(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr,struct list_head * up_list,struct list_head * down_list,void * private,bool master)5758 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5759 struct net_device *upper_dev,
5760 u16 ref_nr,
5761 struct list_head *up_list,
5762 struct list_head *down_list,
5763 void *private, bool master)
5764 {
5765 int ret;
5766
5767 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5768 private, master);
5769 if (ret)
5770 return ret;
5771
5772 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5773 private, false);
5774 if (ret) {
5775 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5776 return ret;
5777 }
5778
5779 return 0;
5780 }
5781
__netdev_adjacent_dev_link(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr)5782 static int __netdev_adjacent_dev_link(struct net_device *dev,
5783 struct net_device *upper_dev,
5784 u16 ref_nr)
5785 {
5786 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5787 &dev->all_adj_list.upper,
5788 &upper_dev->all_adj_list.lower,
5789 NULL, false);
5790 }
5791
__netdev_adjacent_dev_unlink_lists(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr,struct list_head * up_list,struct list_head * down_list)5792 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5793 struct net_device *upper_dev,
5794 u16 ref_nr,
5795 struct list_head *up_list,
5796 struct list_head *down_list)
5797 {
5798 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5799 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5800 }
5801
__netdev_adjacent_dev_unlink(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr)5802 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5803 struct net_device *upper_dev,
5804 u16 ref_nr)
5805 {
5806 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5807 &dev->all_adj_list.upper,
5808 &upper_dev->all_adj_list.lower);
5809 }
5810
__netdev_adjacent_dev_link_neighbour(struct net_device * dev,struct net_device * upper_dev,void * private,bool master)5811 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5812 struct net_device *upper_dev,
5813 void *private, bool master)
5814 {
5815 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5816
5817 if (ret)
5818 return ret;
5819
5820 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5821 &dev->adj_list.upper,
5822 &upper_dev->adj_list.lower,
5823 private, master);
5824 if (ret) {
5825 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5826 return ret;
5827 }
5828
5829 return 0;
5830 }
5831
__netdev_adjacent_dev_unlink_neighbour(struct net_device * dev,struct net_device * upper_dev)5832 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5833 struct net_device *upper_dev)
5834 {
5835 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5836 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5837 &dev->adj_list.upper,
5838 &upper_dev->adj_list.lower);
5839 }
5840
__netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,bool master,void * upper_priv,void * upper_info)5841 static int __netdev_upper_dev_link(struct net_device *dev,
5842 struct net_device *upper_dev, bool master,
5843 void *upper_priv, void *upper_info)
5844 {
5845 struct netdev_notifier_changeupper_info changeupper_info;
5846 struct netdev_adjacent *i, *j, *to_i, *to_j;
5847 int ret = 0;
5848
5849 ASSERT_RTNL();
5850
5851 if (dev == upper_dev)
5852 return -EBUSY;
5853
5854 /* To prevent loops, check if dev is not upper device to upper_dev. */
5855 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5856 return -EBUSY;
5857
5858 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5859 return -EEXIST;
5860
5861 if (master && netdev_master_upper_dev_get(dev))
5862 return -EBUSY;
5863
5864 changeupper_info.upper_dev = upper_dev;
5865 changeupper_info.master = master;
5866 changeupper_info.linking = true;
5867 changeupper_info.upper_info = upper_info;
5868
5869 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5870 &changeupper_info.info);
5871 ret = notifier_to_errno(ret);
5872 if (ret)
5873 return ret;
5874
5875 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5876 master);
5877 if (ret)
5878 return ret;
5879
5880 /* Now that we linked these devs, make all the upper_dev's
5881 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5882 * versa, and don't forget the devices itself. All of these
5883 * links are non-neighbours.
5884 */
5885 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5886 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5887 pr_debug("Interlinking %s with %s, non-neighbour\n",
5888 i->dev->name, j->dev->name);
5889 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5890 if (ret)
5891 goto rollback_mesh;
5892 }
5893 }
5894
5895 /* add dev to every upper_dev's upper device */
5896 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5897 pr_debug("linking %s's upper device %s with %s\n",
5898 upper_dev->name, i->dev->name, dev->name);
5899 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5900 if (ret)
5901 goto rollback_upper_mesh;
5902 }
5903
5904 /* add upper_dev to every dev's lower device */
5905 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5906 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5907 i->dev->name, upper_dev->name);
5908 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5909 if (ret)
5910 goto rollback_lower_mesh;
5911 }
5912
5913 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5914 &changeupper_info.info);
5915 ret = notifier_to_errno(ret);
5916 if (ret)
5917 goto rollback_lower_mesh;
5918
5919 return 0;
5920
5921 rollback_lower_mesh:
5922 to_i = i;
5923 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5924 if (i == to_i)
5925 break;
5926 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5927 }
5928
5929 i = NULL;
5930
5931 rollback_upper_mesh:
5932 to_i = i;
5933 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5934 if (i == to_i)
5935 break;
5936 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5937 }
5938
5939 i = j = NULL;
5940
5941 rollback_mesh:
5942 to_i = i;
5943 to_j = j;
5944 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5945 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5946 if (i == to_i && j == to_j)
5947 break;
5948 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5949 }
5950 if (i == to_i)
5951 break;
5952 }
5953
5954 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5955
5956 return ret;
5957 }
5958
5959 /**
5960 * netdev_upper_dev_link - Add a link to the upper device
5961 * @dev: device
5962 * @upper_dev: new upper device
5963 *
5964 * Adds a link to device which is upper to this one. The caller must hold
5965 * the RTNL lock. On a failure a negative errno code is returned.
5966 * On success the reference counts are adjusted and the function
5967 * returns zero.
5968 */
netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev)5969 int netdev_upper_dev_link(struct net_device *dev,
5970 struct net_device *upper_dev)
5971 {
5972 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5973 }
5974 EXPORT_SYMBOL(netdev_upper_dev_link);
5975
5976 /**
5977 * netdev_master_upper_dev_link - Add a master link to the upper device
5978 * @dev: device
5979 * @upper_dev: new upper device
5980 * @upper_priv: upper device private
5981 * @upper_info: upper info to be passed down via notifier
5982 *
5983 * Adds a link to device which is upper to this one. In this case, only
5984 * one master upper device can be linked, although other non-master devices
5985 * might be linked as well. The caller must hold the RTNL lock.
5986 * On a failure a negative errno code is returned. On success the reference
5987 * counts are adjusted and the function returns zero.
5988 */
netdev_master_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,void * upper_priv,void * upper_info)5989 int netdev_master_upper_dev_link(struct net_device *dev,
5990 struct net_device *upper_dev,
5991 void *upper_priv, void *upper_info)
5992 {
5993 return __netdev_upper_dev_link(dev, upper_dev, true,
5994 upper_priv, upper_info);
5995 }
5996 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5997
5998 /**
5999 * netdev_upper_dev_unlink - Removes a link to upper device
6000 * @dev: device
6001 * @upper_dev: new upper device
6002 *
6003 * Removes a link to device which is upper to this one. The caller must hold
6004 * the RTNL lock.
6005 */
netdev_upper_dev_unlink(struct net_device * dev,struct net_device * upper_dev)6006 void netdev_upper_dev_unlink(struct net_device *dev,
6007 struct net_device *upper_dev)
6008 {
6009 struct netdev_notifier_changeupper_info changeupper_info;
6010 struct netdev_adjacent *i, *j;
6011 ASSERT_RTNL();
6012
6013 changeupper_info.upper_dev = upper_dev;
6014 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6015 changeupper_info.linking = false;
6016
6017 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6018 &changeupper_info.info);
6019
6020 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6021
6022 /* Here is the tricky part. We must remove all dev's lower
6023 * devices from all upper_dev's upper devices and vice
6024 * versa, to maintain the graph relationship.
6025 */
6026 list_for_each_entry(i, &dev->all_adj_list.lower, list)
6027 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
6028 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
6029
6030 /* remove also the devices itself from lower/upper device
6031 * list
6032 */
6033 list_for_each_entry(i, &dev->all_adj_list.lower, list)
6034 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
6035
6036 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
6037 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
6038
6039 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6040 &changeupper_info.info);
6041 }
6042 EXPORT_SYMBOL(netdev_upper_dev_unlink);
6043
6044 /**
6045 * netdev_bonding_info_change - Dispatch event about slave change
6046 * @dev: device
6047 * @bonding_info: info to dispatch
6048 *
6049 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6050 * The caller must hold the RTNL lock.
6051 */
netdev_bonding_info_change(struct net_device * dev,struct netdev_bonding_info * bonding_info)6052 void netdev_bonding_info_change(struct net_device *dev,
6053 struct netdev_bonding_info *bonding_info)
6054 {
6055 struct netdev_notifier_bonding_info info;
6056
6057 memcpy(&info.bonding_info, bonding_info,
6058 sizeof(struct netdev_bonding_info));
6059 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6060 &info.info);
6061 }
6062 EXPORT_SYMBOL(netdev_bonding_info_change);
6063
netdev_adjacent_add_links(struct net_device * dev)6064 static void netdev_adjacent_add_links(struct net_device *dev)
6065 {
6066 struct netdev_adjacent *iter;
6067
6068 struct net *net = dev_net(dev);
6069
6070 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6071 if (!net_eq(net, dev_net(iter->dev)))
6072 continue;
6073 netdev_adjacent_sysfs_add(iter->dev, dev,
6074 &iter->dev->adj_list.lower);
6075 netdev_adjacent_sysfs_add(dev, iter->dev,
6076 &dev->adj_list.upper);
6077 }
6078
6079 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6080 if (!net_eq(net, dev_net(iter->dev)))
6081 continue;
6082 netdev_adjacent_sysfs_add(iter->dev, dev,
6083 &iter->dev->adj_list.upper);
6084 netdev_adjacent_sysfs_add(dev, iter->dev,
6085 &dev->adj_list.lower);
6086 }
6087 }
6088
netdev_adjacent_del_links(struct net_device * dev)6089 static void netdev_adjacent_del_links(struct net_device *dev)
6090 {
6091 struct netdev_adjacent *iter;
6092
6093 struct net *net = dev_net(dev);
6094
6095 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6096 if (!net_eq(net, dev_net(iter->dev)))
6097 continue;
6098 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6099 &iter->dev->adj_list.lower);
6100 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6101 &dev->adj_list.upper);
6102 }
6103
6104 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6105 if (!net_eq(net, dev_net(iter->dev)))
6106 continue;
6107 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6108 &iter->dev->adj_list.upper);
6109 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6110 &dev->adj_list.lower);
6111 }
6112 }
6113
netdev_adjacent_rename_links(struct net_device * dev,char * oldname)6114 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6115 {
6116 struct netdev_adjacent *iter;
6117
6118 struct net *net = dev_net(dev);
6119
6120 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6121 if (!net_eq(net, dev_net(iter->dev)))
6122 continue;
6123 netdev_adjacent_sysfs_del(iter->dev, oldname,
6124 &iter->dev->adj_list.lower);
6125 netdev_adjacent_sysfs_add(iter->dev, dev,
6126 &iter->dev->adj_list.lower);
6127 }
6128
6129 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6130 if (!net_eq(net, dev_net(iter->dev)))
6131 continue;
6132 netdev_adjacent_sysfs_del(iter->dev, oldname,
6133 &iter->dev->adj_list.upper);
6134 netdev_adjacent_sysfs_add(iter->dev, dev,
6135 &iter->dev->adj_list.upper);
6136 }
6137 }
6138
netdev_lower_dev_get_private(struct net_device * dev,struct net_device * lower_dev)6139 void *netdev_lower_dev_get_private(struct net_device *dev,
6140 struct net_device *lower_dev)
6141 {
6142 struct netdev_adjacent *lower;
6143
6144 if (!lower_dev)
6145 return NULL;
6146 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6147 if (!lower)
6148 return NULL;
6149
6150 return lower->private;
6151 }
6152 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6153
6154
dev_get_nest_level(struct net_device * dev)6155 int dev_get_nest_level(struct net_device *dev)
6156 {
6157 struct net_device *lower = NULL;
6158 struct list_head *iter;
6159 int max_nest = -1;
6160 int nest;
6161
6162 ASSERT_RTNL();
6163
6164 netdev_for_each_lower_dev(dev, lower, iter) {
6165 nest = dev_get_nest_level(lower);
6166 if (max_nest < nest)
6167 max_nest = nest;
6168 }
6169
6170 return max_nest + 1;
6171 }
6172 EXPORT_SYMBOL(dev_get_nest_level);
6173
6174 /**
6175 * netdev_lower_change - Dispatch event about lower device state change
6176 * @lower_dev: device
6177 * @lower_state_info: state to dispatch
6178 *
6179 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6180 * The caller must hold the RTNL lock.
6181 */
netdev_lower_state_changed(struct net_device * lower_dev,void * lower_state_info)6182 void netdev_lower_state_changed(struct net_device *lower_dev,
6183 void *lower_state_info)
6184 {
6185 struct netdev_notifier_changelowerstate_info changelowerstate_info;
6186
6187 ASSERT_RTNL();
6188 changelowerstate_info.lower_state_info = lower_state_info;
6189 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6190 &changelowerstate_info.info);
6191 }
6192 EXPORT_SYMBOL(netdev_lower_state_changed);
6193
netdev_default_l2upper_neigh_construct(struct net_device * dev,struct neighbour * n)6194 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6195 struct neighbour *n)
6196 {
6197 struct net_device *lower_dev, *stop_dev;
6198 struct list_head *iter;
6199 int err;
6200
6201 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6202 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6203 continue;
6204 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6205 if (err) {
6206 stop_dev = lower_dev;
6207 goto rollback;
6208 }
6209 }
6210 return 0;
6211
6212 rollback:
6213 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6214 if (lower_dev == stop_dev)
6215 break;
6216 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6217 continue;
6218 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6219 }
6220 return err;
6221 }
6222 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6223
netdev_default_l2upper_neigh_destroy(struct net_device * dev,struct neighbour * n)6224 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6225 struct neighbour *n)
6226 {
6227 struct net_device *lower_dev;
6228 struct list_head *iter;
6229
6230 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6231 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6232 continue;
6233 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6234 }
6235 }
6236 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6237
dev_change_rx_flags(struct net_device * dev,int flags)6238 static void dev_change_rx_flags(struct net_device *dev, int flags)
6239 {
6240 const struct net_device_ops *ops = dev->netdev_ops;
6241
6242 if (ops->ndo_change_rx_flags)
6243 ops->ndo_change_rx_flags(dev, flags);
6244 }
6245
__dev_set_promiscuity(struct net_device * dev,int inc,bool notify)6246 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6247 {
6248 unsigned int old_flags = dev->flags;
6249 kuid_t uid;
6250 kgid_t gid;
6251
6252 ASSERT_RTNL();
6253
6254 dev->flags |= IFF_PROMISC;
6255 dev->promiscuity += inc;
6256 if (dev->promiscuity == 0) {
6257 /*
6258 * Avoid overflow.
6259 * If inc causes overflow, untouch promisc and return error.
6260 */
6261 if (inc < 0)
6262 dev->flags &= ~IFF_PROMISC;
6263 else {
6264 dev->promiscuity -= inc;
6265 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6266 dev->name);
6267 return -EOVERFLOW;
6268 }
6269 }
6270 if (dev->flags != old_flags) {
6271 pr_info("device %s %s promiscuous mode\n",
6272 dev->name,
6273 dev->flags & IFF_PROMISC ? "entered" : "left");
6274 if (audit_enabled) {
6275 current_uid_gid(&uid, &gid);
6276 audit_log(current->audit_context, GFP_ATOMIC,
6277 AUDIT_ANOM_PROMISCUOUS,
6278 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6279 dev->name, (dev->flags & IFF_PROMISC),
6280 (old_flags & IFF_PROMISC),
6281 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6282 from_kuid(&init_user_ns, uid),
6283 from_kgid(&init_user_ns, gid),
6284 audit_get_sessionid(current));
6285 }
6286
6287 dev_change_rx_flags(dev, IFF_PROMISC);
6288 }
6289 if (notify)
6290 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6291 return 0;
6292 }
6293
6294 /**
6295 * dev_set_promiscuity - update promiscuity count on a device
6296 * @dev: device
6297 * @inc: modifier
6298 *
6299 * Add or remove promiscuity from a device. While the count in the device
6300 * remains above zero the interface remains promiscuous. Once it hits zero
6301 * the device reverts back to normal filtering operation. A negative inc
6302 * value is used to drop promiscuity on the device.
6303 * Return 0 if successful or a negative errno code on error.
6304 */
dev_set_promiscuity(struct net_device * dev,int inc)6305 int dev_set_promiscuity(struct net_device *dev, int inc)
6306 {
6307 unsigned int old_flags = dev->flags;
6308 int err;
6309
6310 err = __dev_set_promiscuity(dev, inc, true);
6311 if (err < 0)
6312 return err;
6313 if (dev->flags != old_flags)
6314 dev_set_rx_mode(dev);
6315 return err;
6316 }
6317 EXPORT_SYMBOL(dev_set_promiscuity);
6318
__dev_set_allmulti(struct net_device * dev,int inc,bool notify)6319 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6320 {
6321 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6322
6323 ASSERT_RTNL();
6324
6325 dev->flags |= IFF_ALLMULTI;
6326 dev->allmulti += inc;
6327 if (dev->allmulti == 0) {
6328 /*
6329 * Avoid overflow.
6330 * If inc causes overflow, untouch allmulti and return error.
6331 */
6332 if (inc < 0)
6333 dev->flags &= ~IFF_ALLMULTI;
6334 else {
6335 dev->allmulti -= inc;
6336 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6337 dev->name);
6338 return -EOVERFLOW;
6339 }
6340 }
6341 if (dev->flags ^ old_flags) {
6342 dev_change_rx_flags(dev, IFF_ALLMULTI);
6343 dev_set_rx_mode(dev);
6344 if (notify)
6345 __dev_notify_flags(dev, old_flags,
6346 dev->gflags ^ old_gflags);
6347 }
6348 return 0;
6349 }
6350
6351 /**
6352 * dev_set_allmulti - update allmulti count on a device
6353 * @dev: device
6354 * @inc: modifier
6355 *
6356 * Add or remove reception of all multicast frames to a device. While the
6357 * count in the device remains above zero the interface remains listening
6358 * to all interfaces. Once it hits zero the device reverts back to normal
6359 * filtering operation. A negative @inc value is used to drop the counter
6360 * when releasing a resource needing all multicasts.
6361 * Return 0 if successful or a negative errno code on error.
6362 */
6363
dev_set_allmulti(struct net_device * dev,int inc)6364 int dev_set_allmulti(struct net_device *dev, int inc)
6365 {
6366 return __dev_set_allmulti(dev, inc, true);
6367 }
6368 EXPORT_SYMBOL(dev_set_allmulti);
6369
6370 /*
6371 * Upload unicast and multicast address lists to device and
6372 * configure RX filtering. When the device doesn't support unicast
6373 * filtering it is put in promiscuous mode while unicast addresses
6374 * are present.
6375 */
__dev_set_rx_mode(struct net_device * dev)6376 void __dev_set_rx_mode(struct net_device *dev)
6377 {
6378 const struct net_device_ops *ops = dev->netdev_ops;
6379
6380 /* dev_open will call this function so the list will stay sane. */
6381 if (!(dev->flags&IFF_UP))
6382 return;
6383
6384 if (!netif_device_present(dev))
6385 return;
6386
6387 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6388 /* Unicast addresses changes may only happen under the rtnl,
6389 * therefore calling __dev_set_promiscuity here is safe.
6390 */
6391 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6392 __dev_set_promiscuity(dev, 1, false);
6393 dev->uc_promisc = true;
6394 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6395 __dev_set_promiscuity(dev, -1, false);
6396 dev->uc_promisc = false;
6397 }
6398 }
6399
6400 if (ops->ndo_set_rx_mode)
6401 ops->ndo_set_rx_mode(dev);
6402 }
6403
dev_set_rx_mode(struct net_device * dev)6404 void dev_set_rx_mode(struct net_device *dev)
6405 {
6406 netif_addr_lock_bh(dev);
6407 __dev_set_rx_mode(dev);
6408 netif_addr_unlock_bh(dev);
6409 }
6410
6411 /**
6412 * dev_get_flags - get flags reported to userspace
6413 * @dev: device
6414 *
6415 * Get the combination of flag bits exported through APIs to userspace.
6416 */
dev_get_flags(const struct net_device * dev)6417 unsigned int dev_get_flags(const struct net_device *dev)
6418 {
6419 unsigned int flags;
6420
6421 flags = (dev->flags & ~(IFF_PROMISC |
6422 IFF_ALLMULTI |
6423 IFF_RUNNING |
6424 IFF_LOWER_UP |
6425 IFF_DORMANT)) |
6426 (dev->gflags & (IFF_PROMISC |
6427 IFF_ALLMULTI));
6428
6429 if (netif_running(dev)) {
6430 if (netif_oper_up(dev))
6431 flags |= IFF_RUNNING;
6432 if (netif_carrier_ok(dev))
6433 flags |= IFF_LOWER_UP;
6434 if (netif_dormant(dev))
6435 flags |= IFF_DORMANT;
6436 }
6437
6438 return flags;
6439 }
6440 EXPORT_SYMBOL(dev_get_flags);
6441
__dev_change_flags(struct net_device * dev,unsigned int flags)6442 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6443 {
6444 unsigned int old_flags = dev->flags;
6445 int ret;
6446
6447 ASSERT_RTNL();
6448
6449 /*
6450 * Set the flags on our device.
6451 */
6452
6453 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6454 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6455 IFF_AUTOMEDIA)) |
6456 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6457 IFF_ALLMULTI));
6458
6459 /*
6460 * Load in the correct multicast list now the flags have changed.
6461 */
6462
6463 if ((old_flags ^ flags) & IFF_MULTICAST)
6464 dev_change_rx_flags(dev, IFF_MULTICAST);
6465
6466 dev_set_rx_mode(dev);
6467
6468 /*
6469 * Have we downed the interface. We handle IFF_UP ourselves
6470 * according to user attempts to set it, rather than blindly
6471 * setting it.
6472 */
6473
6474 ret = 0;
6475 if ((old_flags ^ flags) & IFF_UP)
6476 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6477
6478 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6479 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6480 unsigned int old_flags = dev->flags;
6481
6482 dev->gflags ^= IFF_PROMISC;
6483
6484 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6485 if (dev->flags != old_flags)
6486 dev_set_rx_mode(dev);
6487 }
6488
6489 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6490 is important. Some (broken) drivers set IFF_PROMISC, when
6491 IFF_ALLMULTI is requested not asking us and not reporting.
6492 */
6493 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6494 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6495
6496 dev->gflags ^= IFF_ALLMULTI;
6497 __dev_set_allmulti(dev, inc, false);
6498 }
6499
6500 return ret;
6501 }
6502
__dev_notify_flags(struct net_device * dev,unsigned int old_flags,unsigned int gchanges)6503 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6504 unsigned int gchanges)
6505 {
6506 unsigned int changes = dev->flags ^ old_flags;
6507
6508 if (gchanges)
6509 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6510
6511 if (changes & IFF_UP) {
6512 if (dev->flags & IFF_UP)
6513 call_netdevice_notifiers(NETDEV_UP, dev);
6514 else
6515 call_netdevice_notifiers(NETDEV_DOWN, dev);
6516 }
6517
6518 if (dev->flags & IFF_UP &&
6519 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6520 struct netdev_notifier_change_info change_info;
6521
6522 change_info.flags_changed = changes;
6523 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6524 &change_info.info);
6525 }
6526 }
6527
6528 /**
6529 * dev_change_flags - change device settings
6530 * @dev: device
6531 * @flags: device state flags
6532 *
6533 * Change settings on device based state flags. The flags are
6534 * in the userspace exported format.
6535 */
dev_change_flags(struct net_device * dev,unsigned int flags)6536 int dev_change_flags(struct net_device *dev, unsigned int flags)
6537 {
6538 int ret;
6539 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6540
6541 ret = __dev_change_flags(dev, flags);
6542 if (ret < 0)
6543 return ret;
6544
6545 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6546 __dev_notify_flags(dev, old_flags, changes);
6547 return ret;
6548 }
6549 EXPORT_SYMBOL(dev_change_flags);
6550
__dev_set_mtu(struct net_device * dev,int new_mtu)6551 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6552 {
6553 const struct net_device_ops *ops = dev->netdev_ops;
6554
6555 if (ops->ndo_change_mtu)
6556 return ops->ndo_change_mtu(dev, new_mtu);
6557
6558 dev->mtu = new_mtu;
6559 return 0;
6560 }
6561
6562 /**
6563 * dev_set_mtu - Change maximum transfer unit
6564 * @dev: device
6565 * @new_mtu: new transfer unit
6566 *
6567 * Change the maximum transfer size of the network device.
6568 */
dev_set_mtu(struct net_device * dev,int new_mtu)6569 int dev_set_mtu(struct net_device *dev, int new_mtu)
6570 {
6571 int err, orig_mtu;
6572
6573 if (new_mtu == dev->mtu)
6574 return 0;
6575
6576 /* MTU must be positive. */
6577 if (new_mtu < 0)
6578 return -EINVAL;
6579
6580 if (!netif_device_present(dev))
6581 return -ENODEV;
6582
6583 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6584 err = notifier_to_errno(err);
6585 if (err)
6586 return err;
6587
6588 orig_mtu = dev->mtu;
6589 err = __dev_set_mtu(dev, new_mtu);
6590
6591 if (!err) {
6592 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6593 err = notifier_to_errno(err);
6594 if (err) {
6595 /* setting mtu back and notifying everyone again,
6596 * so that they have a chance to revert changes.
6597 */
6598 __dev_set_mtu(dev, orig_mtu);
6599 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6600 }
6601 }
6602 return err;
6603 }
6604 EXPORT_SYMBOL(dev_set_mtu);
6605
6606 /**
6607 * dev_set_group - Change group this device belongs to
6608 * @dev: device
6609 * @new_group: group this device should belong to
6610 */
dev_set_group(struct net_device * dev,int new_group)6611 void dev_set_group(struct net_device *dev, int new_group)
6612 {
6613 dev->group = new_group;
6614 }
6615 EXPORT_SYMBOL(dev_set_group);
6616
6617 /**
6618 * dev_set_mac_address - Change Media Access Control Address
6619 * @dev: device
6620 * @sa: new address
6621 *
6622 * Change the hardware (MAC) address of the device
6623 */
dev_set_mac_address(struct net_device * dev,struct sockaddr * sa)6624 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6625 {
6626 const struct net_device_ops *ops = dev->netdev_ops;
6627 int err;
6628
6629 if (!ops->ndo_set_mac_address)
6630 return -EOPNOTSUPP;
6631 if (sa->sa_family != dev->type)
6632 return -EINVAL;
6633 if (!netif_device_present(dev))
6634 return -ENODEV;
6635 err = ops->ndo_set_mac_address(dev, sa);
6636 if (err)
6637 return err;
6638 dev->addr_assign_type = NET_ADDR_SET;
6639 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6640 add_device_randomness(dev->dev_addr, dev->addr_len);
6641 return 0;
6642 }
6643 EXPORT_SYMBOL(dev_set_mac_address);
6644
6645 /**
6646 * dev_change_carrier - Change device carrier
6647 * @dev: device
6648 * @new_carrier: new value
6649 *
6650 * Change device carrier
6651 */
dev_change_carrier(struct net_device * dev,bool new_carrier)6652 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6653 {
6654 const struct net_device_ops *ops = dev->netdev_ops;
6655
6656 if (!ops->ndo_change_carrier)
6657 return -EOPNOTSUPP;
6658 if (!netif_device_present(dev))
6659 return -ENODEV;
6660 return ops->ndo_change_carrier(dev, new_carrier);
6661 }
6662 EXPORT_SYMBOL(dev_change_carrier);
6663
6664 /**
6665 * dev_get_phys_port_id - Get device physical port ID
6666 * @dev: device
6667 * @ppid: port ID
6668 *
6669 * Get device physical port ID
6670 */
dev_get_phys_port_id(struct net_device * dev,struct netdev_phys_item_id * ppid)6671 int dev_get_phys_port_id(struct net_device *dev,
6672 struct netdev_phys_item_id *ppid)
6673 {
6674 const struct net_device_ops *ops = dev->netdev_ops;
6675
6676 if (!ops->ndo_get_phys_port_id)
6677 return -EOPNOTSUPP;
6678 return ops->ndo_get_phys_port_id(dev, ppid);
6679 }
6680 EXPORT_SYMBOL(dev_get_phys_port_id);
6681
6682 /**
6683 * dev_get_phys_port_name - Get device physical port name
6684 * @dev: device
6685 * @name: port name
6686 * @len: limit of bytes to copy to name
6687 *
6688 * Get device physical port name
6689 */
dev_get_phys_port_name(struct net_device * dev,char * name,size_t len)6690 int dev_get_phys_port_name(struct net_device *dev,
6691 char *name, size_t len)
6692 {
6693 const struct net_device_ops *ops = dev->netdev_ops;
6694
6695 if (!ops->ndo_get_phys_port_name)
6696 return -EOPNOTSUPP;
6697 return ops->ndo_get_phys_port_name(dev, name, len);
6698 }
6699 EXPORT_SYMBOL(dev_get_phys_port_name);
6700
6701 /**
6702 * dev_change_proto_down - update protocol port state information
6703 * @dev: device
6704 * @proto_down: new value
6705 *
6706 * This info can be used by switch drivers to set the phys state of the
6707 * port.
6708 */
dev_change_proto_down(struct net_device * dev,bool proto_down)6709 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6710 {
6711 const struct net_device_ops *ops = dev->netdev_ops;
6712
6713 if (!ops->ndo_change_proto_down)
6714 return -EOPNOTSUPP;
6715 if (!netif_device_present(dev))
6716 return -ENODEV;
6717 return ops->ndo_change_proto_down(dev, proto_down);
6718 }
6719 EXPORT_SYMBOL(dev_change_proto_down);
6720
6721 /**
6722 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6723 * @dev: device
6724 * @fd: new program fd or negative value to clear
6725 *
6726 * Set or clear a bpf program for a device
6727 */
dev_change_xdp_fd(struct net_device * dev,int fd)6728 int dev_change_xdp_fd(struct net_device *dev, int fd)
6729 {
6730 const struct net_device_ops *ops = dev->netdev_ops;
6731 struct bpf_prog *prog = NULL;
6732 struct netdev_xdp xdp = {};
6733 int err;
6734
6735 if (!ops->ndo_xdp)
6736 return -EOPNOTSUPP;
6737 if (fd >= 0) {
6738 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6739 if (IS_ERR(prog))
6740 return PTR_ERR(prog);
6741 }
6742
6743 xdp.command = XDP_SETUP_PROG;
6744 xdp.prog = prog;
6745 err = ops->ndo_xdp(dev, &xdp);
6746 if (err < 0 && prog)
6747 bpf_prog_put(prog);
6748
6749 return err;
6750 }
6751 EXPORT_SYMBOL(dev_change_xdp_fd);
6752
6753 /**
6754 * dev_new_index - allocate an ifindex
6755 * @net: the applicable net namespace
6756 *
6757 * Returns a suitable unique value for a new device interface
6758 * number. The caller must hold the rtnl semaphore or the
6759 * dev_base_lock to be sure it remains unique.
6760 */
dev_new_index(struct net * net)6761 static int dev_new_index(struct net *net)
6762 {
6763 int ifindex = net->ifindex;
6764 for (;;) {
6765 if (++ifindex <= 0)
6766 ifindex = 1;
6767 if (!__dev_get_by_index(net, ifindex))
6768 return net->ifindex = ifindex;
6769 }
6770 }
6771
6772 /* Delayed registration/unregisteration */
6773 static LIST_HEAD(net_todo_list);
6774 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6775
net_set_todo(struct net_device * dev)6776 static void net_set_todo(struct net_device *dev)
6777 {
6778 list_add_tail(&dev->todo_list, &net_todo_list);
6779 dev_net(dev)->dev_unreg_count++;
6780 }
6781
rollback_registered_many(struct list_head * head)6782 static void rollback_registered_many(struct list_head *head)
6783 {
6784 struct net_device *dev, *tmp;
6785 LIST_HEAD(close_head);
6786
6787 BUG_ON(dev_boot_phase);
6788 ASSERT_RTNL();
6789
6790 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6791 /* Some devices call without registering
6792 * for initialization unwind. Remove those
6793 * devices and proceed with the remaining.
6794 */
6795 if (dev->reg_state == NETREG_UNINITIALIZED) {
6796 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6797 dev->name, dev);
6798
6799 WARN_ON(1);
6800 list_del(&dev->unreg_list);
6801 continue;
6802 }
6803 dev->dismantle = true;
6804 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6805 }
6806
6807 /* If device is running, close it first. */
6808 list_for_each_entry(dev, head, unreg_list)
6809 list_add_tail(&dev->close_list, &close_head);
6810 dev_close_many(&close_head, true);
6811
6812 list_for_each_entry(dev, head, unreg_list) {
6813 /* And unlink it from device chain. */
6814 unlist_netdevice(dev);
6815
6816 dev->reg_state = NETREG_UNREGISTERING;
6817 }
6818 flush_all_backlogs();
6819
6820 synchronize_net();
6821
6822 list_for_each_entry(dev, head, unreg_list) {
6823 struct sk_buff *skb = NULL;
6824
6825 /* Shutdown queueing discipline. */
6826 dev_shutdown(dev);
6827
6828
6829 /* Notify protocols, that we are about to destroy
6830 this device. They should clean all the things.
6831 */
6832 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6833
6834 if (!dev->rtnl_link_ops ||
6835 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6836 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6837 GFP_KERNEL);
6838
6839 /*
6840 * Flush the unicast and multicast chains
6841 */
6842 dev_uc_flush(dev);
6843 dev_mc_flush(dev);
6844
6845 if (dev->netdev_ops->ndo_uninit)
6846 dev->netdev_ops->ndo_uninit(dev);
6847
6848 if (skb)
6849 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6850
6851 /* Notifier chain MUST detach us all upper devices. */
6852 WARN_ON(netdev_has_any_upper_dev(dev));
6853
6854 /* Remove entries from kobject tree */
6855 netdev_unregister_kobject(dev);
6856 #ifdef CONFIG_XPS
6857 /* Remove XPS queueing entries */
6858 netif_reset_xps_queues_gt(dev, 0);
6859 #endif
6860 }
6861
6862 synchronize_net();
6863
6864 list_for_each_entry(dev, head, unreg_list)
6865 dev_put(dev);
6866 }
6867
rollback_registered(struct net_device * dev)6868 static void rollback_registered(struct net_device *dev)
6869 {
6870 LIST_HEAD(single);
6871
6872 list_add(&dev->unreg_list, &single);
6873 rollback_registered_many(&single);
6874 list_del(&single);
6875 }
6876
netdev_sync_upper_features(struct net_device * lower,struct net_device * upper,netdev_features_t features)6877 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6878 struct net_device *upper, netdev_features_t features)
6879 {
6880 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6881 netdev_features_t feature;
6882 int feature_bit;
6883
6884 for_each_netdev_feature(&upper_disables, feature_bit) {
6885 feature = __NETIF_F_BIT(feature_bit);
6886 if (!(upper->wanted_features & feature)
6887 && (features & feature)) {
6888 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6889 &feature, upper->name);
6890 features &= ~feature;
6891 }
6892 }
6893
6894 return features;
6895 }
6896
netdev_sync_lower_features(struct net_device * upper,struct net_device * lower,netdev_features_t features)6897 static void netdev_sync_lower_features(struct net_device *upper,
6898 struct net_device *lower, netdev_features_t features)
6899 {
6900 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6901 netdev_features_t feature;
6902 int feature_bit;
6903
6904 for_each_netdev_feature(&upper_disables, feature_bit) {
6905 feature = __NETIF_F_BIT(feature_bit);
6906 if (!(features & feature) && (lower->features & feature)) {
6907 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6908 &feature, lower->name);
6909 lower->wanted_features &= ~feature;
6910 netdev_update_features(lower);
6911
6912 if (unlikely(lower->features & feature))
6913 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6914 &feature, lower->name);
6915 }
6916 }
6917 }
6918
netdev_fix_features(struct net_device * dev,netdev_features_t features)6919 static netdev_features_t netdev_fix_features(struct net_device *dev,
6920 netdev_features_t features)
6921 {
6922 /* Fix illegal checksum combinations */
6923 if ((features & NETIF_F_HW_CSUM) &&
6924 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6925 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6926 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6927 }
6928
6929 /* TSO requires that SG is present as well. */
6930 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6931 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6932 features &= ~NETIF_F_ALL_TSO;
6933 }
6934
6935 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6936 !(features & NETIF_F_IP_CSUM)) {
6937 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6938 features &= ~NETIF_F_TSO;
6939 features &= ~NETIF_F_TSO_ECN;
6940 }
6941
6942 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6943 !(features & NETIF_F_IPV6_CSUM)) {
6944 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6945 features &= ~NETIF_F_TSO6;
6946 }
6947
6948 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6949 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6950 features &= ~NETIF_F_TSO_MANGLEID;
6951
6952 /* TSO ECN requires that TSO is present as well. */
6953 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6954 features &= ~NETIF_F_TSO_ECN;
6955
6956 /* Software GSO depends on SG. */
6957 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6958 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6959 features &= ~NETIF_F_GSO;
6960 }
6961
6962 /* UFO needs SG and checksumming */
6963 if (features & NETIF_F_UFO) {
6964 /* maybe split UFO into V4 and V6? */
6965 if (!(features & NETIF_F_HW_CSUM) &&
6966 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6967 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6968 netdev_dbg(dev,
6969 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6970 features &= ~NETIF_F_UFO;
6971 }
6972
6973 if (!(features & NETIF_F_SG)) {
6974 netdev_dbg(dev,
6975 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6976 features &= ~NETIF_F_UFO;
6977 }
6978 }
6979
6980 /* GSO partial features require GSO partial be set */
6981 if ((features & dev->gso_partial_features) &&
6982 !(features & NETIF_F_GSO_PARTIAL)) {
6983 netdev_dbg(dev,
6984 "Dropping partially supported GSO features since no GSO partial.\n");
6985 features &= ~dev->gso_partial_features;
6986 }
6987
6988 #ifdef CONFIG_NET_RX_BUSY_POLL
6989 if (dev->netdev_ops->ndo_busy_poll)
6990 features |= NETIF_F_BUSY_POLL;
6991 else
6992 #endif
6993 features &= ~NETIF_F_BUSY_POLL;
6994
6995 return features;
6996 }
6997
__netdev_update_features(struct net_device * dev)6998 int __netdev_update_features(struct net_device *dev)
6999 {
7000 struct net_device *upper, *lower;
7001 netdev_features_t features;
7002 struct list_head *iter;
7003 int err = -1;
7004
7005 ASSERT_RTNL();
7006
7007 features = netdev_get_wanted_features(dev);
7008
7009 if (dev->netdev_ops->ndo_fix_features)
7010 features = dev->netdev_ops->ndo_fix_features(dev, features);
7011
7012 /* driver might be less strict about feature dependencies */
7013 features = netdev_fix_features(dev, features);
7014
7015 /* some features can't be enabled if they're off an an upper device */
7016 netdev_for_each_upper_dev_rcu(dev, upper, iter)
7017 features = netdev_sync_upper_features(dev, upper, features);
7018
7019 if (dev->features == features)
7020 goto sync_lower;
7021
7022 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7023 &dev->features, &features);
7024
7025 if (dev->netdev_ops->ndo_set_features)
7026 err = dev->netdev_ops->ndo_set_features(dev, features);
7027 else
7028 err = 0;
7029
7030 if (unlikely(err < 0)) {
7031 netdev_err(dev,
7032 "set_features() failed (%d); wanted %pNF, left %pNF\n",
7033 err, &features, &dev->features);
7034 /* return non-0 since some features might have changed and
7035 * it's better to fire a spurious notification than miss it
7036 */
7037 return -1;
7038 }
7039
7040 sync_lower:
7041 /* some features must be disabled on lower devices when disabled
7042 * on an upper device (think: bonding master or bridge)
7043 */
7044 netdev_for_each_lower_dev(dev, lower, iter)
7045 netdev_sync_lower_features(dev, lower, features);
7046
7047 if (!err)
7048 dev->features = features;
7049
7050 return err < 0 ? 0 : 1;
7051 }
7052
7053 /**
7054 * netdev_update_features - recalculate device features
7055 * @dev: the device to check
7056 *
7057 * Recalculate dev->features set and send notifications if it
7058 * has changed. Should be called after driver or hardware dependent
7059 * conditions might have changed that influence the features.
7060 */
netdev_update_features(struct net_device * dev)7061 void netdev_update_features(struct net_device *dev)
7062 {
7063 if (__netdev_update_features(dev))
7064 netdev_features_change(dev);
7065 }
7066 EXPORT_SYMBOL(netdev_update_features);
7067
7068 /**
7069 * netdev_change_features - recalculate device features
7070 * @dev: the device to check
7071 *
7072 * Recalculate dev->features set and send notifications even
7073 * if they have not changed. Should be called instead of
7074 * netdev_update_features() if also dev->vlan_features might
7075 * have changed to allow the changes to be propagated to stacked
7076 * VLAN devices.
7077 */
netdev_change_features(struct net_device * dev)7078 void netdev_change_features(struct net_device *dev)
7079 {
7080 __netdev_update_features(dev);
7081 netdev_features_change(dev);
7082 }
7083 EXPORT_SYMBOL(netdev_change_features);
7084
7085 /**
7086 * netif_stacked_transfer_operstate - transfer operstate
7087 * @rootdev: the root or lower level device to transfer state from
7088 * @dev: the device to transfer operstate to
7089 *
7090 * Transfer operational state from root to device. This is normally
7091 * called when a stacking relationship exists between the root
7092 * device and the device(a leaf device).
7093 */
netif_stacked_transfer_operstate(const struct net_device * rootdev,struct net_device * dev)7094 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7095 struct net_device *dev)
7096 {
7097 if (rootdev->operstate == IF_OPER_DORMANT)
7098 netif_dormant_on(dev);
7099 else
7100 netif_dormant_off(dev);
7101
7102 if (netif_carrier_ok(rootdev)) {
7103 if (!netif_carrier_ok(dev))
7104 netif_carrier_on(dev);
7105 } else {
7106 if (netif_carrier_ok(dev))
7107 netif_carrier_off(dev);
7108 }
7109 }
7110 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7111
7112 #ifdef CONFIG_SYSFS
netif_alloc_rx_queues(struct net_device * dev)7113 static int netif_alloc_rx_queues(struct net_device *dev)
7114 {
7115 unsigned int i, count = dev->num_rx_queues;
7116 struct netdev_rx_queue *rx;
7117 size_t sz = count * sizeof(*rx);
7118
7119 BUG_ON(count < 1);
7120
7121 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7122 if (!rx) {
7123 rx = vzalloc(sz);
7124 if (!rx)
7125 return -ENOMEM;
7126 }
7127 dev->_rx = rx;
7128
7129 for (i = 0; i < count; i++)
7130 rx[i].dev = dev;
7131 return 0;
7132 }
7133 #endif
7134
netdev_init_one_queue(struct net_device * dev,struct netdev_queue * queue,void * _unused)7135 static void netdev_init_one_queue(struct net_device *dev,
7136 struct netdev_queue *queue, void *_unused)
7137 {
7138 /* Initialize queue lock */
7139 spin_lock_init(&queue->_xmit_lock);
7140 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7141 queue->xmit_lock_owner = -1;
7142 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7143 queue->dev = dev;
7144 #ifdef CONFIG_BQL
7145 dql_init(&queue->dql, HZ);
7146 #endif
7147 }
7148
netif_free_tx_queues(struct net_device * dev)7149 static void netif_free_tx_queues(struct net_device *dev)
7150 {
7151 kvfree(dev->_tx);
7152 }
7153
netif_alloc_netdev_queues(struct net_device * dev)7154 static int netif_alloc_netdev_queues(struct net_device *dev)
7155 {
7156 unsigned int count = dev->num_tx_queues;
7157 struct netdev_queue *tx;
7158 size_t sz = count * sizeof(*tx);
7159
7160 if (count < 1 || count > 0xffff)
7161 return -EINVAL;
7162
7163 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7164 if (!tx) {
7165 tx = vzalloc(sz);
7166 if (!tx)
7167 return -ENOMEM;
7168 }
7169 dev->_tx = tx;
7170
7171 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7172 spin_lock_init(&dev->tx_global_lock);
7173
7174 return 0;
7175 }
7176
netif_tx_stop_all_queues(struct net_device * dev)7177 void netif_tx_stop_all_queues(struct net_device *dev)
7178 {
7179 unsigned int i;
7180
7181 for (i = 0; i < dev->num_tx_queues; i++) {
7182 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7183 netif_tx_stop_queue(txq);
7184 }
7185 }
7186 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7187
7188 /**
7189 * register_netdevice - register a network device
7190 * @dev: device to register
7191 *
7192 * Take a completed network device structure and add it to the kernel
7193 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7194 * chain. 0 is returned on success. A negative errno code is returned
7195 * on a failure to set up the device, or if the name is a duplicate.
7196 *
7197 * Callers must hold the rtnl semaphore. You may want
7198 * register_netdev() instead of this.
7199 *
7200 * BUGS:
7201 * The locking appears insufficient to guarantee two parallel registers
7202 * will not get the same name.
7203 */
7204
register_netdevice(struct net_device * dev)7205 int register_netdevice(struct net_device *dev)
7206 {
7207 int ret;
7208 struct net *net = dev_net(dev);
7209
7210 BUG_ON(dev_boot_phase);
7211 ASSERT_RTNL();
7212
7213 might_sleep();
7214
7215 /* When net_device's are persistent, this will be fatal. */
7216 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7217 BUG_ON(!net);
7218
7219 spin_lock_init(&dev->addr_list_lock);
7220 netdev_set_addr_lockdep_class(dev);
7221
7222 ret = dev_get_valid_name(net, dev, dev->name);
7223 if (ret < 0)
7224 goto out;
7225
7226 /* Init, if this function is available */
7227 if (dev->netdev_ops->ndo_init) {
7228 ret = dev->netdev_ops->ndo_init(dev);
7229 if (ret) {
7230 if (ret > 0)
7231 ret = -EIO;
7232 goto out;
7233 }
7234 }
7235
7236 if (((dev->hw_features | dev->features) &
7237 NETIF_F_HW_VLAN_CTAG_FILTER) &&
7238 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7239 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7240 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7241 ret = -EINVAL;
7242 goto err_uninit;
7243 }
7244
7245 ret = -EBUSY;
7246 if (!dev->ifindex)
7247 dev->ifindex = dev_new_index(net);
7248 else if (__dev_get_by_index(net, dev->ifindex))
7249 goto err_uninit;
7250
7251 /* Transfer changeable features to wanted_features and enable
7252 * software offloads (GSO and GRO).
7253 */
7254 dev->hw_features |= NETIF_F_SOFT_FEATURES;
7255 dev->features |= NETIF_F_SOFT_FEATURES;
7256 dev->wanted_features = dev->features & dev->hw_features;
7257
7258 if (!(dev->flags & IFF_LOOPBACK))
7259 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7260
7261 /* If IPv4 TCP segmentation offload is supported we should also
7262 * allow the device to enable segmenting the frame with the option
7263 * of ignoring a static IP ID value. This doesn't enable the
7264 * feature itself but allows the user to enable it later.
7265 */
7266 if (dev->hw_features & NETIF_F_TSO)
7267 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7268 if (dev->vlan_features & NETIF_F_TSO)
7269 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7270 if (dev->mpls_features & NETIF_F_TSO)
7271 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7272 if (dev->hw_enc_features & NETIF_F_TSO)
7273 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7274
7275 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7276 */
7277 dev->vlan_features |= NETIF_F_HIGHDMA;
7278
7279 /* Make NETIF_F_SG inheritable to tunnel devices.
7280 */
7281 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7282
7283 /* Make NETIF_F_SG inheritable to MPLS.
7284 */
7285 dev->mpls_features |= NETIF_F_SG;
7286
7287 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7288 ret = notifier_to_errno(ret);
7289 if (ret)
7290 goto err_uninit;
7291
7292 ret = netdev_register_kobject(dev);
7293 if (ret)
7294 goto err_uninit;
7295 dev->reg_state = NETREG_REGISTERED;
7296
7297 __netdev_update_features(dev);
7298
7299 /*
7300 * Default initial state at registry is that the
7301 * device is present.
7302 */
7303
7304 set_bit(__LINK_STATE_PRESENT, &dev->state);
7305
7306 linkwatch_init_dev(dev);
7307
7308 dev_init_scheduler(dev);
7309 dev_hold(dev);
7310 list_netdevice(dev);
7311 add_device_randomness(dev->dev_addr, dev->addr_len);
7312
7313 /* If the device has permanent device address, driver should
7314 * set dev_addr and also addr_assign_type should be set to
7315 * NET_ADDR_PERM (default value).
7316 */
7317 if (dev->addr_assign_type == NET_ADDR_PERM)
7318 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7319
7320 /* Notify protocols, that a new device appeared. */
7321 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7322 ret = notifier_to_errno(ret);
7323 if (ret) {
7324 rollback_registered(dev);
7325 dev->reg_state = NETREG_UNREGISTERED;
7326 }
7327 /*
7328 * Prevent userspace races by waiting until the network
7329 * device is fully setup before sending notifications.
7330 */
7331 if (!dev->rtnl_link_ops ||
7332 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7333 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7334
7335 out:
7336 return ret;
7337
7338 err_uninit:
7339 if (dev->netdev_ops->ndo_uninit)
7340 dev->netdev_ops->ndo_uninit(dev);
7341 goto out;
7342 }
7343 EXPORT_SYMBOL(register_netdevice);
7344
7345 /**
7346 * init_dummy_netdev - init a dummy network device for NAPI
7347 * @dev: device to init
7348 *
7349 * This takes a network device structure and initialize the minimum
7350 * amount of fields so it can be used to schedule NAPI polls without
7351 * registering a full blown interface. This is to be used by drivers
7352 * that need to tie several hardware interfaces to a single NAPI
7353 * poll scheduler due to HW limitations.
7354 */
init_dummy_netdev(struct net_device * dev)7355 int init_dummy_netdev(struct net_device *dev)
7356 {
7357 /* Clear everything. Note we don't initialize spinlocks
7358 * are they aren't supposed to be taken by any of the
7359 * NAPI code and this dummy netdev is supposed to be
7360 * only ever used for NAPI polls
7361 */
7362 memset(dev, 0, sizeof(struct net_device));
7363
7364 /* make sure we BUG if trying to hit standard
7365 * register/unregister code path
7366 */
7367 dev->reg_state = NETREG_DUMMY;
7368
7369 /* NAPI wants this */
7370 INIT_LIST_HEAD(&dev->napi_list);
7371
7372 /* a dummy interface is started by default */
7373 set_bit(__LINK_STATE_PRESENT, &dev->state);
7374 set_bit(__LINK_STATE_START, &dev->state);
7375
7376 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7377 * because users of this 'device' dont need to change
7378 * its refcount.
7379 */
7380
7381 return 0;
7382 }
7383 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7384
7385
7386 /**
7387 * register_netdev - register a network device
7388 * @dev: device to register
7389 *
7390 * Take a completed network device structure and add it to the kernel
7391 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7392 * chain. 0 is returned on success. A negative errno code is returned
7393 * on a failure to set up the device, or if the name is a duplicate.
7394 *
7395 * This is a wrapper around register_netdevice that takes the rtnl semaphore
7396 * and expands the device name if you passed a format string to
7397 * alloc_netdev.
7398 */
register_netdev(struct net_device * dev)7399 int register_netdev(struct net_device *dev)
7400 {
7401 int err;
7402
7403 rtnl_lock();
7404 err = register_netdevice(dev);
7405 rtnl_unlock();
7406 return err;
7407 }
7408 EXPORT_SYMBOL(register_netdev);
7409
netdev_refcnt_read(const struct net_device * dev)7410 int netdev_refcnt_read(const struct net_device *dev)
7411 {
7412 int i, refcnt = 0;
7413
7414 for_each_possible_cpu(i)
7415 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7416 return refcnt;
7417 }
7418 EXPORT_SYMBOL(netdev_refcnt_read);
7419
7420 /**
7421 * netdev_wait_allrefs - wait until all references are gone.
7422 * @dev: target net_device
7423 *
7424 * This is called when unregistering network devices.
7425 *
7426 * Any protocol or device that holds a reference should register
7427 * for netdevice notification, and cleanup and put back the
7428 * reference if they receive an UNREGISTER event.
7429 * We can get stuck here if buggy protocols don't correctly
7430 * call dev_put.
7431 */
netdev_wait_allrefs(struct net_device * dev)7432 static void netdev_wait_allrefs(struct net_device *dev)
7433 {
7434 unsigned long rebroadcast_time, warning_time;
7435 int refcnt;
7436
7437 linkwatch_forget_dev(dev);
7438
7439 rebroadcast_time = warning_time = jiffies;
7440 refcnt = netdev_refcnt_read(dev);
7441
7442 while (refcnt != 0) {
7443 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7444 rtnl_lock();
7445
7446 /* Rebroadcast unregister notification */
7447 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7448
7449 __rtnl_unlock();
7450 rcu_barrier();
7451 rtnl_lock();
7452
7453 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7454 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7455 &dev->state)) {
7456 /* We must not have linkwatch events
7457 * pending on unregister. If this
7458 * happens, we simply run the queue
7459 * unscheduled, resulting in a noop
7460 * for this device.
7461 */
7462 linkwatch_run_queue();
7463 }
7464
7465 __rtnl_unlock();
7466
7467 rebroadcast_time = jiffies;
7468 }
7469
7470 msleep(250);
7471
7472 refcnt = netdev_refcnt_read(dev);
7473
7474 if (time_after(jiffies, warning_time + 10 * HZ)) {
7475 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7476 dev->name, refcnt);
7477 warning_time = jiffies;
7478 }
7479 }
7480 }
7481
7482 /* The sequence is:
7483 *
7484 * rtnl_lock();
7485 * ...
7486 * register_netdevice(x1);
7487 * register_netdevice(x2);
7488 * ...
7489 * unregister_netdevice(y1);
7490 * unregister_netdevice(y2);
7491 * ...
7492 * rtnl_unlock();
7493 * free_netdev(y1);
7494 * free_netdev(y2);
7495 *
7496 * We are invoked by rtnl_unlock().
7497 * This allows us to deal with problems:
7498 * 1) We can delete sysfs objects which invoke hotplug
7499 * without deadlocking with linkwatch via keventd.
7500 * 2) Since we run with the RTNL semaphore not held, we can sleep
7501 * safely in order to wait for the netdev refcnt to drop to zero.
7502 *
7503 * We must not return until all unregister events added during
7504 * the interval the lock was held have been completed.
7505 */
netdev_run_todo(void)7506 void netdev_run_todo(void)
7507 {
7508 struct list_head list;
7509
7510 /* Snapshot list, allow later requests */
7511 list_replace_init(&net_todo_list, &list);
7512
7513 __rtnl_unlock();
7514
7515
7516 /* Wait for rcu callbacks to finish before next phase */
7517 if (!list_empty(&list))
7518 rcu_barrier();
7519
7520 while (!list_empty(&list)) {
7521 struct net_device *dev
7522 = list_first_entry(&list, struct net_device, todo_list);
7523 list_del(&dev->todo_list);
7524
7525 rtnl_lock();
7526 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7527 __rtnl_unlock();
7528
7529 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7530 pr_err("network todo '%s' but state %d\n",
7531 dev->name, dev->reg_state);
7532 dump_stack();
7533 continue;
7534 }
7535
7536 dev->reg_state = NETREG_UNREGISTERED;
7537
7538 netdev_wait_allrefs(dev);
7539
7540 /* paranoia */
7541 BUG_ON(netdev_refcnt_read(dev));
7542 BUG_ON(!list_empty(&dev->ptype_all));
7543 BUG_ON(!list_empty(&dev->ptype_specific));
7544 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7545 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7546 WARN_ON(dev->dn_ptr);
7547
7548 if (dev->destructor)
7549 dev->destructor(dev);
7550
7551 /* Report a network device has been unregistered */
7552 rtnl_lock();
7553 dev_net(dev)->dev_unreg_count--;
7554 __rtnl_unlock();
7555 wake_up(&netdev_unregistering_wq);
7556
7557 /* Free network device */
7558 kobject_put(&dev->dev.kobj);
7559 }
7560 }
7561
7562 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7563 * all the same fields in the same order as net_device_stats, with only
7564 * the type differing, but rtnl_link_stats64 may have additional fields
7565 * at the end for newer counters.
7566 */
netdev_stats_to_stats64(struct rtnl_link_stats64 * stats64,const struct net_device_stats * netdev_stats)7567 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7568 const struct net_device_stats *netdev_stats)
7569 {
7570 #if BITS_PER_LONG == 64
7571 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7572 memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
7573 /* zero out counters that only exist in rtnl_link_stats64 */
7574 memset((char *)stats64 + sizeof(*netdev_stats), 0,
7575 sizeof(*stats64) - sizeof(*netdev_stats));
7576 #else
7577 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7578 const unsigned long *src = (const unsigned long *)netdev_stats;
7579 u64 *dst = (u64 *)stats64;
7580
7581 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7582 for (i = 0; i < n; i++)
7583 dst[i] = src[i];
7584 /* zero out counters that only exist in rtnl_link_stats64 */
7585 memset((char *)stats64 + n * sizeof(u64), 0,
7586 sizeof(*stats64) - n * sizeof(u64));
7587 #endif
7588 }
7589 EXPORT_SYMBOL(netdev_stats_to_stats64);
7590
7591 /**
7592 * dev_get_stats - get network device statistics
7593 * @dev: device to get statistics from
7594 * @storage: place to store stats
7595 *
7596 * Get network statistics from device. Return @storage.
7597 * The device driver may provide its own method by setting
7598 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7599 * otherwise the internal statistics structure is used.
7600 */
dev_get_stats(struct net_device * dev,struct rtnl_link_stats64 * storage)7601 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7602 struct rtnl_link_stats64 *storage)
7603 {
7604 const struct net_device_ops *ops = dev->netdev_ops;
7605
7606 if (ops->ndo_get_stats64) {
7607 memset(storage, 0, sizeof(*storage));
7608 ops->ndo_get_stats64(dev, storage);
7609 } else if (ops->ndo_get_stats) {
7610 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7611 } else {
7612 netdev_stats_to_stats64(storage, &dev->stats);
7613 }
7614 storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
7615 storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
7616 storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
7617 return storage;
7618 }
7619 EXPORT_SYMBOL(dev_get_stats);
7620
dev_ingress_queue_create(struct net_device * dev)7621 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7622 {
7623 struct netdev_queue *queue = dev_ingress_queue(dev);
7624
7625 #ifdef CONFIG_NET_CLS_ACT
7626 if (queue)
7627 return queue;
7628 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7629 if (!queue)
7630 return NULL;
7631 netdev_init_one_queue(dev, queue, NULL);
7632 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7633 queue->qdisc_sleeping = &noop_qdisc;
7634 rcu_assign_pointer(dev->ingress_queue, queue);
7635 #endif
7636 return queue;
7637 }
7638
7639 static const struct ethtool_ops default_ethtool_ops;
7640
netdev_set_default_ethtool_ops(struct net_device * dev,const struct ethtool_ops * ops)7641 void netdev_set_default_ethtool_ops(struct net_device *dev,
7642 const struct ethtool_ops *ops)
7643 {
7644 if (dev->ethtool_ops == &default_ethtool_ops)
7645 dev->ethtool_ops = ops;
7646 }
7647 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7648
netdev_freemem(struct net_device * dev)7649 void netdev_freemem(struct net_device *dev)
7650 {
7651 char *addr = (char *)dev - dev->padded;
7652
7653 kvfree(addr);
7654 }
7655
7656 /**
7657 * alloc_netdev_mqs - allocate network device
7658 * @sizeof_priv: size of private data to allocate space for
7659 * @name: device name format string
7660 * @name_assign_type: origin of device name
7661 * @setup: callback to initialize device
7662 * @txqs: the number of TX subqueues to allocate
7663 * @rxqs: the number of RX subqueues to allocate
7664 *
7665 * Allocates a struct net_device with private data area for driver use
7666 * and performs basic initialization. Also allocates subqueue structs
7667 * for each queue on the device.
7668 */
alloc_netdev_mqs(int sizeof_priv,const char * name,unsigned char name_assign_type,void (* setup)(struct net_device *),unsigned int txqs,unsigned int rxqs)7669 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7670 unsigned char name_assign_type,
7671 void (*setup)(struct net_device *),
7672 unsigned int txqs, unsigned int rxqs)
7673 {
7674 struct net_device *dev;
7675 size_t alloc_size;
7676 struct net_device *p;
7677
7678 BUG_ON(strlen(name) >= sizeof(dev->name));
7679
7680 if (txqs < 1) {
7681 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7682 return NULL;
7683 }
7684
7685 #ifdef CONFIG_SYSFS
7686 if (rxqs < 1) {
7687 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7688 return NULL;
7689 }
7690 #endif
7691
7692 alloc_size = sizeof(struct net_device);
7693 if (sizeof_priv) {
7694 /* ensure 32-byte alignment of private area */
7695 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7696 alloc_size += sizeof_priv;
7697 }
7698 /* ensure 32-byte alignment of whole construct */
7699 alloc_size += NETDEV_ALIGN - 1;
7700
7701 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7702 if (!p)
7703 p = vzalloc(alloc_size);
7704 if (!p)
7705 return NULL;
7706
7707 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7708 dev->padded = (char *)dev - (char *)p;
7709
7710 dev->pcpu_refcnt = alloc_percpu(int);
7711 if (!dev->pcpu_refcnt)
7712 goto free_dev;
7713
7714 if (dev_addr_init(dev))
7715 goto free_pcpu;
7716
7717 dev_mc_init(dev);
7718 dev_uc_init(dev);
7719
7720 dev_net_set(dev, &init_net);
7721
7722 dev->gso_max_size = GSO_MAX_SIZE;
7723 dev->gso_max_segs = GSO_MAX_SEGS;
7724
7725 INIT_LIST_HEAD(&dev->napi_list);
7726 INIT_LIST_HEAD(&dev->unreg_list);
7727 INIT_LIST_HEAD(&dev->close_list);
7728 INIT_LIST_HEAD(&dev->link_watch_list);
7729 INIT_LIST_HEAD(&dev->adj_list.upper);
7730 INIT_LIST_HEAD(&dev->adj_list.lower);
7731 INIT_LIST_HEAD(&dev->all_adj_list.upper);
7732 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7733 INIT_LIST_HEAD(&dev->ptype_all);
7734 INIT_LIST_HEAD(&dev->ptype_specific);
7735 #ifdef CONFIG_NET_SCHED
7736 hash_init(dev->qdisc_hash);
7737 #endif
7738 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7739 setup(dev);
7740
7741 if (!dev->tx_queue_len) {
7742 dev->priv_flags |= IFF_NO_QUEUE;
7743 dev->tx_queue_len = 1;
7744 }
7745
7746 dev->num_tx_queues = txqs;
7747 dev->real_num_tx_queues = txqs;
7748 if (netif_alloc_netdev_queues(dev))
7749 goto free_all;
7750
7751 #ifdef CONFIG_SYSFS
7752 dev->num_rx_queues = rxqs;
7753 dev->real_num_rx_queues = rxqs;
7754 if (netif_alloc_rx_queues(dev))
7755 goto free_all;
7756 #endif
7757
7758 strcpy(dev->name, name);
7759 dev->name_assign_type = name_assign_type;
7760 dev->group = INIT_NETDEV_GROUP;
7761 if (!dev->ethtool_ops)
7762 dev->ethtool_ops = &default_ethtool_ops;
7763
7764 nf_hook_ingress_init(dev);
7765
7766 return dev;
7767
7768 free_all:
7769 free_netdev(dev);
7770 return NULL;
7771
7772 free_pcpu:
7773 free_percpu(dev->pcpu_refcnt);
7774 free_dev:
7775 netdev_freemem(dev);
7776 return NULL;
7777 }
7778 EXPORT_SYMBOL(alloc_netdev_mqs);
7779
7780 /**
7781 * free_netdev - free network device
7782 * @dev: device
7783 *
7784 * This function does the last stage of destroying an allocated device
7785 * interface. The reference to the device object is released.
7786 * If this is the last reference then it will be freed.
7787 * Must be called in process context.
7788 */
free_netdev(struct net_device * dev)7789 void free_netdev(struct net_device *dev)
7790 {
7791 struct napi_struct *p, *n;
7792
7793 might_sleep();
7794 netif_free_tx_queues(dev);
7795 #ifdef CONFIG_SYSFS
7796 kvfree(dev->_rx);
7797 #endif
7798
7799 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7800
7801 /* Flush device addresses */
7802 dev_addr_flush(dev);
7803
7804 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7805 netif_napi_del(p);
7806
7807 free_percpu(dev->pcpu_refcnt);
7808 dev->pcpu_refcnt = NULL;
7809
7810 /* Compatibility with error handling in drivers */
7811 if (dev->reg_state == NETREG_UNINITIALIZED) {
7812 netdev_freemem(dev);
7813 return;
7814 }
7815
7816 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7817 dev->reg_state = NETREG_RELEASED;
7818
7819 /* will free via device release */
7820 put_device(&dev->dev);
7821 }
7822 EXPORT_SYMBOL(free_netdev);
7823
7824 /**
7825 * synchronize_net - Synchronize with packet receive processing
7826 *
7827 * Wait for packets currently being received to be done.
7828 * Does not block later packets from starting.
7829 */
synchronize_net(void)7830 void synchronize_net(void)
7831 {
7832 might_sleep();
7833 if (rtnl_is_locked())
7834 synchronize_rcu_expedited();
7835 else
7836 synchronize_rcu();
7837 }
7838 EXPORT_SYMBOL(synchronize_net);
7839
7840 /**
7841 * unregister_netdevice_queue - remove device from the kernel
7842 * @dev: device
7843 * @head: list
7844 *
7845 * This function shuts down a device interface and removes it
7846 * from the kernel tables.
7847 * If head not NULL, device is queued to be unregistered later.
7848 *
7849 * Callers must hold the rtnl semaphore. You may want
7850 * unregister_netdev() instead of this.
7851 */
7852
unregister_netdevice_queue(struct net_device * dev,struct list_head * head)7853 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7854 {
7855 ASSERT_RTNL();
7856
7857 if (head) {
7858 list_move_tail(&dev->unreg_list, head);
7859 } else {
7860 rollback_registered(dev);
7861 /* Finish processing unregister after unlock */
7862 net_set_todo(dev);
7863 }
7864 }
7865 EXPORT_SYMBOL(unregister_netdevice_queue);
7866
7867 /**
7868 * unregister_netdevice_many - unregister many devices
7869 * @head: list of devices
7870 *
7871 * Note: As most callers use a stack allocated list_head,
7872 * we force a list_del() to make sure stack wont be corrupted later.
7873 */
unregister_netdevice_many(struct list_head * head)7874 void unregister_netdevice_many(struct list_head *head)
7875 {
7876 struct net_device *dev;
7877
7878 if (!list_empty(head)) {
7879 rollback_registered_many(head);
7880 list_for_each_entry(dev, head, unreg_list)
7881 net_set_todo(dev);
7882 list_del(head);
7883 }
7884 }
7885 EXPORT_SYMBOL(unregister_netdevice_many);
7886
7887 /**
7888 * unregister_netdev - remove device from the kernel
7889 * @dev: device
7890 *
7891 * This function shuts down a device interface and removes it
7892 * from the kernel tables.
7893 *
7894 * This is just a wrapper for unregister_netdevice that takes
7895 * the rtnl semaphore. In general you want to use this and not
7896 * unregister_netdevice.
7897 */
unregister_netdev(struct net_device * dev)7898 void unregister_netdev(struct net_device *dev)
7899 {
7900 rtnl_lock();
7901 unregister_netdevice(dev);
7902 rtnl_unlock();
7903 }
7904 EXPORT_SYMBOL(unregister_netdev);
7905
7906 /**
7907 * dev_change_net_namespace - move device to different nethost namespace
7908 * @dev: device
7909 * @net: network namespace
7910 * @pat: If not NULL name pattern to try if the current device name
7911 * is already taken in the destination network namespace.
7912 *
7913 * This function shuts down a device interface and moves it
7914 * to a new network namespace. On success 0 is returned, on
7915 * a failure a netagive errno code is returned.
7916 *
7917 * Callers must hold the rtnl semaphore.
7918 */
7919
dev_change_net_namespace(struct net_device * dev,struct net * net,const char * pat)7920 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7921 {
7922 int err;
7923
7924 ASSERT_RTNL();
7925
7926 /* Don't allow namespace local devices to be moved. */
7927 err = -EINVAL;
7928 if (dev->features & NETIF_F_NETNS_LOCAL)
7929 goto out;
7930
7931 /* Ensure the device has been registrered */
7932 if (dev->reg_state != NETREG_REGISTERED)
7933 goto out;
7934
7935 /* Get out if there is nothing todo */
7936 err = 0;
7937 if (net_eq(dev_net(dev), net))
7938 goto out;
7939
7940 /* Pick the destination device name, and ensure
7941 * we can use it in the destination network namespace.
7942 */
7943 err = -EEXIST;
7944 if (__dev_get_by_name(net, dev->name)) {
7945 /* We get here if we can't use the current device name */
7946 if (!pat)
7947 goto out;
7948 if (dev_get_valid_name(net, dev, pat) < 0)
7949 goto out;
7950 }
7951
7952 /*
7953 * And now a mini version of register_netdevice unregister_netdevice.
7954 */
7955
7956 /* If device is running close it first. */
7957 dev_close(dev);
7958
7959 /* And unlink it from device chain */
7960 err = -ENODEV;
7961 unlist_netdevice(dev);
7962
7963 synchronize_net();
7964
7965 /* Shutdown queueing discipline. */
7966 dev_shutdown(dev);
7967
7968 /* Notify protocols, that we are about to destroy
7969 this device. They should clean all the things.
7970
7971 Note that dev->reg_state stays at NETREG_REGISTERED.
7972 This is wanted because this way 8021q and macvlan know
7973 the device is just moving and can keep their slaves up.
7974 */
7975 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7976 rcu_barrier();
7977 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7978 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7979
7980 /*
7981 * Flush the unicast and multicast chains
7982 */
7983 dev_uc_flush(dev);
7984 dev_mc_flush(dev);
7985
7986 /* Send a netdev-removed uevent to the old namespace */
7987 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7988 netdev_adjacent_del_links(dev);
7989
7990 /* Actually switch the network namespace */
7991 dev_net_set(dev, net);
7992
7993 /* If there is an ifindex conflict assign a new one */
7994 if (__dev_get_by_index(net, dev->ifindex))
7995 dev->ifindex = dev_new_index(net);
7996
7997 /* Send a netdev-add uevent to the new namespace */
7998 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7999 netdev_adjacent_add_links(dev);
8000
8001 /* Fixup kobjects */
8002 err = device_rename(&dev->dev, dev->name);
8003 WARN_ON(err);
8004
8005 /* Add the device back in the hashes */
8006 list_netdevice(dev);
8007
8008 /* Notify protocols, that a new device appeared. */
8009 call_netdevice_notifiers(NETDEV_REGISTER, dev);
8010
8011 /*
8012 * Prevent userspace races by waiting until the network
8013 * device is fully setup before sending notifications.
8014 */
8015 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8016
8017 synchronize_net();
8018 err = 0;
8019 out:
8020 return err;
8021 }
8022 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8023
dev_cpu_callback(struct notifier_block * nfb,unsigned long action,void * ocpu)8024 static int dev_cpu_callback(struct notifier_block *nfb,
8025 unsigned long action,
8026 void *ocpu)
8027 {
8028 struct sk_buff **list_skb;
8029 struct sk_buff *skb;
8030 unsigned int cpu, oldcpu = (unsigned long)ocpu;
8031 struct softnet_data *sd, *oldsd;
8032
8033 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
8034 return NOTIFY_OK;
8035
8036 local_irq_disable();
8037 cpu = smp_processor_id();
8038 sd = &per_cpu(softnet_data, cpu);
8039 oldsd = &per_cpu(softnet_data, oldcpu);
8040
8041 /* Find end of our completion_queue. */
8042 list_skb = &sd->completion_queue;
8043 while (*list_skb)
8044 list_skb = &(*list_skb)->next;
8045 /* Append completion queue from offline CPU. */
8046 *list_skb = oldsd->completion_queue;
8047 oldsd->completion_queue = NULL;
8048
8049 /* Append output queue from offline CPU. */
8050 if (oldsd->output_queue) {
8051 *sd->output_queue_tailp = oldsd->output_queue;
8052 sd->output_queue_tailp = oldsd->output_queue_tailp;
8053 oldsd->output_queue = NULL;
8054 oldsd->output_queue_tailp = &oldsd->output_queue;
8055 }
8056 /* Append NAPI poll list from offline CPU, with one exception :
8057 * process_backlog() must be called by cpu owning percpu backlog.
8058 * We properly handle process_queue & input_pkt_queue later.
8059 */
8060 while (!list_empty(&oldsd->poll_list)) {
8061 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8062 struct napi_struct,
8063 poll_list);
8064
8065 list_del_init(&napi->poll_list);
8066 if (napi->poll == process_backlog)
8067 napi->state = 0;
8068 else
8069 ____napi_schedule(sd, napi);
8070 }
8071
8072 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8073 local_irq_enable();
8074
8075 /* Process offline CPU's input_pkt_queue */
8076 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8077 netif_rx_ni(skb);
8078 input_queue_head_incr(oldsd);
8079 }
8080 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8081 netif_rx_ni(skb);
8082 input_queue_head_incr(oldsd);
8083 }
8084
8085 return NOTIFY_OK;
8086 }
8087
8088
8089 /**
8090 * netdev_increment_features - increment feature set by one
8091 * @all: current feature set
8092 * @one: new feature set
8093 * @mask: mask feature set
8094 *
8095 * Computes a new feature set after adding a device with feature set
8096 * @one to the master device with current feature set @all. Will not
8097 * enable anything that is off in @mask. Returns the new feature set.
8098 */
netdev_increment_features(netdev_features_t all,netdev_features_t one,netdev_features_t mask)8099 netdev_features_t netdev_increment_features(netdev_features_t all,
8100 netdev_features_t one, netdev_features_t mask)
8101 {
8102 if (mask & NETIF_F_HW_CSUM)
8103 mask |= NETIF_F_CSUM_MASK;
8104 mask |= NETIF_F_VLAN_CHALLENGED;
8105
8106 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8107 all &= one | ~NETIF_F_ALL_FOR_ALL;
8108
8109 /* If one device supports hw checksumming, set for all. */
8110 if (all & NETIF_F_HW_CSUM)
8111 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8112
8113 return all;
8114 }
8115 EXPORT_SYMBOL(netdev_increment_features);
8116
netdev_create_hash(void)8117 static struct hlist_head * __net_init netdev_create_hash(void)
8118 {
8119 int i;
8120 struct hlist_head *hash;
8121
8122 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8123 if (hash != NULL)
8124 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8125 INIT_HLIST_HEAD(&hash[i]);
8126
8127 return hash;
8128 }
8129
8130 /* Initialize per network namespace state */
netdev_init(struct net * net)8131 static int __net_init netdev_init(struct net *net)
8132 {
8133 if (net != &init_net)
8134 INIT_LIST_HEAD(&net->dev_base_head);
8135
8136 net->dev_name_head = netdev_create_hash();
8137 if (net->dev_name_head == NULL)
8138 goto err_name;
8139
8140 net->dev_index_head = netdev_create_hash();
8141 if (net->dev_index_head == NULL)
8142 goto err_idx;
8143
8144 return 0;
8145
8146 err_idx:
8147 kfree(net->dev_name_head);
8148 err_name:
8149 return -ENOMEM;
8150 }
8151
8152 /**
8153 * netdev_drivername - network driver for the device
8154 * @dev: network device
8155 *
8156 * Determine network driver for device.
8157 */
netdev_drivername(const struct net_device * dev)8158 const char *netdev_drivername(const struct net_device *dev)
8159 {
8160 const struct device_driver *driver;
8161 const struct device *parent;
8162 const char *empty = "";
8163
8164 parent = dev->dev.parent;
8165 if (!parent)
8166 return empty;
8167
8168 driver = parent->driver;
8169 if (driver && driver->name)
8170 return driver->name;
8171 return empty;
8172 }
8173
__netdev_printk(const char * level,const struct net_device * dev,struct va_format * vaf)8174 static void __netdev_printk(const char *level, const struct net_device *dev,
8175 struct va_format *vaf)
8176 {
8177 if (dev && dev->dev.parent) {
8178 dev_printk_emit(level[1] - '0',
8179 dev->dev.parent,
8180 "%s %s %s%s: %pV",
8181 dev_driver_string(dev->dev.parent),
8182 dev_name(dev->dev.parent),
8183 netdev_name(dev), netdev_reg_state(dev),
8184 vaf);
8185 } else if (dev) {
8186 printk("%s%s%s: %pV",
8187 level, netdev_name(dev), netdev_reg_state(dev), vaf);
8188 } else {
8189 printk("%s(NULL net_device): %pV", level, vaf);
8190 }
8191 }
8192
netdev_printk(const char * level,const struct net_device * dev,const char * format,...)8193 void netdev_printk(const char *level, const struct net_device *dev,
8194 const char *format, ...)
8195 {
8196 struct va_format vaf;
8197 va_list args;
8198
8199 va_start(args, format);
8200
8201 vaf.fmt = format;
8202 vaf.va = &args;
8203
8204 __netdev_printk(level, dev, &vaf);
8205
8206 va_end(args);
8207 }
8208 EXPORT_SYMBOL(netdev_printk);
8209
8210 #define define_netdev_printk_level(func, level) \
8211 void func(const struct net_device *dev, const char *fmt, ...) \
8212 { \
8213 struct va_format vaf; \
8214 va_list args; \
8215 \
8216 va_start(args, fmt); \
8217 \
8218 vaf.fmt = fmt; \
8219 vaf.va = &args; \
8220 \
8221 __netdev_printk(level, dev, &vaf); \
8222 \
8223 va_end(args); \
8224 } \
8225 EXPORT_SYMBOL(func);
8226
8227 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8228 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8229 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8230 define_netdev_printk_level(netdev_err, KERN_ERR);
8231 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8232 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8233 define_netdev_printk_level(netdev_info, KERN_INFO);
8234
netdev_exit(struct net * net)8235 static void __net_exit netdev_exit(struct net *net)
8236 {
8237 kfree(net->dev_name_head);
8238 kfree(net->dev_index_head);
8239 }
8240
8241 static struct pernet_operations __net_initdata netdev_net_ops = {
8242 .init = netdev_init,
8243 .exit = netdev_exit,
8244 };
8245
default_device_exit(struct net * net)8246 static void __net_exit default_device_exit(struct net *net)
8247 {
8248 struct net_device *dev, *aux;
8249 /*
8250 * Push all migratable network devices back to the
8251 * initial network namespace
8252 */
8253 rtnl_lock();
8254 for_each_netdev_safe(net, dev, aux) {
8255 int err;
8256 char fb_name[IFNAMSIZ];
8257
8258 /* Ignore unmoveable devices (i.e. loopback) */
8259 if (dev->features & NETIF_F_NETNS_LOCAL)
8260 continue;
8261
8262 /* Leave virtual devices for the generic cleanup */
8263 if (dev->rtnl_link_ops)
8264 continue;
8265
8266 /* Push remaining network devices to init_net */
8267 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8268 err = dev_change_net_namespace(dev, &init_net, fb_name);
8269 if (err) {
8270 pr_emerg("%s: failed to move %s to init_net: %d\n",
8271 __func__, dev->name, err);
8272 BUG();
8273 }
8274 }
8275 rtnl_unlock();
8276 }
8277
rtnl_lock_unregistering(struct list_head * net_list)8278 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8279 {
8280 /* Return with the rtnl_lock held when there are no network
8281 * devices unregistering in any network namespace in net_list.
8282 */
8283 struct net *net;
8284 bool unregistering;
8285 DEFINE_WAIT_FUNC(wait, woken_wake_function);
8286
8287 add_wait_queue(&netdev_unregistering_wq, &wait);
8288 for (;;) {
8289 unregistering = false;
8290 rtnl_lock();
8291 list_for_each_entry(net, net_list, exit_list) {
8292 if (net->dev_unreg_count > 0) {
8293 unregistering = true;
8294 break;
8295 }
8296 }
8297 if (!unregistering)
8298 break;
8299 __rtnl_unlock();
8300
8301 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8302 }
8303 remove_wait_queue(&netdev_unregistering_wq, &wait);
8304 }
8305
default_device_exit_batch(struct list_head * net_list)8306 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8307 {
8308 /* At exit all network devices most be removed from a network
8309 * namespace. Do this in the reverse order of registration.
8310 * Do this across as many network namespaces as possible to
8311 * improve batching efficiency.
8312 */
8313 struct net_device *dev;
8314 struct net *net;
8315 LIST_HEAD(dev_kill_list);
8316
8317 /* To prevent network device cleanup code from dereferencing
8318 * loopback devices or network devices that have been freed
8319 * wait here for all pending unregistrations to complete,
8320 * before unregistring the loopback device and allowing the
8321 * network namespace be freed.
8322 *
8323 * The netdev todo list containing all network devices
8324 * unregistrations that happen in default_device_exit_batch
8325 * will run in the rtnl_unlock() at the end of
8326 * default_device_exit_batch.
8327 */
8328 rtnl_lock_unregistering(net_list);
8329 list_for_each_entry(net, net_list, exit_list) {
8330 for_each_netdev_reverse(net, dev) {
8331 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8332 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8333 else
8334 unregister_netdevice_queue(dev, &dev_kill_list);
8335 }
8336 }
8337 unregister_netdevice_many(&dev_kill_list);
8338 rtnl_unlock();
8339 }
8340
8341 static struct pernet_operations __net_initdata default_device_ops = {
8342 .exit = default_device_exit,
8343 .exit_batch = default_device_exit_batch,
8344 };
8345
8346 /*
8347 * Initialize the DEV module. At boot time this walks the device list and
8348 * unhooks any devices that fail to initialise (normally hardware not
8349 * present) and leaves us with a valid list of present and active devices.
8350 *
8351 */
8352
8353 /*
8354 * This is called single threaded during boot, so no need
8355 * to take the rtnl semaphore.
8356 */
net_dev_init(void)8357 static int __init net_dev_init(void)
8358 {
8359 int i, rc = -ENOMEM;
8360
8361 BUG_ON(!dev_boot_phase);
8362
8363 if (dev_proc_init())
8364 goto out;
8365
8366 if (netdev_kobject_init())
8367 goto out;
8368
8369 INIT_LIST_HEAD(&ptype_all);
8370 for (i = 0; i < PTYPE_HASH_SIZE; i++)
8371 INIT_LIST_HEAD(&ptype_base[i]);
8372
8373 INIT_LIST_HEAD(&offload_base);
8374
8375 if (register_pernet_subsys(&netdev_net_ops))
8376 goto out;
8377
8378 /*
8379 * Initialise the packet receive queues.
8380 */
8381
8382 for_each_possible_cpu(i) {
8383 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8384 struct softnet_data *sd = &per_cpu(softnet_data, i);
8385
8386 INIT_WORK(flush, flush_backlog);
8387
8388 skb_queue_head_init(&sd->input_pkt_queue);
8389 skb_queue_head_init(&sd->process_queue);
8390 INIT_LIST_HEAD(&sd->poll_list);
8391 sd->output_queue_tailp = &sd->output_queue;
8392 #ifdef CONFIG_RPS
8393 sd->csd.func = rps_trigger_softirq;
8394 sd->csd.info = sd;
8395 sd->cpu = i;
8396 #endif
8397
8398 sd->backlog.poll = process_backlog;
8399 sd->backlog.weight = weight_p;
8400 }
8401
8402 dev_boot_phase = 0;
8403
8404 /* The loopback device is special if any other network devices
8405 * is present in a network namespace the loopback device must
8406 * be present. Since we now dynamically allocate and free the
8407 * loopback device ensure this invariant is maintained by
8408 * keeping the loopback device as the first device on the
8409 * list of network devices. Ensuring the loopback devices
8410 * is the first device that appears and the last network device
8411 * that disappears.
8412 */
8413 if (register_pernet_device(&loopback_net_ops))
8414 goto out;
8415
8416 if (register_pernet_device(&default_device_ops))
8417 goto out;
8418
8419 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8420 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8421
8422 hotcpu_notifier(dev_cpu_callback, 0);
8423 dst_subsys_init();
8424 rc = 0;
8425 out:
8426 return rc;
8427 }
8428
8429 subsys_initcall(net_dev_init);
8430