1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/rwsem.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/stat.h>
102 #include <net/dst.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
119 #include <linux/if_arp.h>
120 #include <linux/if_vlan.h>
121 #include <linux/ip.h>
122 #include <net/ip.h>
123 #include <net/mpls.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141
142 #include "net-sysfs.h"
143
144 /* Instead of increasing this, you should create a hash table. */
145 #define MAX_GRO_SKBS 8
146
147 /* This should be increased if a protocol with a bigger head is added. */
148 #define GRO_MAX_HEAD (MAX_HEADER + 128)
149
150 static DEFINE_SPINLOCK(ptype_lock);
151 static DEFINE_SPINLOCK(offload_lock);
152 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
153 struct list_head ptype_all __read_mostly; /* Taps */
154 static struct list_head offload_base __read_mostly;
155
156 static int netif_rx_internal(struct sk_buff *skb);
157 static int call_netdevice_notifiers_info(unsigned long val,
158 struct net_device *dev,
159 struct netdev_notifier_info *info);
160
161 /*
162 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
163 * semaphore.
164 *
165 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
166 *
167 * Writers must hold the rtnl semaphore while they loop through the
168 * dev_base_head list, and hold dev_base_lock for writing when they do the
169 * actual updates. This allows pure readers to access the list even
170 * while a writer is preparing to update it.
171 *
172 * To put it another way, dev_base_lock is held for writing only to
173 * protect against pure readers; the rtnl semaphore provides the
174 * protection against other writers.
175 *
176 * See, for example usages, register_netdevice() and
177 * unregister_netdevice(), which must be called with the rtnl
178 * semaphore held.
179 */
180 DEFINE_RWLOCK(dev_base_lock);
181 EXPORT_SYMBOL(dev_base_lock);
182
183 /* protects napi_hash addition/deletion and napi_gen_id */
184 static DEFINE_SPINLOCK(napi_hash_lock);
185
186 static unsigned int napi_gen_id = NR_CPUS;
187 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
188
189 static DECLARE_RWSEM(devnet_rename_sem);
190
dev_base_seq_inc(struct net * net)191 static inline void dev_base_seq_inc(struct net *net)
192 {
193 while (++net->dev_base_seq == 0);
194 }
195
dev_name_hash(struct net * net,const char * name)196 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
197 {
198 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
199
200 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
201 }
202
dev_index_hash(struct net * net,int ifindex)203 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
204 {
205 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
206 }
207
rps_lock(struct softnet_data * sd)208 static inline void rps_lock(struct softnet_data *sd)
209 {
210 #ifdef CONFIG_RPS
211 spin_lock(&sd->input_pkt_queue.lock);
212 #endif
213 }
214
rps_unlock(struct softnet_data * sd)215 static inline void rps_unlock(struct softnet_data *sd)
216 {
217 #ifdef CONFIG_RPS
218 spin_unlock(&sd->input_pkt_queue.lock);
219 #endif
220 }
221
222 /* Device list insertion */
list_netdevice(struct net_device * dev)223 static void list_netdevice(struct net_device *dev)
224 {
225 struct net *net = dev_net(dev);
226
227 ASSERT_RTNL();
228
229 write_lock_bh(&dev_base_lock);
230 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
231 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
232 hlist_add_head_rcu(&dev->index_hlist,
233 dev_index_hash(net, dev->ifindex));
234 write_unlock_bh(&dev_base_lock);
235
236 dev_base_seq_inc(net);
237 }
238
239 /* Device list removal
240 * caller must respect a RCU grace period before freeing/reusing dev
241 */
unlist_netdevice(struct net_device * dev)242 static void unlist_netdevice(struct net_device *dev)
243 {
244 ASSERT_RTNL();
245
246 /* Unlink dev from the device chain */
247 write_lock_bh(&dev_base_lock);
248 list_del_rcu(&dev->dev_list);
249 hlist_del_rcu(&dev->name_hlist);
250 hlist_del_rcu(&dev->index_hlist);
251 write_unlock_bh(&dev_base_lock);
252
253 dev_base_seq_inc(dev_net(dev));
254 }
255
256 /*
257 * Our notifier list
258 */
259
260 static RAW_NOTIFIER_HEAD(netdev_chain);
261
262 /*
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
265 */
266
267 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
268 EXPORT_PER_CPU_SYMBOL(softnet_data);
269
270 #ifdef CONFIG_LOCKDEP
271 /*
272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
273 * according to dev->type
274 */
275 static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
289 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
290 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
291
292 static const char *const netdev_lock_name[] =
293 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
294 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
295 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
296 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
297 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
298 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
299 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
300 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
301 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
302 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
303 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
304 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
305 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
306 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
307 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
308
309 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
310 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
311
netdev_lock_pos(unsigned short dev_type)312 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
313 {
314 int i;
315
316 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
317 if (netdev_lock_type[i] == dev_type)
318 return i;
319 /* the last key is used by default */
320 return ARRAY_SIZE(netdev_lock_type) - 1;
321 }
322
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324 unsigned short dev_type)
325 {
326 int i;
327
328 i = netdev_lock_pos(dev_type);
329 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
330 netdev_lock_name[i]);
331 }
332
netdev_set_addr_lockdep_class(struct net_device * dev)333 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
334 {
335 int i;
336
337 i = netdev_lock_pos(dev->type);
338 lockdep_set_class_and_name(&dev->addr_list_lock,
339 &netdev_addr_lock_key[i],
340 netdev_lock_name[i]);
341 }
342 #else
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)343 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
344 unsigned short dev_type)
345 {
346 }
netdev_set_addr_lockdep_class(struct net_device * dev)347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
348 {
349 }
350 #endif
351
352 /*******************************************************************************
353
354 Protocol management and registration routines
355
356 *******************************************************************************/
357
358 /*
359 * Add a protocol ID to the list. Now that the input handler is
360 * smarter we can dispense with all the messy stuff that used to be
361 * here.
362 *
363 * BEWARE!!! Protocol handlers, mangling input packets,
364 * MUST BE last in hash buckets and checking protocol handlers
365 * MUST start from promiscuous ptype_all chain in net_bh.
366 * It is true now, do not change it.
367 * Explanation follows: if protocol handler, mangling packet, will
368 * be the first on list, it is not able to sense, that packet
369 * is cloned and should be copied-on-write, so that it will
370 * change it and subsequent readers will get broken packet.
371 * --ANK (980803)
372 */
373
ptype_head(const struct packet_type * pt)374 static inline struct list_head *ptype_head(const struct packet_type *pt)
375 {
376 if (pt->type == htons(ETH_P_ALL))
377 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
378 else
379 return pt->dev ? &pt->dev->ptype_specific :
380 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
381 }
382
383 /**
384 * dev_add_pack - add packet handler
385 * @pt: packet type declaration
386 *
387 * Add a protocol handler to the networking stack. The passed &packet_type
388 * is linked into kernel lists and may not be freed until it has been
389 * removed from the kernel lists.
390 *
391 * This call does not sleep therefore it can not
392 * guarantee all CPU's that are in middle of receiving packets
393 * will see the new packet type (until the next received packet).
394 */
395
dev_add_pack(struct packet_type * pt)396 void dev_add_pack(struct packet_type *pt)
397 {
398 struct list_head *head = ptype_head(pt);
399
400 spin_lock(&ptype_lock);
401 list_add_rcu(&pt->list, head);
402 spin_unlock(&ptype_lock);
403 }
404 EXPORT_SYMBOL(dev_add_pack);
405
406 /**
407 * __dev_remove_pack - remove packet handler
408 * @pt: packet type declaration
409 *
410 * Remove a protocol handler that was previously added to the kernel
411 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
412 * from the kernel lists and can be freed or reused once this function
413 * returns.
414 *
415 * The packet type might still be in use by receivers
416 * and must not be freed until after all the CPU's have gone
417 * through a quiescent state.
418 */
__dev_remove_pack(struct packet_type * pt)419 void __dev_remove_pack(struct packet_type *pt)
420 {
421 struct list_head *head = ptype_head(pt);
422 struct packet_type *pt1;
423
424 spin_lock(&ptype_lock);
425
426 list_for_each_entry(pt1, head, list) {
427 if (pt == pt1) {
428 list_del_rcu(&pt->list);
429 goto out;
430 }
431 }
432
433 pr_warn("dev_remove_pack: %p not found\n", pt);
434 out:
435 spin_unlock(&ptype_lock);
436 }
437 EXPORT_SYMBOL(__dev_remove_pack);
438
439 /**
440 * dev_remove_pack - remove packet handler
441 * @pt: packet type declaration
442 *
443 * Remove a protocol handler that was previously added to the kernel
444 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
445 * from the kernel lists and can be freed or reused once this function
446 * returns.
447 *
448 * This call sleeps to guarantee that no CPU is looking at the packet
449 * type after return.
450 */
dev_remove_pack(struct packet_type * pt)451 void dev_remove_pack(struct packet_type *pt)
452 {
453 __dev_remove_pack(pt);
454
455 synchronize_net();
456 }
457 EXPORT_SYMBOL(dev_remove_pack);
458
459
460 /**
461 * dev_add_offload - register offload handlers
462 * @po: protocol offload declaration
463 *
464 * Add protocol offload handlers to the networking stack. The passed
465 * &proto_offload is linked into kernel lists and may not be freed until
466 * it has been removed from the kernel lists.
467 *
468 * This call does not sleep therefore it can not
469 * guarantee all CPU's that are in middle of receiving packets
470 * will see the new offload handlers (until the next received packet).
471 */
dev_add_offload(struct packet_offload * po)472 void dev_add_offload(struct packet_offload *po)
473 {
474 struct packet_offload *elem;
475
476 spin_lock(&offload_lock);
477 list_for_each_entry(elem, &offload_base, list) {
478 if (po->priority < elem->priority)
479 break;
480 }
481 list_add_rcu(&po->list, elem->list.prev);
482 spin_unlock(&offload_lock);
483 }
484 EXPORT_SYMBOL(dev_add_offload);
485
486 /**
487 * __dev_remove_offload - remove offload handler
488 * @po: packet offload declaration
489 *
490 * Remove a protocol offload handler that was previously added to the
491 * kernel offload handlers by dev_add_offload(). The passed &offload_type
492 * is removed from the kernel lists and can be freed or reused once this
493 * function returns.
494 *
495 * The packet type might still be in use by receivers
496 * and must not be freed until after all the CPU's have gone
497 * through a quiescent state.
498 */
__dev_remove_offload(struct packet_offload * po)499 static void __dev_remove_offload(struct packet_offload *po)
500 {
501 struct list_head *head = &offload_base;
502 struct packet_offload *po1;
503
504 spin_lock(&offload_lock);
505
506 list_for_each_entry(po1, head, list) {
507 if (po == po1) {
508 list_del_rcu(&po->list);
509 goto out;
510 }
511 }
512
513 pr_warn("dev_remove_offload: %p not found\n", po);
514 out:
515 spin_unlock(&offload_lock);
516 }
517
518 /**
519 * dev_remove_offload - remove packet offload handler
520 * @po: packet offload declaration
521 *
522 * Remove a packet offload handler that was previously added to the kernel
523 * offload handlers by dev_add_offload(). The passed &offload_type is
524 * removed from the kernel lists and can be freed or reused once this
525 * function returns.
526 *
527 * This call sleeps to guarantee that no CPU is looking at the packet
528 * type after return.
529 */
dev_remove_offload(struct packet_offload * po)530 void dev_remove_offload(struct packet_offload *po)
531 {
532 __dev_remove_offload(po);
533
534 synchronize_net();
535 }
536 EXPORT_SYMBOL(dev_remove_offload);
537
538 /******************************************************************************
539
540 Device Boot-time Settings Routines
541
542 *******************************************************************************/
543
544 /* Boot time configuration table */
545 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
546
547 /**
548 * netdev_boot_setup_add - add new setup entry
549 * @name: name of the device
550 * @map: configured settings for the device
551 *
552 * Adds new setup entry to the dev_boot_setup list. The function
553 * returns 0 on error and 1 on success. This is a generic routine to
554 * all netdevices.
555 */
netdev_boot_setup_add(char * name,struct ifmap * map)556 static int netdev_boot_setup_add(char *name, struct ifmap *map)
557 {
558 struct netdev_boot_setup *s;
559 int i;
560
561 s = dev_boot_setup;
562 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
563 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
564 memset(s[i].name, 0, sizeof(s[i].name));
565 strlcpy(s[i].name, name, IFNAMSIZ);
566 memcpy(&s[i].map, map, sizeof(s[i].map));
567 break;
568 }
569 }
570
571 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
572 }
573
574 /**
575 * netdev_boot_setup_check - check boot time settings
576 * @dev: the netdevice
577 *
578 * Check boot time settings for the device.
579 * The found settings are set for the device to be used
580 * later in the device probing.
581 * Returns 0 if no settings found, 1 if they are.
582 */
netdev_boot_setup_check(struct net_device * dev)583 int netdev_boot_setup_check(struct net_device *dev)
584 {
585 struct netdev_boot_setup *s = dev_boot_setup;
586 int i;
587
588 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
589 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
590 !strcmp(dev->name, s[i].name)) {
591 dev->irq = s[i].map.irq;
592 dev->base_addr = s[i].map.base_addr;
593 dev->mem_start = s[i].map.mem_start;
594 dev->mem_end = s[i].map.mem_end;
595 return 1;
596 }
597 }
598 return 0;
599 }
600 EXPORT_SYMBOL(netdev_boot_setup_check);
601
602
603 /**
604 * netdev_boot_base - get address from boot time settings
605 * @prefix: prefix for network device
606 * @unit: id for network device
607 *
608 * Check boot time settings for the base address of device.
609 * The found settings are set for the device to be used
610 * later in the device probing.
611 * Returns 0 if no settings found.
612 */
netdev_boot_base(const char * prefix,int unit)613 unsigned long netdev_boot_base(const char *prefix, int unit)
614 {
615 const struct netdev_boot_setup *s = dev_boot_setup;
616 char name[IFNAMSIZ];
617 int i;
618
619 sprintf(name, "%s%d", prefix, unit);
620
621 /*
622 * If device already registered then return base of 1
623 * to indicate not to probe for this interface
624 */
625 if (__dev_get_by_name(&init_net, name))
626 return 1;
627
628 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
629 if (!strcmp(name, s[i].name))
630 return s[i].map.base_addr;
631 return 0;
632 }
633
634 /*
635 * Saves at boot time configured settings for any netdevice.
636 */
netdev_boot_setup(char * str)637 int __init netdev_boot_setup(char *str)
638 {
639 int ints[5];
640 struct ifmap map;
641
642 str = get_options(str, ARRAY_SIZE(ints), ints);
643 if (!str || !*str)
644 return 0;
645
646 /* Save settings */
647 memset(&map, 0, sizeof(map));
648 if (ints[0] > 0)
649 map.irq = ints[1];
650 if (ints[0] > 1)
651 map.base_addr = ints[2];
652 if (ints[0] > 2)
653 map.mem_start = ints[3];
654 if (ints[0] > 3)
655 map.mem_end = ints[4];
656
657 /* Add new entry to the list */
658 return netdev_boot_setup_add(str, &map);
659 }
660
661 __setup("netdev=", netdev_boot_setup);
662
663 /*******************************************************************************
664
665 Device Interface Subroutines
666
667 *******************************************************************************/
668
669 /**
670 * dev_get_iflink - get 'iflink' value of a interface
671 * @dev: targeted interface
672 *
673 * Indicates the ifindex the interface is linked to.
674 * Physical interfaces have the same 'ifindex' and 'iflink' values.
675 */
676
dev_get_iflink(const struct net_device * dev)677 int dev_get_iflink(const struct net_device *dev)
678 {
679 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
680 return dev->netdev_ops->ndo_get_iflink(dev);
681
682 return dev->ifindex;
683 }
684 EXPORT_SYMBOL(dev_get_iflink);
685
686 /**
687 * dev_fill_metadata_dst - Retrieve tunnel egress information.
688 * @dev: targeted interface
689 * @skb: The packet.
690 *
691 * For better visibility of tunnel traffic OVS needs to retrieve
692 * egress tunnel information for a packet. Following API allows
693 * user to get this info.
694 */
dev_fill_metadata_dst(struct net_device * dev,struct sk_buff * skb)695 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
696 {
697 struct ip_tunnel_info *info;
698
699 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
700 return -EINVAL;
701
702 info = skb_tunnel_info_unclone(skb);
703 if (!info)
704 return -ENOMEM;
705 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
706 return -EINVAL;
707
708 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
709 }
710 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
711
712 /**
713 * __dev_get_by_name - find a device by its name
714 * @net: the applicable net namespace
715 * @name: name to find
716 *
717 * Find an interface by name. Must be called under RTNL semaphore
718 * or @dev_base_lock. If the name is found a pointer to the device
719 * is returned. If the name is not found then %NULL is returned. The
720 * reference counters are not incremented so the caller must be
721 * careful with locks.
722 */
723
__dev_get_by_name(struct net * net,const char * name)724 struct net_device *__dev_get_by_name(struct net *net, const char *name)
725 {
726 struct net_device *dev;
727 struct hlist_head *head = dev_name_hash(net, name);
728
729 hlist_for_each_entry(dev, head, name_hlist)
730 if (!strncmp(dev->name, name, IFNAMSIZ))
731 return dev;
732
733 return NULL;
734 }
735 EXPORT_SYMBOL(__dev_get_by_name);
736
737 /**
738 * dev_get_by_name_rcu - find a device by its name
739 * @net: the applicable net namespace
740 * @name: name to find
741 *
742 * Find an interface by name.
743 * If the name is found a pointer to the device is returned.
744 * If the name is not found then %NULL is returned.
745 * The reference counters are not incremented so the caller must be
746 * careful with locks. The caller must hold RCU lock.
747 */
748
dev_get_by_name_rcu(struct net * net,const char * name)749 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
750 {
751 struct net_device *dev;
752 struct hlist_head *head = dev_name_hash(net, name);
753
754 hlist_for_each_entry_rcu(dev, head, name_hlist)
755 if (!strncmp(dev->name, name, IFNAMSIZ))
756 return dev;
757
758 return NULL;
759 }
760 EXPORT_SYMBOL(dev_get_by_name_rcu);
761
762 /**
763 * dev_get_by_name - find a device by its name
764 * @net: the applicable net namespace
765 * @name: name to find
766 *
767 * Find an interface by name. This can be called from any
768 * context and does its own locking. The returned handle has
769 * the usage count incremented and the caller must use dev_put() to
770 * release it when it is no longer needed. %NULL is returned if no
771 * matching device is found.
772 */
773
dev_get_by_name(struct net * net,const char * name)774 struct net_device *dev_get_by_name(struct net *net, const char *name)
775 {
776 struct net_device *dev;
777
778 rcu_read_lock();
779 dev = dev_get_by_name_rcu(net, name);
780 if (dev)
781 dev_hold(dev);
782 rcu_read_unlock();
783 return dev;
784 }
785 EXPORT_SYMBOL(dev_get_by_name);
786
787 /**
788 * __dev_get_by_index - find a device by its ifindex
789 * @net: the applicable net namespace
790 * @ifindex: index of device
791 *
792 * Search for an interface by index. Returns %NULL if the device
793 * is not found or a pointer to the device. The device has not
794 * had its reference counter increased so the caller must be careful
795 * about locking. The caller must hold either the RTNL semaphore
796 * or @dev_base_lock.
797 */
798
__dev_get_by_index(struct net * net,int ifindex)799 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
800 {
801 struct net_device *dev;
802 struct hlist_head *head = dev_index_hash(net, ifindex);
803
804 hlist_for_each_entry(dev, head, index_hlist)
805 if (dev->ifindex == ifindex)
806 return dev;
807
808 return NULL;
809 }
810 EXPORT_SYMBOL(__dev_get_by_index);
811
812 /**
813 * dev_get_by_index_rcu - find a device by its ifindex
814 * @net: the applicable net namespace
815 * @ifindex: index of device
816 *
817 * Search for an interface by index. Returns %NULL if the device
818 * is not found or a pointer to the device. The device has not
819 * had its reference counter increased so the caller must be careful
820 * about locking. The caller must hold RCU lock.
821 */
822
dev_get_by_index_rcu(struct net * net,int ifindex)823 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
824 {
825 struct net_device *dev;
826 struct hlist_head *head = dev_index_hash(net, ifindex);
827
828 hlist_for_each_entry_rcu(dev, head, index_hlist)
829 if (dev->ifindex == ifindex)
830 return dev;
831
832 return NULL;
833 }
834 EXPORT_SYMBOL(dev_get_by_index_rcu);
835
836
837 /**
838 * dev_get_by_index - find a device by its ifindex
839 * @net: the applicable net namespace
840 * @ifindex: index of device
841 *
842 * Search for an interface by index. Returns NULL if the device
843 * is not found or a pointer to the device. The device returned has
844 * had a reference added and the pointer is safe until the user calls
845 * dev_put to indicate they have finished with it.
846 */
847
dev_get_by_index(struct net * net,int ifindex)848 struct net_device *dev_get_by_index(struct net *net, int ifindex)
849 {
850 struct net_device *dev;
851
852 rcu_read_lock();
853 dev = dev_get_by_index_rcu(net, ifindex);
854 if (dev)
855 dev_hold(dev);
856 rcu_read_unlock();
857 return dev;
858 }
859 EXPORT_SYMBOL(dev_get_by_index);
860
861 /**
862 * netdev_get_name - get a netdevice name, knowing its ifindex.
863 * @net: network namespace
864 * @name: a pointer to the buffer where the name will be stored.
865 * @ifindex: the ifindex of the interface to get the name from.
866 */
netdev_get_name(struct net * net,char * name,int ifindex)867 int netdev_get_name(struct net *net, char *name, int ifindex)
868 {
869 struct net_device *dev;
870 int ret;
871
872 down_read(&devnet_rename_sem);
873 rcu_read_lock();
874
875 dev = dev_get_by_index_rcu(net, ifindex);
876 if (!dev) {
877 ret = -ENODEV;
878 goto out;
879 }
880
881 strcpy(name, dev->name);
882
883 ret = 0;
884 out:
885 rcu_read_unlock();
886 up_read(&devnet_rename_sem);
887 return ret;
888 }
889
890 /**
891 * dev_getbyhwaddr_rcu - find a device by its hardware address
892 * @net: the applicable net namespace
893 * @type: media type of device
894 * @ha: hardware address
895 *
896 * Search for an interface by MAC address. Returns NULL if the device
897 * is not found or a pointer to the device.
898 * The caller must hold RCU or RTNL.
899 * The returned device has not had its ref count increased
900 * and the caller must therefore be careful about locking
901 *
902 */
903
dev_getbyhwaddr_rcu(struct net * net,unsigned short type,const char * ha)904 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
905 const char *ha)
906 {
907 struct net_device *dev;
908
909 for_each_netdev_rcu(net, dev)
910 if (dev->type == type &&
911 !memcmp(dev->dev_addr, ha, dev->addr_len))
912 return dev;
913
914 return NULL;
915 }
916 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
917
__dev_getfirstbyhwtype(struct net * net,unsigned short type)918 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
919 {
920 struct net_device *dev;
921
922 ASSERT_RTNL();
923 for_each_netdev(net, dev)
924 if (dev->type == type)
925 return dev;
926
927 return NULL;
928 }
929 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
930
dev_getfirstbyhwtype(struct net * net,unsigned short type)931 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
932 {
933 struct net_device *dev, *ret = NULL;
934
935 rcu_read_lock();
936 for_each_netdev_rcu(net, dev)
937 if (dev->type == type) {
938 dev_hold(dev);
939 ret = dev;
940 break;
941 }
942 rcu_read_unlock();
943 return ret;
944 }
945 EXPORT_SYMBOL(dev_getfirstbyhwtype);
946
947 /**
948 * __dev_get_by_flags - find any device with given flags
949 * @net: the applicable net namespace
950 * @if_flags: IFF_* values
951 * @mask: bitmask of bits in if_flags to check
952 *
953 * Search for any interface with the given flags. Returns NULL if a device
954 * is not found or a pointer to the device. Must be called inside
955 * rtnl_lock(), and result refcount is unchanged.
956 */
957
__dev_get_by_flags(struct net * net,unsigned short if_flags,unsigned short mask)958 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
959 unsigned short mask)
960 {
961 struct net_device *dev, *ret;
962
963 ASSERT_RTNL();
964
965 ret = NULL;
966 for_each_netdev(net, dev) {
967 if (((dev->flags ^ if_flags) & mask) == 0) {
968 ret = dev;
969 break;
970 }
971 }
972 return ret;
973 }
974 EXPORT_SYMBOL(__dev_get_by_flags);
975
976 /**
977 * dev_valid_name - check if name is okay for network device
978 * @name: name string
979 *
980 * Network device names need to be valid file names to
981 * to allow sysfs to work. We also disallow any kind of
982 * whitespace.
983 */
dev_valid_name(const char * name)984 bool dev_valid_name(const char *name)
985 {
986 if (*name == '\0')
987 return false;
988 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
989 return false;
990 if (!strcmp(name, ".") || !strcmp(name, ".."))
991 return false;
992
993 while (*name) {
994 if (*name == '/' || *name == ':' || isspace(*name))
995 return false;
996 name++;
997 }
998 return true;
999 }
1000 EXPORT_SYMBOL(dev_valid_name);
1001
1002 /**
1003 * __dev_alloc_name - allocate a name for a device
1004 * @net: network namespace to allocate the device name in
1005 * @name: name format string
1006 * @buf: scratch buffer and result name string
1007 *
1008 * Passed a format string - eg "lt%d" it will try and find a suitable
1009 * id. It scans list of devices to build up a free map, then chooses
1010 * the first empty slot. The caller must hold the dev_base or rtnl lock
1011 * while allocating the name and adding the device in order to avoid
1012 * duplicates.
1013 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1014 * Returns the number of the unit assigned or a negative errno code.
1015 */
1016
__dev_alloc_name(struct net * net,const char * name,char * buf)1017 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1018 {
1019 int i = 0;
1020 const char *p;
1021 const int max_netdevices = 8*PAGE_SIZE;
1022 unsigned long *inuse;
1023 struct net_device *d;
1024
1025 p = strnchr(name, IFNAMSIZ-1, '%');
1026 if (p) {
1027 /*
1028 * Verify the string as this thing may have come from
1029 * the user. There must be either one "%d" and no other "%"
1030 * characters.
1031 */
1032 if (p[1] != 'd' || strchr(p + 2, '%'))
1033 return -EINVAL;
1034
1035 /* Use one page as a bit array of possible slots */
1036 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1037 if (!inuse)
1038 return -ENOMEM;
1039
1040 for_each_netdev(net, d) {
1041 if (!sscanf(d->name, name, &i))
1042 continue;
1043 if (i < 0 || i >= max_netdevices)
1044 continue;
1045
1046 /* avoid cases where sscanf is not exact inverse of printf */
1047 snprintf(buf, IFNAMSIZ, name, i);
1048 if (!strncmp(buf, d->name, IFNAMSIZ))
1049 set_bit(i, inuse);
1050 }
1051
1052 i = find_first_zero_bit(inuse, max_netdevices);
1053 free_page((unsigned long) inuse);
1054 }
1055
1056 if (buf != name)
1057 snprintf(buf, IFNAMSIZ, name, i);
1058 if (!__dev_get_by_name(net, buf))
1059 return i;
1060
1061 /* It is possible to run out of possible slots
1062 * when the name is long and there isn't enough space left
1063 * for the digits, or if all bits are used.
1064 */
1065 return -ENFILE;
1066 }
1067
1068 /**
1069 * dev_alloc_name - allocate a name for a device
1070 * @dev: device
1071 * @name: name format string
1072 *
1073 * Passed a format string - eg "lt%d" it will try and find a suitable
1074 * id. It scans list of devices to build up a free map, then chooses
1075 * the first empty slot. The caller must hold the dev_base or rtnl lock
1076 * while allocating the name and adding the device in order to avoid
1077 * duplicates.
1078 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1079 * Returns the number of the unit assigned or a negative errno code.
1080 */
1081
dev_alloc_name(struct net_device * dev,const char * name)1082 int dev_alloc_name(struct net_device *dev, const char *name)
1083 {
1084 char buf[IFNAMSIZ];
1085 struct net *net;
1086 int ret;
1087
1088 BUG_ON(!dev_net(dev));
1089 net = dev_net(dev);
1090 ret = __dev_alloc_name(net, name, buf);
1091 if (ret >= 0)
1092 strlcpy(dev->name, buf, IFNAMSIZ);
1093 return ret;
1094 }
1095 EXPORT_SYMBOL(dev_alloc_name);
1096
dev_alloc_name_ns(struct net * net,struct net_device * dev,const char * name)1097 static int dev_alloc_name_ns(struct net *net,
1098 struct net_device *dev,
1099 const char *name)
1100 {
1101 char buf[IFNAMSIZ];
1102 int ret;
1103
1104 ret = __dev_alloc_name(net, name, buf);
1105 if (ret >= 0)
1106 strlcpy(dev->name, buf, IFNAMSIZ);
1107 return ret;
1108 }
1109
dev_get_valid_name(struct net * net,struct net_device * dev,const char * name)1110 int dev_get_valid_name(struct net *net, struct net_device *dev,
1111 const char *name)
1112 {
1113 BUG_ON(!net);
1114
1115 if (!dev_valid_name(name))
1116 return -EINVAL;
1117
1118 if (strchr(name, '%'))
1119 return dev_alloc_name_ns(net, dev, name);
1120 else if (__dev_get_by_name(net, name))
1121 return -EEXIST;
1122 else if (dev->name != name)
1123 strlcpy(dev->name, name, IFNAMSIZ);
1124
1125 return 0;
1126 }
1127 EXPORT_SYMBOL(dev_get_valid_name);
1128
1129 /**
1130 * dev_change_name - change name of a device
1131 * @dev: device
1132 * @newname: name (or format string) must be at least IFNAMSIZ
1133 *
1134 * Change name of a device, can pass format strings "eth%d".
1135 * for wildcarding.
1136 */
dev_change_name(struct net_device * dev,const char * newname)1137 int dev_change_name(struct net_device *dev, const char *newname)
1138 {
1139 unsigned char old_assign_type;
1140 char oldname[IFNAMSIZ];
1141 int err = 0;
1142 int ret;
1143 struct net *net;
1144
1145 ASSERT_RTNL();
1146 BUG_ON(!dev_net(dev));
1147
1148 net = dev_net(dev);
1149 if (dev->flags & IFF_UP)
1150 return -EBUSY;
1151
1152 down_write(&devnet_rename_sem);
1153
1154 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1155 up_write(&devnet_rename_sem);
1156 return 0;
1157 }
1158
1159 memcpy(oldname, dev->name, IFNAMSIZ);
1160
1161 err = dev_get_valid_name(net, dev, newname);
1162 if (err < 0) {
1163 up_write(&devnet_rename_sem);
1164 return err;
1165 }
1166
1167 if (oldname[0] && !strchr(oldname, '%'))
1168 netdev_info(dev, "renamed from %s\n", oldname);
1169
1170 old_assign_type = dev->name_assign_type;
1171 dev->name_assign_type = NET_NAME_RENAMED;
1172
1173 rollback:
1174 ret = device_rename(&dev->dev, dev->name);
1175 if (ret) {
1176 memcpy(dev->name, oldname, IFNAMSIZ);
1177 dev->name_assign_type = old_assign_type;
1178 up_write(&devnet_rename_sem);
1179 return ret;
1180 }
1181
1182 up_write(&devnet_rename_sem);
1183
1184 netdev_adjacent_rename_links(dev, oldname);
1185
1186 write_lock_bh(&dev_base_lock);
1187 hlist_del_rcu(&dev->name_hlist);
1188 write_unlock_bh(&dev_base_lock);
1189
1190 synchronize_rcu();
1191
1192 write_lock_bh(&dev_base_lock);
1193 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1194 write_unlock_bh(&dev_base_lock);
1195
1196 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1197 ret = notifier_to_errno(ret);
1198
1199 if (ret) {
1200 /* err >= 0 after dev_alloc_name() or stores the first errno */
1201 if (err >= 0) {
1202 err = ret;
1203 down_write(&devnet_rename_sem);
1204 memcpy(dev->name, oldname, IFNAMSIZ);
1205 memcpy(oldname, newname, IFNAMSIZ);
1206 dev->name_assign_type = old_assign_type;
1207 old_assign_type = NET_NAME_RENAMED;
1208 goto rollback;
1209 } else {
1210 pr_err("%s: name change rollback failed: %d\n",
1211 dev->name, ret);
1212 }
1213 }
1214
1215 return err;
1216 }
1217
1218 /**
1219 * dev_set_alias - change ifalias of a device
1220 * @dev: device
1221 * @alias: name up to IFALIASZ
1222 * @len: limit of bytes to copy from info
1223 *
1224 * Set ifalias for a device,
1225 */
dev_set_alias(struct net_device * dev,const char * alias,size_t len)1226 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1227 {
1228 char *new_ifalias;
1229
1230 ASSERT_RTNL();
1231
1232 if (len >= IFALIASZ)
1233 return -EINVAL;
1234
1235 if (!len) {
1236 kfree(dev->ifalias);
1237 dev->ifalias = NULL;
1238 return 0;
1239 }
1240
1241 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1242 if (!new_ifalias)
1243 return -ENOMEM;
1244 dev->ifalias = new_ifalias;
1245 memcpy(dev->ifalias, alias, len);
1246 dev->ifalias[len] = 0;
1247
1248 return len;
1249 }
1250
1251
1252 /**
1253 * netdev_features_change - device changes features
1254 * @dev: device to cause notification
1255 *
1256 * Called to indicate a device has changed features.
1257 */
netdev_features_change(struct net_device * dev)1258 void netdev_features_change(struct net_device *dev)
1259 {
1260 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1261 }
1262 EXPORT_SYMBOL(netdev_features_change);
1263
1264 /**
1265 * netdev_state_change - device changes state
1266 * @dev: device to cause notification
1267 *
1268 * Called to indicate a device has changed state. This function calls
1269 * the notifier chains for netdev_chain and sends a NEWLINK message
1270 * to the routing socket.
1271 */
netdev_state_change(struct net_device * dev)1272 void netdev_state_change(struct net_device *dev)
1273 {
1274 if (dev->flags & IFF_UP) {
1275 struct netdev_notifier_change_info change_info;
1276
1277 change_info.flags_changed = 0;
1278 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1279 &change_info.info);
1280 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1281 }
1282 }
1283 EXPORT_SYMBOL(netdev_state_change);
1284
1285 /**
1286 * netdev_notify_peers - notify network peers about existence of @dev
1287 * @dev: network device
1288 *
1289 * Generate traffic such that interested network peers are aware of
1290 * @dev, such as by generating a gratuitous ARP. This may be used when
1291 * a device wants to inform the rest of the network about some sort of
1292 * reconfiguration such as a failover event or virtual machine
1293 * migration.
1294 */
netdev_notify_peers(struct net_device * dev)1295 void netdev_notify_peers(struct net_device *dev)
1296 {
1297 rtnl_lock();
1298 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1299 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1300 rtnl_unlock();
1301 }
1302 EXPORT_SYMBOL(netdev_notify_peers);
1303
__dev_open(struct net_device * dev)1304 static int __dev_open(struct net_device *dev)
1305 {
1306 const struct net_device_ops *ops = dev->netdev_ops;
1307 int ret;
1308
1309 ASSERT_RTNL();
1310
1311 if (!netif_device_present(dev))
1312 return -ENODEV;
1313
1314 /* Block netpoll from trying to do any rx path servicing.
1315 * If we don't do this there is a chance ndo_poll_controller
1316 * or ndo_poll may be running while we open the device
1317 */
1318 netpoll_poll_disable(dev);
1319
1320 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1321 ret = notifier_to_errno(ret);
1322 if (ret)
1323 return ret;
1324
1325 set_bit(__LINK_STATE_START, &dev->state);
1326
1327 if (ops->ndo_validate_addr)
1328 ret = ops->ndo_validate_addr(dev);
1329
1330 if (!ret && ops->ndo_open)
1331 ret = ops->ndo_open(dev);
1332
1333 netpoll_poll_enable(dev);
1334
1335 if (ret)
1336 clear_bit(__LINK_STATE_START, &dev->state);
1337 else {
1338 dev->flags |= IFF_UP;
1339 dev_set_rx_mode(dev);
1340 dev_activate(dev);
1341 add_device_randomness(dev->dev_addr, dev->addr_len);
1342 }
1343
1344 return ret;
1345 }
1346
1347 /**
1348 * dev_open - prepare an interface for use.
1349 * @dev: device to open
1350 *
1351 * Takes a device from down to up state. The device's private open
1352 * function is invoked and then the multicast lists are loaded. Finally
1353 * the device is moved into the up state and a %NETDEV_UP message is
1354 * sent to the netdev notifier chain.
1355 *
1356 * Calling this function on an active interface is a nop. On a failure
1357 * a negative errno code is returned.
1358 */
dev_open(struct net_device * dev)1359 int dev_open(struct net_device *dev)
1360 {
1361 int ret;
1362
1363 if (dev->flags & IFF_UP)
1364 return 0;
1365
1366 ret = __dev_open(dev);
1367 if (ret < 0)
1368 return ret;
1369
1370 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1371 call_netdevice_notifiers(NETDEV_UP, dev);
1372
1373 return ret;
1374 }
1375 EXPORT_SYMBOL(dev_open);
1376
__dev_close_many(struct list_head * head)1377 static int __dev_close_many(struct list_head *head)
1378 {
1379 struct net_device *dev;
1380
1381 ASSERT_RTNL();
1382 might_sleep();
1383
1384 list_for_each_entry(dev, head, close_list) {
1385 /* Temporarily disable netpoll until the interface is down */
1386 netpoll_poll_disable(dev);
1387
1388 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1389
1390 clear_bit(__LINK_STATE_START, &dev->state);
1391
1392 /* Synchronize to scheduled poll. We cannot touch poll list, it
1393 * can be even on different cpu. So just clear netif_running().
1394 *
1395 * dev->stop() will invoke napi_disable() on all of it's
1396 * napi_struct instances on this device.
1397 */
1398 smp_mb__after_atomic(); /* Commit netif_running(). */
1399 }
1400
1401 dev_deactivate_many(head);
1402
1403 list_for_each_entry(dev, head, close_list) {
1404 const struct net_device_ops *ops = dev->netdev_ops;
1405
1406 /*
1407 * Call the device specific close. This cannot fail.
1408 * Only if device is UP
1409 *
1410 * We allow it to be called even after a DETACH hot-plug
1411 * event.
1412 */
1413 if (ops->ndo_stop)
1414 ops->ndo_stop(dev);
1415
1416 dev->flags &= ~IFF_UP;
1417 netpoll_poll_enable(dev);
1418 }
1419
1420 return 0;
1421 }
1422
__dev_close(struct net_device * dev)1423 static int __dev_close(struct net_device *dev)
1424 {
1425 int retval;
1426 LIST_HEAD(single);
1427
1428 list_add(&dev->close_list, &single);
1429 retval = __dev_close_many(&single);
1430 list_del(&single);
1431
1432 return retval;
1433 }
1434
dev_close_many(struct list_head * head,bool unlink)1435 int dev_close_many(struct list_head *head, bool unlink)
1436 {
1437 struct net_device *dev, *tmp;
1438
1439 /* Remove the devices that don't need to be closed */
1440 list_for_each_entry_safe(dev, tmp, head, close_list)
1441 if (!(dev->flags & IFF_UP))
1442 list_del_init(&dev->close_list);
1443
1444 __dev_close_many(head);
1445
1446 list_for_each_entry_safe(dev, tmp, head, close_list) {
1447 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1448 call_netdevice_notifiers(NETDEV_DOWN, dev);
1449 if (unlink)
1450 list_del_init(&dev->close_list);
1451 }
1452
1453 return 0;
1454 }
1455 EXPORT_SYMBOL(dev_close_many);
1456
1457 /**
1458 * dev_close - shutdown an interface.
1459 * @dev: device to shutdown
1460 *
1461 * This function moves an active device into down state. A
1462 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1463 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1464 * chain.
1465 */
dev_close(struct net_device * dev)1466 int dev_close(struct net_device *dev)
1467 {
1468 if (dev->flags & IFF_UP) {
1469 LIST_HEAD(single);
1470
1471 list_add(&dev->close_list, &single);
1472 dev_close_many(&single, true);
1473 list_del(&single);
1474 }
1475 return 0;
1476 }
1477 EXPORT_SYMBOL(dev_close);
1478
1479
1480 /**
1481 * dev_disable_lro - disable Large Receive Offload on a device
1482 * @dev: device
1483 *
1484 * Disable Large Receive Offload (LRO) on a net device. Must be
1485 * called under RTNL. This is needed if received packets may be
1486 * forwarded to another interface.
1487 */
dev_disable_lro(struct net_device * dev)1488 void dev_disable_lro(struct net_device *dev)
1489 {
1490 struct net_device *lower_dev;
1491 struct list_head *iter;
1492
1493 dev->wanted_features &= ~NETIF_F_LRO;
1494 netdev_update_features(dev);
1495
1496 if (unlikely(dev->features & NETIF_F_LRO))
1497 netdev_WARN(dev, "failed to disable LRO!\n");
1498
1499 netdev_for_each_lower_dev(dev, lower_dev, iter)
1500 dev_disable_lro(lower_dev);
1501 }
1502 EXPORT_SYMBOL(dev_disable_lro);
1503
call_netdevice_notifier(struct notifier_block * nb,unsigned long val,struct net_device * dev)1504 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1505 struct net_device *dev)
1506 {
1507 struct netdev_notifier_info info;
1508
1509 netdev_notifier_info_init(&info, dev);
1510 return nb->notifier_call(nb, val, &info);
1511 }
1512
1513 static int dev_boot_phase = 1;
1514
1515 /**
1516 * register_netdevice_notifier - register a network notifier block
1517 * @nb: notifier
1518 *
1519 * Register a notifier to be called when network device events occur.
1520 * The notifier passed is linked into the kernel structures and must
1521 * not be reused until it has been unregistered. A negative errno code
1522 * is returned on a failure.
1523 *
1524 * When registered all registration and up events are replayed
1525 * to the new notifier to allow device to have a race free
1526 * view of the network device list.
1527 */
1528
register_netdevice_notifier(struct notifier_block * nb)1529 int register_netdevice_notifier(struct notifier_block *nb)
1530 {
1531 struct net_device *dev;
1532 struct net_device *last;
1533 struct net *net;
1534 int err;
1535
1536 rtnl_lock();
1537 err = raw_notifier_chain_register(&netdev_chain, nb);
1538 if (err)
1539 goto unlock;
1540 if (dev_boot_phase)
1541 goto unlock;
1542 for_each_net(net) {
1543 for_each_netdev(net, dev) {
1544 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1545 err = notifier_to_errno(err);
1546 if (err)
1547 goto rollback;
1548
1549 if (!(dev->flags & IFF_UP))
1550 continue;
1551
1552 call_netdevice_notifier(nb, NETDEV_UP, dev);
1553 }
1554 }
1555
1556 unlock:
1557 rtnl_unlock();
1558 return err;
1559
1560 rollback:
1561 last = dev;
1562 for_each_net(net) {
1563 for_each_netdev(net, dev) {
1564 if (dev == last)
1565 goto outroll;
1566
1567 if (dev->flags & IFF_UP) {
1568 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1569 dev);
1570 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1571 }
1572 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1573 }
1574 }
1575
1576 outroll:
1577 raw_notifier_chain_unregister(&netdev_chain, nb);
1578 goto unlock;
1579 }
1580 EXPORT_SYMBOL(register_netdevice_notifier);
1581
1582 /**
1583 * unregister_netdevice_notifier - unregister a network notifier block
1584 * @nb: notifier
1585 *
1586 * Unregister a notifier previously registered by
1587 * register_netdevice_notifier(). The notifier is unlinked into the
1588 * kernel structures and may then be reused. A negative errno code
1589 * is returned on a failure.
1590 *
1591 * After unregistering unregister and down device events are synthesized
1592 * for all devices on the device list to the removed notifier to remove
1593 * the need for special case cleanup code.
1594 */
1595
unregister_netdevice_notifier(struct notifier_block * nb)1596 int unregister_netdevice_notifier(struct notifier_block *nb)
1597 {
1598 struct net_device *dev;
1599 struct net *net;
1600 int err;
1601
1602 rtnl_lock();
1603 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1604 if (err)
1605 goto unlock;
1606
1607 for_each_net(net) {
1608 for_each_netdev(net, dev) {
1609 if (dev->flags & IFF_UP) {
1610 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1611 dev);
1612 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1613 }
1614 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1615 }
1616 }
1617 unlock:
1618 rtnl_unlock();
1619 return err;
1620 }
1621 EXPORT_SYMBOL(unregister_netdevice_notifier);
1622
1623 /**
1624 * call_netdevice_notifiers_info - call all network notifier blocks
1625 * @val: value passed unmodified to notifier function
1626 * @dev: net_device pointer passed unmodified to notifier function
1627 * @info: notifier information data
1628 *
1629 * Call all network notifier blocks. Parameters and return value
1630 * are as for raw_notifier_call_chain().
1631 */
1632
call_netdevice_notifiers_info(unsigned long val,struct net_device * dev,struct netdev_notifier_info * info)1633 static int call_netdevice_notifiers_info(unsigned long val,
1634 struct net_device *dev,
1635 struct netdev_notifier_info *info)
1636 {
1637 ASSERT_RTNL();
1638 netdev_notifier_info_init(info, dev);
1639 return raw_notifier_call_chain(&netdev_chain, val, info);
1640 }
1641
1642 /**
1643 * call_netdevice_notifiers - call all network notifier blocks
1644 * @val: value passed unmodified to notifier function
1645 * @dev: net_device pointer passed unmodified to notifier function
1646 *
1647 * Call all network notifier blocks. Parameters and return value
1648 * are as for raw_notifier_call_chain().
1649 */
1650
call_netdevice_notifiers(unsigned long val,struct net_device * dev)1651 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1652 {
1653 struct netdev_notifier_info info;
1654
1655 return call_netdevice_notifiers_info(val, dev, &info);
1656 }
1657 EXPORT_SYMBOL(call_netdevice_notifiers);
1658
1659 /**
1660 * call_netdevice_notifiers_mtu - call all network notifier blocks
1661 * @val: value passed unmodified to notifier function
1662 * @dev: net_device pointer passed unmodified to notifier function
1663 * @arg: additional u32 argument passed to the notifier function
1664 *
1665 * Call all network notifier blocks. Parameters and return value
1666 * are as for raw_notifier_call_chain().
1667 */
call_netdevice_notifiers_mtu(unsigned long val,struct net_device * dev,u32 arg)1668 static int call_netdevice_notifiers_mtu(unsigned long val,
1669 struct net_device *dev, u32 arg)
1670 {
1671 struct netdev_notifier_info_ext info = {
1672 .info.dev = dev,
1673 .ext.mtu = arg,
1674 };
1675
1676 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1677
1678 return call_netdevice_notifiers_info(val, dev, &info.info);
1679 }
1680
1681 #ifdef CONFIG_NET_INGRESS
1682 static struct static_key ingress_needed __read_mostly;
1683
net_inc_ingress_queue(void)1684 void net_inc_ingress_queue(void)
1685 {
1686 static_key_slow_inc(&ingress_needed);
1687 }
1688 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1689
net_dec_ingress_queue(void)1690 void net_dec_ingress_queue(void)
1691 {
1692 static_key_slow_dec(&ingress_needed);
1693 }
1694 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1695 #endif
1696
1697 static struct static_key netstamp_needed __read_mostly;
1698 #ifdef HAVE_JUMP_LABEL
1699 static atomic_t netstamp_needed_deferred;
1700 static atomic_t netstamp_wanted;
netstamp_clear(struct work_struct * work)1701 static void netstamp_clear(struct work_struct *work)
1702 {
1703 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1704 int wanted;
1705
1706 wanted = atomic_add_return(deferred, &netstamp_wanted);
1707 if (wanted > 0)
1708 static_key_enable(&netstamp_needed);
1709 else
1710 static_key_disable(&netstamp_needed);
1711 }
1712 static DECLARE_WORK(netstamp_work, netstamp_clear);
1713 #endif
1714
net_enable_timestamp(void)1715 void net_enable_timestamp(void)
1716 {
1717 #ifdef HAVE_JUMP_LABEL
1718 int wanted;
1719
1720 while (1) {
1721 wanted = atomic_read(&netstamp_wanted);
1722 if (wanted <= 0)
1723 break;
1724 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1725 return;
1726 }
1727 atomic_inc(&netstamp_needed_deferred);
1728 schedule_work(&netstamp_work);
1729 #else
1730 static_key_slow_inc(&netstamp_needed);
1731 #endif
1732 }
1733 EXPORT_SYMBOL(net_enable_timestamp);
1734
net_disable_timestamp(void)1735 void net_disable_timestamp(void)
1736 {
1737 #ifdef HAVE_JUMP_LABEL
1738 int wanted;
1739
1740 while (1) {
1741 wanted = atomic_read(&netstamp_wanted);
1742 if (wanted <= 1)
1743 break;
1744 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1745 return;
1746 }
1747 atomic_dec(&netstamp_needed_deferred);
1748 schedule_work(&netstamp_work);
1749 #else
1750 static_key_slow_dec(&netstamp_needed);
1751 #endif
1752 }
1753 EXPORT_SYMBOL(net_disable_timestamp);
1754
net_timestamp_set(struct sk_buff * skb)1755 static inline void net_timestamp_set(struct sk_buff *skb)
1756 {
1757 skb->tstamp.tv64 = 0;
1758 if (static_key_false(&netstamp_needed))
1759 __net_timestamp(skb);
1760 }
1761
1762 #define net_timestamp_check(COND, SKB) \
1763 if (static_key_false(&netstamp_needed)) { \
1764 if ((COND) && !(SKB)->tstamp.tv64) \
1765 __net_timestamp(SKB); \
1766 } \
1767
is_skb_forwardable(struct net_device * dev,struct sk_buff * skb)1768 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1769 {
1770 unsigned int len;
1771
1772 if (!(dev->flags & IFF_UP))
1773 return false;
1774
1775 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1776 if (skb->len <= len)
1777 return true;
1778
1779 /* if TSO is enabled, we don't care about the length as the packet
1780 * could be forwarded without being segmented before
1781 */
1782 if (skb_is_gso(skb))
1783 return true;
1784
1785 return false;
1786 }
1787 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1788
__dev_forward_skb(struct net_device * dev,struct sk_buff * skb)1789 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1790 {
1791 if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1792 unlikely(!is_skb_forwardable(dev, skb))) {
1793 atomic_long_inc(&dev->rx_dropped);
1794 kfree_skb(skb);
1795 return NET_RX_DROP;
1796 }
1797
1798 skb_scrub_packet(skb, true);
1799 skb->priority = 0;
1800 skb->protocol = eth_type_trans(skb, dev);
1801 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1802
1803 return 0;
1804 }
1805 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1806
1807 /**
1808 * dev_forward_skb - loopback an skb to another netif
1809 *
1810 * @dev: destination network device
1811 * @skb: buffer to forward
1812 *
1813 * return values:
1814 * NET_RX_SUCCESS (no congestion)
1815 * NET_RX_DROP (packet was dropped, but freed)
1816 *
1817 * dev_forward_skb can be used for injecting an skb from the
1818 * start_xmit function of one device into the receive queue
1819 * of another device.
1820 *
1821 * The receiving device may be in another namespace, so
1822 * we have to clear all information in the skb that could
1823 * impact namespace isolation.
1824 */
dev_forward_skb(struct net_device * dev,struct sk_buff * skb)1825 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1826 {
1827 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1828 }
1829 EXPORT_SYMBOL_GPL(dev_forward_skb);
1830
deliver_skb(struct sk_buff * skb,struct packet_type * pt_prev,struct net_device * orig_dev)1831 static inline int deliver_skb(struct sk_buff *skb,
1832 struct packet_type *pt_prev,
1833 struct net_device *orig_dev)
1834 {
1835 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1836 return -ENOMEM;
1837 atomic_inc(&skb->users);
1838 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1839 }
1840
deliver_ptype_list_skb(struct sk_buff * skb,struct packet_type ** pt,struct net_device * orig_dev,__be16 type,struct list_head * ptype_list)1841 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1842 struct packet_type **pt,
1843 struct net_device *orig_dev,
1844 __be16 type,
1845 struct list_head *ptype_list)
1846 {
1847 struct packet_type *ptype, *pt_prev = *pt;
1848
1849 list_for_each_entry_rcu(ptype, ptype_list, list) {
1850 if (ptype->type != type)
1851 continue;
1852 if (pt_prev)
1853 deliver_skb(skb, pt_prev, orig_dev);
1854 pt_prev = ptype;
1855 }
1856 *pt = pt_prev;
1857 }
1858
skb_loop_sk(struct packet_type * ptype,struct sk_buff * skb)1859 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1860 {
1861 if (!ptype->af_packet_priv || !skb->sk)
1862 return false;
1863
1864 if (ptype->id_match)
1865 return ptype->id_match(ptype, skb->sk);
1866 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1867 return true;
1868
1869 return false;
1870 }
1871
1872 /*
1873 * Support routine. Sends outgoing frames to any network
1874 * taps currently in use.
1875 */
1876
dev_queue_xmit_nit(struct sk_buff * skb,struct net_device * dev)1877 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1878 {
1879 struct packet_type *ptype;
1880 struct sk_buff *skb2 = NULL;
1881 struct packet_type *pt_prev = NULL;
1882 struct list_head *ptype_list = &ptype_all;
1883
1884 rcu_read_lock();
1885 again:
1886 list_for_each_entry_rcu(ptype, ptype_list, list) {
1887 /* Never send packets back to the socket
1888 * they originated from - MvS (miquels@drinkel.ow.org)
1889 */
1890 if (skb_loop_sk(ptype, skb))
1891 continue;
1892
1893 if (pt_prev) {
1894 deliver_skb(skb2, pt_prev, skb->dev);
1895 pt_prev = ptype;
1896 continue;
1897 }
1898
1899 /* need to clone skb, done only once */
1900 skb2 = skb_clone(skb, GFP_ATOMIC);
1901 if (!skb2)
1902 goto out_unlock;
1903
1904 net_timestamp_set(skb2);
1905
1906 /* skb->nh should be correctly
1907 * set by sender, so that the second statement is
1908 * just protection against buggy protocols.
1909 */
1910 skb_reset_mac_header(skb2);
1911
1912 if (skb_network_header(skb2) < skb2->data ||
1913 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1914 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1915 ntohs(skb2->protocol),
1916 dev->name);
1917 skb_reset_network_header(skb2);
1918 }
1919
1920 skb2->transport_header = skb2->network_header;
1921 skb2->pkt_type = PACKET_OUTGOING;
1922 pt_prev = ptype;
1923 }
1924
1925 if (ptype_list == &ptype_all) {
1926 ptype_list = &dev->ptype_all;
1927 goto again;
1928 }
1929 out_unlock:
1930 if (pt_prev)
1931 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1932 rcu_read_unlock();
1933 }
1934
1935 /**
1936 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1937 * @dev: Network device
1938 * @txq: number of queues available
1939 *
1940 * If real_num_tx_queues is changed the tc mappings may no longer be
1941 * valid. To resolve this verify the tc mapping remains valid and if
1942 * not NULL the mapping. With no priorities mapping to this
1943 * offset/count pair it will no longer be used. In the worst case TC0
1944 * is invalid nothing can be done so disable priority mappings. If is
1945 * expected that drivers will fix this mapping if they can before
1946 * calling netif_set_real_num_tx_queues.
1947 */
netif_setup_tc(struct net_device * dev,unsigned int txq)1948 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1949 {
1950 int i;
1951 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1952
1953 /* If TC0 is invalidated disable TC mapping */
1954 if (tc->offset + tc->count > txq) {
1955 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1956 dev->num_tc = 0;
1957 return;
1958 }
1959
1960 /* Invalidated prio to tc mappings set to TC0 */
1961 for (i = 1; i < TC_BITMASK + 1; i++) {
1962 int q = netdev_get_prio_tc_map(dev, i);
1963
1964 tc = &dev->tc_to_txq[q];
1965 if (tc->offset + tc->count > txq) {
1966 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1967 i, q);
1968 netdev_set_prio_tc_map(dev, i, 0);
1969 }
1970 }
1971 }
1972
1973 #ifdef CONFIG_XPS
1974 static DEFINE_MUTEX(xps_map_mutex);
1975 #define xmap_dereference(P) \
1976 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1977
remove_xps_queue(struct xps_dev_maps * dev_maps,int cpu,u16 index)1978 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1979 int cpu, u16 index)
1980 {
1981 struct xps_map *map = NULL;
1982 int pos;
1983
1984 if (dev_maps)
1985 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1986
1987 for (pos = 0; map && pos < map->len; pos++) {
1988 if (map->queues[pos] == index) {
1989 if (map->len > 1) {
1990 map->queues[pos] = map->queues[--map->len];
1991 } else {
1992 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1993 kfree_rcu(map, rcu);
1994 map = NULL;
1995 }
1996 break;
1997 }
1998 }
1999
2000 return map;
2001 }
2002
netif_reset_xps_queues_gt(struct net_device * dev,u16 index)2003 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2004 {
2005 struct xps_dev_maps *dev_maps;
2006 int cpu, i;
2007 bool active = false;
2008
2009 mutex_lock(&xps_map_mutex);
2010 dev_maps = xmap_dereference(dev->xps_maps);
2011
2012 if (!dev_maps)
2013 goto out_no_maps;
2014
2015 for_each_possible_cpu(cpu) {
2016 for (i = index; i < dev->num_tx_queues; i++) {
2017 if (!remove_xps_queue(dev_maps, cpu, i))
2018 break;
2019 }
2020 if (i == dev->num_tx_queues)
2021 active = true;
2022 }
2023
2024 if (!active) {
2025 RCU_INIT_POINTER(dev->xps_maps, NULL);
2026 kfree_rcu(dev_maps, rcu);
2027 }
2028
2029 for (i = index; i < dev->num_tx_queues; i++)
2030 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2031 NUMA_NO_NODE);
2032
2033 out_no_maps:
2034 mutex_unlock(&xps_map_mutex);
2035 }
2036
expand_xps_map(struct xps_map * map,int cpu,u16 index)2037 static struct xps_map *expand_xps_map(struct xps_map *map,
2038 int cpu, u16 index)
2039 {
2040 struct xps_map *new_map;
2041 int alloc_len = XPS_MIN_MAP_ALLOC;
2042 int i, pos;
2043
2044 for (pos = 0; map && pos < map->len; pos++) {
2045 if (map->queues[pos] != index)
2046 continue;
2047 return map;
2048 }
2049
2050 /* Need to add queue to this CPU's existing map */
2051 if (map) {
2052 if (pos < map->alloc_len)
2053 return map;
2054
2055 alloc_len = map->alloc_len * 2;
2056 }
2057
2058 /* Need to allocate new map to store queue on this CPU's map */
2059 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2060 cpu_to_node(cpu));
2061 if (!new_map)
2062 return NULL;
2063
2064 for (i = 0; i < pos; i++)
2065 new_map->queues[i] = map->queues[i];
2066 new_map->alloc_len = alloc_len;
2067 new_map->len = pos;
2068
2069 return new_map;
2070 }
2071
netif_set_xps_queue(struct net_device * dev,const struct cpumask * mask,u16 index)2072 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2073 u16 index)
2074 {
2075 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2076 struct xps_map *map, *new_map;
2077 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2078 int cpu, numa_node_id = -2;
2079 bool active = false;
2080
2081 mutex_lock(&xps_map_mutex);
2082
2083 dev_maps = xmap_dereference(dev->xps_maps);
2084
2085 /* allocate memory for queue storage */
2086 for_each_online_cpu(cpu) {
2087 if (!cpumask_test_cpu(cpu, mask))
2088 continue;
2089
2090 if (!new_dev_maps)
2091 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2092 if (!new_dev_maps) {
2093 mutex_unlock(&xps_map_mutex);
2094 return -ENOMEM;
2095 }
2096
2097 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2098 NULL;
2099
2100 map = expand_xps_map(map, cpu, index);
2101 if (!map)
2102 goto error;
2103
2104 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2105 }
2106
2107 if (!new_dev_maps)
2108 goto out_no_new_maps;
2109
2110 for_each_possible_cpu(cpu) {
2111 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2112 /* add queue to CPU maps */
2113 int pos = 0;
2114
2115 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2116 while ((pos < map->len) && (map->queues[pos] != index))
2117 pos++;
2118
2119 if (pos == map->len)
2120 map->queues[map->len++] = index;
2121 #ifdef CONFIG_NUMA
2122 if (numa_node_id == -2)
2123 numa_node_id = cpu_to_node(cpu);
2124 else if (numa_node_id != cpu_to_node(cpu))
2125 numa_node_id = -1;
2126 #endif
2127 } else if (dev_maps) {
2128 /* fill in the new device map from the old device map */
2129 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2130 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2131 }
2132
2133 }
2134
2135 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2136
2137 /* Cleanup old maps */
2138 if (dev_maps) {
2139 for_each_possible_cpu(cpu) {
2140 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2141 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2142 if (map && map != new_map)
2143 kfree_rcu(map, rcu);
2144 }
2145
2146 kfree_rcu(dev_maps, rcu);
2147 }
2148
2149 dev_maps = new_dev_maps;
2150 active = true;
2151
2152 out_no_new_maps:
2153 /* update Tx queue numa node */
2154 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2155 (numa_node_id >= 0) ? numa_node_id :
2156 NUMA_NO_NODE);
2157
2158 if (!dev_maps)
2159 goto out_no_maps;
2160
2161 /* removes queue from unused CPUs */
2162 for_each_possible_cpu(cpu) {
2163 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2164 continue;
2165
2166 if (remove_xps_queue(dev_maps, cpu, index))
2167 active = true;
2168 }
2169
2170 /* free map if not active */
2171 if (!active) {
2172 RCU_INIT_POINTER(dev->xps_maps, NULL);
2173 kfree_rcu(dev_maps, rcu);
2174 }
2175
2176 out_no_maps:
2177 mutex_unlock(&xps_map_mutex);
2178
2179 return 0;
2180 error:
2181 /* remove any maps that we added */
2182 for_each_possible_cpu(cpu) {
2183 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2184 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2185 NULL;
2186 if (new_map && new_map != map)
2187 kfree(new_map);
2188 }
2189
2190 mutex_unlock(&xps_map_mutex);
2191
2192 kfree(new_dev_maps);
2193 return -ENOMEM;
2194 }
2195 EXPORT_SYMBOL(netif_set_xps_queue);
2196
2197 #endif
2198 /*
2199 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2200 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2201 */
netif_set_real_num_tx_queues(struct net_device * dev,unsigned int txq)2202 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2203 {
2204 bool disabling;
2205 int rc;
2206
2207 disabling = txq < dev->real_num_tx_queues;
2208
2209 if (txq < 1 || txq > dev->num_tx_queues)
2210 return -EINVAL;
2211
2212 if (dev->reg_state == NETREG_REGISTERED ||
2213 dev->reg_state == NETREG_UNREGISTERING) {
2214 ASSERT_RTNL();
2215
2216 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2217 txq);
2218 if (rc)
2219 return rc;
2220
2221 if (dev->num_tc)
2222 netif_setup_tc(dev, txq);
2223
2224 dev->real_num_tx_queues = txq;
2225
2226 if (disabling) {
2227 synchronize_net();
2228 qdisc_reset_all_tx_gt(dev, txq);
2229 #ifdef CONFIG_XPS
2230 netif_reset_xps_queues_gt(dev, txq);
2231 #endif
2232 }
2233 } else {
2234 dev->real_num_tx_queues = txq;
2235 }
2236
2237 return 0;
2238 }
2239 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2240
2241 #ifdef CONFIG_SYSFS
2242 /**
2243 * netif_set_real_num_rx_queues - set actual number of RX queues used
2244 * @dev: Network device
2245 * @rxq: Actual number of RX queues
2246 *
2247 * This must be called either with the rtnl_lock held or before
2248 * registration of the net device. Returns 0 on success, or a
2249 * negative error code. If called before registration, it always
2250 * succeeds.
2251 */
netif_set_real_num_rx_queues(struct net_device * dev,unsigned int rxq)2252 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2253 {
2254 int rc;
2255
2256 if (rxq < 1 || rxq > dev->num_rx_queues)
2257 return -EINVAL;
2258
2259 if (dev->reg_state == NETREG_REGISTERED) {
2260 ASSERT_RTNL();
2261
2262 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2263 rxq);
2264 if (rc)
2265 return rc;
2266 }
2267
2268 dev->real_num_rx_queues = rxq;
2269 return 0;
2270 }
2271 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2272 #endif
2273
2274 /**
2275 * netif_get_num_default_rss_queues - default number of RSS queues
2276 *
2277 * This routine should set an upper limit on the number of RSS queues
2278 * used by default by multiqueue devices.
2279 */
netif_get_num_default_rss_queues(void)2280 int netif_get_num_default_rss_queues(void)
2281 {
2282 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2283 }
2284 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2285
__netif_reschedule(struct Qdisc * q)2286 static inline void __netif_reschedule(struct Qdisc *q)
2287 {
2288 struct softnet_data *sd;
2289 unsigned long flags;
2290
2291 local_irq_save(flags);
2292 sd = this_cpu_ptr(&softnet_data);
2293 q->next_sched = NULL;
2294 *sd->output_queue_tailp = q;
2295 sd->output_queue_tailp = &q->next_sched;
2296 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2297 local_irq_restore(flags);
2298 }
2299
__netif_schedule(struct Qdisc * q)2300 void __netif_schedule(struct Qdisc *q)
2301 {
2302 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2303 __netif_reschedule(q);
2304 }
2305 EXPORT_SYMBOL(__netif_schedule);
2306
2307 struct dev_kfree_skb_cb {
2308 enum skb_free_reason reason;
2309 };
2310
get_kfree_skb_cb(const struct sk_buff * skb)2311 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2312 {
2313 return (struct dev_kfree_skb_cb *)skb->cb;
2314 }
2315
netif_schedule_queue(struct netdev_queue * txq)2316 void netif_schedule_queue(struct netdev_queue *txq)
2317 {
2318 rcu_read_lock();
2319 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2320 struct Qdisc *q = rcu_dereference(txq->qdisc);
2321
2322 __netif_schedule(q);
2323 }
2324 rcu_read_unlock();
2325 }
2326 EXPORT_SYMBOL(netif_schedule_queue);
2327
2328 /**
2329 * netif_wake_subqueue - allow sending packets on subqueue
2330 * @dev: network device
2331 * @queue_index: sub queue index
2332 *
2333 * Resume individual transmit queue of a device with multiple transmit queues.
2334 */
netif_wake_subqueue(struct net_device * dev,u16 queue_index)2335 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2336 {
2337 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2338
2339 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2340 struct Qdisc *q;
2341
2342 rcu_read_lock();
2343 q = rcu_dereference(txq->qdisc);
2344 __netif_schedule(q);
2345 rcu_read_unlock();
2346 }
2347 }
2348 EXPORT_SYMBOL(netif_wake_subqueue);
2349
netif_tx_wake_queue(struct netdev_queue * dev_queue)2350 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2351 {
2352 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2353 struct Qdisc *q;
2354
2355 rcu_read_lock();
2356 q = rcu_dereference(dev_queue->qdisc);
2357 __netif_schedule(q);
2358 rcu_read_unlock();
2359 }
2360 }
2361 EXPORT_SYMBOL(netif_tx_wake_queue);
2362
__dev_kfree_skb_irq(struct sk_buff * skb,enum skb_free_reason reason)2363 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2364 {
2365 unsigned long flags;
2366
2367 if (unlikely(!skb))
2368 return;
2369
2370 if (likely(atomic_read(&skb->users) == 1)) {
2371 smp_rmb();
2372 atomic_set(&skb->users, 0);
2373 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2374 return;
2375 }
2376 get_kfree_skb_cb(skb)->reason = reason;
2377 local_irq_save(flags);
2378 skb->next = __this_cpu_read(softnet_data.completion_queue);
2379 __this_cpu_write(softnet_data.completion_queue, skb);
2380 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2381 local_irq_restore(flags);
2382 }
2383 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2384
__dev_kfree_skb_any(struct sk_buff * skb,enum skb_free_reason reason)2385 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2386 {
2387 if (in_irq() || irqs_disabled())
2388 __dev_kfree_skb_irq(skb, reason);
2389 else
2390 dev_kfree_skb(skb);
2391 }
2392 EXPORT_SYMBOL(__dev_kfree_skb_any);
2393
2394
2395 /**
2396 * netif_device_detach - mark device as removed
2397 * @dev: network device
2398 *
2399 * Mark device as removed from system and therefore no longer available.
2400 */
netif_device_detach(struct net_device * dev)2401 void netif_device_detach(struct net_device *dev)
2402 {
2403 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2404 netif_running(dev)) {
2405 netif_tx_stop_all_queues(dev);
2406 }
2407 }
2408 EXPORT_SYMBOL(netif_device_detach);
2409
2410 /**
2411 * netif_device_attach - mark device as attached
2412 * @dev: network device
2413 *
2414 * Mark device as attached from system and restart if needed.
2415 */
netif_device_attach(struct net_device * dev)2416 void netif_device_attach(struct net_device *dev)
2417 {
2418 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2419 netif_running(dev)) {
2420 netif_tx_wake_all_queues(dev);
2421 __netdev_watchdog_up(dev);
2422 }
2423 }
2424 EXPORT_SYMBOL(netif_device_attach);
2425
2426 /*
2427 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2428 * to be used as a distribution range.
2429 */
__skb_tx_hash(const struct net_device * dev,struct sk_buff * skb,unsigned int num_tx_queues)2430 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2431 unsigned int num_tx_queues)
2432 {
2433 u32 hash;
2434 u16 qoffset = 0;
2435 u16 qcount = num_tx_queues;
2436
2437 if (skb_rx_queue_recorded(skb)) {
2438 hash = skb_get_rx_queue(skb);
2439 while (unlikely(hash >= num_tx_queues))
2440 hash -= num_tx_queues;
2441 return hash;
2442 }
2443
2444 if (dev->num_tc) {
2445 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2446 qoffset = dev->tc_to_txq[tc].offset;
2447 qcount = dev->tc_to_txq[tc].count;
2448 }
2449
2450 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2451 }
2452 EXPORT_SYMBOL(__skb_tx_hash);
2453
skb_warn_bad_offload(const struct sk_buff * skb)2454 static void skb_warn_bad_offload(const struct sk_buff *skb)
2455 {
2456 static const netdev_features_t null_features = 0;
2457 struct net_device *dev = skb->dev;
2458 const char *name = "";
2459
2460 if (!net_ratelimit())
2461 return;
2462
2463 if (dev) {
2464 if (dev->dev.parent)
2465 name = dev_driver_string(dev->dev.parent);
2466 else
2467 name = netdev_name(dev);
2468 }
2469 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2470 "gso_type=%d ip_summed=%d\n",
2471 name, dev ? &dev->features : &null_features,
2472 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2473 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2474 skb_shinfo(skb)->gso_type, skb->ip_summed);
2475 }
2476
2477 /*
2478 * Invalidate hardware checksum when packet is to be mangled, and
2479 * complete checksum manually on outgoing path.
2480 */
skb_checksum_help(struct sk_buff * skb)2481 int skb_checksum_help(struct sk_buff *skb)
2482 {
2483 __wsum csum;
2484 int ret = 0, offset;
2485
2486 if (skb->ip_summed == CHECKSUM_COMPLETE)
2487 goto out_set_summed;
2488
2489 if (unlikely(skb_shinfo(skb)->gso_size)) {
2490 skb_warn_bad_offload(skb);
2491 return -EINVAL;
2492 }
2493
2494 /* Before computing a checksum, we should make sure no frag could
2495 * be modified by an external entity : checksum could be wrong.
2496 */
2497 if (skb_has_shared_frag(skb)) {
2498 ret = __skb_linearize(skb);
2499 if (ret)
2500 goto out;
2501 }
2502
2503 offset = skb_checksum_start_offset(skb);
2504 BUG_ON(offset >= skb_headlen(skb));
2505 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2506
2507 offset += skb->csum_offset;
2508 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2509
2510 if (skb_cloned(skb) &&
2511 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2512 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2513 if (ret)
2514 goto out;
2515 }
2516
2517 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2518 out_set_summed:
2519 skb->ip_summed = CHECKSUM_NONE;
2520 out:
2521 return ret;
2522 }
2523 EXPORT_SYMBOL(skb_checksum_help);
2524
skb_network_protocol(struct sk_buff * skb,int * depth)2525 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2526 {
2527 __be16 type = skb->protocol;
2528
2529 /* Tunnel gso handlers can set protocol to ethernet. */
2530 if (type == htons(ETH_P_TEB)) {
2531 struct ethhdr *eth;
2532
2533 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2534 return 0;
2535
2536 eth = (struct ethhdr *)skb->data;
2537 type = eth->h_proto;
2538 }
2539
2540 return __vlan_get_protocol(skb, type, depth);
2541 }
2542
2543 /**
2544 * skb_mac_gso_segment - mac layer segmentation handler.
2545 * @skb: buffer to segment
2546 * @features: features for the output path (see dev->features)
2547 */
skb_mac_gso_segment(struct sk_buff * skb,netdev_features_t features)2548 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2549 netdev_features_t features)
2550 {
2551 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2552 struct packet_offload *ptype;
2553 int vlan_depth = skb->mac_len;
2554 __be16 type = skb_network_protocol(skb, &vlan_depth);
2555
2556 if (unlikely(!type))
2557 return ERR_PTR(-EINVAL);
2558
2559 __skb_pull(skb, vlan_depth);
2560
2561 rcu_read_lock();
2562 list_for_each_entry_rcu(ptype, &offload_base, list) {
2563 if (ptype->type == type && ptype->callbacks.gso_segment) {
2564 segs = ptype->callbacks.gso_segment(skb, features);
2565 break;
2566 }
2567 }
2568 rcu_read_unlock();
2569
2570 __skb_push(skb, skb->data - skb_mac_header(skb));
2571
2572 return segs;
2573 }
2574 EXPORT_SYMBOL(skb_mac_gso_segment);
2575
2576
2577 /* openvswitch calls this on rx path, so we need a different check.
2578 */
skb_needs_check(struct sk_buff * skb,bool tx_path)2579 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2580 {
2581 if (tx_path)
2582 return skb->ip_summed != CHECKSUM_PARTIAL &&
2583 skb->ip_summed != CHECKSUM_UNNECESSARY;
2584
2585 return skb->ip_summed == CHECKSUM_NONE;
2586 }
2587
2588 /**
2589 * __skb_gso_segment - Perform segmentation on skb.
2590 * @skb: buffer to segment
2591 * @features: features for the output path (see dev->features)
2592 * @tx_path: whether it is called in TX path
2593 *
2594 * This function segments the given skb and returns a list of segments.
2595 *
2596 * It may return NULL if the skb requires no segmentation. This is
2597 * only possible when GSO is used for verifying header integrity.
2598 *
2599 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2600 */
__skb_gso_segment(struct sk_buff * skb,netdev_features_t features,bool tx_path)2601 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2602 netdev_features_t features, bool tx_path)
2603 {
2604 struct sk_buff *segs;
2605
2606 if (unlikely(skb_needs_check(skb, tx_path))) {
2607 int err;
2608
2609 /* We're going to init ->check field in TCP or UDP header */
2610 err = skb_cow_head(skb, 0);
2611 if (err < 0)
2612 return ERR_PTR(err);
2613 }
2614
2615 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2616 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2617
2618 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2619 SKB_GSO_CB(skb)->encap_level = 0;
2620
2621 skb_reset_mac_header(skb);
2622 skb_reset_mac_len(skb);
2623
2624 segs = skb_mac_gso_segment(skb, features);
2625
2626 if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2627 skb_warn_bad_offload(skb);
2628
2629 return segs;
2630 }
2631 EXPORT_SYMBOL(__skb_gso_segment);
2632
2633 /* Take action when hardware reception checksum errors are detected. */
2634 #ifdef CONFIG_BUG
netdev_rx_csum_fault(struct net_device * dev)2635 void netdev_rx_csum_fault(struct net_device *dev)
2636 {
2637 if (net_ratelimit()) {
2638 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2639 dump_stack();
2640 }
2641 }
2642 EXPORT_SYMBOL(netdev_rx_csum_fault);
2643 #endif
2644
2645 /* Actually, we should eliminate this check as soon as we know, that:
2646 * 1. IOMMU is present and allows to map all the memory.
2647 * 2. No high memory really exists on this machine.
2648 */
2649
illegal_highdma(struct net_device * dev,struct sk_buff * skb)2650 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2651 {
2652 #ifdef CONFIG_HIGHMEM
2653 int i;
2654 if (!(dev->features & NETIF_F_HIGHDMA)) {
2655 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2656 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2657 if (PageHighMem(skb_frag_page(frag)))
2658 return 1;
2659 }
2660 }
2661
2662 if (PCI_DMA_BUS_IS_PHYS) {
2663 struct device *pdev = dev->dev.parent;
2664
2665 if (!pdev)
2666 return 0;
2667 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2668 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2669 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2670 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2671 return 1;
2672 }
2673 }
2674 #endif
2675 return 0;
2676 }
2677
2678 /* If MPLS offload request, verify we are testing hardware MPLS features
2679 * instead of standard features for the netdev.
2680 */
2681 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)2682 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2683 netdev_features_t features,
2684 __be16 type)
2685 {
2686 if (eth_p_mpls(type))
2687 features &= skb->dev->mpls_features;
2688
2689 return features;
2690 }
2691 #else
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)2692 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2693 netdev_features_t features,
2694 __be16 type)
2695 {
2696 return features;
2697 }
2698 #endif
2699
harmonize_features(struct sk_buff * skb,netdev_features_t features)2700 static netdev_features_t harmonize_features(struct sk_buff *skb,
2701 netdev_features_t features)
2702 {
2703 int tmp;
2704 __be16 type;
2705
2706 type = skb_network_protocol(skb, &tmp);
2707 features = net_mpls_features(skb, features, type);
2708
2709 if (skb->ip_summed != CHECKSUM_NONE &&
2710 !can_checksum_protocol(features, type)) {
2711 features &= ~NETIF_F_ALL_CSUM;
2712 }
2713 if (illegal_highdma(skb->dev, skb))
2714 features &= ~NETIF_F_SG;
2715
2716 return features;
2717 }
2718
passthru_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2719 netdev_features_t passthru_features_check(struct sk_buff *skb,
2720 struct net_device *dev,
2721 netdev_features_t features)
2722 {
2723 return features;
2724 }
2725 EXPORT_SYMBOL(passthru_features_check);
2726
dflt_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2727 static netdev_features_t dflt_features_check(struct sk_buff *skb,
2728 struct net_device *dev,
2729 netdev_features_t features)
2730 {
2731 return vlan_features_check(skb, features);
2732 }
2733
netif_skb_features(struct sk_buff * skb)2734 netdev_features_t netif_skb_features(struct sk_buff *skb)
2735 {
2736 struct net_device *dev = skb->dev;
2737 netdev_features_t features = dev->features;
2738 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2739
2740 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2741 features &= ~NETIF_F_GSO_MASK;
2742
2743 /* If encapsulation offload request, verify we are testing
2744 * hardware encapsulation features instead of standard
2745 * features for the netdev
2746 */
2747 if (skb->encapsulation)
2748 features &= dev->hw_enc_features;
2749
2750 if (skb_vlan_tagged(skb))
2751 features = netdev_intersect_features(features,
2752 dev->vlan_features |
2753 NETIF_F_HW_VLAN_CTAG_TX |
2754 NETIF_F_HW_VLAN_STAG_TX);
2755
2756 if (dev->netdev_ops->ndo_features_check)
2757 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2758 features);
2759 else
2760 features &= dflt_features_check(skb, dev, features);
2761
2762 return harmonize_features(skb, features);
2763 }
2764 EXPORT_SYMBOL(netif_skb_features);
2765
xmit_one(struct sk_buff * skb,struct net_device * dev,struct netdev_queue * txq,bool more)2766 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2767 struct netdev_queue *txq, bool more)
2768 {
2769 unsigned int len;
2770 int rc;
2771
2772 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2773 dev_queue_xmit_nit(skb, dev);
2774
2775 len = skb->len;
2776 trace_net_dev_start_xmit(skb, dev);
2777 rc = netdev_start_xmit(skb, dev, txq, more);
2778 trace_net_dev_xmit(skb, rc, dev, len);
2779
2780 return rc;
2781 }
2782
dev_hard_start_xmit(struct sk_buff * first,struct net_device * dev,struct netdev_queue * txq,int * ret)2783 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2784 struct netdev_queue *txq, int *ret)
2785 {
2786 struct sk_buff *skb = first;
2787 int rc = NETDEV_TX_OK;
2788
2789 while (skb) {
2790 struct sk_buff *next = skb->next;
2791
2792 skb->next = NULL;
2793 rc = xmit_one(skb, dev, txq, next != NULL);
2794 if (unlikely(!dev_xmit_complete(rc))) {
2795 skb->next = next;
2796 goto out;
2797 }
2798
2799 skb = next;
2800 if (netif_tx_queue_stopped(txq) && skb) {
2801 rc = NETDEV_TX_BUSY;
2802 break;
2803 }
2804 }
2805
2806 out:
2807 *ret = rc;
2808 return skb;
2809 }
2810
validate_xmit_vlan(struct sk_buff * skb,netdev_features_t features)2811 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2812 netdev_features_t features)
2813 {
2814 if (skb_vlan_tag_present(skb) &&
2815 !vlan_hw_offload_capable(features, skb->vlan_proto))
2816 skb = __vlan_hwaccel_push_inside(skb);
2817 return skb;
2818 }
2819
validate_xmit_skb(struct sk_buff * skb,struct net_device * dev)2820 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2821 {
2822 netdev_features_t features;
2823
2824 if (skb->next)
2825 return skb;
2826
2827 features = netif_skb_features(skb);
2828 skb = validate_xmit_vlan(skb, features);
2829 if (unlikely(!skb))
2830 goto out_null;
2831
2832 if (netif_needs_gso(skb, features)) {
2833 struct sk_buff *segs;
2834
2835 segs = skb_gso_segment(skb, features);
2836 if (IS_ERR(segs)) {
2837 goto out_kfree_skb;
2838 } else if (segs) {
2839 consume_skb(skb);
2840 skb = segs;
2841 }
2842 } else {
2843 if (skb_needs_linearize(skb, features) &&
2844 __skb_linearize(skb))
2845 goto out_kfree_skb;
2846
2847 /* If packet is not checksummed and device does not
2848 * support checksumming for this protocol, complete
2849 * checksumming here.
2850 */
2851 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2852 if (skb->encapsulation)
2853 skb_set_inner_transport_header(skb,
2854 skb_checksum_start_offset(skb));
2855 else
2856 skb_set_transport_header(skb,
2857 skb_checksum_start_offset(skb));
2858 if (!(features & NETIF_F_ALL_CSUM) &&
2859 skb_checksum_help(skb))
2860 goto out_kfree_skb;
2861 }
2862 }
2863
2864 return skb;
2865
2866 out_kfree_skb:
2867 kfree_skb(skb);
2868 out_null:
2869 return NULL;
2870 }
2871
validate_xmit_skb_list(struct sk_buff * skb,struct net_device * dev)2872 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2873 {
2874 struct sk_buff *next, *head = NULL, *tail;
2875
2876 for (; skb != NULL; skb = next) {
2877 next = skb->next;
2878 skb->next = NULL;
2879
2880 /* in case skb wont be segmented, point to itself */
2881 skb->prev = skb;
2882
2883 skb = validate_xmit_skb(skb, dev);
2884 if (!skb)
2885 continue;
2886
2887 if (!head)
2888 head = skb;
2889 else
2890 tail->next = skb;
2891 /* If skb was segmented, skb->prev points to
2892 * the last segment. If not, it still contains skb.
2893 */
2894 tail = skb->prev;
2895 }
2896 return head;
2897 }
2898 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2899
qdisc_pkt_len_init(struct sk_buff * skb)2900 static void qdisc_pkt_len_init(struct sk_buff *skb)
2901 {
2902 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2903
2904 qdisc_skb_cb(skb)->pkt_len = skb->len;
2905
2906 /* To get more precise estimation of bytes sent on wire,
2907 * we add to pkt_len the headers size of all segments
2908 */
2909 if (shinfo->gso_size) {
2910 unsigned int hdr_len;
2911 u16 gso_segs = shinfo->gso_segs;
2912
2913 /* mac layer + network layer */
2914 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2915
2916 /* + transport layer */
2917 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
2918 const struct tcphdr *th;
2919 struct tcphdr _tcphdr;
2920
2921 th = skb_header_pointer(skb, skb_transport_offset(skb),
2922 sizeof(_tcphdr), &_tcphdr);
2923 if (likely(th))
2924 hdr_len += __tcp_hdrlen(th);
2925 } else {
2926 struct udphdr _udphdr;
2927
2928 if (skb_header_pointer(skb, skb_transport_offset(skb),
2929 sizeof(_udphdr), &_udphdr))
2930 hdr_len += sizeof(struct udphdr);
2931 }
2932
2933 if (shinfo->gso_type & SKB_GSO_DODGY)
2934 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2935 shinfo->gso_size);
2936
2937 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2938 }
2939 }
2940
__dev_xmit_skb(struct sk_buff * skb,struct Qdisc * q,struct net_device * dev,struct netdev_queue * txq)2941 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2942 struct net_device *dev,
2943 struct netdev_queue *txq)
2944 {
2945 spinlock_t *root_lock = qdisc_lock(q);
2946 bool contended;
2947 int rc;
2948
2949 qdisc_pkt_len_init(skb);
2950 qdisc_calculate_pkt_len(skb, q);
2951 /*
2952 * Heuristic to force contended enqueues to serialize on a
2953 * separate lock before trying to get qdisc main lock.
2954 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2955 * often and dequeue packets faster.
2956 */
2957 contended = qdisc_is_running(q);
2958 if (unlikely(contended))
2959 spin_lock(&q->busylock);
2960
2961 spin_lock(root_lock);
2962 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2963 kfree_skb(skb);
2964 rc = NET_XMIT_DROP;
2965 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2966 qdisc_run_begin(q)) {
2967 /*
2968 * This is a work-conserving queue; there are no old skbs
2969 * waiting to be sent out; and the qdisc is not running -
2970 * xmit the skb directly.
2971 */
2972
2973 qdisc_bstats_update(q, skb);
2974
2975 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2976 if (unlikely(contended)) {
2977 spin_unlock(&q->busylock);
2978 contended = false;
2979 }
2980 __qdisc_run(q);
2981 } else
2982 qdisc_run_end(q);
2983
2984 rc = NET_XMIT_SUCCESS;
2985 } else {
2986 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2987 if (qdisc_run_begin(q)) {
2988 if (unlikely(contended)) {
2989 spin_unlock(&q->busylock);
2990 contended = false;
2991 }
2992 __qdisc_run(q);
2993 }
2994 }
2995 spin_unlock(root_lock);
2996 if (unlikely(contended))
2997 spin_unlock(&q->busylock);
2998 return rc;
2999 }
3000
3001 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
skb_update_prio(struct sk_buff * skb)3002 static void skb_update_prio(struct sk_buff *skb)
3003 {
3004 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3005
3006 if (!skb->priority && skb->sk && map) {
3007 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
3008
3009 if (prioidx < map->priomap_len)
3010 skb->priority = map->priomap[prioidx];
3011 }
3012 }
3013 #else
3014 #define skb_update_prio(skb)
3015 #endif
3016
3017 DEFINE_PER_CPU(int, xmit_recursion);
3018 EXPORT_SYMBOL(xmit_recursion);
3019
3020 #define RECURSION_LIMIT 8
3021
3022 /**
3023 * dev_loopback_xmit - loop back @skb
3024 * @net: network namespace this loopback is happening in
3025 * @sk: sk needed to be a netfilter okfn
3026 * @skb: buffer to transmit
3027 */
dev_loopback_xmit(struct net * net,struct sock * sk,struct sk_buff * skb)3028 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3029 {
3030 skb_reset_mac_header(skb);
3031 __skb_pull(skb, skb_network_offset(skb));
3032 skb->pkt_type = PACKET_LOOPBACK;
3033 skb->ip_summed = CHECKSUM_UNNECESSARY;
3034 WARN_ON(!skb_dst(skb));
3035 skb_dst_force(skb);
3036 netif_rx_ni(skb);
3037 return 0;
3038 }
3039 EXPORT_SYMBOL(dev_loopback_xmit);
3040
get_xps_queue(struct net_device * dev,struct sk_buff * skb)3041 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3042 {
3043 #ifdef CONFIG_XPS
3044 struct xps_dev_maps *dev_maps;
3045 struct xps_map *map;
3046 int queue_index = -1;
3047
3048 rcu_read_lock();
3049 dev_maps = rcu_dereference(dev->xps_maps);
3050 if (dev_maps) {
3051 map = rcu_dereference(
3052 dev_maps->cpu_map[skb->sender_cpu - 1]);
3053 if (map) {
3054 if (map->len == 1)
3055 queue_index = map->queues[0];
3056 else
3057 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3058 map->len)];
3059 if (unlikely(queue_index >= dev->real_num_tx_queues))
3060 queue_index = -1;
3061 }
3062 }
3063 rcu_read_unlock();
3064
3065 return queue_index;
3066 #else
3067 return -1;
3068 #endif
3069 }
3070
__netdev_pick_tx(struct net_device * dev,struct sk_buff * skb)3071 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3072 {
3073 struct sock *sk = skb->sk;
3074 int queue_index = sk_tx_queue_get(sk);
3075
3076 if (queue_index < 0 || skb->ooo_okay ||
3077 queue_index >= dev->real_num_tx_queues) {
3078 int new_index = get_xps_queue(dev, skb);
3079 if (new_index < 0)
3080 new_index = skb_tx_hash(dev, skb);
3081
3082 if (queue_index != new_index && sk &&
3083 sk_fullsock(sk) &&
3084 rcu_access_pointer(sk->sk_dst_cache))
3085 sk_tx_queue_set(sk, new_index);
3086
3087 queue_index = new_index;
3088 }
3089
3090 return queue_index;
3091 }
3092
netdev_pick_tx(struct net_device * dev,struct sk_buff * skb,void * accel_priv)3093 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3094 struct sk_buff *skb,
3095 void *accel_priv)
3096 {
3097 int queue_index = 0;
3098
3099 #ifdef CONFIG_XPS
3100 u32 sender_cpu = skb->sender_cpu - 1;
3101
3102 if (sender_cpu >= (u32)NR_CPUS)
3103 skb->sender_cpu = raw_smp_processor_id() + 1;
3104 #endif
3105
3106 if (dev->real_num_tx_queues != 1) {
3107 const struct net_device_ops *ops = dev->netdev_ops;
3108 if (ops->ndo_select_queue)
3109 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3110 __netdev_pick_tx);
3111 else
3112 queue_index = __netdev_pick_tx(dev, skb);
3113
3114 if (!accel_priv)
3115 queue_index = netdev_cap_txqueue(dev, queue_index);
3116 }
3117
3118 skb_set_queue_mapping(skb, queue_index);
3119 return netdev_get_tx_queue(dev, queue_index);
3120 }
3121
3122 /**
3123 * __dev_queue_xmit - transmit a buffer
3124 * @skb: buffer to transmit
3125 * @accel_priv: private data used for L2 forwarding offload
3126 *
3127 * Queue a buffer for transmission to a network device. The caller must
3128 * have set the device and priority and built the buffer before calling
3129 * this function. The function can be called from an interrupt.
3130 *
3131 * A negative errno code is returned on a failure. A success does not
3132 * guarantee the frame will be transmitted as it may be dropped due
3133 * to congestion or traffic shaping.
3134 *
3135 * -----------------------------------------------------------------------------------
3136 * I notice this method can also return errors from the queue disciplines,
3137 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3138 * be positive.
3139 *
3140 * Regardless of the return value, the skb is consumed, so it is currently
3141 * difficult to retry a send to this method. (You can bump the ref count
3142 * before sending to hold a reference for retry if you are careful.)
3143 *
3144 * When calling this method, interrupts MUST be enabled. This is because
3145 * the BH enable code must have IRQs enabled so that it will not deadlock.
3146 * --BLG
3147 */
__dev_queue_xmit(struct sk_buff * skb,void * accel_priv)3148 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3149 {
3150 struct net_device *dev = skb->dev;
3151 struct netdev_queue *txq;
3152 struct Qdisc *q;
3153 int rc = -ENOMEM;
3154
3155 skb_reset_mac_header(skb);
3156
3157 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3158 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3159
3160 /* Disable soft irqs for various locks below. Also
3161 * stops preemption for RCU.
3162 */
3163 rcu_read_lock_bh();
3164
3165 skb_update_prio(skb);
3166
3167 /* If device/qdisc don't need skb->dst, release it right now while
3168 * its hot in this cpu cache.
3169 */
3170 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3171 skb_dst_drop(skb);
3172 else
3173 skb_dst_force(skb);
3174
3175 #ifdef CONFIG_NET_SWITCHDEV
3176 /* Don't forward if offload device already forwarded */
3177 if (skb->offload_fwd_mark &&
3178 skb->offload_fwd_mark == dev->offload_fwd_mark) {
3179 consume_skb(skb);
3180 rc = NET_XMIT_SUCCESS;
3181 goto out;
3182 }
3183 #endif
3184
3185 txq = netdev_pick_tx(dev, skb, accel_priv);
3186 q = rcu_dereference_bh(txq->qdisc);
3187
3188 #ifdef CONFIG_NET_CLS_ACT
3189 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3190 #endif
3191 trace_net_dev_queue(skb);
3192 if (q->enqueue) {
3193 rc = __dev_xmit_skb(skb, q, dev, txq);
3194 goto out;
3195 }
3196
3197 /* The device has no queue. Common case for software devices:
3198 loopback, all the sorts of tunnels...
3199
3200 Really, it is unlikely that netif_tx_lock protection is necessary
3201 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3202 counters.)
3203 However, it is possible, that they rely on protection
3204 made by us here.
3205
3206 Check this and shot the lock. It is not prone from deadlocks.
3207 Either shot noqueue qdisc, it is even simpler 8)
3208 */
3209 if (dev->flags & IFF_UP) {
3210 int cpu = smp_processor_id(); /* ok because BHs are off */
3211
3212 if (txq->xmit_lock_owner != cpu) {
3213
3214 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3215 goto recursion_alert;
3216
3217 skb = validate_xmit_skb(skb, dev);
3218 if (!skb)
3219 goto drop;
3220
3221 HARD_TX_LOCK(dev, txq, cpu);
3222
3223 if (!netif_xmit_stopped(txq)) {
3224 __this_cpu_inc(xmit_recursion);
3225 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3226 __this_cpu_dec(xmit_recursion);
3227 if (dev_xmit_complete(rc)) {
3228 HARD_TX_UNLOCK(dev, txq);
3229 goto out;
3230 }
3231 }
3232 HARD_TX_UNLOCK(dev, txq);
3233 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3234 dev->name);
3235 } else {
3236 /* Recursion is detected! It is possible,
3237 * unfortunately
3238 */
3239 recursion_alert:
3240 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3241 dev->name);
3242 }
3243 }
3244
3245 rc = -ENETDOWN;
3246 drop:
3247 rcu_read_unlock_bh();
3248
3249 atomic_long_inc(&dev->tx_dropped);
3250 kfree_skb_list(skb);
3251 return rc;
3252 out:
3253 rcu_read_unlock_bh();
3254 return rc;
3255 }
3256
dev_queue_xmit(struct sk_buff * skb)3257 int dev_queue_xmit(struct sk_buff *skb)
3258 {
3259 return __dev_queue_xmit(skb, NULL);
3260 }
3261 EXPORT_SYMBOL(dev_queue_xmit);
3262
dev_queue_xmit_accel(struct sk_buff * skb,void * accel_priv)3263 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3264 {
3265 return __dev_queue_xmit(skb, accel_priv);
3266 }
3267 EXPORT_SYMBOL(dev_queue_xmit_accel);
3268
3269
3270 /*=======================================================================
3271 Receiver routines
3272 =======================================================================*/
3273
3274 int netdev_max_backlog __read_mostly = 1000;
3275 EXPORT_SYMBOL(netdev_max_backlog);
3276
3277 int netdev_tstamp_prequeue __read_mostly = 1;
3278 int netdev_budget __read_mostly = 300;
3279 int weight_p __read_mostly = 64; /* old backlog weight */
3280
3281 /* Called with irq disabled */
____napi_schedule(struct softnet_data * sd,struct napi_struct * napi)3282 static inline void ____napi_schedule(struct softnet_data *sd,
3283 struct napi_struct *napi)
3284 {
3285 list_add_tail(&napi->poll_list, &sd->poll_list);
3286 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3287 }
3288
3289 #ifdef CONFIG_RPS
3290
3291 /* One global table that all flow-based protocols share. */
3292 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3293 EXPORT_SYMBOL(rps_sock_flow_table);
3294 u32 rps_cpu_mask __read_mostly;
3295 EXPORT_SYMBOL(rps_cpu_mask);
3296
3297 struct static_key rps_needed __read_mostly;
3298
3299 static struct rps_dev_flow *
set_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow * rflow,u16 next_cpu)3300 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3301 struct rps_dev_flow *rflow, u16 next_cpu)
3302 {
3303 if (next_cpu < nr_cpu_ids) {
3304 #ifdef CONFIG_RFS_ACCEL
3305 struct netdev_rx_queue *rxqueue;
3306 struct rps_dev_flow_table *flow_table;
3307 struct rps_dev_flow *old_rflow;
3308 u32 flow_id;
3309 u16 rxq_index;
3310 int rc;
3311
3312 /* Should we steer this flow to a different hardware queue? */
3313 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3314 !(dev->features & NETIF_F_NTUPLE))
3315 goto out;
3316 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3317 if (rxq_index == skb_get_rx_queue(skb))
3318 goto out;
3319
3320 rxqueue = dev->_rx + rxq_index;
3321 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3322 if (!flow_table)
3323 goto out;
3324 flow_id = skb_get_hash(skb) & flow_table->mask;
3325 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3326 rxq_index, flow_id);
3327 if (rc < 0)
3328 goto out;
3329 old_rflow = rflow;
3330 rflow = &flow_table->flows[flow_id];
3331 rflow->filter = rc;
3332 if (old_rflow->filter == rflow->filter)
3333 old_rflow->filter = RPS_NO_FILTER;
3334 out:
3335 #endif
3336 rflow->last_qtail =
3337 per_cpu(softnet_data, next_cpu).input_queue_head;
3338 }
3339
3340 rflow->cpu = next_cpu;
3341 return rflow;
3342 }
3343
3344 /*
3345 * get_rps_cpu is called from netif_receive_skb and returns the target
3346 * CPU from the RPS map of the receiving queue for a given skb.
3347 * rcu_read_lock must be held on entry.
3348 */
get_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow ** rflowp)3349 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3350 struct rps_dev_flow **rflowp)
3351 {
3352 const struct rps_sock_flow_table *sock_flow_table;
3353 struct netdev_rx_queue *rxqueue = dev->_rx;
3354 struct rps_dev_flow_table *flow_table;
3355 struct rps_map *map;
3356 int cpu = -1;
3357 u32 tcpu;
3358 u32 hash;
3359
3360 if (skb_rx_queue_recorded(skb)) {
3361 u16 index = skb_get_rx_queue(skb);
3362
3363 if (unlikely(index >= dev->real_num_rx_queues)) {
3364 WARN_ONCE(dev->real_num_rx_queues > 1,
3365 "%s received packet on queue %u, but number "
3366 "of RX queues is %u\n",
3367 dev->name, index, dev->real_num_rx_queues);
3368 goto done;
3369 }
3370 rxqueue += index;
3371 }
3372
3373 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3374
3375 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3376 map = rcu_dereference(rxqueue->rps_map);
3377 if (!flow_table && !map)
3378 goto done;
3379
3380 skb_reset_network_header(skb);
3381 hash = skb_get_hash(skb);
3382 if (!hash)
3383 goto done;
3384
3385 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3386 if (flow_table && sock_flow_table) {
3387 struct rps_dev_flow *rflow;
3388 u32 next_cpu;
3389 u32 ident;
3390
3391 /* First check into global flow table if there is a match */
3392 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3393 if ((ident ^ hash) & ~rps_cpu_mask)
3394 goto try_rps;
3395
3396 next_cpu = ident & rps_cpu_mask;
3397
3398 /* OK, now we know there is a match,
3399 * we can look at the local (per receive queue) flow table
3400 */
3401 rflow = &flow_table->flows[hash & flow_table->mask];
3402 tcpu = rflow->cpu;
3403
3404 /*
3405 * If the desired CPU (where last recvmsg was done) is
3406 * different from current CPU (one in the rx-queue flow
3407 * table entry), switch if one of the following holds:
3408 * - Current CPU is unset (>= nr_cpu_ids).
3409 * - Current CPU is offline.
3410 * - The current CPU's queue tail has advanced beyond the
3411 * last packet that was enqueued using this table entry.
3412 * This guarantees that all previous packets for the flow
3413 * have been dequeued, thus preserving in order delivery.
3414 */
3415 if (unlikely(tcpu != next_cpu) &&
3416 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3417 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3418 rflow->last_qtail)) >= 0)) {
3419 tcpu = next_cpu;
3420 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3421 }
3422
3423 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3424 *rflowp = rflow;
3425 cpu = tcpu;
3426 goto done;
3427 }
3428 }
3429
3430 try_rps:
3431
3432 if (map) {
3433 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3434 if (cpu_online(tcpu)) {
3435 cpu = tcpu;
3436 goto done;
3437 }
3438 }
3439
3440 done:
3441 return cpu;
3442 }
3443
3444 #ifdef CONFIG_RFS_ACCEL
3445
3446 /**
3447 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3448 * @dev: Device on which the filter was set
3449 * @rxq_index: RX queue index
3450 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3451 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3452 *
3453 * Drivers that implement ndo_rx_flow_steer() should periodically call
3454 * this function for each installed filter and remove the filters for
3455 * which it returns %true.
3456 */
rps_may_expire_flow(struct net_device * dev,u16 rxq_index,u32 flow_id,u16 filter_id)3457 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3458 u32 flow_id, u16 filter_id)
3459 {
3460 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3461 struct rps_dev_flow_table *flow_table;
3462 struct rps_dev_flow *rflow;
3463 bool expire = true;
3464 unsigned int cpu;
3465
3466 rcu_read_lock();
3467 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3468 if (flow_table && flow_id <= flow_table->mask) {
3469 rflow = &flow_table->flows[flow_id];
3470 cpu = ACCESS_ONCE(rflow->cpu);
3471 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3472 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3473 rflow->last_qtail) <
3474 (int)(10 * flow_table->mask)))
3475 expire = false;
3476 }
3477 rcu_read_unlock();
3478 return expire;
3479 }
3480 EXPORT_SYMBOL(rps_may_expire_flow);
3481
3482 #endif /* CONFIG_RFS_ACCEL */
3483
3484 /* Called from hardirq (IPI) context */
rps_trigger_softirq(void * data)3485 static void rps_trigger_softirq(void *data)
3486 {
3487 struct softnet_data *sd = data;
3488
3489 ____napi_schedule(sd, &sd->backlog);
3490 sd->received_rps++;
3491 }
3492
3493 #endif /* CONFIG_RPS */
3494
3495 /*
3496 * Check if this softnet_data structure is another cpu one
3497 * If yes, queue it to our IPI list and return 1
3498 * If no, return 0
3499 */
rps_ipi_queued(struct softnet_data * sd)3500 static int rps_ipi_queued(struct softnet_data *sd)
3501 {
3502 #ifdef CONFIG_RPS
3503 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3504
3505 if (sd != mysd) {
3506 sd->rps_ipi_next = mysd->rps_ipi_list;
3507 mysd->rps_ipi_list = sd;
3508
3509 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3510 return 1;
3511 }
3512 #endif /* CONFIG_RPS */
3513 return 0;
3514 }
3515
3516 #ifdef CONFIG_NET_FLOW_LIMIT
3517 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3518 #endif
3519
skb_flow_limit(struct sk_buff * skb,unsigned int qlen)3520 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3521 {
3522 #ifdef CONFIG_NET_FLOW_LIMIT
3523 struct sd_flow_limit *fl;
3524 struct softnet_data *sd;
3525 unsigned int old_flow, new_flow;
3526
3527 if (qlen < (netdev_max_backlog >> 1))
3528 return false;
3529
3530 sd = this_cpu_ptr(&softnet_data);
3531
3532 rcu_read_lock();
3533 fl = rcu_dereference(sd->flow_limit);
3534 if (fl) {
3535 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3536 old_flow = fl->history[fl->history_head];
3537 fl->history[fl->history_head] = new_flow;
3538
3539 fl->history_head++;
3540 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3541
3542 if (likely(fl->buckets[old_flow]))
3543 fl->buckets[old_flow]--;
3544
3545 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3546 fl->count++;
3547 rcu_read_unlock();
3548 return true;
3549 }
3550 }
3551 rcu_read_unlock();
3552 #endif
3553 return false;
3554 }
3555
3556 /*
3557 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3558 * queue (may be a remote CPU queue).
3559 */
enqueue_to_backlog(struct sk_buff * skb,int cpu,unsigned int * qtail)3560 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3561 unsigned int *qtail)
3562 {
3563 struct softnet_data *sd;
3564 unsigned long flags;
3565 unsigned int qlen;
3566
3567 sd = &per_cpu(softnet_data, cpu);
3568
3569 local_irq_save(flags);
3570
3571 rps_lock(sd);
3572 if (!netif_running(skb->dev))
3573 goto drop;
3574 qlen = skb_queue_len(&sd->input_pkt_queue);
3575 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3576 if (qlen) {
3577 enqueue:
3578 __skb_queue_tail(&sd->input_pkt_queue, skb);
3579 input_queue_tail_incr_save(sd, qtail);
3580 rps_unlock(sd);
3581 local_irq_restore(flags);
3582 return NET_RX_SUCCESS;
3583 }
3584
3585 /* Schedule NAPI for backlog device
3586 * We can use non atomic operation since we own the queue lock
3587 */
3588 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3589 if (!rps_ipi_queued(sd))
3590 ____napi_schedule(sd, &sd->backlog);
3591 }
3592 goto enqueue;
3593 }
3594
3595 drop:
3596 sd->dropped++;
3597 rps_unlock(sd);
3598
3599 local_irq_restore(flags);
3600
3601 atomic_long_inc(&skb->dev->rx_dropped);
3602 kfree_skb(skb);
3603 return NET_RX_DROP;
3604 }
3605
netif_rx_internal(struct sk_buff * skb)3606 static int netif_rx_internal(struct sk_buff *skb)
3607 {
3608 int ret;
3609
3610 net_timestamp_check(netdev_tstamp_prequeue, skb);
3611
3612 trace_netif_rx(skb);
3613 #ifdef CONFIG_RPS
3614 if (static_key_false(&rps_needed)) {
3615 struct rps_dev_flow voidflow, *rflow = &voidflow;
3616 int cpu;
3617
3618 preempt_disable();
3619 rcu_read_lock();
3620
3621 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3622 if (cpu < 0)
3623 cpu = smp_processor_id();
3624
3625 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3626
3627 rcu_read_unlock();
3628 preempt_enable();
3629 } else
3630 #endif
3631 {
3632 unsigned int qtail;
3633 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3634 put_cpu();
3635 }
3636 return ret;
3637 }
3638
3639 /**
3640 * netif_rx - post buffer to the network code
3641 * @skb: buffer to post
3642 *
3643 * This function receives a packet from a device driver and queues it for
3644 * the upper (protocol) levels to process. It always succeeds. The buffer
3645 * may be dropped during processing for congestion control or by the
3646 * protocol layers.
3647 *
3648 * return values:
3649 * NET_RX_SUCCESS (no congestion)
3650 * NET_RX_DROP (packet was dropped)
3651 *
3652 */
3653
netif_rx(struct sk_buff * skb)3654 int netif_rx(struct sk_buff *skb)
3655 {
3656 trace_netif_rx_entry(skb);
3657
3658 return netif_rx_internal(skb);
3659 }
3660 EXPORT_SYMBOL(netif_rx);
3661
netif_rx_ni(struct sk_buff * skb)3662 int netif_rx_ni(struct sk_buff *skb)
3663 {
3664 int err;
3665
3666 trace_netif_rx_ni_entry(skb);
3667
3668 preempt_disable();
3669 err = netif_rx_internal(skb);
3670 if (local_softirq_pending())
3671 do_softirq();
3672 preempt_enable();
3673
3674 return err;
3675 }
3676 EXPORT_SYMBOL(netif_rx_ni);
3677
net_tx_action(struct softirq_action * h)3678 static void net_tx_action(struct softirq_action *h)
3679 {
3680 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3681
3682 if (sd->completion_queue) {
3683 struct sk_buff *clist;
3684
3685 local_irq_disable();
3686 clist = sd->completion_queue;
3687 sd->completion_queue = NULL;
3688 local_irq_enable();
3689
3690 while (clist) {
3691 struct sk_buff *skb = clist;
3692 clist = clist->next;
3693
3694 WARN_ON(atomic_read(&skb->users));
3695 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3696 trace_consume_skb(skb);
3697 else
3698 trace_kfree_skb(skb, net_tx_action);
3699 __kfree_skb(skb);
3700 }
3701 }
3702
3703 if (sd->output_queue) {
3704 struct Qdisc *head;
3705
3706 local_irq_disable();
3707 head = sd->output_queue;
3708 sd->output_queue = NULL;
3709 sd->output_queue_tailp = &sd->output_queue;
3710 local_irq_enable();
3711
3712 while (head) {
3713 struct Qdisc *q = head;
3714 spinlock_t *root_lock;
3715
3716 head = head->next_sched;
3717
3718 root_lock = qdisc_lock(q);
3719 if (spin_trylock(root_lock)) {
3720 smp_mb__before_atomic();
3721 clear_bit(__QDISC_STATE_SCHED,
3722 &q->state);
3723 qdisc_run(q);
3724 spin_unlock(root_lock);
3725 } else {
3726 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3727 &q->state)) {
3728 __netif_reschedule(q);
3729 } else {
3730 smp_mb__before_atomic();
3731 clear_bit(__QDISC_STATE_SCHED,
3732 &q->state);
3733 }
3734 }
3735 }
3736 }
3737 }
3738
3739 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3740 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3741 /* This hook is defined here for ATM LANE */
3742 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3743 unsigned char *addr) __read_mostly;
3744 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3745 #endif
3746
handle_ing(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)3747 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3748 struct packet_type **pt_prev,
3749 int *ret, struct net_device *orig_dev)
3750 {
3751 #ifdef CONFIG_NET_CLS_ACT
3752 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3753 struct tcf_result cl_res;
3754
3755 /* If there's at least one ingress present somewhere (so
3756 * we get here via enabled static key), remaining devices
3757 * that are not configured with an ingress qdisc will bail
3758 * out here.
3759 */
3760 if (!cl)
3761 return skb;
3762 if (*pt_prev) {
3763 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3764 *pt_prev = NULL;
3765 }
3766
3767 qdisc_skb_cb(skb)->pkt_len = skb->len;
3768 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3769 qdisc_bstats_cpu_update(cl->q, skb);
3770
3771 switch (tc_classify(skb, cl, &cl_res, false)) {
3772 case TC_ACT_OK:
3773 case TC_ACT_RECLASSIFY:
3774 skb->tc_index = TC_H_MIN(cl_res.classid);
3775 break;
3776 case TC_ACT_SHOT:
3777 qdisc_qstats_cpu_drop(cl->q);
3778 case TC_ACT_STOLEN:
3779 case TC_ACT_QUEUED:
3780 kfree_skb(skb);
3781 return NULL;
3782 case TC_ACT_REDIRECT:
3783 /* skb_mac_header check was done by cls/act_bpf, so
3784 * we can safely push the L2 header back before
3785 * redirecting to another netdev
3786 */
3787 __skb_push(skb, skb->mac_len);
3788 skb_do_redirect(skb);
3789 return NULL;
3790 default:
3791 break;
3792 }
3793 #endif /* CONFIG_NET_CLS_ACT */
3794 return skb;
3795 }
3796
3797 /**
3798 * netdev_is_rx_handler_busy - check if receive handler is registered
3799 * @dev: device to check
3800 *
3801 * Check if a receive handler is already registered for a given device.
3802 * Return true if there one.
3803 *
3804 * The caller must hold the rtnl_mutex.
3805 */
netdev_is_rx_handler_busy(struct net_device * dev)3806 bool netdev_is_rx_handler_busy(struct net_device *dev)
3807 {
3808 ASSERT_RTNL();
3809 return dev && rtnl_dereference(dev->rx_handler);
3810 }
3811 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3812
3813 /**
3814 * netdev_rx_handler_register - register receive handler
3815 * @dev: device to register a handler for
3816 * @rx_handler: receive handler to register
3817 * @rx_handler_data: data pointer that is used by rx handler
3818 *
3819 * Register a receive handler for a device. This handler will then be
3820 * called from __netif_receive_skb. A negative errno code is returned
3821 * on a failure.
3822 *
3823 * The caller must hold the rtnl_mutex.
3824 *
3825 * For a general description of rx_handler, see enum rx_handler_result.
3826 */
netdev_rx_handler_register(struct net_device * dev,rx_handler_func_t * rx_handler,void * rx_handler_data)3827 int netdev_rx_handler_register(struct net_device *dev,
3828 rx_handler_func_t *rx_handler,
3829 void *rx_handler_data)
3830 {
3831 ASSERT_RTNL();
3832
3833 if (dev->rx_handler)
3834 return -EBUSY;
3835
3836 /* Note: rx_handler_data must be set before rx_handler */
3837 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3838 rcu_assign_pointer(dev->rx_handler, rx_handler);
3839
3840 return 0;
3841 }
3842 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3843
3844 /**
3845 * netdev_rx_handler_unregister - unregister receive handler
3846 * @dev: device to unregister a handler from
3847 *
3848 * Unregister a receive handler from a device.
3849 *
3850 * The caller must hold the rtnl_mutex.
3851 */
netdev_rx_handler_unregister(struct net_device * dev)3852 void netdev_rx_handler_unregister(struct net_device *dev)
3853 {
3854
3855 ASSERT_RTNL();
3856 RCU_INIT_POINTER(dev->rx_handler, NULL);
3857 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3858 * section has a guarantee to see a non NULL rx_handler_data
3859 * as well.
3860 */
3861 synchronize_net();
3862 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3863 }
3864 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3865
3866 /*
3867 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3868 * the special handling of PFMEMALLOC skbs.
3869 */
skb_pfmemalloc_protocol(struct sk_buff * skb)3870 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3871 {
3872 switch (skb->protocol) {
3873 case htons(ETH_P_ARP):
3874 case htons(ETH_P_IP):
3875 case htons(ETH_P_IPV6):
3876 case htons(ETH_P_8021Q):
3877 case htons(ETH_P_8021AD):
3878 return true;
3879 default:
3880 return false;
3881 }
3882 }
3883
nf_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)3884 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3885 int *ret, struct net_device *orig_dev)
3886 {
3887 #ifdef CONFIG_NETFILTER_INGRESS
3888 if (nf_hook_ingress_active(skb)) {
3889 if (*pt_prev) {
3890 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3891 *pt_prev = NULL;
3892 }
3893
3894 return nf_hook_ingress(skb);
3895 }
3896 #endif /* CONFIG_NETFILTER_INGRESS */
3897 return 0;
3898 }
3899
__netif_receive_skb_core(struct sk_buff * skb,bool pfmemalloc)3900 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3901 {
3902 struct packet_type *ptype, *pt_prev;
3903 rx_handler_func_t *rx_handler;
3904 struct net_device *orig_dev;
3905 bool deliver_exact = false;
3906 int ret = NET_RX_DROP;
3907 __be16 type;
3908
3909 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3910
3911 trace_netif_receive_skb(skb);
3912
3913 orig_dev = skb->dev;
3914
3915 skb_reset_network_header(skb);
3916 if (!skb_transport_header_was_set(skb))
3917 skb_reset_transport_header(skb);
3918 skb_reset_mac_len(skb);
3919
3920 pt_prev = NULL;
3921
3922 another_round:
3923 skb->skb_iif = skb->dev->ifindex;
3924
3925 __this_cpu_inc(softnet_data.processed);
3926
3927 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3928 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3929 skb = skb_vlan_untag(skb);
3930 if (unlikely(!skb))
3931 goto out;
3932 }
3933
3934 #ifdef CONFIG_NET_CLS_ACT
3935 if (skb->tc_verd & TC_NCLS) {
3936 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3937 goto ncls;
3938 }
3939 #endif
3940
3941 if (pfmemalloc)
3942 goto skip_taps;
3943
3944 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3945 if (pt_prev)
3946 ret = deliver_skb(skb, pt_prev, orig_dev);
3947 pt_prev = ptype;
3948 }
3949
3950 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3951 if (pt_prev)
3952 ret = deliver_skb(skb, pt_prev, orig_dev);
3953 pt_prev = ptype;
3954 }
3955
3956 skip_taps:
3957 #ifdef CONFIG_NET_INGRESS
3958 if (static_key_false(&ingress_needed)) {
3959 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3960 if (!skb)
3961 goto out;
3962
3963 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3964 goto out;
3965 }
3966 #endif
3967 #ifdef CONFIG_NET_CLS_ACT
3968 skb->tc_verd = 0;
3969 ncls:
3970 #endif
3971 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3972 goto drop;
3973
3974 if (skb_vlan_tag_present(skb)) {
3975 if (pt_prev) {
3976 ret = deliver_skb(skb, pt_prev, orig_dev);
3977 pt_prev = NULL;
3978 }
3979 if (vlan_do_receive(&skb))
3980 goto another_round;
3981 else if (unlikely(!skb))
3982 goto out;
3983 }
3984
3985 rx_handler = rcu_dereference(skb->dev->rx_handler);
3986 if (rx_handler) {
3987 if (pt_prev) {
3988 ret = deliver_skb(skb, pt_prev, orig_dev);
3989 pt_prev = NULL;
3990 }
3991 switch (rx_handler(&skb)) {
3992 case RX_HANDLER_CONSUMED:
3993 ret = NET_RX_SUCCESS;
3994 goto out;
3995 case RX_HANDLER_ANOTHER:
3996 goto another_round;
3997 case RX_HANDLER_EXACT:
3998 deliver_exact = true;
3999 case RX_HANDLER_PASS:
4000 break;
4001 default:
4002 BUG();
4003 }
4004 }
4005
4006 if (unlikely(skb_vlan_tag_present(skb))) {
4007 if (skb_vlan_tag_get_id(skb))
4008 skb->pkt_type = PACKET_OTHERHOST;
4009 /* Note: we might in the future use prio bits
4010 * and set skb->priority like in vlan_do_receive()
4011 * For the time being, just ignore Priority Code Point
4012 */
4013 skb->vlan_tci = 0;
4014 }
4015
4016 type = skb->protocol;
4017
4018 /* deliver only exact match when indicated */
4019 if (likely(!deliver_exact)) {
4020 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4021 &ptype_base[ntohs(type) &
4022 PTYPE_HASH_MASK]);
4023 }
4024
4025 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4026 &orig_dev->ptype_specific);
4027
4028 if (unlikely(skb->dev != orig_dev)) {
4029 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4030 &skb->dev->ptype_specific);
4031 }
4032
4033 if (pt_prev) {
4034 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4035 goto drop;
4036 else
4037 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4038 } else {
4039 drop:
4040 atomic_long_inc(&skb->dev->rx_dropped);
4041 kfree_skb(skb);
4042 /* Jamal, now you will not able to escape explaining
4043 * me how you were going to use this. :-)
4044 */
4045 ret = NET_RX_DROP;
4046 }
4047
4048 out:
4049 return ret;
4050 }
4051
__netif_receive_skb(struct sk_buff * skb)4052 static int __netif_receive_skb(struct sk_buff *skb)
4053 {
4054 int ret;
4055
4056 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4057 unsigned long pflags = current->flags;
4058
4059 /*
4060 * PFMEMALLOC skbs are special, they should
4061 * - be delivered to SOCK_MEMALLOC sockets only
4062 * - stay away from userspace
4063 * - have bounded memory usage
4064 *
4065 * Use PF_MEMALLOC as this saves us from propagating the allocation
4066 * context down to all allocation sites.
4067 */
4068 current->flags |= PF_MEMALLOC;
4069 ret = __netif_receive_skb_core(skb, true);
4070 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4071 } else
4072 ret = __netif_receive_skb_core(skb, false);
4073
4074 return ret;
4075 }
4076
netif_receive_skb_internal(struct sk_buff * skb)4077 static int netif_receive_skb_internal(struct sk_buff *skb)
4078 {
4079 int ret;
4080
4081 net_timestamp_check(netdev_tstamp_prequeue, skb);
4082
4083 if (skb_defer_rx_timestamp(skb))
4084 return NET_RX_SUCCESS;
4085
4086 rcu_read_lock();
4087
4088 #ifdef CONFIG_RPS
4089 if (static_key_false(&rps_needed)) {
4090 struct rps_dev_flow voidflow, *rflow = &voidflow;
4091 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4092
4093 if (cpu >= 0) {
4094 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4095 rcu_read_unlock();
4096 return ret;
4097 }
4098 }
4099 #endif
4100 ret = __netif_receive_skb(skb);
4101 rcu_read_unlock();
4102 return ret;
4103 }
4104
4105 /**
4106 * netif_receive_skb - process receive buffer from network
4107 * @skb: buffer to process
4108 *
4109 * netif_receive_skb() is the main receive data processing function.
4110 * It always succeeds. The buffer may be dropped during processing
4111 * for congestion control or by the protocol layers.
4112 *
4113 * This function may only be called from softirq context and interrupts
4114 * should be enabled.
4115 *
4116 * Return values (usually ignored):
4117 * NET_RX_SUCCESS: no congestion
4118 * NET_RX_DROP: packet was dropped
4119 */
netif_receive_skb(struct sk_buff * skb)4120 int netif_receive_skb(struct sk_buff *skb)
4121 {
4122 trace_netif_receive_skb_entry(skb);
4123
4124 return netif_receive_skb_internal(skb);
4125 }
4126 EXPORT_SYMBOL(netif_receive_skb);
4127
4128 /* Network device is going away, flush any packets still pending
4129 * Called with irqs disabled.
4130 */
flush_backlog(void * arg)4131 static void flush_backlog(void *arg)
4132 {
4133 struct net_device *dev = arg;
4134 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4135 struct sk_buff *skb, *tmp;
4136
4137 rps_lock(sd);
4138 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4139 if (skb->dev == dev) {
4140 __skb_unlink(skb, &sd->input_pkt_queue);
4141 kfree_skb(skb);
4142 input_queue_head_incr(sd);
4143 }
4144 }
4145 rps_unlock(sd);
4146
4147 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4148 if (skb->dev == dev) {
4149 __skb_unlink(skb, &sd->process_queue);
4150 kfree_skb(skb);
4151 input_queue_head_incr(sd);
4152 }
4153 }
4154 }
4155
napi_gro_complete(struct sk_buff * skb)4156 static int napi_gro_complete(struct sk_buff *skb)
4157 {
4158 struct packet_offload *ptype;
4159 __be16 type = skb->protocol;
4160 struct list_head *head = &offload_base;
4161 int err = -ENOENT;
4162
4163 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4164
4165 if (NAPI_GRO_CB(skb)->count == 1) {
4166 skb_shinfo(skb)->gso_size = 0;
4167 goto out;
4168 }
4169
4170 rcu_read_lock();
4171 list_for_each_entry_rcu(ptype, head, list) {
4172 if (ptype->type != type || !ptype->callbacks.gro_complete)
4173 continue;
4174
4175 err = ptype->callbacks.gro_complete(skb, 0);
4176 break;
4177 }
4178 rcu_read_unlock();
4179
4180 if (err) {
4181 WARN_ON(&ptype->list == head);
4182 kfree_skb(skb);
4183 return NET_RX_SUCCESS;
4184 }
4185
4186 out:
4187 return netif_receive_skb_internal(skb);
4188 }
4189
4190 /* napi->gro_list contains packets ordered by age.
4191 * youngest packets at the head of it.
4192 * Complete skbs in reverse order to reduce latencies.
4193 */
napi_gro_flush(struct napi_struct * napi,bool flush_old)4194 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4195 {
4196 struct sk_buff *skb, *prev = NULL;
4197
4198 /* scan list and build reverse chain */
4199 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4200 skb->prev = prev;
4201 prev = skb;
4202 }
4203
4204 for (skb = prev; skb; skb = prev) {
4205 skb->next = NULL;
4206
4207 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4208 return;
4209
4210 prev = skb->prev;
4211 napi_gro_complete(skb);
4212 napi->gro_count--;
4213 }
4214
4215 napi->gro_list = NULL;
4216 }
4217 EXPORT_SYMBOL(napi_gro_flush);
4218
gro_list_prepare(struct napi_struct * napi,struct sk_buff * skb)4219 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4220 {
4221 struct sk_buff *p;
4222 unsigned int maclen = skb->dev->hard_header_len;
4223 u32 hash = skb_get_hash_raw(skb);
4224
4225 for (p = napi->gro_list; p; p = p->next) {
4226 unsigned long diffs;
4227
4228 NAPI_GRO_CB(p)->flush = 0;
4229
4230 if (hash != skb_get_hash_raw(p)) {
4231 NAPI_GRO_CB(p)->same_flow = 0;
4232 continue;
4233 }
4234
4235 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4236 diffs |= p->vlan_tci ^ skb->vlan_tci;
4237 diffs |= skb_metadata_dst_cmp(p, skb);
4238 if (maclen == ETH_HLEN)
4239 diffs |= compare_ether_header(skb_mac_header(p),
4240 skb_mac_header(skb));
4241 else if (!diffs)
4242 diffs = memcmp(skb_mac_header(p),
4243 skb_mac_header(skb),
4244 maclen);
4245 NAPI_GRO_CB(p)->same_flow = !diffs;
4246 }
4247 }
4248
skb_gro_reset_offset(struct sk_buff * skb)4249 static void skb_gro_reset_offset(struct sk_buff *skb)
4250 {
4251 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4252 const skb_frag_t *frag0 = &pinfo->frags[0];
4253
4254 NAPI_GRO_CB(skb)->data_offset = 0;
4255 NAPI_GRO_CB(skb)->frag0 = NULL;
4256 NAPI_GRO_CB(skb)->frag0_len = 0;
4257
4258 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4259 pinfo->nr_frags &&
4260 !PageHighMem(skb_frag_page(frag0))) {
4261 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4262 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4263 skb_frag_size(frag0),
4264 skb->end - skb->tail);
4265 }
4266 }
4267
gro_pull_from_frag0(struct sk_buff * skb,int grow)4268 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4269 {
4270 struct skb_shared_info *pinfo = skb_shinfo(skb);
4271
4272 BUG_ON(skb->end - skb->tail < grow);
4273
4274 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4275
4276 skb->data_len -= grow;
4277 skb->tail += grow;
4278
4279 pinfo->frags[0].page_offset += grow;
4280 skb_frag_size_sub(&pinfo->frags[0], grow);
4281
4282 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4283 skb_frag_unref(skb, 0);
4284 memmove(pinfo->frags, pinfo->frags + 1,
4285 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4286 }
4287 }
4288
dev_gro_receive(struct napi_struct * napi,struct sk_buff * skb)4289 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4290 {
4291 struct sk_buff **pp = NULL;
4292 struct packet_offload *ptype;
4293 __be16 type = skb->protocol;
4294 struct list_head *head = &offload_base;
4295 int same_flow;
4296 enum gro_result ret;
4297 int grow;
4298
4299 if (!(skb->dev->features & NETIF_F_GRO))
4300 goto normal;
4301
4302 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4303 goto normal;
4304
4305 gro_list_prepare(napi, skb);
4306
4307 rcu_read_lock();
4308 list_for_each_entry_rcu(ptype, head, list) {
4309 if (ptype->type != type || !ptype->callbacks.gro_receive)
4310 continue;
4311
4312 skb_set_network_header(skb, skb_gro_offset(skb));
4313 skb_reset_mac_len(skb);
4314 NAPI_GRO_CB(skb)->same_flow = 0;
4315 NAPI_GRO_CB(skb)->flush = 0;
4316 NAPI_GRO_CB(skb)->free = 0;
4317 NAPI_GRO_CB(skb)->encap_mark = 0;
4318 NAPI_GRO_CB(skb)->recursion_counter = 0;
4319 NAPI_GRO_CB(skb)->is_fou = 0;
4320 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4321
4322 /* Setup for GRO checksum validation */
4323 switch (skb->ip_summed) {
4324 case CHECKSUM_COMPLETE:
4325 NAPI_GRO_CB(skb)->csum = skb->csum;
4326 NAPI_GRO_CB(skb)->csum_valid = 1;
4327 NAPI_GRO_CB(skb)->csum_cnt = 0;
4328 break;
4329 case CHECKSUM_UNNECESSARY:
4330 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4331 NAPI_GRO_CB(skb)->csum_valid = 0;
4332 break;
4333 default:
4334 NAPI_GRO_CB(skb)->csum_cnt = 0;
4335 NAPI_GRO_CB(skb)->csum_valid = 0;
4336 }
4337
4338 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4339 break;
4340 }
4341 rcu_read_unlock();
4342
4343 if (&ptype->list == head)
4344 goto normal;
4345
4346 same_flow = NAPI_GRO_CB(skb)->same_flow;
4347 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4348
4349 if (pp) {
4350 struct sk_buff *nskb = *pp;
4351
4352 *pp = nskb->next;
4353 nskb->next = NULL;
4354 napi_gro_complete(nskb);
4355 napi->gro_count--;
4356 }
4357
4358 if (same_flow)
4359 goto ok;
4360
4361 if (NAPI_GRO_CB(skb)->flush)
4362 goto normal;
4363
4364 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4365 struct sk_buff *nskb = napi->gro_list;
4366
4367 /* locate the end of the list to select the 'oldest' flow */
4368 while (nskb->next) {
4369 pp = &nskb->next;
4370 nskb = *pp;
4371 }
4372 *pp = NULL;
4373 nskb->next = NULL;
4374 napi_gro_complete(nskb);
4375 } else {
4376 napi->gro_count++;
4377 }
4378 NAPI_GRO_CB(skb)->count = 1;
4379 NAPI_GRO_CB(skb)->age = jiffies;
4380 NAPI_GRO_CB(skb)->last = skb;
4381 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4382 skb->next = napi->gro_list;
4383 napi->gro_list = skb;
4384 ret = GRO_HELD;
4385
4386 pull:
4387 grow = skb_gro_offset(skb) - skb_headlen(skb);
4388 if (grow > 0)
4389 gro_pull_from_frag0(skb, grow);
4390 ok:
4391 return ret;
4392
4393 normal:
4394 ret = GRO_NORMAL;
4395 goto pull;
4396 }
4397
gro_find_receive_by_type(__be16 type)4398 struct packet_offload *gro_find_receive_by_type(__be16 type)
4399 {
4400 struct list_head *offload_head = &offload_base;
4401 struct packet_offload *ptype;
4402
4403 list_for_each_entry_rcu(ptype, offload_head, list) {
4404 if (ptype->type != type || !ptype->callbacks.gro_receive)
4405 continue;
4406 return ptype;
4407 }
4408 return NULL;
4409 }
4410 EXPORT_SYMBOL(gro_find_receive_by_type);
4411
gro_find_complete_by_type(__be16 type)4412 struct packet_offload *gro_find_complete_by_type(__be16 type)
4413 {
4414 struct list_head *offload_head = &offload_base;
4415 struct packet_offload *ptype;
4416
4417 list_for_each_entry_rcu(ptype, offload_head, list) {
4418 if (ptype->type != type || !ptype->callbacks.gro_complete)
4419 continue;
4420 return ptype;
4421 }
4422 return NULL;
4423 }
4424 EXPORT_SYMBOL(gro_find_complete_by_type);
4425
napi_skb_free_stolen_head(struct sk_buff * skb)4426 static void napi_skb_free_stolen_head(struct sk_buff *skb)
4427 {
4428 skb_dst_drop(skb);
4429 kmem_cache_free(skbuff_head_cache, skb);
4430 }
4431
napi_skb_finish(gro_result_t ret,struct sk_buff * skb)4432 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4433 {
4434 switch (ret) {
4435 case GRO_NORMAL:
4436 if (netif_receive_skb_internal(skb))
4437 ret = GRO_DROP;
4438 break;
4439
4440 case GRO_DROP:
4441 kfree_skb(skb);
4442 break;
4443
4444 case GRO_MERGED_FREE:
4445 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4446 napi_skb_free_stolen_head(skb);
4447 else
4448 __kfree_skb(skb);
4449 break;
4450
4451 case GRO_HELD:
4452 case GRO_MERGED:
4453 break;
4454 }
4455
4456 return ret;
4457 }
4458
napi_gro_receive(struct napi_struct * napi,struct sk_buff * skb)4459 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4460 {
4461 trace_napi_gro_receive_entry(skb);
4462
4463 skb_gro_reset_offset(skb);
4464
4465 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4466 }
4467 EXPORT_SYMBOL(napi_gro_receive);
4468
napi_reuse_skb(struct napi_struct * napi,struct sk_buff * skb)4469 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4470 {
4471 if (unlikely(skb->pfmemalloc)) {
4472 consume_skb(skb);
4473 return;
4474 }
4475 __skb_pull(skb, skb_headlen(skb));
4476 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4477 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4478 skb->vlan_tci = 0;
4479 skb->dev = napi->dev;
4480 skb->skb_iif = 0;
4481
4482 /* eth_type_trans() assumes pkt_type is PACKET_HOST */
4483 skb->pkt_type = PACKET_HOST;
4484
4485 skb->encapsulation = 0;
4486 skb_shinfo(skb)->gso_type = 0;
4487 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4488
4489 napi->skb = skb;
4490 }
4491
napi_get_frags(struct napi_struct * napi)4492 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4493 {
4494 struct sk_buff *skb = napi->skb;
4495
4496 if (!skb) {
4497 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4498 napi->skb = skb;
4499 }
4500 return skb;
4501 }
4502 EXPORT_SYMBOL(napi_get_frags);
4503
napi_frags_finish(struct napi_struct * napi,struct sk_buff * skb,gro_result_t ret)4504 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4505 struct sk_buff *skb,
4506 gro_result_t ret)
4507 {
4508 switch (ret) {
4509 case GRO_NORMAL:
4510 case GRO_HELD:
4511 __skb_push(skb, ETH_HLEN);
4512 skb->protocol = eth_type_trans(skb, skb->dev);
4513 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4514 ret = GRO_DROP;
4515 break;
4516
4517 case GRO_DROP:
4518 napi_reuse_skb(napi, skb);
4519 break;
4520
4521 case GRO_MERGED_FREE:
4522 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4523 napi_skb_free_stolen_head(skb);
4524 else
4525 napi_reuse_skb(napi, skb);
4526 break;
4527
4528 case GRO_MERGED:
4529 break;
4530 }
4531
4532 return ret;
4533 }
4534
4535 /* Upper GRO stack assumes network header starts at gro_offset=0
4536 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4537 * We copy ethernet header into skb->data to have a common layout.
4538 */
napi_frags_skb(struct napi_struct * napi)4539 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4540 {
4541 struct sk_buff *skb = napi->skb;
4542 const struct ethhdr *eth;
4543 unsigned int hlen = sizeof(*eth);
4544
4545 napi->skb = NULL;
4546
4547 skb_reset_mac_header(skb);
4548 skb_gro_reset_offset(skb);
4549
4550 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4551 eth = skb_gro_header_slow(skb, hlen, 0);
4552 if (unlikely(!eth)) {
4553 napi_reuse_skb(napi, skb);
4554 return NULL;
4555 }
4556 } else {
4557 eth = (const struct ethhdr *)skb->data;
4558 gro_pull_from_frag0(skb, hlen);
4559 NAPI_GRO_CB(skb)->frag0 += hlen;
4560 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4561 }
4562 __skb_pull(skb, hlen);
4563
4564 /*
4565 * This works because the only protocols we care about don't require
4566 * special handling.
4567 * We'll fix it up properly in napi_frags_finish()
4568 */
4569 skb->protocol = eth->h_proto;
4570
4571 return skb;
4572 }
4573
napi_gro_frags(struct napi_struct * napi)4574 gro_result_t napi_gro_frags(struct napi_struct *napi)
4575 {
4576 struct sk_buff *skb = napi_frags_skb(napi);
4577
4578 if (!skb)
4579 return GRO_DROP;
4580
4581 trace_napi_gro_frags_entry(skb);
4582
4583 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4584 }
4585 EXPORT_SYMBOL(napi_gro_frags);
4586
4587 /* Compute the checksum from gro_offset and return the folded value
4588 * after adding in any pseudo checksum.
4589 */
__skb_gro_checksum_complete(struct sk_buff * skb)4590 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4591 {
4592 __wsum wsum;
4593 __sum16 sum;
4594
4595 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4596
4597 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4598 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4599 if (likely(!sum)) {
4600 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4601 !skb->csum_complete_sw)
4602 netdev_rx_csum_fault(skb->dev);
4603 }
4604
4605 NAPI_GRO_CB(skb)->csum = wsum;
4606 NAPI_GRO_CB(skb)->csum_valid = 1;
4607
4608 return sum;
4609 }
4610 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4611
4612 /*
4613 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4614 * Note: called with local irq disabled, but exits with local irq enabled.
4615 */
net_rps_action_and_irq_enable(struct softnet_data * sd)4616 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4617 {
4618 #ifdef CONFIG_RPS
4619 struct softnet_data *remsd = sd->rps_ipi_list;
4620
4621 if (remsd) {
4622 sd->rps_ipi_list = NULL;
4623
4624 local_irq_enable();
4625
4626 /* Send pending IPI's to kick RPS processing on remote cpus. */
4627 while (remsd) {
4628 struct softnet_data *next = remsd->rps_ipi_next;
4629
4630 if (cpu_online(remsd->cpu))
4631 smp_call_function_single_async(remsd->cpu,
4632 &remsd->csd);
4633 remsd = next;
4634 }
4635 } else
4636 #endif
4637 local_irq_enable();
4638 }
4639
sd_has_rps_ipi_waiting(struct softnet_data * sd)4640 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4641 {
4642 #ifdef CONFIG_RPS
4643 return sd->rps_ipi_list != NULL;
4644 #else
4645 return false;
4646 #endif
4647 }
4648
process_backlog(struct napi_struct * napi,int quota)4649 static int process_backlog(struct napi_struct *napi, int quota)
4650 {
4651 int work = 0;
4652 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4653
4654 /* Check if we have pending ipi, its better to send them now,
4655 * not waiting net_rx_action() end.
4656 */
4657 if (sd_has_rps_ipi_waiting(sd)) {
4658 local_irq_disable();
4659 net_rps_action_and_irq_enable(sd);
4660 }
4661
4662 napi->weight = weight_p;
4663 local_irq_disable();
4664 while (1) {
4665 struct sk_buff *skb;
4666
4667 while ((skb = __skb_dequeue(&sd->process_queue))) {
4668 rcu_read_lock();
4669 local_irq_enable();
4670 __netif_receive_skb(skb);
4671 rcu_read_unlock();
4672 local_irq_disable();
4673 input_queue_head_incr(sd);
4674 if (++work >= quota) {
4675 local_irq_enable();
4676 return work;
4677 }
4678 }
4679
4680 rps_lock(sd);
4681 if (skb_queue_empty(&sd->input_pkt_queue)) {
4682 /*
4683 * Inline a custom version of __napi_complete().
4684 * only current cpu owns and manipulates this napi,
4685 * and NAPI_STATE_SCHED is the only possible flag set
4686 * on backlog.
4687 * We can use a plain write instead of clear_bit(),
4688 * and we dont need an smp_mb() memory barrier.
4689 */
4690 napi->state = 0;
4691 rps_unlock(sd);
4692
4693 break;
4694 }
4695
4696 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4697 &sd->process_queue);
4698 rps_unlock(sd);
4699 }
4700 local_irq_enable();
4701
4702 return work;
4703 }
4704
4705 /**
4706 * __napi_schedule - schedule for receive
4707 * @n: entry to schedule
4708 *
4709 * The entry's receive function will be scheduled to run.
4710 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4711 */
__napi_schedule(struct napi_struct * n)4712 void __napi_schedule(struct napi_struct *n)
4713 {
4714 unsigned long flags;
4715
4716 local_irq_save(flags);
4717 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4718 local_irq_restore(flags);
4719 }
4720 EXPORT_SYMBOL(__napi_schedule);
4721
4722 /**
4723 * __napi_schedule_irqoff - schedule for receive
4724 * @n: entry to schedule
4725 *
4726 * Variant of __napi_schedule() assuming hard irqs are masked.
4727 *
4728 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
4729 * because the interrupt disabled assumption might not be true
4730 * due to force-threaded interrupts and spinlock substitution.
4731 */
__napi_schedule_irqoff(struct napi_struct * n)4732 void __napi_schedule_irqoff(struct napi_struct *n)
4733 {
4734 if (!IS_ENABLED(CONFIG_PREEMPT_RT))
4735 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4736 else
4737 __napi_schedule(n);
4738 }
4739 EXPORT_SYMBOL(__napi_schedule_irqoff);
4740
__napi_complete(struct napi_struct * n)4741 void __napi_complete(struct napi_struct *n)
4742 {
4743 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4744
4745 list_del_init(&n->poll_list);
4746 smp_mb__before_atomic();
4747 clear_bit(NAPI_STATE_SCHED, &n->state);
4748 }
4749 EXPORT_SYMBOL(__napi_complete);
4750
napi_complete_done(struct napi_struct * n,int work_done)4751 void napi_complete_done(struct napi_struct *n, int work_done)
4752 {
4753 unsigned long flags;
4754
4755 /*
4756 * don't let napi dequeue from the cpu poll list
4757 * just in case its running on a different cpu
4758 */
4759 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4760 return;
4761
4762 if (n->gro_list) {
4763 unsigned long timeout = 0;
4764
4765 if (work_done)
4766 timeout = n->dev->gro_flush_timeout;
4767
4768 if (timeout)
4769 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4770 HRTIMER_MODE_REL_PINNED);
4771 else
4772 napi_gro_flush(n, false);
4773 }
4774 if (likely(list_empty(&n->poll_list))) {
4775 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4776 } else {
4777 /* If n->poll_list is not empty, we need to mask irqs */
4778 local_irq_save(flags);
4779 __napi_complete(n);
4780 local_irq_restore(flags);
4781 }
4782 }
4783 EXPORT_SYMBOL(napi_complete_done);
4784
4785 /* must be called under rcu_read_lock(), as we dont take a reference */
napi_by_id(unsigned int napi_id)4786 struct napi_struct *napi_by_id(unsigned int napi_id)
4787 {
4788 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4789 struct napi_struct *napi;
4790
4791 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4792 if (napi->napi_id == napi_id)
4793 return napi;
4794
4795 return NULL;
4796 }
4797 EXPORT_SYMBOL_GPL(napi_by_id);
4798
napi_hash_add(struct napi_struct * napi)4799 void napi_hash_add(struct napi_struct *napi)
4800 {
4801 if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4802 return;
4803
4804 spin_lock(&napi_hash_lock);
4805
4806 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4807 do {
4808 if (unlikely(++napi_gen_id < NR_CPUS + 1))
4809 napi_gen_id = NR_CPUS + 1;
4810 } while (napi_by_id(napi_gen_id));
4811 napi->napi_id = napi_gen_id;
4812
4813 hlist_add_head_rcu(&napi->napi_hash_node,
4814 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4815
4816 spin_unlock(&napi_hash_lock);
4817 }
4818 EXPORT_SYMBOL_GPL(napi_hash_add);
4819
4820 /* Warning : caller is responsible to make sure rcu grace period
4821 * is respected before freeing memory containing @napi
4822 */
napi_hash_del(struct napi_struct * napi)4823 void napi_hash_del(struct napi_struct *napi)
4824 {
4825 spin_lock(&napi_hash_lock);
4826
4827 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4828 hlist_del_rcu(&napi->napi_hash_node);
4829
4830 spin_unlock(&napi_hash_lock);
4831 }
4832 EXPORT_SYMBOL_GPL(napi_hash_del);
4833
napi_watchdog(struct hrtimer * timer)4834 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4835 {
4836 struct napi_struct *napi;
4837
4838 napi = container_of(timer, struct napi_struct, timer);
4839 if (napi->gro_list)
4840 napi_schedule(napi);
4841
4842 return HRTIMER_NORESTART;
4843 }
4844
netif_napi_add(struct net_device * dev,struct napi_struct * napi,int (* poll)(struct napi_struct *,int),int weight)4845 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4846 int (*poll)(struct napi_struct *, int), int weight)
4847 {
4848 INIT_LIST_HEAD(&napi->poll_list);
4849 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4850 napi->timer.function = napi_watchdog;
4851 napi->gro_count = 0;
4852 napi->gro_list = NULL;
4853 napi->skb = NULL;
4854 napi->poll = poll;
4855 if (weight > NAPI_POLL_WEIGHT)
4856 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4857 weight, dev->name);
4858 napi->weight = weight;
4859 napi->dev = dev;
4860 #ifdef CONFIG_NETPOLL
4861 spin_lock_init(&napi->poll_lock);
4862 napi->poll_owner = -1;
4863 #endif
4864 set_bit(NAPI_STATE_SCHED, &napi->state);
4865 set_bit(NAPI_STATE_NPSVC, &napi->state);
4866 list_add_rcu(&napi->dev_list, &dev->napi_list);
4867 }
4868 EXPORT_SYMBOL(netif_napi_add);
4869
napi_disable(struct napi_struct * n)4870 void napi_disable(struct napi_struct *n)
4871 {
4872 might_sleep();
4873 set_bit(NAPI_STATE_DISABLE, &n->state);
4874
4875 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4876 msleep(1);
4877 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4878 msleep(1);
4879
4880 hrtimer_cancel(&n->timer);
4881
4882 clear_bit(NAPI_STATE_DISABLE, &n->state);
4883 }
4884 EXPORT_SYMBOL(napi_disable);
4885
netif_napi_del(struct napi_struct * napi)4886 void netif_napi_del(struct napi_struct *napi)
4887 {
4888 list_del_init(&napi->dev_list);
4889 napi_free_frags(napi);
4890
4891 kfree_skb_list(napi->gro_list);
4892 napi->gro_list = NULL;
4893 napi->gro_count = 0;
4894 }
4895 EXPORT_SYMBOL(netif_napi_del);
4896
napi_poll(struct napi_struct * n,struct list_head * repoll)4897 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4898 {
4899 void *have;
4900 int work, weight;
4901
4902 list_del_init(&n->poll_list);
4903
4904 have = netpoll_poll_lock(n);
4905
4906 weight = n->weight;
4907
4908 /* This NAPI_STATE_SCHED test is for avoiding a race
4909 * with netpoll's poll_napi(). Only the entity which
4910 * obtains the lock and sees NAPI_STATE_SCHED set will
4911 * actually make the ->poll() call. Therefore we avoid
4912 * accidentally calling ->poll() when NAPI is not scheduled.
4913 */
4914 work = 0;
4915 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4916 work = n->poll(n, weight);
4917 trace_napi_poll(n);
4918 }
4919
4920 WARN_ON_ONCE(work > weight);
4921
4922 if (likely(work < weight))
4923 goto out_unlock;
4924
4925 /* Drivers must not modify the NAPI state if they
4926 * consume the entire weight. In such cases this code
4927 * still "owns" the NAPI instance and therefore can
4928 * move the instance around on the list at-will.
4929 */
4930 if (unlikely(napi_disable_pending(n))) {
4931 napi_complete(n);
4932 goto out_unlock;
4933 }
4934
4935 if (n->gro_list) {
4936 /* flush too old packets
4937 * If HZ < 1000, flush all packets.
4938 */
4939 napi_gro_flush(n, HZ >= 1000);
4940 }
4941
4942 /* Some drivers may have called napi_schedule
4943 * prior to exhausting their budget.
4944 */
4945 if (unlikely(!list_empty(&n->poll_list))) {
4946 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4947 n->dev ? n->dev->name : "backlog");
4948 goto out_unlock;
4949 }
4950
4951 list_add_tail(&n->poll_list, repoll);
4952
4953 out_unlock:
4954 netpoll_poll_unlock(have);
4955
4956 return work;
4957 }
4958
net_rx_action(struct softirq_action * h)4959 static void net_rx_action(struct softirq_action *h)
4960 {
4961 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4962 unsigned long time_limit = jiffies + 2;
4963 int budget = netdev_budget;
4964 LIST_HEAD(list);
4965 LIST_HEAD(repoll);
4966
4967 local_irq_disable();
4968 list_splice_init(&sd->poll_list, &list);
4969 local_irq_enable();
4970
4971 for (;;) {
4972 struct napi_struct *n;
4973
4974 if (list_empty(&list)) {
4975 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4976 return;
4977 break;
4978 }
4979
4980 n = list_first_entry(&list, struct napi_struct, poll_list);
4981 budget -= napi_poll(n, &repoll);
4982
4983 /* If softirq window is exhausted then punt.
4984 * Allow this to run for 2 jiffies since which will allow
4985 * an average latency of 1.5/HZ.
4986 */
4987 if (unlikely(budget <= 0 ||
4988 time_after_eq(jiffies, time_limit))) {
4989 sd->time_squeeze++;
4990 break;
4991 }
4992 }
4993
4994 local_irq_disable();
4995
4996 list_splice_tail_init(&sd->poll_list, &list);
4997 list_splice_tail(&repoll, &list);
4998 list_splice(&list, &sd->poll_list);
4999 if (!list_empty(&sd->poll_list))
5000 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5001
5002 net_rps_action_and_irq_enable(sd);
5003 }
5004
5005 struct netdev_adjacent {
5006 struct net_device *dev;
5007
5008 /* upper master flag, there can only be one master device per list */
5009 bool master;
5010
5011 /* counter for the number of times this device was added to us */
5012 u16 ref_nr;
5013
5014 /* private field for the users */
5015 void *private;
5016
5017 struct list_head list;
5018 struct rcu_head rcu;
5019 };
5020
__netdev_find_adj(struct net_device * adj_dev,struct list_head * adj_list)5021 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5022 struct list_head *adj_list)
5023 {
5024 struct netdev_adjacent *adj;
5025
5026 list_for_each_entry(adj, adj_list, list) {
5027 if (adj->dev == adj_dev)
5028 return adj;
5029 }
5030 return NULL;
5031 }
5032
5033 /**
5034 * netdev_has_upper_dev - Check if device is linked to an upper device
5035 * @dev: device
5036 * @upper_dev: upper device to check
5037 *
5038 * Find out if a device is linked to specified upper device and return true
5039 * in case it is. Note that this checks only immediate upper device,
5040 * not through a complete stack of devices. The caller must hold the RTNL lock.
5041 */
netdev_has_upper_dev(struct net_device * dev,struct net_device * upper_dev)5042 bool netdev_has_upper_dev(struct net_device *dev,
5043 struct net_device *upper_dev)
5044 {
5045 ASSERT_RTNL();
5046
5047 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5048 }
5049 EXPORT_SYMBOL(netdev_has_upper_dev);
5050
5051 /**
5052 * netdev_has_any_upper_dev - Check if device is linked to some device
5053 * @dev: device
5054 *
5055 * Find out if a device is linked to an upper device and return true in case
5056 * it is. The caller must hold the RTNL lock.
5057 */
netdev_has_any_upper_dev(struct net_device * dev)5058 static bool netdev_has_any_upper_dev(struct net_device *dev)
5059 {
5060 ASSERT_RTNL();
5061
5062 return !list_empty(&dev->all_adj_list.upper);
5063 }
5064
5065 /**
5066 * netdev_master_upper_dev_get - Get master upper device
5067 * @dev: device
5068 *
5069 * Find a master upper device and return pointer to it or NULL in case
5070 * it's not there. The caller must hold the RTNL lock.
5071 */
netdev_master_upper_dev_get(struct net_device * dev)5072 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5073 {
5074 struct netdev_adjacent *upper;
5075
5076 ASSERT_RTNL();
5077
5078 if (list_empty(&dev->adj_list.upper))
5079 return NULL;
5080
5081 upper = list_first_entry(&dev->adj_list.upper,
5082 struct netdev_adjacent, list);
5083 if (likely(upper->master))
5084 return upper->dev;
5085 return NULL;
5086 }
5087 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5088
netdev_adjacent_get_private(struct list_head * adj_list)5089 void *netdev_adjacent_get_private(struct list_head *adj_list)
5090 {
5091 struct netdev_adjacent *adj;
5092
5093 adj = list_entry(adj_list, struct netdev_adjacent, list);
5094
5095 return adj->private;
5096 }
5097 EXPORT_SYMBOL(netdev_adjacent_get_private);
5098
5099 /**
5100 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5101 * @dev: device
5102 * @iter: list_head ** of the current position
5103 *
5104 * Gets the next device from the dev's upper list, starting from iter
5105 * position. The caller must hold RCU read lock.
5106 */
netdev_upper_get_next_dev_rcu(struct net_device * dev,struct list_head ** iter)5107 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5108 struct list_head **iter)
5109 {
5110 struct netdev_adjacent *upper;
5111
5112 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5113
5114 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5115
5116 if (&upper->list == &dev->adj_list.upper)
5117 return NULL;
5118
5119 *iter = &upper->list;
5120
5121 return upper->dev;
5122 }
5123 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5124
5125 /**
5126 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5127 * @dev: device
5128 * @iter: list_head ** of the current position
5129 *
5130 * Gets the next device from the dev's upper list, starting from iter
5131 * position. The caller must hold RCU read lock.
5132 */
netdev_all_upper_get_next_dev_rcu(struct net_device * dev,struct list_head ** iter)5133 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5134 struct list_head **iter)
5135 {
5136 struct netdev_adjacent *upper;
5137
5138 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5139
5140 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5141
5142 if (&upper->list == &dev->all_adj_list.upper)
5143 return NULL;
5144
5145 *iter = &upper->list;
5146
5147 return upper->dev;
5148 }
5149 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5150
5151 /**
5152 * netdev_lower_get_next_private - Get the next ->private from the
5153 * lower neighbour list
5154 * @dev: device
5155 * @iter: list_head ** of the current position
5156 *
5157 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5158 * list, starting from iter position. The caller must hold either hold the
5159 * RTNL lock or its own locking that guarantees that the neighbour lower
5160 * list will remain unchanged.
5161 */
netdev_lower_get_next_private(struct net_device * dev,struct list_head ** iter)5162 void *netdev_lower_get_next_private(struct net_device *dev,
5163 struct list_head **iter)
5164 {
5165 struct netdev_adjacent *lower;
5166
5167 lower = list_entry(*iter, struct netdev_adjacent, list);
5168
5169 if (&lower->list == &dev->adj_list.lower)
5170 return NULL;
5171
5172 *iter = lower->list.next;
5173
5174 return lower->private;
5175 }
5176 EXPORT_SYMBOL(netdev_lower_get_next_private);
5177
5178 /**
5179 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5180 * lower neighbour list, RCU
5181 * variant
5182 * @dev: device
5183 * @iter: list_head ** of the current position
5184 *
5185 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5186 * list, starting from iter position. The caller must hold RCU read lock.
5187 */
netdev_lower_get_next_private_rcu(struct net_device * dev,struct list_head ** iter)5188 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5189 struct list_head **iter)
5190 {
5191 struct netdev_adjacent *lower;
5192
5193 WARN_ON_ONCE(!rcu_read_lock_held());
5194
5195 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5196
5197 if (&lower->list == &dev->adj_list.lower)
5198 return NULL;
5199
5200 *iter = &lower->list;
5201
5202 return lower->private;
5203 }
5204 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5205
5206 /**
5207 * netdev_lower_get_next - Get the next device from the lower neighbour
5208 * list
5209 * @dev: device
5210 * @iter: list_head ** of the current position
5211 *
5212 * Gets the next netdev_adjacent from the dev's lower neighbour
5213 * list, starting from iter position. The caller must hold RTNL lock or
5214 * its own locking that guarantees that the neighbour lower
5215 * list will remain unchanged.
5216 */
netdev_lower_get_next(struct net_device * dev,struct list_head ** iter)5217 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5218 {
5219 struct netdev_adjacent *lower;
5220
5221 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5222
5223 if (&lower->list == &dev->adj_list.lower)
5224 return NULL;
5225
5226 *iter = &lower->list;
5227
5228 return lower->dev;
5229 }
5230 EXPORT_SYMBOL(netdev_lower_get_next);
5231
5232 /**
5233 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5234 * lower neighbour list, RCU
5235 * variant
5236 * @dev: device
5237 *
5238 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5239 * list. The caller must hold RCU read lock.
5240 */
netdev_lower_get_first_private_rcu(struct net_device * dev)5241 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5242 {
5243 struct netdev_adjacent *lower;
5244
5245 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5246 struct netdev_adjacent, list);
5247 if (lower)
5248 return lower->private;
5249 return NULL;
5250 }
5251 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5252
5253 /**
5254 * netdev_master_upper_dev_get_rcu - Get master upper device
5255 * @dev: device
5256 *
5257 * Find a master upper device and return pointer to it or NULL in case
5258 * it's not there. The caller must hold the RCU read lock.
5259 */
netdev_master_upper_dev_get_rcu(struct net_device * dev)5260 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5261 {
5262 struct netdev_adjacent *upper;
5263
5264 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5265 struct netdev_adjacent, list);
5266 if (upper && likely(upper->master))
5267 return upper->dev;
5268 return NULL;
5269 }
5270 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5271
netdev_adjacent_sysfs_add(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)5272 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5273 struct net_device *adj_dev,
5274 struct list_head *dev_list)
5275 {
5276 char linkname[IFNAMSIZ+7];
5277 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5278 "upper_%s" : "lower_%s", adj_dev->name);
5279 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5280 linkname);
5281 }
netdev_adjacent_sysfs_del(struct net_device * dev,char * name,struct list_head * dev_list)5282 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5283 char *name,
5284 struct list_head *dev_list)
5285 {
5286 char linkname[IFNAMSIZ+7];
5287 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5288 "upper_%s" : "lower_%s", name);
5289 sysfs_remove_link(&(dev->dev.kobj), linkname);
5290 }
5291
netdev_adjacent_is_neigh_list(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)5292 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5293 struct net_device *adj_dev,
5294 struct list_head *dev_list)
5295 {
5296 return (dev_list == &dev->adj_list.upper ||
5297 dev_list == &dev->adj_list.lower) &&
5298 net_eq(dev_net(dev), dev_net(adj_dev));
5299 }
5300
__netdev_adjacent_dev_insert(struct net_device * dev,struct net_device * adj_dev,u16 ref_nr,struct list_head * dev_list,void * private,bool master)5301 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5302 struct net_device *adj_dev,
5303 u16 ref_nr,
5304 struct list_head *dev_list,
5305 void *private, bool master)
5306 {
5307 struct netdev_adjacent *adj;
5308 int ret;
5309
5310 adj = __netdev_find_adj(adj_dev, dev_list);
5311
5312 if (adj) {
5313 adj->ref_nr += ref_nr;
5314 return 0;
5315 }
5316
5317 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5318 if (!adj)
5319 return -ENOMEM;
5320
5321 adj->dev = adj_dev;
5322 adj->master = master;
5323 adj->ref_nr = ref_nr;
5324 adj->private = private;
5325 dev_hold(adj_dev);
5326
5327 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5328 adj_dev->name, dev->name, adj_dev->name);
5329
5330 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5331 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5332 if (ret)
5333 goto free_adj;
5334 }
5335
5336 /* Ensure that master link is always the first item in list. */
5337 if (master) {
5338 ret = sysfs_create_link(&(dev->dev.kobj),
5339 &(adj_dev->dev.kobj), "master");
5340 if (ret)
5341 goto remove_symlinks;
5342
5343 list_add_rcu(&adj->list, dev_list);
5344 } else {
5345 list_add_tail_rcu(&adj->list, dev_list);
5346 }
5347
5348 return 0;
5349
5350 remove_symlinks:
5351 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5352 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5353 free_adj:
5354 kfree(adj);
5355 dev_put(adj_dev);
5356
5357 return ret;
5358 }
5359
__netdev_adjacent_dev_remove(struct net_device * dev,struct net_device * adj_dev,u16 ref_nr,struct list_head * dev_list)5360 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5361 struct net_device *adj_dev,
5362 u16 ref_nr,
5363 struct list_head *dev_list)
5364 {
5365 struct netdev_adjacent *adj;
5366
5367 adj = __netdev_find_adj(adj_dev, dev_list);
5368
5369 if (!adj) {
5370 pr_err("tried to remove device %s from %s\n",
5371 dev->name, adj_dev->name);
5372 BUG();
5373 }
5374
5375 if (adj->ref_nr > ref_nr) {
5376 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5377 ref_nr, adj->ref_nr-ref_nr);
5378 adj->ref_nr -= ref_nr;
5379 return;
5380 }
5381
5382 if (adj->master)
5383 sysfs_remove_link(&(dev->dev.kobj), "master");
5384
5385 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5386 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5387
5388 list_del_rcu(&adj->list);
5389 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5390 adj_dev->name, dev->name, adj_dev->name);
5391 dev_put(adj_dev);
5392 kfree_rcu(adj, rcu);
5393 }
5394
__netdev_adjacent_dev_link_lists(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr,struct list_head * up_list,struct list_head * down_list,void * private,bool master)5395 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5396 struct net_device *upper_dev,
5397 u16 ref_nr,
5398 struct list_head *up_list,
5399 struct list_head *down_list,
5400 void *private, bool master)
5401 {
5402 int ret;
5403
5404 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5405 private, master);
5406 if (ret)
5407 return ret;
5408
5409 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5410 private, false);
5411 if (ret) {
5412 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5413 return ret;
5414 }
5415
5416 return 0;
5417 }
5418
__netdev_adjacent_dev_link(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr)5419 static int __netdev_adjacent_dev_link(struct net_device *dev,
5420 struct net_device *upper_dev,
5421 u16 ref_nr)
5422 {
5423 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5424 &dev->all_adj_list.upper,
5425 &upper_dev->all_adj_list.lower,
5426 NULL, false);
5427 }
5428
__netdev_adjacent_dev_unlink_lists(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr,struct list_head * up_list,struct list_head * down_list)5429 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5430 struct net_device *upper_dev,
5431 u16 ref_nr,
5432 struct list_head *up_list,
5433 struct list_head *down_list)
5434 {
5435 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5436 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5437 }
5438
__netdev_adjacent_dev_unlink(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr)5439 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5440 struct net_device *upper_dev,
5441 u16 ref_nr)
5442 {
5443 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5444 &dev->all_adj_list.upper,
5445 &upper_dev->all_adj_list.lower);
5446 }
5447
__netdev_adjacent_dev_link_neighbour(struct net_device * dev,struct net_device * upper_dev,void * private,bool master)5448 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5449 struct net_device *upper_dev,
5450 void *private, bool master)
5451 {
5452 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5453
5454 if (ret)
5455 return ret;
5456
5457 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5458 &dev->adj_list.upper,
5459 &upper_dev->adj_list.lower,
5460 private, master);
5461 if (ret) {
5462 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5463 return ret;
5464 }
5465
5466 return 0;
5467 }
5468
__netdev_adjacent_dev_unlink_neighbour(struct net_device * dev,struct net_device * upper_dev)5469 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5470 struct net_device *upper_dev)
5471 {
5472 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5473 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5474 &dev->adj_list.upper,
5475 &upper_dev->adj_list.lower);
5476 }
5477
__netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,bool master,void * private)5478 static int __netdev_upper_dev_link(struct net_device *dev,
5479 struct net_device *upper_dev, bool master,
5480 void *private)
5481 {
5482 struct netdev_notifier_changeupper_info changeupper_info;
5483 struct netdev_adjacent *i, *j, *to_i, *to_j;
5484 int ret = 0;
5485
5486 ASSERT_RTNL();
5487
5488 if (dev == upper_dev)
5489 return -EBUSY;
5490
5491 /* To prevent loops, check if dev is not upper device to upper_dev. */
5492 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5493 return -EBUSY;
5494
5495 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5496 return -EEXIST;
5497
5498 if (master && netdev_master_upper_dev_get(dev))
5499 return -EBUSY;
5500
5501 changeupper_info.upper_dev = upper_dev;
5502 changeupper_info.master = master;
5503 changeupper_info.linking = true;
5504
5505 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5506 &changeupper_info.info);
5507 ret = notifier_to_errno(ret);
5508 if (ret)
5509 return ret;
5510
5511 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5512 master);
5513 if (ret)
5514 return ret;
5515
5516 /* Now that we linked these devs, make all the upper_dev's
5517 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5518 * versa, and don't forget the devices itself. All of these
5519 * links are non-neighbours.
5520 */
5521 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5522 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5523 pr_debug("Interlinking %s with %s, non-neighbour\n",
5524 i->dev->name, j->dev->name);
5525 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5526 if (ret)
5527 goto rollback_mesh;
5528 }
5529 }
5530
5531 /* add dev to every upper_dev's upper device */
5532 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5533 pr_debug("linking %s's upper device %s with %s\n",
5534 upper_dev->name, i->dev->name, dev->name);
5535 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5536 if (ret)
5537 goto rollback_upper_mesh;
5538 }
5539
5540 /* add upper_dev to every dev's lower device */
5541 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5542 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5543 i->dev->name, upper_dev->name);
5544 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5545 if (ret)
5546 goto rollback_lower_mesh;
5547 }
5548
5549 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5550 &changeupper_info.info);
5551 return 0;
5552
5553 rollback_lower_mesh:
5554 to_i = i;
5555 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5556 if (i == to_i)
5557 break;
5558 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5559 }
5560
5561 i = NULL;
5562
5563 rollback_upper_mesh:
5564 to_i = i;
5565 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5566 if (i == to_i)
5567 break;
5568 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5569 }
5570
5571 i = j = NULL;
5572
5573 rollback_mesh:
5574 to_i = i;
5575 to_j = j;
5576 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5577 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5578 if (i == to_i && j == to_j)
5579 break;
5580 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5581 }
5582 if (i == to_i)
5583 break;
5584 }
5585
5586 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5587
5588 return ret;
5589 }
5590
5591 /**
5592 * netdev_upper_dev_link - Add a link to the upper device
5593 * @dev: device
5594 * @upper_dev: new upper device
5595 *
5596 * Adds a link to device which is upper to this one. The caller must hold
5597 * the RTNL lock. On a failure a negative errno code is returned.
5598 * On success the reference counts are adjusted and the function
5599 * returns zero.
5600 */
netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev)5601 int netdev_upper_dev_link(struct net_device *dev,
5602 struct net_device *upper_dev)
5603 {
5604 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5605 }
5606 EXPORT_SYMBOL(netdev_upper_dev_link);
5607
5608 /**
5609 * netdev_master_upper_dev_link - Add a master link to the upper device
5610 * @dev: device
5611 * @upper_dev: new upper device
5612 *
5613 * Adds a link to device which is upper to this one. In this case, only
5614 * one master upper device can be linked, although other non-master devices
5615 * might be linked as well. The caller must hold the RTNL lock.
5616 * On a failure a negative errno code is returned. On success the reference
5617 * counts are adjusted and the function returns zero.
5618 */
netdev_master_upper_dev_link(struct net_device * dev,struct net_device * upper_dev)5619 int netdev_master_upper_dev_link(struct net_device *dev,
5620 struct net_device *upper_dev)
5621 {
5622 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5623 }
5624 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5625
netdev_master_upper_dev_link_private(struct net_device * dev,struct net_device * upper_dev,void * private)5626 int netdev_master_upper_dev_link_private(struct net_device *dev,
5627 struct net_device *upper_dev,
5628 void *private)
5629 {
5630 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5631 }
5632 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5633
5634 /**
5635 * netdev_upper_dev_unlink - Removes a link to upper device
5636 * @dev: device
5637 * @upper_dev: new upper device
5638 *
5639 * Removes a link to device which is upper to this one. The caller must hold
5640 * the RTNL lock.
5641 */
netdev_upper_dev_unlink(struct net_device * dev,struct net_device * upper_dev)5642 void netdev_upper_dev_unlink(struct net_device *dev,
5643 struct net_device *upper_dev)
5644 {
5645 struct netdev_notifier_changeupper_info changeupper_info;
5646 struct netdev_adjacent *i, *j;
5647 ASSERT_RTNL();
5648
5649 changeupper_info.upper_dev = upper_dev;
5650 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5651 changeupper_info.linking = false;
5652
5653 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5654 &changeupper_info.info);
5655
5656 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5657
5658 /* Here is the tricky part. We must remove all dev's lower
5659 * devices from all upper_dev's upper devices and vice
5660 * versa, to maintain the graph relationship.
5661 */
5662 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5663 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5664 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5665
5666 /* remove also the devices itself from lower/upper device
5667 * list
5668 */
5669 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5670 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5671
5672 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5673 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5674
5675 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5676 &changeupper_info.info);
5677 }
5678 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5679
5680 /**
5681 * netdev_bonding_info_change - Dispatch event about slave change
5682 * @dev: device
5683 * @bonding_info: info to dispatch
5684 *
5685 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5686 * The caller must hold the RTNL lock.
5687 */
netdev_bonding_info_change(struct net_device * dev,struct netdev_bonding_info * bonding_info)5688 void netdev_bonding_info_change(struct net_device *dev,
5689 struct netdev_bonding_info *bonding_info)
5690 {
5691 struct netdev_notifier_bonding_info info;
5692
5693 memcpy(&info.bonding_info, bonding_info,
5694 sizeof(struct netdev_bonding_info));
5695 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5696 &info.info);
5697 }
5698 EXPORT_SYMBOL(netdev_bonding_info_change);
5699
netdev_adjacent_add_links(struct net_device * dev)5700 static void netdev_adjacent_add_links(struct net_device *dev)
5701 {
5702 struct netdev_adjacent *iter;
5703
5704 struct net *net = dev_net(dev);
5705
5706 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5707 if (!net_eq(net,dev_net(iter->dev)))
5708 continue;
5709 netdev_adjacent_sysfs_add(iter->dev, dev,
5710 &iter->dev->adj_list.lower);
5711 netdev_adjacent_sysfs_add(dev, iter->dev,
5712 &dev->adj_list.upper);
5713 }
5714
5715 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5716 if (!net_eq(net,dev_net(iter->dev)))
5717 continue;
5718 netdev_adjacent_sysfs_add(iter->dev, dev,
5719 &iter->dev->adj_list.upper);
5720 netdev_adjacent_sysfs_add(dev, iter->dev,
5721 &dev->adj_list.lower);
5722 }
5723 }
5724
netdev_adjacent_del_links(struct net_device * dev)5725 static void netdev_adjacent_del_links(struct net_device *dev)
5726 {
5727 struct netdev_adjacent *iter;
5728
5729 struct net *net = dev_net(dev);
5730
5731 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5732 if (!net_eq(net,dev_net(iter->dev)))
5733 continue;
5734 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5735 &iter->dev->adj_list.lower);
5736 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5737 &dev->adj_list.upper);
5738 }
5739
5740 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5741 if (!net_eq(net,dev_net(iter->dev)))
5742 continue;
5743 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5744 &iter->dev->adj_list.upper);
5745 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5746 &dev->adj_list.lower);
5747 }
5748 }
5749
netdev_adjacent_rename_links(struct net_device * dev,char * oldname)5750 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5751 {
5752 struct netdev_adjacent *iter;
5753
5754 struct net *net = dev_net(dev);
5755
5756 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5757 if (!net_eq(net,dev_net(iter->dev)))
5758 continue;
5759 netdev_adjacent_sysfs_del(iter->dev, oldname,
5760 &iter->dev->adj_list.lower);
5761 netdev_adjacent_sysfs_add(iter->dev, dev,
5762 &iter->dev->adj_list.lower);
5763 }
5764
5765 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5766 if (!net_eq(net,dev_net(iter->dev)))
5767 continue;
5768 netdev_adjacent_sysfs_del(iter->dev, oldname,
5769 &iter->dev->adj_list.upper);
5770 netdev_adjacent_sysfs_add(iter->dev, dev,
5771 &iter->dev->adj_list.upper);
5772 }
5773 }
5774
netdev_lower_dev_get_private(struct net_device * dev,struct net_device * lower_dev)5775 void *netdev_lower_dev_get_private(struct net_device *dev,
5776 struct net_device *lower_dev)
5777 {
5778 struct netdev_adjacent *lower;
5779
5780 if (!lower_dev)
5781 return NULL;
5782 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5783 if (!lower)
5784 return NULL;
5785
5786 return lower->private;
5787 }
5788 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5789
5790
dev_get_nest_level(struct net_device * dev,bool (* type_check)(struct net_device * dev))5791 int dev_get_nest_level(struct net_device *dev,
5792 bool (*type_check)(struct net_device *dev))
5793 {
5794 struct net_device *lower = NULL;
5795 struct list_head *iter;
5796 int max_nest = -1;
5797 int nest;
5798
5799 ASSERT_RTNL();
5800
5801 netdev_for_each_lower_dev(dev, lower, iter) {
5802 nest = dev_get_nest_level(lower, type_check);
5803 if (max_nest < nest)
5804 max_nest = nest;
5805 }
5806
5807 if (type_check(dev))
5808 max_nest++;
5809
5810 return max_nest;
5811 }
5812 EXPORT_SYMBOL(dev_get_nest_level);
5813
dev_change_rx_flags(struct net_device * dev,int flags)5814 static void dev_change_rx_flags(struct net_device *dev, int flags)
5815 {
5816 const struct net_device_ops *ops = dev->netdev_ops;
5817
5818 if (ops->ndo_change_rx_flags)
5819 ops->ndo_change_rx_flags(dev, flags);
5820 }
5821
__dev_set_promiscuity(struct net_device * dev,int inc,bool notify)5822 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5823 {
5824 unsigned int old_flags = dev->flags;
5825 kuid_t uid;
5826 kgid_t gid;
5827
5828 ASSERT_RTNL();
5829
5830 dev->flags |= IFF_PROMISC;
5831 dev->promiscuity += inc;
5832 if (dev->promiscuity == 0) {
5833 /*
5834 * Avoid overflow.
5835 * If inc causes overflow, untouch promisc and return error.
5836 */
5837 if (inc < 0)
5838 dev->flags &= ~IFF_PROMISC;
5839 else {
5840 dev->promiscuity -= inc;
5841 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5842 dev->name);
5843 return -EOVERFLOW;
5844 }
5845 }
5846 if (dev->flags != old_flags) {
5847 pr_info("device %s %s promiscuous mode\n",
5848 dev->name,
5849 dev->flags & IFF_PROMISC ? "entered" : "left");
5850 if (audit_enabled) {
5851 current_uid_gid(&uid, &gid);
5852 audit_log(current->audit_context, GFP_ATOMIC,
5853 AUDIT_ANOM_PROMISCUOUS,
5854 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5855 dev->name, (dev->flags & IFF_PROMISC),
5856 (old_flags & IFF_PROMISC),
5857 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5858 from_kuid(&init_user_ns, uid),
5859 from_kgid(&init_user_ns, gid),
5860 audit_get_sessionid(current));
5861 }
5862
5863 dev_change_rx_flags(dev, IFF_PROMISC);
5864 }
5865 if (notify)
5866 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5867 return 0;
5868 }
5869
5870 /**
5871 * dev_set_promiscuity - update promiscuity count on a device
5872 * @dev: device
5873 * @inc: modifier
5874 *
5875 * Add or remove promiscuity from a device. While the count in the device
5876 * remains above zero the interface remains promiscuous. Once it hits zero
5877 * the device reverts back to normal filtering operation. A negative inc
5878 * value is used to drop promiscuity on the device.
5879 * Return 0 if successful or a negative errno code on error.
5880 */
dev_set_promiscuity(struct net_device * dev,int inc)5881 int dev_set_promiscuity(struct net_device *dev, int inc)
5882 {
5883 unsigned int old_flags = dev->flags;
5884 int err;
5885
5886 err = __dev_set_promiscuity(dev, inc, true);
5887 if (err < 0)
5888 return err;
5889 if (dev->flags != old_flags)
5890 dev_set_rx_mode(dev);
5891 return err;
5892 }
5893 EXPORT_SYMBOL(dev_set_promiscuity);
5894
__dev_set_allmulti(struct net_device * dev,int inc,bool notify)5895 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5896 {
5897 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5898
5899 ASSERT_RTNL();
5900
5901 dev->flags |= IFF_ALLMULTI;
5902 dev->allmulti += inc;
5903 if (dev->allmulti == 0) {
5904 /*
5905 * Avoid overflow.
5906 * If inc causes overflow, untouch allmulti and return error.
5907 */
5908 if (inc < 0)
5909 dev->flags &= ~IFF_ALLMULTI;
5910 else {
5911 dev->allmulti -= inc;
5912 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5913 dev->name);
5914 return -EOVERFLOW;
5915 }
5916 }
5917 if (dev->flags ^ old_flags) {
5918 dev_change_rx_flags(dev, IFF_ALLMULTI);
5919 dev_set_rx_mode(dev);
5920 if (notify)
5921 __dev_notify_flags(dev, old_flags,
5922 dev->gflags ^ old_gflags);
5923 }
5924 return 0;
5925 }
5926
5927 /**
5928 * dev_set_allmulti - update allmulti count on a device
5929 * @dev: device
5930 * @inc: modifier
5931 *
5932 * Add or remove reception of all multicast frames to a device. While the
5933 * count in the device remains above zero the interface remains listening
5934 * to all interfaces. Once it hits zero the device reverts back to normal
5935 * filtering operation. A negative @inc value is used to drop the counter
5936 * when releasing a resource needing all multicasts.
5937 * Return 0 if successful or a negative errno code on error.
5938 */
5939
dev_set_allmulti(struct net_device * dev,int inc)5940 int dev_set_allmulti(struct net_device *dev, int inc)
5941 {
5942 return __dev_set_allmulti(dev, inc, true);
5943 }
5944 EXPORT_SYMBOL(dev_set_allmulti);
5945
5946 /*
5947 * Upload unicast and multicast address lists to device and
5948 * configure RX filtering. When the device doesn't support unicast
5949 * filtering it is put in promiscuous mode while unicast addresses
5950 * are present.
5951 */
__dev_set_rx_mode(struct net_device * dev)5952 void __dev_set_rx_mode(struct net_device *dev)
5953 {
5954 const struct net_device_ops *ops = dev->netdev_ops;
5955
5956 /* dev_open will call this function so the list will stay sane. */
5957 if (!(dev->flags&IFF_UP))
5958 return;
5959
5960 if (!netif_device_present(dev))
5961 return;
5962
5963 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5964 /* Unicast addresses changes may only happen under the rtnl,
5965 * therefore calling __dev_set_promiscuity here is safe.
5966 */
5967 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5968 __dev_set_promiscuity(dev, 1, false);
5969 dev->uc_promisc = true;
5970 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5971 __dev_set_promiscuity(dev, -1, false);
5972 dev->uc_promisc = false;
5973 }
5974 }
5975
5976 if (ops->ndo_set_rx_mode)
5977 ops->ndo_set_rx_mode(dev);
5978 }
5979
dev_set_rx_mode(struct net_device * dev)5980 void dev_set_rx_mode(struct net_device *dev)
5981 {
5982 netif_addr_lock_bh(dev);
5983 __dev_set_rx_mode(dev);
5984 netif_addr_unlock_bh(dev);
5985 }
5986
5987 /**
5988 * dev_get_flags - get flags reported to userspace
5989 * @dev: device
5990 *
5991 * Get the combination of flag bits exported through APIs to userspace.
5992 */
dev_get_flags(const struct net_device * dev)5993 unsigned int dev_get_flags(const struct net_device *dev)
5994 {
5995 unsigned int flags;
5996
5997 flags = (dev->flags & ~(IFF_PROMISC |
5998 IFF_ALLMULTI |
5999 IFF_RUNNING |
6000 IFF_LOWER_UP |
6001 IFF_DORMANT)) |
6002 (dev->gflags & (IFF_PROMISC |
6003 IFF_ALLMULTI));
6004
6005 if (netif_running(dev)) {
6006 if (netif_oper_up(dev))
6007 flags |= IFF_RUNNING;
6008 if (netif_carrier_ok(dev))
6009 flags |= IFF_LOWER_UP;
6010 if (netif_dormant(dev))
6011 flags |= IFF_DORMANT;
6012 }
6013
6014 return flags;
6015 }
6016 EXPORT_SYMBOL(dev_get_flags);
6017
__dev_change_flags(struct net_device * dev,unsigned int flags)6018 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6019 {
6020 unsigned int old_flags = dev->flags;
6021 int ret;
6022
6023 ASSERT_RTNL();
6024
6025 /*
6026 * Set the flags on our device.
6027 */
6028
6029 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6030 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6031 IFF_AUTOMEDIA)) |
6032 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6033 IFF_ALLMULTI));
6034
6035 /*
6036 * Load in the correct multicast list now the flags have changed.
6037 */
6038
6039 if ((old_flags ^ flags) & IFF_MULTICAST)
6040 dev_change_rx_flags(dev, IFF_MULTICAST);
6041
6042 dev_set_rx_mode(dev);
6043
6044 /*
6045 * Have we downed the interface. We handle IFF_UP ourselves
6046 * according to user attempts to set it, rather than blindly
6047 * setting it.
6048 */
6049
6050 ret = 0;
6051 if ((old_flags ^ flags) & IFF_UP)
6052 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6053
6054 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6055 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6056 unsigned int old_flags = dev->flags;
6057
6058 dev->gflags ^= IFF_PROMISC;
6059
6060 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6061 if (dev->flags != old_flags)
6062 dev_set_rx_mode(dev);
6063 }
6064
6065 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6066 is important. Some (broken) drivers set IFF_PROMISC, when
6067 IFF_ALLMULTI is requested not asking us and not reporting.
6068 */
6069 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6070 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6071
6072 dev->gflags ^= IFF_ALLMULTI;
6073 __dev_set_allmulti(dev, inc, false);
6074 }
6075
6076 return ret;
6077 }
6078
__dev_notify_flags(struct net_device * dev,unsigned int old_flags,unsigned int gchanges)6079 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6080 unsigned int gchanges)
6081 {
6082 unsigned int changes = dev->flags ^ old_flags;
6083
6084 if (gchanges)
6085 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6086
6087 if (changes & IFF_UP) {
6088 if (dev->flags & IFF_UP)
6089 call_netdevice_notifiers(NETDEV_UP, dev);
6090 else
6091 call_netdevice_notifiers(NETDEV_DOWN, dev);
6092 }
6093
6094 if (dev->flags & IFF_UP &&
6095 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6096 struct netdev_notifier_change_info change_info;
6097
6098 change_info.flags_changed = changes;
6099 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6100 &change_info.info);
6101 }
6102 }
6103
6104 /**
6105 * dev_change_flags - change device settings
6106 * @dev: device
6107 * @flags: device state flags
6108 *
6109 * Change settings on device based state flags. The flags are
6110 * in the userspace exported format.
6111 */
dev_change_flags(struct net_device * dev,unsigned int flags)6112 int dev_change_flags(struct net_device *dev, unsigned int flags)
6113 {
6114 int ret;
6115 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6116
6117 ret = __dev_change_flags(dev, flags);
6118 if (ret < 0)
6119 return ret;
6120
6121 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6122 __dev_notify_flags(dev, old_flags, changes);
6123 return ret;
6124 }
6125 EXPORT_SYMBOL(dev_change_flags);
6126
__dev_set_mtu(struct net_device * dev,int new_mtu)6127 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6128 {
6129 const struct net_device_ops *ops = dev->netdev_ops;
6130
6131 if (ops->ndo_change_mtu)
6132 return ops->ndo_change_mtu(dev, new_mtu);
6133
6134 /* Pairs with all the lockless reads of dev->mtu in the stack */
6135 WRITE_ONCE(dev->mtu, new_mtu);
6136 return 0;
6137 }
6138
6139 /**
6140 * dev_set_mtu - Change maximum transfer unit
6141 * @dev: device
6142 * @new_mtu: new transfer unit
6143 *
6144 * Change the maximum transfer size of the network device.
6145 */
dev_set_mtu(struct net_device * dev,int new_mtu)6146 int dev_set_mtu(struct net_device *dev, int new_mtu)
6147 {
6148 int err, orig_mtu;
6149
6150 if (new_mtu == dev->mtu)
6151 return 0;
6152
6153 /* MTU must be positive. */
6154 if (new_mtu < 0)
6155 return -EINVAL;
6156
6157 if (!netif_device_present(dev))
6158 return -ENODEV;
6159
6160 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6161 err = notifier_to_errno(err);
6162 if (err)
6163 return err;
6164
6165 orig_mtu = dev->mtu;
6166 err = __dev_set_mtu(dev, new_mtu);
6167
6168 if (!err) {
6169 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6170 orig_mtu);
6171 err = notifier_to_errno(err);
6172 if (err) {
6173 /* setting mtu back and notifying everyone again,
6174 * so that they have a chance to revert changes.
6175 */
6176 __dev_set_mtu(dev, orig_mtu);
6177 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6178 new_mtu);
6179 }
6180 }
6181 return err;
6182 }
6183 EXPORT_SYMBOL(dev_set_mtu);
6184
6185 /**
6186 * dev_set_group - Change group this device belongs to
6187 * @dev: device
6188 * @new_group: group this device should belong to
6189 */
dev_set_group(struct net_device * dev,int new_group)6190 void dev_set_group(struct net_device *dev, int new_group)
6191 {
6192 dev->group = new_group;
6193 }
6194 EXPORT_SYMBOL(dev_set_group);
6195
6196 /**
6197 * dev_set_mac_address - Change Media Access Control Address
6198 * @dev: device
6199 * @sa: new address
6200 *
6201 * Change the hardware (MAC) address of the device
6202 */
dev_set_mac_address(struct net_device * dev,struct sockaddr * sa)6203 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6204 {
6205 const struct net_device_ops *ops = dev->netdev_ops;
6206 int err;
6207
6208 if (!ops->ndo_set_mac_address)
6209 return -EOPNOTSUPP;
6210 if (sa->sa_family != dev->type)
6211 return -EINVAL;
6212 if (!netif_device_present(dev))
6213 return -ENODEV;
6214 err = ops->ndo_set_mac_address(dev, sa);
6215 if (err)
6216 return err;
6217 dev->addr_assign_type = NET_ADDR_SET;
6218 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6219 add_device_randomness(dev->dev_addr, dev->addr_len);
6220 return 0;
6221 }
6222 EXPORT_SYMBOL(dev_set_mac_address);
6223
6224 /**
6225 * dev_change_carrier - Change device carrier
6226 * @dev: device
6227 * @new_carrier: new value
6228 *
6229 * Change device carrier
6230 */
dev_change_carrier(struct net_device * dev,bool new_carrier)6231 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6232 {
6233 const struct net_device_ops *ops = dev->netdev_ops;
6234
6235 if (!ops->ndo_change_carrier)
6236 return -EOPNOTSUPP;
6237 if (!netif_device_present(dev))
6238 return -ENODEV;
6239 return ops->ndo_change_carrier(dev, new_carrier);
6240 }
6241 EXPORT_SYMBOL(dev_change_carrier);
6242
6243 /**
6244 * dev_get_phys_port_id - Get device physical port ID
6245 * @dev: device
6246 * @ppid: port ID
6247 *
6248 * Get device physical port ID
6249 */
dev_get_phys_port_id(struct net_device * dev,struct netdev_phys_item_id * ppid)6250 int dev_get_phys_port_id(struct net_device *dev,
6251 struct netdev_phys_item_id *ppid)
6252 {
6253 const struct net_device_ops *ops = dev->netdev_ops;
6254
6255 if (!ops->ndo_get_phys_port_id)
6256 return -EOPNOTSUPP;
6257 return ops->ndo_get_phys_port_id(dev, ppid);
6258 }
6259 EXPORT_SYMBOL(dev_get_phys_port_id);
6260
6261 /**
6262 * dev_get_phys_port_name - Get device physical port name
6263 * @dev: device
6264 * @name: port name
6265 *
6266 * Get device physical port name
6267 */
dev_get_phys_port_name(struct net_device * dev,char * name,size_t len)6268 int dev_get_phys_port_name(struct net_device *dev,
6269 char *name, size_t len)
6270 {
6271 const struct net_device_ops *ops = dev->netdev_ops;
6272
6273 if (!ops->ndo_get_phys_port_name)
6274 return -EOPNOTSUPP;
6275 return ops->ndo_get_phys_port_name(dev, name, len);
6276 }
6277 EXPORT_SYMBOL(dev_get_phys_port_name);
6278
6279 /**
6280 * dev_change_proto_down - update protocol port state information
6281 * @dev: device
6282 * @proto_down: new value
6283 *
6284 * This info can be used by switch drivers to set the phys state of the
6285 * port.
6286 */
dev_change_proto_down(struct net_device * dev,bool proto_down)6287 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6288 {
6289 const struct net_device_ops *ops = dev->netdev_ops;
6290
6291 if (!ops->ndo_change_proto_down)
6292 return -EOPNOTSUPP;
6293 if (!netif_device_present(dev))
6294 return -ENODEV;
6295 return ops->ndo_change_proto_down(dev, proto_down);
6296 }
6297 EXPORT_SYMBOL(dev_change_proto_down);
6298
6299 /**
6300 * dev_new_index - allocate an ifindex
6301 * @net: the applicable net namespace
6302 *
6303 * Returns a suitable unique value for a new device interface
6304 * number. The caller must hold the rtnl semaphore or the
6305 * dev_base_lock to be sure it remains unique.
6306 */
dev_new_index(struct net * net)6307 static int dev_new_index(struct net *net)
6308 {
6309 int ifindex = net->ifindex;
6310 for (;;) {
6311 if (++ifindex <= 0)
6312 ifindex = 1;
6313 if (!__dev_get_by_index(net, ifindex))
6314 return net->ifindex = ifindex;
6315 }
6316 }
6317
6318 /* Delayed registration/unregisteration */
6319 static LIST_HEAD(net_todo_list);
6320 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6321
net_set_todo(struct net_device * dev)6322 static void net_set_todo(struct net_device *dev)
6323 {
6324 list_add_tail(&dev->todo_list, &net_todo_list);
6325 dev_net(dev)->dev_unreg_count++;
6326 }
6327
rollback_registered_many(struct list_head * head)6328 static void rollback_registered_many(struct list_head *head)
6329 {
6330 struct net_device *dev, *tmp;
6331 LIST_HEAD(close_head);
6332
6333 BUG_ON(dev_boot_phase);
6334 ASSERT_RTNL();
6335
6336 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6337 /* Some devices call without registering
6338 * for initialization unwind. Remove those
6339 * devices and proceed with the remaining.
6340 */
6341 if (dev->reg_state == NETREG_UNINITIALIZED) {
6342 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6343 dev->name, dev);
6344
6345 WARN_ON(1);
6346 list_del(&dev->unreg_list);
6347 continue;
6348 }
6349 dev->dismantle = true;
6350 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6351 }
6352
6353 /* If device is running, close it first. */
6354 list_for_each_entry(dev, head, unreg_list)
6355 list_add_tail(&dev->close_list, &close_head);
6356 dev_close_many(&close_head, true);
6357
6358 list_for_each_entry(dev, head, unreg_list) {
6359 /* And unlink it from device chain. */
6360 unlist_netdevice(dev);
6361
6362 dev->reg_state = NETREG_UNREGISTERING;
6363 on_each_cpu(flush_backlog, dev, 1);
6364 }
6365
6366 synchronize_net();
6367
6368 list_for_each_entry(dev, head, unreg_list) {
6369 struct sk_buff *skb = NULL;
6370
6371 /* Shutdown queueing discipline. */
6372 dev_shutdown(dev);
6373
6374
6375 /* Notify protocols, that we are about to destroy
6376 this device. They should clean all the things.
6377 */
6378 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6379
6380 if (!dev->rtnl_link_ops ||
6381 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6382 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6383 GFP_KERNEL);
6384
6385 /*
6386 * Flush the unicast and multicast chains
6387 */
6388 dev_uc_flush(dev);
6389 dev_mc_flush(dev);
6390
6391 if (dev->netdev_ops->ndo_uninit)
6392 dev->netdev_ops->ndo_uninit(dev);
6393
6394 if (skb)
6395 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6396
6397 /* Notifier chain MUST detach us all upper devices. */
6398 WARN_ON(netdev_has_any_upper_dev(dev));
6399
6400 /* Remove entries from kobject tree */
6401 netdev_unregister_kobject(dev);
6402 #ifdef CONFIG_XPS
6403 /* Remove XPS queueing entries */
6404 netif_reset_xps_queues_gt(dev, 0);
6405 #endif
6406 }
6407
6408 synchronize_net();
6409
6410 list_for_each_entry(dev, head, unreg_list)
6411 dev_put(dev);
6412 }
6413
rollback_registered(struct net_device * dev)6414 static void rollback_registered(struct net_device *dev)
6415 {
6416 LIST_HEAD(single);
6417
6418 list_add(&dev->unreg_list, &single);
6419 rollback_registered_many(&single);
6420 list_del(&single);
6421 }
6422
netdev_sync_upper_features(struct net_device * lower,struct net_device * upper,netdev_features_t features)6423 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6424 struct net_device *upper, netdev_features_t features)
6425 {
6426 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6427 netdev_features_t feature;
6428 int feature_bit;
6429
6430 for_each_netdev_feature(upper_disables, feature_bit) {
6431 feature = __NETIF_F_BIT(feature_bit);
6432 if (!(upper->wanted_features & feature)
6433 && (features & feature)) {
6434 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6435 &feature, upper->name);
6436 features &= ~feature;
6437 }
6438 }
6439
6440 return features;
6441 }
6442
netdev_sync_lower_features(struct net_device * upper,struct net_device * lower,netdev_features_t features)6443 static void netdev_sync_lower_features(struct net_device *upper,
6444 struct net_device *lower, netdev_features_t features)
6445 {
6446 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6447 netdev_features_t feature;
6448 int feature_bit;
6449
6450 for_each_netdev_feature(upper_disables, feature_bit) {
6451 feature = __NETIF_F_BIT(feature_bit);
6452 if (!(features & feature) && (lower->features & feature)) {
6453 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6454 &feature, lower->name);
6455 lower->wanted_features &= ~feature;
6456 __netdev_update_features(lower);
6457
6458 if (unlikely(lower->features & feature))
6459 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6460 &feature, lower->name);
6461 else
6462 netdev_features_change(lower);
6463 }
6464 }
6465 }
6466
netdev_fix_features(struct net_device * dev,netdev_features_t features)6467 static netdev_features_t netdev_fix_features(struct net_device *dev,
6468 netdev_features_t features)
6469 {
6470 /* Fix illegal checksum combinations */
6471 if ((features & NETIF_F_HW_CSUM) &&
6472 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6473 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6474 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6475 }
6476
6477 /* TSO requires that SG is present as well. */
6478 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6479 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6480 features &= ~NETIF_F_ALL_TSO;
6481 }
6482
6483 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6484 !(features & NETIF_F_IP_CSUM)) {
6485 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6486 features &= ~NETIF_F_TSO;
6487 features &= ~NETIF_F_TSO_ECN;
6488 }
6489
6490 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6491 !(features & NETIF_F_IPV6_CSUM)) {
6492 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6493 features &= ~NETIF_F_TSO6;
6494 }
6495
6496 /* TSO ECN requires that TSO is present as well. */
6497 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6498 features &= ~NETIF_F_TSO_ECN;
6499
6500 /* Software GSO depends on SG. */
6501 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6502 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6503 features &= ~NETIF_F_GSO;
6504 }
6505
6506 /* UFO needs SG and checksumming */
6507 if (features & NETIF_F_UFO) {
6508 /* maybe split UFO into V4 and V6? */
6509 if (!((features & NETIF_F_GEN_CSUM) ||
6510 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6511 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6512 netdev_dbg(dev,
6513 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6514 features &= ~NETIF_F_UFO;
6515 }
6516
6517 if (!(features & NETIF_F_SG)) {
6518 netdev_dbg(dev,
6519 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6520 features &= ~NETIF_F_UFO;
6521 }
6522 }
6523
6524 #ifdef CONFIG_NET_RX_BUSY_POLL
6525 if (dev->netdev_ops->ndo_busy_poll)
6526 features |= NETIF_F_BUSY_POLL;
6527 else
6528 #endif
6529 features &= ~NETIF_F_BUSY_POLL;
6530
6531 return features;
6532 }
6533
__netdev_update_features(struct net_device * dev)6534 int __netdev_update_features(struct net_device *dev)
6535 {
6536 struct net_device *upper, *lower;
6537 netdev_features_t features;
6538 struct list_head *iter;
6539 int err = -1;
6540
6541 ASSERT_RTNL();
6542
6543 features = netdev_get_wanted_features(dev);
6544
6545 if (dev->netdev_ops->ndo_fix_features)
6546 features = dev->netdev_ops->ndo_fix_features(dev, features);
6547
6548 /* driver might be less strict about feature dependencies */
6549 features = netdev_fix_features(dev, features);
6550
6551 /* some features can't be enabled if they're off an an upper device */
6552 netdev_for_each_upper_dev_rcu(dev, upper, iter)
6553 features = netdev_sync_upper_features(dev, upper, features);
6554
6555 if (dev->features == features)
6556 goto sync_lower;
6557
6558 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6559 &dev->features, &features);
6560
6561 if (dev->netdev_ops->ndo_set_features)
6562 err = dev->netdev_ops->ndo_set_features(dev, features);
6563 else
6564 err = 0;
6565
6566 if (unlikely(err < 0)) {
6567 netdev_err(dev,
6568 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6569 err, &features, &dev->features);
6570 /* return non-0 since some features might have changed and
6571 * it's better to fire a spurious notification than miss it
6572 */
6573 return -1;
6574 }
6575
6576 sync_lower:
6577 /* some features must be disabled on lower devices when disabled
6578 * on an upper device (think: bonding master or bridge)
6579 */
6580 netdev_for_each_lower_dev(dev, lower, iter)
6581 netdev_sync_lower_features(dev, lower, features);
6582
6583 if (!err)
6584 dev->features = features;
6585
6586 return err < 0 ? 0 : 1;
6587 }
6588
6589 /**
6590 * netdev_update_features - recalculate device features
6591 * @dev: the device to check
6592 *
6593 * Recalculate dev->features set and send notifications if it
6594 * has changed. Should be called after driver or hardware dependent
6595 * conditions might have changed that influence the features.
6596 */
netdev_update_features(struct net_device * dev)6597 void netdev_update_features(struct net_device *dev)
6598 {
6599 if (__netdev_update_features(dev))
6600 netdev_features_change(dev);
6601 }
6602 EXPORT_SYMBOL(netdev_update_features);
6603
6604 /**
6605 * netdev_change_features - recalculate device features
6606 * @dev: the device to check
6607 *
6608 * Recalculate dev->features set and send notifications even
6609 * if they have not changed. Should be called instead of
6610 * netdev_update_features() if also dev->vlan_features might
6611 * have changed to allow the changes to be propagated to stacked
6612 * VLAN devices.
6613 */
netdev_change_features(struct net_device * dev)6614 void netdev_change_features(struct net_device *dev)
6615 {
6616 __netdev_update_features(dev);
6617 netdev_features_change(dev);
6618 }
6619 EXPORT_SYMBOL(netdev_change_features);
6620
6621 /**
6622 * netif_stacked_transfer_operstate - transfer operstate
6623 * @rootdev: the root or lower level device to transfer state from
6624 * @dev: the device to transfer operstate to
6625 *
6626 * Transfer operational state from root to device. This is normally
6627 * called when a stacking relationship exists between the root
6628 * device and the device(a leaf device).
6629 */
netif_stacked_transfer_operstate(const struct net_device * rootdev,struct net_device * dev)6630 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6631 struct net_device *dev)
6632 {
6633 if (rootdev->operstate == IF_OPER_DORMANT)
6634 netif_dormant_on(dev);
6635 else
6636 netif_dormant_off(dev);
6637
6638 if (netif_carrier_ok(rootdev)) {
6639 if (!netif_carrier_ok(dev))
6640 netif_carrier_on(dev);
6641 } else {
6642 if (netif_carrier_ok(dev))
6643 netif_carrier_off(dev);
6644 }
6645 }
6646 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6647
6648 #ifdef CONFIG_SYSFS
netif_alloc_rx_queues(struct net_device * dev)6649 static int netif_alloc_rx_queues(struct net_device *dev)
6650 {
6651 unsigned int i, count = dev->num_rx_queues;
6652 struct netdev_rx_queue *rx;
6653 size_t sz = count * sizeof(*rx);
6654
6655 BUG_ON(count < 1);
6656
6657 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6658 if (!rx) {
6659 rx = vzalloc(sz);
6660 if (!rx)
6661 return -ENOMEM;
6662 }
6663 dev->_rx = rx;
6664
6665 for (i = 0; i < count; i++)
6666 rx[i].dev = dev;
6667 return 0;
6668 }
6669 #endif
6670
netdev_init_one_queue(struct net_device * dev,struct netdev_queue * queue,void * _unused)6671 static void netdev_init_one_queue(struct net_device *dev,
6672 struct netdev_queue *queue, void *_unused)
6673 {
6674 /* Initialize queue lock */
6675 spin_lock_init(&queue->_xmit_lock);
6676 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6677 queue->xmit_lock_owner = -1;
6678 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6679 queue->dev = dev;
6680 #ifdef CONFIG_BQL
6681 dql_init(&queue->dql, HZ);
6682 #endif
6683 }
6684
netif_free_tx_queues(struct net_device * dev)6685 static void netif_free_tx_queues(struct net_device *dev)
6686 {
6687 kvfree(dev->_tx);
6688 }
6689
netif_alloc_netdev_queues(struct net_device * dev)6690 static int netif_alloc_netdev_queues(struct net_device *dev)
6691 {
6692 unsigned int count = dev->num_tx_queues;
6693 struct netdev_queue *tx;
6694 size_t sz = count * sizeof(*tx);
6695
6696 if (count < 1 || count > 0xffff)
6697 return -EINVAL;
6698
6699 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6700 if (!tx) {
6701 tx = vzalloc(sz);
6702 if (!tx)
6703 return -ENOMEM;
6704 }
6705 dev->_tx = tx;
6706
6707 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6708 spin_lock_init(&dev->tx_global_lock);
6709
6710 return 0;
6711 }
6712
netif_tx_stop_all_queues(struct net_device * dev)6713 void netif_tx_stop_all_queues(struct net_device *dev)
6714 {
6715 unsigned int i;
6716
6717 for (i = 0; i < dev->num_tx_queues; i++) {
6718 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6719 netif_tx_stop_queue(txq);
6720 }
6721 }
6722 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6723
6724 /**
6725 * register_netdevice - register a network device
6726 * @dev: device to register
6727 *
6728 * Take a completed network device structure and add it to the kernel
6729 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6730 * chain. 0 is returned on success. A negative errno code is returned
6731 * on a failure to set up the device, or if the name is a duplicate.
6732 *
6733 * Callers must hold the rtnl semaphore. You may want
6734 * register_netdev() instead of this.
6735 *
6736 * BUGS:
6737 * The locking appears insufficient to guarantee two parallel registers
6738 * will not get the same name.
6739 */
6740
register_netdevice(struct net_device * dev)6741 int register_netdevice(struct net_device *dev)
6742 {
6743 int ret;
6744 struct net *net = dev_net(dev);
6745
6746 BUG_ON(dev_boot_phase);
6747 ASSERT_RTNL();
6748
6749 might_sleep();
6750
6751 /* When net_device's are persistent, this will be fatal. */
6752 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6753 BUG_ON(!net);
6754
6755 spin_lock_init(&dev->addr_list_lock);
6756 netdev_set_addr_lockdep_class(dev);
6757
6758 ret = dev_get_valid_name(net, dev, dev->name);
6759 if (ret < 0)
6760 goto out;
6761
6762 /* Init, if this function is available */
6763 if (dev->netdev_ops->ndo_init) {
6764 ret = dev->netdev_ops->ndo_init(dev);
6765 if (ret) {
6766 if (ret > 0)
6767 ret = -EIO;
6768 goto out;
6769 }
6770 }
6771
6772 if (((dev->hw_features | dev->features) &
6773 NETIF_F_HW_VLAN_CTAG_FILTER) &&
6774 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6775 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6776 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6777 ret = -EINVAL;
6778 goto err_uninit;
6779 }
6780
6781 ret = -EBUSY;
6782 if (!dev->ifindex)
6783 dev->ifindex = dev_new_index(net);
6784 else if (__dev_get_by_index(net, dev->ifindex))
6785 goto err_uninit;
6786
6787 /* Transfer changeable features to wanted_features and enable
6788 * software offloads (GSO and GRO).
6789 */
6790 dev->hw_features |= NETIF_F_SOFT_FEATURES;
6791 dev->features |= NETIF_F_SOFT_FEATURES;
6792 dev->wanted_features = dev->features & dev->hw_features;
6793
6794 if (!(dev->flags & IFF_LOOPBACK)) {
6795 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6796 }
6797
6798 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6799 */
6800 dev->vlan_features |= NETIF_F_HIGHDMA;
6801
6802 /* Make NETIF_F_SG inheritable to tunnel devices.
6803 */
6804 dev->hw_enc_features |= NETIF_F_SG;
6805
6806 /* Make NETIF_F_SG inheritable to MPLS.
6807 */
6808 dev->mpls_features |= NETIF_F_SG;
6809
6810 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6811 ret = notifier_to_errno(ret);
6812 if (ret)
6813 goto err_uninit;
6814
6815 ret = netdev_register_kobject(dev);
6816 if (ret)
6817 goto err_uninit;
6818 dev->reg_state = NETREG_REGISTERED;
6819
6820 __netdev_update_features(dev);
6821
6822 /*
6823 * Default initial state at registry is that the
6824 * device is present.
6825 */
6826
6827 set_bit(__LINK_STATE_PRESENT, &dev->state);
6828
6829 linkwatch_init_dev(dev);
6830
6831 dev_init_scheduler(dev);
6832 dev_hold(dev);
6833 list_netdevice(dev);
6834 add_device_randomness(dev->dev_addr, dev->addr_len);
6835
6836 /* If the device has permanent device address, driver should
6837 * set dev_addr and also addr_assign_type should be set to
6838 * NET_ADDR_PERM (default value).
6839 */
6840 if (dev->addr_assign_type == NET_ADDR_PERM)
6841 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6842
6843 /* Notify protocols, that a new device appeared. */
6844 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6845 ret = notifier_to_errno(ret);
6846 if (ret) {
6847 rollback_registered(dev);
6848 rcu_barrier();
6849
6850 dev->reg_state = NETREG_UNREGISTERED;
6851 /* We should put the kobject that hold in
6852 * netdev_unregister_kobject(), otherwise
6853 * the net device cannot be freed when
6854 * driver calls free_netdev(), because the
6855 * kobject is being hold.
6856 */
6857 kobject_put(&dev->dev.kobj);
6858 }
6859 /*
6860 * Prevent userspace races by waiting until the network
6861 * device is fully setup before sending notifications.
6862 */
6863 if (!dev->rtnl_link_ops ||
6864 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6865 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6866
6867 out:
6868 return ret;
6869
6870 err_uninit:
6871 if (dev->netdev_ops->ndo_uninit)
6872 dev->netdev_ops->ndo_uninit(dev);
6873 goto out;
6874 }
6875 EXPORT_SYMBOL(register_netdevice);
6876
6877 /**
6878 * init_dummy_netdev - init a dummy network device for NAPI
6879 * @dev: device to init
6880 *
6881 * This takes a network device structure and initialize the minimum
6882 * amount of fields so it can be used to schedule NAPI polls without
6883 * registering a full blown interface. This is to be used by drivers
6884 * that need to tie several hardware interfaces to a single NAPI
6885 * poll scheduler due to HW limitations.
6886 */
init_dummy_netdev(struct net_device * dev)6887 int init_dummy_netdev(struct net_device *dev)
6888 {
6889 /* Clear everything. Note we don't initialize spinlocks
6890 * are they aren't supposed to be taken by any of the
6891 * NAPI code and this dummy netdev is supposed to be
6892 * only ever used for NAPI polls
6893 */
6894 memset(dev, 0, sizeof(struct net_device));
6895
6896 /* make sure we BUG if trying to hit standard
6897 * register/unregister code path
6898 */
6899 dev->reg_state = NETREG_DUMMY;
6900
6901 /* NAPI wants this */
6902 INIT_LIST_HEAD(&dev->napi_list);
6903
6904 /* a dummy interface is started by default */
6905 set_bit(__LINK_STATE_PRESENT, &dev->state);
6906 set_bit(__LINK_STATE_START, &dev->state);
6907
6908 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6909 * because users of this 'device' dont need to change
6910 * its refcount.
6911 */
6912
6913 return 0;
6914 }
6915 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6916
6917
6918 /**
6919 * register_netdev - register a network device
6920 * @dev: device to register
6921 *
6922 * Take a completed network device structure and add it to the kernel
6923 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6924 * chain. 0 is returned on success. A negative errno code is returned
6925 * on a failure to set up the device, or if the name is a duplicate.
6926 *
6927 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6928 * and expands the device name if you passed a format string to
6929 * alloc_netdev.
6930 */
register_netdev(struct net_device * dev)6931 int register_netdev(struct net_device *dev)
6932 {
6933 int err;
6934
6935 rtnl_lock();
6936 err = register_netdevice(dev);
6937 rtnl_unlock();
6938 return err;
6939 }
6940 EXPORT_SYMBOL(register_netdev);
6941
netdev_refcnt_read(const struct net_device * dev)6942 int netdev_refcnt_read(const struct net_device *dev)
6943 {
6944 int i, refcnt = 0;
6945
6946 for_each_possible_cpu(i)
6947 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6948 return refcnt;
6949 }
6950 EXPORT_SYMBOL(netdev_refcnt_read);
6951
6952 /**
6953 * netdev_wait_allrefs - wait until all references are gone.
6954 * @dev: target net_device
6955 *
6956 * This is called when unregistering network devices.
6957 *
6958 * Any protocol or device that holds a reference should register
6959 * for netdevice notification, and cleanup and put back the
6960 * reference if they receive an UNREGISTER event.
6961 * We can get stuck here if buggy protocols don't correctly
6962 * call dev_put.
6963 */
netdev_wait_allrefs(struct net_device * dev)6964 static void netdev_wait_allrefs(struct net_device *dev)
6965 {
6966 unsigned long rebroadcast_time, warning_time;
6967 int refcnt;
6968
6969 linkwatch_forget_dev(dev);
6970
6971 rebroadcast_time = warning_time = jiffies;
6972 refcnt = netdev_refcnt_read(dev);
6973
6974 while (refcnt != 0) {
6975 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6976 rtnl_lock();
6977
6978 /* Rebroadcast unregister notification */
6979 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6980
6981 __rtnl_unlock();
6982 rcu_barrier();
6983 rtnl_lock();
6984
6985 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6986 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6987 &dev->state)) {
6988 /* We must not have linkwatch events
6989 * pending on unregister. If this
6990 * happens, we simply run the queue
6991 * unscheduled, resulting in a noop
6992 * for this device.
6993 */
6994 linkwatch_run_queue();
6995 }
6996
6997 __rtnl_unlock();
6998
6999 rebroadcast_time = jiffies;
7000 }
7001
7002 msleep(250);
7003
7004 refcnt = netdev_refcnt_read(dev);
7005
7006 if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
7007 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7008 dev->name, refcnt);
7009 warning_time = jiffies;
7010 }
7011 }
7012 }
7013
7014 /* The sequence is:
7015 *
7016 * rtnl_lock();
7017 * ...
7018 * register_netdevice(x1);
7019 * register_netdevice(x2);
7020 * ...
7021 * unregister_netdevice(y1);
7022 * unregister_netdevice(y2);
7023 * ...
7024 * rtnl_unlock();
7025 * free_netdev(y1);
7026 * free_netdev(y2);
7027 *
7028 * We are invoked by rtnl_unlock().
7029 * This allows us to deal with problems:
7030 * 1) We can delete sysfs objects which invoke hotplug
7031 * without deadlocking with linkwatch via keventd.
7032 * 2) Since we run with the RTNL semaphore not held, we can sleep
7033 * safely in order to wait for the netdev refcnt to drop to zero.
7034 *
7035 * We must not return until all unregister events added during
7036 * the interval the lock was held have been completed.
7037 */
netdev_run_todo(void)7038 void netdev_run_todo(void)
7039 {
7040 struct list_head list;
7041
7042 /* Snapshot list, allow later requests */
7043 list_replace_init(&net_todo_list, &list);
7044
7045 __rtnl_unlock();
7046
7047
7048 /* Wait for rcu callbacks to finish before next phase */
7049 if (!list_empty(&list))
7050 rcu_barrier();
7051
7052 while (!list_empty(&list)) {
7053 struct net_device *dev
7054 = list_first_entry(&list, struct net_device, todo_list);
7055 list_del(&dev->todo_list);
7056
7057 rtnl_lock();
7058 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7059 __rtnl_unlock();
7060
7061 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7062 pr_err("network todo '%s' but state %d\n",
7063 dev->name, dev->reg_state);
7064 dump_stack();
7065 continue;
7066 }
7067
7068 dev->reg_state = NETREG_UNREGISTERED;
7069
7070 netdev_wait_allrefs(dev);
7071
7072 /* paranoia */
7073 BUG_ON(netdev_refcnt_read(dev));
7074 BUG_ON(!list_empty(&dev->ptype_all));
7075 BUG_ON(!list_empty(&dev->ptype_specific));
7076 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7077 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7078 WARN_ON(dev->dn_ptr);
7079
7080 if (dev->destructor)
7081 dev->destructor(dev);
7082
7083 /* Report a network device has been unregistered */
7084 rtnl_lock();
7085 dev_net(dev)->dev_unreg_count--;
7086 __rtnl_unlock();
7087 wake_up(&netdev_unregistering_wq);
7088
7089 /* Free network device */
7090 kobject_put(&dev->dev.kobj);
7091 }
7092 }
7093
7094 /* Convert net_device_stats to rtnl_link_stats64. They have the same
7095 * fields in the same order, with only the type differing.
7096 */
netdev_stats_to_stats64(struct rtnl_link_stats64 * stats64,const struct net_device_stats * netdev_stats)7097 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7098 const struct net_device_stats *netdev_stats)
7099 {
7100 #if BITS_PER_LONG == 64
7101 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7102 memcpy(stats64, netdev_stats, sizeof(*stats64));
7103 #else
7104 size_t i, n = sizeof(*stats64) / sizeof(u64);
7105 const unsigned long *src = (const unsigned long *)netdev_stats;
7106 u64 *dst = (u64 *)stats64;
7107
7108 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7109 sizeof(*stats64) / sizeof(u64));
7110 for (i = 0; i < n; i++)
7111 dst[i] = src[i];
7112 #endif
7113 }
7114 EXPORT_SYMBOL(netdev_stats_to_stats64);
7115
7116 /**
7117 * dev_get_stats - get network device statistics
7118 * @dev: device to get statistics from
7119 * @storage: place to store stats
7120 *
7121 * Get network statistics from device. Return @storage.
7122 * The device driver may provide its own method by setting
7123 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7124 * otherwise the internal statistics structure is used.
7125 */
dev_get_stats(struct net_device * dev,struct rtnl_link_stats64 * storage)7126 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7127 struct rtnl_link_stats64 *storage)
7128 {
7129 const struct net_device_ops *ops = dev->netdev_ops;
7130
7131 if (ops->ndo_get_stats64) {
7132 memset(storage, 0, sizeof(*storage));
7133 ops->ndo_get_stats64(dev, storage);
7134 } else if (ops->ndo_get_stats) {
7135 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7136 } else {
7137 netdev_stats_to_stats64(storage, &dev->stats);
7138 }
7139 storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
7140 storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
7141 return storage;
7142 }
7143 EXPORT_SYMBOL(dev_get_stats);
7144
dev_ingress_queue_create(struct net_device * dev)7145 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7146 {
7147 struct netdev_queue *queue = dev_ingress_queue(dev);
7148
7149 #ifdef CONFIG_NET_CLS_ACT
7150 if (queue)
7151 return queue;
7152 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7153 if (!queue)
7154 return NULL;
7155 netdev_init_one_queue(dev, queue, NULL);
7156 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7157 queue->qdisc_sleeping = &noop_qdisc;
7158 rcu_assign_pointer(dev->ingress_queue, queue);
7159 #endif
7160 return queue;
7161 }
7162
7163 static const struct ethtool_ops default_ethtool_ops;
7164
netdev_set_default_ethtool_ops(struct net_device * dev,const struct ethtool_ops * ops)7165 void netdev_set_default_ethtool_ops(struct net_device *dev,
7166 const struct ethtool_ops *ops)
7167 {
7168 if (dev->ethtool_ops == &default_ethtool_ops)
7169 dev->ethtool_ops = ops;
7170 }
7171 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7172
netdev_freemem(struct net_device * dev)7173 void netdev_freemem(struct net_device *dev)
7174 {
7175 char *addr = (char *)dev - dev->padded;
7176
7177 kvfree(addr);
7178 }
7179
7180 /**
7181 * alloc_netdev_mqs - allocate network device
7182 * @sizeof_priv: size of private data to allocate space for
7183 * @name: device name format string
7184 * @name_assign_type: origin of device name
7185 * @setup: callback to initialize device
7186 * @txqs: the number of TX subqueues to allocate
7187 * @rxqs: the number of RX subqueues to allocate
7188 *
7189 * Allocates a struct net_device with private data area for driver use
7190 * and performs basic initialization. Also allocates subqueue structs
7191 * for each queue on the device.
7192 */
alloc_netdev_mqs(int sizeof_priv,const char * name,unsigned char name_assign_type,void (* setup)(struct net_device *),unsigned int txqs,unsigned int rxqs)7193 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7194 unsigned char name_assign_type,
7195 void (*setup)(struct net_device *),
7196 unsigned int txqs, unsigned int rxqs)
7197 {
7198 struct net_device *dev;
7199 size_t alloc_size;
7200 struct net_device *p;
7201
7202 BUG_ON(strlen(name) >= sizeof(dev->name));
7203
7204 if (txqs < 1) {
7205 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7206 return NULL;
7207 }
7208
7209 #ifdef CONFIG_SYSFS
7210 if (rxqs < 1) {
7211 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7212 return NULL;
7213 }
7214 #endif
7215
7216 alloc_size = sizeof(struct net_device);
7217 if (sizeof_priv) {
7218 /* ensure 32-byte alignment of private area */
7219 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7220 alloc_size += sizeof_priv;
7221 }
7222 /* ensure 32-byte alignment of whole construct */
7223 alloc_size += NETDEV_ALIGN - 1;
7224
7225 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7226 if (!p)
7227 p = vzalloc(alloc_size);
7228 if (!p)
7229 return NULL;
7230
7231 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7232 dev->padded = (char *)dev - (char *)p;
7233
7234 dev->pcpu_refcnt = alloc_percpu(int);
7235 if (!dev->pcpu_refcnt)
7236 goto free_dev;
7237
7238 if (dev_addr_init(dev))
7239 goto free_pcpu;
7240
7241 dev_mc_init(dev);
7242 dev_uc_init(dev);
7243
7244 dev_net_set(dev, &init_net);
7245
7246 dev->gso_max_size = GSO_MAX_SIZE;
7247 dev->gso_max_segs = GSO_MAX_SEGS;
7248 dev->gso_min_segs = 0;
7249
7250 INIT_LIST_HEAD(&dev->napi_list);
7251 INIT_LIST_HEAD(&dev->unreg_list);
7252 INIT_LIST_HEAD(&dev->close_list);
7253 INIT_LIST_HEAD(&dev->link_watch_list);
7254 INIT_LIST_HEAD(&dev->adj_list.upper);
7255 INIT_LIST_HEAD(&dev->adj_list.lower);
7256 INIT_LIST_HEAD(&dev->all_adj_list.upper);
7257 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7258 INIT_LIST_HEAD(&dev->ptype_all);
7259 INIT_LIST_HEAD(&dev->ptype_specific);
7260 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7261 setup(dev);
7262
7263 if (!dev->tx_queue_len) {
7264 dev->priv_flags |= IFF_NO_QUEUE;
7265 dev->tx_queue_len = 1;
7266 }
7267
7268 dev->num_tx_queues = txqs;
7269 dev->real_num_tx_queues = txqs;
7270 if (netif_alloc_netdev_queues(dev))
7271 goto free_all;
7272
7273 #ifdef CONFIG_SYSFS
7274 dev->num_rx_queues = rxqs;
7275 dev->real_num_rx_queues = rxqs;
7276 if (netif_alloc_rx_queues(dev))
7277 goto free_all;
7278 #endif
7279
7280 strcpy(dev->name, name);
7281 dev->name_assign_type = name_assign_type;
7282 dev->group = INIT_NETDEV_GROUP;
7283 if (!dev->ethtool_ops)
7284 dev->ethtool_ops = &default_ethtool_ops;
7285
7286 nf_hook_ingress_init(dev);
7287
7288 return dev;
7289
7290 free_all:
7291 free_netdev(dev);
7292 return NULL;
7293
7294 free_pcpu:
7295 free_percpu(dev->pcpu_refcnt);
7296 free_dev:
7297 netdev_freemem(dev);
7298 return NULL;
7299 }
7300 EXPORT_SYMBOL(alloc_netdev_mqs);
7301
7302 /**
7303 * free_netdev - free network device
7304 * @dev: device
7305 *
7306 * This function does the last stage of destroying an allocated device
7307 * interface. The reference to the device object is released.
7308 * If this is the last reference then it will be freed.
7309 */
free_netdev(struct net_device * dev)7310 void free_netdev(struct net_device *dev)
7311 {
7312 struct napi_struct *p, *n;
7313
7314 netif_free_tx_queues(dev);
7315 #ifdef CONFIG_SYSFS
7316 kvfree(dev->_rx);
7317 #endif
7318
7319 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7320
7321 /* Flush device addresses */
7322 dev_addr_flush(dev);
7323
7324 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7325 netif_napi_del(p);
7326
7327 free_percpu(dev->pcpu_refcnt);
7328 dev->pcpu_refcnt = NULL;
7329
7330 /* Compatibility with error handling in drivers */
7331 if (dev->reg_state == NETREG_UNINITIALIZED) {
7332 netdev_freemem(dev);
7333 return;
7334 }
7335
7336 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7337 dev->reg_state = NETREG_RELEASED;
7338
7339 /* will free via device release */
7340 put_device(&dev->dev);
7341 }
7342 EXPORT_SYMBOL(free_netdev);
7343
7344 /**
7345 * synchronize_net - Synchronize with packet receive processing
7346 *
7347 * Wait for packets currently being received to be done.
7348 * Does not block later packets from starting.
7349 */
synchronize_net(void)7350 void synchronize_net(void)
7351 {
7352 might_sleep();
7353 if (rtnl_is_locked())
7354 synchronize_rcu_expedited();
7355 else
7356 synchronize_rcu();
7357 }
7358 EXPORT_SYMBOL(synchronize_net);
7359
7360 /**
7361 * unregister_netdevice_queue - remove device from the kernel
7362 * @dev: device
7363 * @head: list
7364 *
7365 * This function shuts down a device interface and removes it
7366 * from the kernel tables.
7367 * If head not NULL, device is queued to be unregistered later.
7368 *
7369 * Callers must hold the rtnl semaphore. You may want
7370 * unregister_netdev() instead of this.
7371 */
7372
unregister_netdevice_queue(struct net_device * dev,struct list_head * head)7373 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7374 {
7375 ASSERT_RTNL();
7376
7377 if (head) {
7378 list_move_tail(&dev->unreg_list, head);
7379 } else {
7380 rollback_registered(dev);
7381 /* Finish processing unregister after unlock */
7382 net_set_todo(dev);
7383 }
7384 }
7385 EXPORT_SYMBOL(unregister_netdevice_queue);
7386
7387 /**
7388 * unregister_netdevice_many - unregister many devices
7389 * @head: list of devices
7390 *
7391 * Note: As most callers use a stack allocated list_head,
7392 * we force a list_del() to make sure stack wont be corrupted later.
7393 */
unregister_netdevice_many(struct list_head * head)7394 void unregister_netdevice_many(struct list_head *head)
7395 {
7396 struct net_device *dev;
7397
7398 if (!list_empty(head)) {
7399 rollback_registered_many(head);
7400 list_for_each_entry(dev, head, unreg_list)
7401 net_set_todo(dev);
7402 list_del(head);
7403 }
7404 }
7405 EXPORT_SYMBOL(unregister_netdevice_many);
7406
7407 /**
7408 * unregister_netdev - remove device from the kernel
7409 * @dev: device
7410 *
7411 * This function shuts down a device interface and removes it
7412 * from the kernel tables.
7413 *
7414 * This is just a wrapper for unregister_netdevice that takes
7415 * the rtnl semaphore. In general you want to use this and not
7416 * unregister_netdevice.
7417 */
unregister_netdev(struct net_device * dev)7418 void unregister_netdev(struct net_device *dev)
7419 {
7420 rtnl_lock();
7421 unregister_netdevice(dev);
7422 rtnl_unlock();
7423 }
7424 EXPORT_SYMBOL(unregister_netdev);
7425
7426 /**
7427 * dev_change_net_namespace - move device to different nethost namespace
7428 * @dev: device
7429 * @net: network namespace
7430 * @pat: If not NULL name pattern to try if the current device name
7431 * is already taken in the destination network namespace.
7432 *
7433 * This function shuts down a device interface and moves it
7434 * to a new network namespace. On success 0 is returned, on
7435 * a failure a netagive errno code is returned.
7436 *
7437 * Callers must hold the rtnl semaphore.
7438 */
7439
dev_change_net_namespace(struct net_device * dev,struct net * net,const char * pat)7440 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7441 {
7442 int err;
7443
7444 ASSERT_RTNL();
7445
7446 /* Don't allow namespace local devices to be moved. */
7447 err = -EINVAL;
7448 if (dev->features & NETIF_F_NETNS_LOCAL)
7449 goto out;
7450
7451 /* Ensure the device has been registrered */
7452 if (dev->reg_state != NETREG_REGISTERED)
7453 goto out;
7454
7455 /* Get out if there is nothing todo */
7456 err = 0;
7457 if (net_eq(dev_net(dev), net))
7458 goto out;
7459
7460 /* Pick the destination device name, and ensure
7461 * we can use it in the destination network namespace.
7462 */
7463 err = -EEXIST;
7464 if (__dev_get_by_name(net, dev->name)) {
7465 /* We get here if we can't use the current device name */
7466 if (!pat)
7467 goto out;
7468 err = dev_get_valid_name(net, dev, pat);
7469 if (err < 0)
7470 goto out;
7471 }
7472
7473 /*
7474 * And now a mini version of register_netdevice unregister_netdevice.
7475 */
7476
7477 /* If device is running close it first. */
7478 dev_close(dev);
7479
7480 /* And unlink it from device chain */
7481 unlist_netdevice(dev);
7482
7483 synchronize_net();
7484
7485 /* Shutdown queueing discipline. */
7486 dev_shutdown(dev);
7487
7488 /* Notify protocols, that we are about to destroy
7489 this device. They should clean all the things.
7490
7491 Note that dev->reg_state stays at NETREG_REGISTERED.
7492 This is wanted because this way 8021q and macvlan know
7493 the device is just moving and can keep their slaves up.
7494 */
7495 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7496 rcu_barrier();
7497 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7498 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7499
7500 /*
7501 * Flush the unicast and multicast chains
7502 */
7503 dev_uc_flush(dev);
7504 dev_mc_flush(dev);
7505
7506 /* Send a netdev-removed uevent to the old namespace */
7507 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7508 netdev_adjacent_del_links(dev);
7509
7510 /* Actually switch the network namespace */
7511 dev_net_set(dev, net);
7512
7513 /* If there is an ifindex conflict assign a new one */
7514 if (__dev_get_by_index(net, dev->ifindex))
7515 dev->ifindex = dev_new_index(net);
7516
7517 /* Send a netdev-add uevent to the new namespace */
7518 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7519 netdev_adjacent_add_links(dev);
7520
7521 /* Fixup kobjects */
7522 err = device_rename(&dev->dev, dev->name);
7523 WARN_ON(err);
7524
7525 /* Add the device back in the hashes */
7526 list_netdevice(dev);
7527
7528 /* Notify protocols, that a new device appeared. */
7529 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7530
7531 /*
7532 * Prevent userspace races by waiting until the network
7533 * device is fully setup before sending notifications.
7534 */
7535 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7536
7537 synchronize_net();
7538 err = 0;
7539 out:
7540 return err;
7541 }
7542 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7543
dev_cpu_callback(struct notifier_block * nfb,unsigned long action,void * ocpu)7544 static int dev_cpu_callback(struct notifier_block *nfb,
7545 unsigned long action,
7546 void *ocpu)
7547 {
7548 struct sk_buff **list_skb;
7549 struct sk_buff *skb;
7550 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7551 struct softnet_data *sd, *oldsd;
7552
7553 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7554 return NOTIFY_OK;
7555
7556 local_irq_disable();
7557 cpu = smp_processor_id();
7558 sd = &per_cpu(softnet_data, cpu);
7559 oldsd = &per_cpu(softnet_data, oldcpu);
7560
7561 /* Find end of our completion_queue. */
7562 list_skb = &sd->completion_queue;
7563 while (*list_skb)
7564 list_skb = &(*list_skb)->next;
7565 /* Append completion queue from offline CPU. */
7566 *list_skb = oldsd->completion_queue;
7567 oldsd->completion_queue = NULL;
7568
7569 /* Append output queue from offline CPU. */
7570 if (oldsd->output_queue) {
7571 *sd->output_queue_tailp = oldsd->output_queue;
7572 sd->output_queue_tailp = oldsd->output_queue_tailp;
7573 oldsd->output_queue = NULL;
7574 oldsd->output_queue_tailp = &oldsd->output_queue;
7575 }
7576 /* Append NAPI poll list from offline CPU, with one exception :
7577 * process_backlog() must be called by cpu owning percpu backlog.
7578 * We properly handle process_queue & input_pkt_queue later.
7579 */
7580 while (!list_empty(&oldsd->poll_list)) {
7581 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7582 struct napi_struct,
7583 poll_list);
7584
7585 list_del_init(&napi->poll_list);
7586 if (napi->poll == process_backlog)
7587 napi->state = 0;
7588 else
7589 ____napi_schedule(sd, napi);
7590 }
7591
7592 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7593 local_irq_enable();
7594
7595 /* Process offline CPU's input_pkt_queue */
7596 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7597 netif_rx_ni(skb);
7598 input_queue_head_incr(oldsd);
7599 }
7600 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7601 netif_rx_ni(skb);
7602 input_queue_head_incr(oldsd);
7603 }
7604
7605 return NOTIFY_OK;
7606 }
7607
7608
7609 /**
7610 * netdev_increment_features - increment feature set by one
7611 * @all: current feature set
7612 * @one: new feature set
7613 * @mask: mask feature set
7614 *
7615 * Computes a new feature set after adding a device with feature set
7616 * @one to the master device with current feature set @all. Will not
7617 * enable anything that is off in @mask. Returns the new feature set.
7618 */
netdev_increment_features(netdev_features_t all,netdev_features_t one,netdev_features_t mask)7619 netdev_features_t netdev_increment_features(netdev_features_t all,
7620 netdev_features_t one, netdev_features_t mask)
7621 {
7622 if (mask & NETIF_F_GEN_CSUM)
7623 mask |= NETIF_F_ALL_CSUM;
7624 mask |= NETIF_F_VLAN_CHALLENGED;
7625
7626 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7627 all &= one | ~NETIF_F_ALL_FOR_ALL;
7628
7629 /* If one device supports hw checksumming, set for all. */
7630 if (all & NETIF_F_GEN_CSUM)
7631 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7632
7633 return all;
7634 }
7635 EXPORT_SYMBOL(netdev_increment_features);
7636
netdev_create_hash(void)7637 static struct hlist_head * __net_init netdev_create_hash(void)
7638 {
7639 int i;
7640 struct hlist_head *hash;
7641
7642 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7643 if (hash != NULL)
7644 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7645 INIT_HLIST_HEAD(&hash[i]);
7646
7647 return hash;
7648 }
7649
7650 /* Initialize per network namespace state */
netdev_init(struct net * net)7651 static int __net_init netdev_init(struct net *net)
7652 {
7653 if (net != &init_net)
7654 INIT_LIST_HEAD(&net->dev_base_head);
7655
7656 net->dev_name_head = netdev_create_hash();
7657 if (net->dev_name_head == NULL)
7658 goto err_name;
7659
7660 net->dev_index_head = netdev_create_hash();
7661 if (net->dev_index_head == NULL)
7662 goto err_idx;
7663
7664 return 0;
7665
7666 err_idx:
7667 kfree(net->dev_name_head);
7668 err_name:
7669 return -ENOMEM;
7670 }
7671
7672 /**
7673 * netdev_drivername - network driver for the device
7674 * @dev: network device
7675 *
7676 * Determine network driver for device.
7677 */
netdev_drivername(const struct net_device * dev)7678 const char *netdev_drivername(const struct net_device *dev)
7679 {
7680 const struct device_driver *driver;
7681 const struct device *parent;
7682 const char *empty = "";
7683
7684 parent = dev->dev.parent;
7685 if (!parent)
7686 return empty;
7687
7688 driver = parent->driver;
7689 if (driver && driver->name)
7690 return driver->name;
7691 return empty;
7692 }
7693
__netdev_printk(const char * level,const struct net_device * dev,struct va_format * vaf)7694 static void __netdev_printk(const char *level, const struct net_device *dev,
7695 struct va_format *vaf)
7696 {
7697 if (dev && dev->dev.parent) {
7698 dev_printk_emit(level[1] - '0',
7699 dev->dev.parent,
7700 "%s %s %s%s: %pV",
7701 dev_driver_string(dev->dev.parent),
7702 dev_name(dev->dev.parent),
7703 netdev_name(dev), netdev_reg_state(dev),
7704 vaf);
7705 } else if (dev) {
7706 printk("%s%s%s: %pV",
7707 level, netdev_name(dev), netdev_reg_state(dev), vaf);
7708 } else {
7709 printk("%s(NULL net_device): %pV", level, vaf);
7710 }
7711 }
7712
netdev_printk(const char * level,const struct net_device * dev,const char * format,...)7713 void netdev_printk(const char *level, const struct net_device *dev,
7714 const char *format, ...)
7715 {
7716 struct va_format vaf;
7717 va_list args;
7718
7719 va_start(args, format);
7720
7721 vaf.fmt = format;
7722 vaf.va = &args;
7723
7724 __netdev_printk(level, dev, &vaf);
7725
7726 va_end(args);
7727 }
7728 EXPORT_SYMBOL(netdev_printk);
7729
7730 #define define_netdev_printk_level(func, level) \
7731 void func(const struct net_device *dev, const char *fmt, ...) \
7732 { \
7733 struct va_format vaf; \
7734 va_list args; \
7735 \
7736 va_start(args, fmt); \
7737 \
7738 vaf.fmt = fmt; \
7739 vaf.va = &args; \
7740 \
7741 __netdev_printk(level, dev, &vaf); \
7742 \
7743 va_end(args); \
7744 } \
7745 EXPORT_SYMBOL(func);
7746
7747 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7748 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7749 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7750 define_netdev_printk_level(netdev_err, KERN_ERR);
7751 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7752 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7753 define_netdev_printk_level(netdev_info, KERN_INFO);
7754
netdev_exit(struct net * net)7755 static void __net_exit netdev_exit(struct net *net)
7756 {
7757 kfree(net->dev_name_head);
7758 kfree(net->dev_index_head);
7759 }
7760
7761 static struct pernet_operations __net_initdata netdev_net_ops = {
7762 .init = netdev_init,
7763 .exit = netdev_exit,
7764 };
7765
default_device_exit(struct net * net)7766 static void __net_exit default_device_exit(struct net *net)
7767 {
7768 struct net_device *dev, *aux;
7769 /*
7770 * Push all migratable network devices back to the
7771 * initial network namespace
7772 */
7773 rtnl_lock();
7774 for_each_netdev_safe(net, dev, aux) {
7775 int err;
7776 char fb_name[IFNAMSIZ];
7777
7778 /* Ignore unmoveable devices (i.e. loopback) */
7779 if (dev->features & NETIF_F_NETNS_LOCAL)
7780 continue;
7781
7782 /* Leave virtual devices for the generic cleanup */
7783 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
7784 continue;
7785
7786 /* Push remaining network devices to init_net */
7787 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7788 if (__dev_get_by_name(&init_net, fb_name))
7789 snprintf(fb_name, IFNAMSIZ, "dev%%d");
7790 err = dev_change_net_namespace(dev, &init_net, fb_name);
7791 if (err) {
7792 pr_emerg("%s: failed to move %s to init_net: %d\n",
7793 __func__, dev->name, err);
7794 BUG();
7795 }
7796 }
7797 rtnl_unlock();
7798 }
7799
rtnl_lock_unregistering(struct list_head * net_list)7800 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7801 {
7802 /* Return with the rtnl_lock held when there are no network
7803 * devices unregistering in any network namespace in net_list.
7804 */
7805 struct net *net;
7806 bool unregistering;
7807 DEFINE_WAIT_FUNC(wait, woken_wake_function);
7808
7809 add_wait_queue(&netdev_unregistering_wq, &wait);
7810 for (;;) {
7811 unregistering = false;
7812 rtnl_lock();
7813 list_for_each_entry(net, net_list, exit_list) {
7814 if (net->dev_unreg_count > 0) {
7815 unregistering = true;
7816 break;
7817 }
7818 }
7819 if (!unregistering)
7820 break;
7821 __rtnl_unlock();
7822
7823 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7824 }
7825 remove_wait_queue(&netdev_unregistering_wq, &wait);
7826 }
7827
default_device_exit_batch(struct list_head * net_list)7828 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7829 {
7830 /* At exit all network devices most be removed from a network
7831 * namespace. Do this in the reverse order of registration.
7832 * Do this across as many network namespaces as possible to
7833 * improve batching efficiency.
7834 */
7835 struct net_device *dev;
7836 struct net *net;
7837 LIST_HEAD(dev_kill_list);
7838
7839 /* To prevent network device cleanup code from dereferencing
7840 * loopback devices or network devices that have been freed
7841 * wait here for all pending unregistrations to complete,
7842 * before unregistring the loopback device and allowing the
7843 * network namespace be freed.
7844 *
7845 * The netdev todo list containing all network devices
7846 * unregistrations that happen in default_device_exit_batch
7847 * will run in the rtnl_unlock() at the end of
7848 * default_device_exit_batch.
7849 */
7850 rtnl_lock_unregistering(net_list);
7851 list_for_each_entry(net, net_list, exit_list) {
7852 for_each_netdev_reverse(net, dev) {
7853 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7854 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7855 else
7856 unregister_netdevice_queue(dev, &dev_kill_list);
7857 }
7858 }
7859 unregister_netdevice_many(&dev_kill_list);
7860 rtnl_unlock();
7861 }
7862
7863 static struct pernet_operations __net_initdata default_device_ops = {
7864 .exit = default_device_exit,
7865 .exit_batch = default_device_exit_batch,
7866 };
7867
7868 /*
7869 * Initialize the DEV module. At boot time this walks the device list and
7870 * unhooks any devices that fail to initialise (normally hardware not
7871 * present) and leaves us with a valid list of present and active devices.
7872 *
7873 */
7874
7875 /*
7876 * This is called single threaded during boot, so no need
7877 * to take the rtnl semaphore.
7878 */
net_dev_init(void)7879 static int __init net_dev_init(void)
7880 {
7881 int i, rc = -ENOMEM;
7882
7883 BUG_ON(!dev_boot_phase);
7884
7885 if (dev_proc_init())
7886 goto out;
7887
7888 if (netdev_kobject_init())
7889 goto out;
7890
7891 INIT_LIST_HEAD(&ptype_all);
7892 for (i = 0; i < PTYPE_HASH_SIZE; i++)
7893 INIT_LIST_HEAD(&ptype_base[i]);
7894
7895 INIT_LIST_HEAD(&offload_base);
7896
7897 if (register_pernet_subsys(&netdev_net_ops))
7898 goto out;
7899
7900 /*
7901 * Initialise the packet receive queues.
7902 */
7903
7904 for_each_possible_cpu(i) {
7905 struct softnet_data *sd = &per_cpu(softnet_data, i);
7906
7907 skb_queue_head_init(&sd->input_pkt_queue);
7908 skb_queue_head_init(&sd->process_queue);
7909 INIT_LIST_HEAD(&sd->poll_list);
7910 sd->output_queue_tailp = &sd->output_queue;
7911 #ifdef CONFIG_RPS
7912 sd->csd.func = rps_trigger_softirq;
7913 sd->csd.info = sd;
7914 sd->cpu = i;
7915 #endif
7916
7917 sd->backlog.poll = process_backlog;
7918 sd->backlog.weight = weight_p;
7919 }
7920
7921 dev_boot_phase = 0;
7922
7923 /* The loopback device is special if any other network devices
7924 * is present in a network namespace the loopback device must
7925 * be present. Since we now dynamically allocate and free the
7926 * loopback device ensure this invariant is maintained by
7927 * keeping the loopback device as the first device on the
7928 * list of network devices. Ensuring the loopback devices
7929 * is the first device that appears and the last network device
7930 * that disappears.
7931 */
7932 if (register_pernet_device(&loopback_net_ops))
7933 goto out;
7934
7935 if (register_pernet_device(&default_device_ops))
7936 goto out;
7937
7938 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7939 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7940
7941 hotcpu_notifier(dev_cpu_callback, 0);
7942 dst_subsys_init();
7943 rc = 0;
7944 out:
7945 return rc;
7946 }
7947
7948 subsys_initcall(net_dev_init);
7949