• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
123 #include <net/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/sctp.h>
143 #include <linux/crash_dump.h>
144 
145 #include "net-sysfs.h"
146 
147 /* Instead of increasing this, you should create a hash table. */
148 #define MAX_GRO_SKBS 8
149 
150 /* This should be increased if a protocol with a bigger head is added. */
151 #define GRO_MAX_HEAD (MAX_HEADER + 128)
152 
153 static DEFINE_SPINLOCK(ptype_lock);
154 static DEFINE_SPINLOCK(offload_lock);
155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
156 struct list_head ptype_all __read_mostly;	/* Taps */
157 static struct list_head offload_base __read_mostly;
158 
159 static int netif_rx_internal(struct sk_buff *skb);
160 static int call_netdevice_notifiers_info(unsigned long val,
161 					 struct net_device *dev,
162 					 struct netdev_notifier_info *info);
163 
164 /*
165  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
166  * semaphore.
167  *
168  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
169  *
170  * Writers must hold the rtnl semaphore while they loop through the
171  * dev_base_head list, and hold dev_base_lock for writing when they do the
172  * actual updates.  This allows pure readers to access the list even
173  * while a writer is preparing to update it.
174  *
175  * To put it another way, dev_base_lock is held for writing only to
176  * protect against pure readers; the rtnl semaphore provides the
177  * protection against other writers.
178  *
179  * See, for example usages, register_netdevice() and
180  * unregister_netdevice(), which must be called with the rtnl
181  * semaphore held.
182  */
183 DEFINE_RWLOCK(dev_base_lock);
184 EXPORT_SYMBOL(dev_base_lock);
185 
186 /* protects napi_hash addition/deletion and napi_gen_id */
187 static DEFINE_SPINLOCK(napi_hash_lock);
188 
189 static unsigned int napi_gen_id = NR_CPUS;
190 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
191 
192 static seqcount_t devnet_rename_seq;
193 
dev_base_seq_inc(struct net * net)194 static inline void dev_base_seq_inc(struct net *net)
195 {
196 	while (++net->dev_base_seq == 0);
197 }
198 
dev_name_hash(struct net * net,const char * name)199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
202 
203 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
204 }
205 
dev_index_hash(struct net * net,int ifindex)206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
207 {
208 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
209 }
210 
rps_lock(struct softnet_data * sd)211 static inline void rps_lock(struct softnet_data *sd)
212 {
213 #ifdef CONFIG_RPS
214 	spin_lock(&sd->input_pkt_queue.lock);
215 #endif
216 }
217 
rps_unlock(struct softnet_data * sd)218 static inline void rps_unlock(struct softnet_data *sd)
219 {
220 #ifdef CONFIG_RPS
221 	spin_unlock(&sd->input_pkt_queue.lock);
222 #endif
223 }
224 
225 /* Device list insertion */
list_netdevice(struct net_device * dev)226 static void list_netdevice(struct net_device *dev)
227 {
228 	struct net *net = dev_net(dev);
229 
230 	ASSERT_RTNL();
231 
232 	write_lock_bh(&dev_base_lock);
233 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 	hlist_add_head_rcu(&dev->index_hlist,
236 			   dev_index_hash(net, dev->ifindex));
237 	write_unlock_bh(&dev_base_lock);
238 
239 	dev_base_seq_inc(net);
240 }
241 
242 /* Device list removal
243  * caller must respect a RCU grace period before freeing/reusing dev
244  */
unlist_netdevice(struct net_device * dev)245 static void unlist_netdevice(struct net_device *dev)
246 {
247 	ASSERT_RTNL();
248 
249 	/* Unlink dev from the device chain */
250 	write_lock_bh(&dev_base_lock);
251 	list_del_rcu(&dev->dev_list);
252 	hlist_del_rcu(&dev->name_hlist);
253 	hlist_del_rcu(&dev->index_hlist);
254 	write_unlock_bh(&dev_base_lock);
255 
256 	dev_base_seq_inc(dev_net(dev));
257 }
258 
259 /*
260  *	Our notifier list
261  */
262 
263 static RAW_NOTIFIER_HEAD(netdev_chain);
264 
265 /*
266  *	Device drivers call our routines to queue packets here. We empty the
267  *	queue in the local softnet handler.
268  */
269 
270 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
271 EXPORT_PER_CPU_SYMBOL(softnet_data);
272 
273 #ifdef CONFIG_LOCKDEP
274 /*
275  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
276  * according to dev->type
277  */
278 static const unsigned short netdev_lock_type[] =
279 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
280 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
281 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
282 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
283 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
284 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
285 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
286 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
287 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
288 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
289 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
290 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
291 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
292 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
293 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
294 
295 static const char *const netdev_lock_name[] =
296 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
309 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
310 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
311 
312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 
netdev_lock_pos(unsigned short dev_type)315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316 {
317 	int i;
318 
319 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 		if (netdev_lock_type[i] == dev_type)
321 			return i;
322 	/* the last key is used by default */
323 	return ARRAY_SIZE(netdev_lock_type) - 1;
324 }
325 
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 						 unsigned short dev_type)
328 {
329 	int i;
330 
331 	i = netdev_lock_pos(dev_type);
332 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 				   netdev_lock_name[i]);
334 }
335 
netdev_set_addr_lockdep_class(struct net_device * dev)336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 	int i;
339 
340 	i = netdev_lock_pos(dev->type);
341 	lockdep_set_class_and_name(&dev->addr_list_lock,
342 				   &netdev_addr_lock_key[i],
343 				   netdev_lock_name[i]);
344 }
345 #else
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 						 unsigned short dev_type)
348 {
349 }
netdev_set_addr_lockdep_class(struct net_device * dev)350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351 {
352 }
353 #endif
354 
355 /*******************************************************************************
356 
357 		Protocol management and registration routines
358 
359 *******************************************************************************/
360 
361 /*
362  *	Add a protocol ID to the list. Now that the input handler is
363  *	smarter we can dispense with all the messy stuff that used to be
364  *	here.
365  *
366  *	BEWARE!!! Protocol handlers, mangling input packets,
367  *	MUST BE last in hash buckets and checking protocol handlers
368  *	MUST start from promiscuous ptype_all chain in net_bh.
369  *	It is true now, do not change it.
370  *	Explanation follows: if protocol handler, mangling packet, will
371  *	be the first on list, it is not able to sense, that packet
372  *	is cloned and should be copied-on-write, so that it will
373  *	change it and subsequent readers will get broken packet.
374  *							--ANK (980803)
375  */
376 
ptype_head(const struct packet_type * pt)377 static inline struct list_head *ptype_head(const struct packet_type *pt)
378 {
379 	if (pt->type == htons(ETH_P_ALL))
380 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
381 	else
382 		return pt->dev ? &pt->dev->ptype_specific :
383 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385 
386 /**
387  *	dev_add_pack - add packet handler
388  *	@pt: packet type declaration
389  *
390  *	Add a protocol handler to the networking stack. The passed &packet_type
391  *	is linked into kernel lists and may not be freed until it has been
392  *	removed from the kernel lists.
393  *
394  *	This call does not sleep therefore it can not
395  *	guarantee all CPU's that are in middle of receiving packets
396  *	will see the new packet type (until the next received packet).
397  */
398 
dev_add_pack(struct packet_type * pt)399 void dev_add_pack(struct packet_type *pt)
400 {
401 	struct list_head *head = ptype_head(pt);
402 
403 	spin_lock(&ptype_lock);
404 	list_add_rcu(&pt->list, head);
405 	spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408 
409 /**
410  *	__dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *      The packet type might still be in use by receivers
419  *	and must not be freed until after all the CPU's have gone
420  *	through a quiescent state.
421  */
__dev_remove_pack(struct packet_type * pt)422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 	struct list_head *head = ptype_head(pt);
425 	struct packet_type *pt1;
426 
427 	spin_lock(&ptype_lock);
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	pr_warn("dev_remove_pack: %p not found\n", pt);
437 out:
438 	spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
dev_remove_pack(struct packet_type * pt)454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 
463 /**
464  *	dev_add_offload - register offload handlers
465  *	@po: protocol offload declaration
466  *
467  *	Add protocol offload handlers to the networking stack. The passed
468  *	&proto_offload is linked into kernel lists and may not be freed until
469  *	it has been removed from the kernel lists.
470  *
471  *	This call does not sleep therefore it can not
472  *	guarantee all CPU's that are in middle of receiving packets
473  *	will see the new offload handlers (until the next received packet).
474  */
dev_add_offload(struct packet_offload * po)475 void dev_add_offload(struct packet_offload *po)
476 {
477 	struct packet_offload *elem;
478 
479 	spin_lock(&offload_lock);
480 	list_for_each_entry(elem, &offload_base, list) {
481 		if (po->priority < elem->priority)
482 			break;
483 	}
484 	list_add_rcu(&po->list, elem->list.prev);
485 	spin_unlock(&offload_lock);
486 }
487 EXPORT_SYMBOL(dev_add_offload);
488 
489 /**
490  *	__dev_remove_offload	 - remove offload handler
491  *	@po: packet offload declaration
492  *
493  *	Remove a protocol offload handler that was previously added to the
494  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
495  *	is removed from the kernel lists and can be freed or reused once this
496  *	function returns.
497  *
498  *      The packet type might still be in use by receivers
499  *	and must not be freed until after all the CPU's have gone
500  *	through a quiescent state.
501  */
__dev_remove_offload(struct packet_offload * po)502 static void __dev_remove_offload(struct packet_offload *po)
503 {
504 	struct list_head *head = &offload_base;
505 	struct packet_offload *po1;
506 
507 	spin_lock(&offload_lock);
508 
509 	list_for_each_entry(po1, head, list) {
510 		if (po == po1) {
511 			list_del_rcu(&po->list);
512 			goto out;
513 		}
514 	}
515 
516 	pr_warn("dev_remove_offload: %p not found\n", po);
517 out:
518 	spin_unlock(&offload_lock);
519 }
520 
521 /**
522  *	dev_remove_offload	 - remove packet offload handler
523  *	@po: packet offload declaration
524  *
525  *	Remove a packet offload handler that was previously added to the kernel
526  *	offload handlers by dev_add_offload(). The passed &offload_type is
527  *	removed from the kernel lists and can be freed or reused once this
528  *	function returns.
529  *
530  *	This call sleeps to guarantee that no CPU is looking at the packet
531  *	type after return.
532  */
dev_remove_offload(struct packet_offload * po)533 void dev_remove_offload(struct packet_offload *po)
534 {
535 	__dev_remove_offload(po);
536 
537 	synchronize_net();
538 }
539 EXPORT_SYMBOL(dev_remove_offload);
540 
541 /******************************************************************************
542 
543 		      Device Boot-time Settings Routines
544 
545 *******************************************************************************/
546 
547 /* Boot time configuration table */
548 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
549 
550 /**
551  *	netdev_boot_setup_add	- add new setup entry
552  *	@name: name of the device
553  *	@map: configured settings for the device
554  *
555  *	Adds new setup entry to the dev_boot_setup list.  The function
556  *	returns 0 on error and 1 on success.  This is a generic routine to
557  *	all netdevices.
558  */
netdev_boot_setup_add(char * name,struct ifmap * map)559 static int netdev_boot_setup_add(char *name, struct ifmap *map)
560 {
561 	struct netdev_boot_setup *s;
562 	int i;
563 
564 	s = dev_boot_setup;
565 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
566 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
567 			memset(s[i].name, 0, sizeof(s[i].name));
568 			strlcpy(s[i].name, name, IFNAMSIZ);
569 			memcpy(&s[i].map, map, sizeof(s[i].map));
570 			break;
571 		}
572 	}
573 
574 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
575 }
576 
577 /**
578  *	netdev_boot_setup_check	- check boot time settings
579  *	@dev: the netdevice
580  *
581  * 	Check boot time settings for the device.
582  *	The found settings are set for the device to be used
583  *	later in the device probing.
584  *	Returns 0 if no settings found, 1 if they are.
585  */
netdev_boot_setup_check(struct net_device * dev)586 int netdev_boot_setup_check(struct net_device *dev)
587 {
588 	struct netdev_boot_setup *s = dev_boot_setup;
589 	int i;
590 
591 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
592 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
593 		    !strcmp(dev->name, s[i].name)) {
594 			dev->irq 	= s[i].map.irq;
595 			dev->base_addr 	= s[i].map.base_addr;
596 			dev->mem_start 	= s[i].map.mem_start;
597 			dev->mem_end 	= s[i].map.mem_end;
598 			return 1;
599 		}
600 	}
601 	return 0;
602 }
603 EXPORT_SYMBOL(netdev_boot_setup_check);
604 
605 
606 /**
607  *	netdev_boot_base	- get address from boot time settings
608  *	@prefix: prefix for network device
609  *	@unit: id for network device
610  *
611  * 	Check boot time settings for the base address of device.
612  *	The found settings are set for the device to be used
613  *	later in the device probing.
614  *	Returns 0 if no settings found.
615  */
netdev_boot_base(const char * prefix,int unit)616 unsigned long netdev_boot_base(const char *prefix, int unit)
617 {
618 	const struct netdev_boot_setup *s = dev_boot_setup;
619 	char name[IFNAMSIZ];
620 	int i;
621 
622 	sprintf(name, "%s%d", prefix, unit);
623 
624 	/*
625 	 * If device already registered then return base of 1
626 	 * to indicate not to probe for this interface
627 	 */
628 	if (__dev_get_by_name(&init_net, name))
629 		return 1;
630 
631 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
632 		if (!strcmp(name, s[i].name))
633 			return s[i].map.base_addr;
634 	return 0;
635 }
636 
637 /*
638  * Saves at boot time configured settings for any netdevice.
639  */
netdev_boot_setup(char * str)640 int __init netdev_boot_setup(char *str)
641 {
642 	int ints[5];
643 	struct ifmap map;
644 
645 	str = get_options(str, ARRAY_SIZE(ints), ints);
646 	if (!str || !*str)
647 		return 0;
648 
649 	/* Save settings */
650 	memset(&map, 0, sizeof(map));
651 	if (ints[0] > 0)
652 		map.irq = ints[1];
653 	if (ints[0] > 1)
654 		map.base_addr = ints[2];
655 	if (ints[0] > 2)
656 		map.mem_start = ints[3];
657 	if (ints[0] > 3)
658 		map.mem_end = ints[4];
659 
660 	/* Add new entry to the list */
661 	return netdev_boot_setup_add(str, &map);
662 }
663 
664 __setup("netdev=", netdev_boot_setup);
665 
666 /*******************************************************************************
667 
668 			    Device Interface Subroutines
669 
670 *******************************************************************************/
671 
672 /**
673  *	dev_get_iflink	- get 'iflink' value of a interface
674  *	@dev: targeted interface
675  *
676  *	Indicates the ifindex the interface is linked to.
677  *	Physical interfaces have the same 'ifindex' and 'iflink' values.
678  */
679 
dev_get_iflink(const struct net_device * dev)680 int dev_get_iflink(const struct net_device *dev)
681 {
682 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
683 		return dev->netdev_ops->ndo_get_iflink(dev);
684 
685 	return dev->ifindex;
686 }
687 EXPORT_SYMBOL(dev_get_iflink);
688 
689 /**
690  *	dev_fill_metadata_dst - Retrieve tunnel egress information.
691  *	@dev: targeted interface
692  *	@skb: The packet.
693  *
694  *	For better visibility of tunnel traffic OVS needs to retrieve
695  *	egress tunnel information for a packet. Following API allows
696  *	user to get this info.
697  */
dev_fill_metadata_dst(struct net_device * dev,struct sk_buff * skb)698 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
699 {
700 	struct ip_tunnel_info *info;
701 
702 	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
703 		return -EINVAL;
704 
705 	info = skb_tunnel_info_unclone(skb);
706 	if (!info)
707 		return -ENOMEM;
708 	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
709 		return -EINVAL;
710 
711 	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
712 }
713 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
714 
715 /**
716  *	__dev_get_by_name	- find a device by its name
717  *	@net: the applicable net namespace
718  *	@name: name to find
719  *
720  *	Find an interface by name. Must be called under RTNL semaphore
721  *	or @dev_base_lock. If the name is found a pointer to the device
722  *	is returned. If the name is not found then %NULL is returned. The
723  *	reference counters are not incremented so the caller must be
724  *	careful with locks.
725  */
726 
__dev_get_by_name(struct net * net,const char * name)727 struct net_device *__dev_get_by_name(struct net *net, const char *name)
728 {
729 	struct net_device *dev;
730 	struct hlist_head *head = dev_name_hash(net, name);
731 
732 	hlist_for_each_entry(dev, head, name_hlist)
733 		if (!strncmp(dev->name, name, IFNAMSIZ))
734 			return dev;
735 
736 	return NULL;
737 }
738 EXPORT_SYMBOL(__dev_get_by_name);
739 
740 /**
741  *	dev_get_by_name_rcu	- find a device by its name
742  *	@net: the applicable net namespace
743  *	@name: name to find
744  *
745  *	Find an interface by name.
746  *	If the name is found a pointer to the device is returned.
747  * 	If the name is not found then %NULL is returned.
748  *	The reference counters are not incremented so the caller must be
749  *	careful with locks. The caller must hold RCU lock.
750  */
751 
dev_get_by_name_rcu(struct net * net,const char * name)752 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
753 {
754 	struct net_device *dev;
755 	struct hlist_head *head = dev_name_hash(net, name);
756 
757 	hlist_for_each_entry_rcu(dev, head, name_hlist)
758 		if (!strncmp(dev->name, name, IFNAMSIZ))
759 			return dev;
760 
761 	return NULL;
762 }
763 EXPORT_SYMBOL(dev_get_by_name_rcu);
764 
765 /**
766  *	dev_get_by_name		- find a device by its name
767  *	@net: the applicable net namespace
768  *	@name: name to find
769  *
770  *	Find an interface by name. This can be called from any
771  *	context and does its own locking. The returned handle has
772  *	the usage count incremented and the caller must use dev_put() to
773  *	release it when it is no longer needed. %NULL is returned if no
774  *	matching device is found.
775  */
776 
dev_get_by_name(struct net * net,const char * name)777 struct net_device *dev_get_by_name(struct net *net, const char *name)
778 {
779 	struct net_device *dev;
780 
781 	rcu_read_lock();
782 	dev = dev_get_by_name_rcu(net, name);
783 	if (dev)
784 		dev_hold(dev);
785 	rcu_read_unlock();
786 	return dev;
787 }
788 EXPORT_SYMBOL(dev_get_by_name);
789 
790 /**
791  *	__dev_get_by_index - find a device by its ifindex
792  *	@net: the applicable net namespace
793  *	@ifindex: index of device
794  *
795  *	Search for an interface by index. Returns %NULL if the device
796  *	is not found or a pointer to the device. The device has not
797  *	had its reference counter increased so the caller must be careful
798  *	about locking. The caller must hold either the RTNL semaphore
799  *	or @dev_base_lock.
800  */
801 
__dev_get_by_index(struct net * net,int ifindex)802 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
803 {
804 	struct net_device *dev;
805 	struct hlist_head *head = dev_index_hash(net, ifindex);
806 
807 	hlist_for_each_entry(dev, head, index_hlist)
808 		if (dev->ifindex == ifindex)
809 			return dev;
810 
811 	return NULL;
812 }
813 EXPORT_SYMBOL(__dev_get_by_index);
814 
815 /**
816  *	dev_get_by_index_rcu - find a device by its ifindex
817  *	@net: the applicable net namespace
818  *	@ifindex: index of device
819  *
820  *	Search for an interface by index. Returns %NULL if the device
821  *	is not found or a pointer to the device. The device has not
822  *	had its reference counter increased so the caller must be careful
823  *	about locking. The caller must hold RCU lock.
824  */
825 
dev_get_by_index_rcu(struct net * net,int ifindex)826 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
827 {
828 	struct net_device *dev;
829 	struct hlist_head *head = dev_index_hash(net, ifindex);
830 
831 	hlist_for_each_entry_rcu(dev, head, index_hlist)
832 		if (dev->ifindex == ifindex)
833 			return dev;
834 
835 	return NULL;
836 }
837 EXPORT_SYMBOL(dev_get_by_index_rcu);
838 
839 
840 /**
841  *	dev_get_by_index - find a device by its ifindex
842  *	@net: the applicable net namespace
843  *	@ifindex: index of device
844  *
845  *	Search for an interface by index. Returns NULL if the device
846  *	is not found or a pointer to the device. The device returned has
847  *	had a reference added and the pointer is safe until the user calls
848  *	dev_put to indicate they have finished with it.
849  */
850 
dev_get_by_index(struct net * net,int ifindex)851 struct net_device *dev_get_by_index(struct net *net, int ifindex)
852 {
853 	struct net_device *dev;
854 
855 	rcu_read_lock();
856 	dev = dev_get_by_index_rcu(net, ifindex);
857 	if (dev)
858 		dev_hold(dev);
859 	rcu_read_unlock();
860 	return dev;
861 }
862 EXPORT_SYMBOL(dev_get_by_index);
863 
864 /**
865  *	netdev_get_name - get a netdevice name, knowing its ifindex.
866  *	@net: network namespace
867  *	@name: a pointer to the buffer where the name will be stored.
868  *	@ifindex: the ifindex of the interface to get the name from.
869  *
870  *	The use of raw_seqcount_begin() and cond_resched() before
871  *	retrying is required as we want to give the writers a chance
872  *	to complete when CONFIG_PREEMPT is not set.
873  */
netdev_get_name(struct net * net,char * name,int ifindex)874 int netdev_get_name(struct net *net, char *name, int ifindex)
875 {
876 	struct net_device *dev;
877 	unsigned int seq;
878 
879 retry:
880 	seq = raw_seqcount_begin(&devnet_rename_seq);
881 	rcu_read_lock();
882 	dev = dev_get_by_index_rcu(net, ifindex);
883 	if (!dev) {
884 		rcu_read_unlock();
885 		return -ENODEV;
886 	}
887 
888 	strcpy(name, dev->name);
889 	rcu_read_unlock();
890 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
891 		cond_resched();
892 		goto retry;
893 	}
894 
895 	return 0;
896 }
897 
898 /**
899  *	dev_getbyhwaddr_rcu - find a device by its hardware address
900  *	@net: the applicable net namespace
901  *	@type: media type of device
902  *	@ha: hardware address
903  *
904  *	Search for an interface by MAC address. Returns NULL if the device
905  *	is not found or a pointer to the device.
906  *	The caller must hold RCU or RTNL.
907  *	The returned device has not had its ref count increased
908  *	and the caller must therefore be careful about locking
909  *
910  */
911 
dev_getbyhwaddr_rcu(struct net * net,unsigned short type,const char * ha)912 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
913 				       const char *ha)
914 {
915 	struct net_device *dev;
916 
917 	for_each_netdev_rcu(net, dev)
918 		if (dev->type == type &&
919 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
920 			return dev;
921 
922 	return NULL;
923 }
924 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
925 
__dev_getfirstbyhwtype(struct net * net,unsigned short type)926 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
927 {
928 	struct net_device *dev;
929 
930 	ASSERT_RTNL();
931 	for_each_netdev(net, dev)
932 		if (dev->type == type)
933 			return dev;
934 
935 	return NULL;
936 }
937 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
938 
dev_getfirstbyhwtype(struct net * net,unsigned short type)939 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
940 {
941 	struct net_device *dev, *ret = NULL;
942 
943 	rcu_read_lock();
944 	for_each_netdev_rcu(net, dev)
945 		if (dev->type == type) {
946 			dev_hold(dev);
947 			ret = dev;
948 			break;
949 		}
950 	rcu_read_unlock();
951 	return ret;
952 }
953 EXPORT_SYMBOL(dev_getfirstbyhwtype);
954 
955 /**
956  *	__dev_get_by_flags - find any device with given flags
957  *	@net: the applicable net namespace
958  *	@if_flags: IFF_* values
959  *	@mask: bitmask of bits in if_flags to check
960  *
961  *	Search for any interface with the given flags. Returns NULL if a device
962  *	is not found or a pointer to the device. Must be called inside
963  *	rtnl_lock(), and result refcount is unchanged.
964  */
965 
__dev_get_by_flags(struct net * net,unsigned short if_flags,unsigned short mask)966 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
967 				      unsigned short mask)
968 {
969 	struct net_device *dev, *ret;
970 
971 	ASSERT_RTNL();
972 
973 	ret = NULL;
974 	for_each_netdev(net, dev) {
975 		if (((dev->flags ^ if_flags) & mask) == 0) {
976 			ret = dev;
977 			break;
978 		}
979 	}
980 	return ret;
981 }
982 EXPORT_SYMBOL(__dev_get_by_flags);
983 
984 /**
985  *	dev_valid_name - check if name is okay for network device
986  *	@name: name string
987  *
988  *	Network device names need to be valid file names to
989  *	to allow sysfs to work.  We also disallow any kind of
990  *	whitespace.
991  */
dev_valid_name(const char * name)992 bool dev_valid_name(const char *name)
993 {
994 	if (*name == '\0')
995 		return false;
996 	if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
997 		return false;
998 	if (!strcmp(name, ".") || !strcmp(name, ".."))
999 		return false;
1000 
1001 	while (*name) {
1002 		if (*name == '/' || *name == ':' || isspace(*name))
1003 			return false;
1004 		name++;
1005 	}
1006 	return true;
1007 }
1008 EXPORT_SYMBOL(dev_valid_name);
1009 
1010 /**
1011  *	__dev_alloc_name - allocate a name for a device
1012  *	@net: network namespace to allocate the device name in
1013  *	@name: name format string
1014  *	@buf:  scratch buffer and result name string
1015  *
1016  *	Passed a format string - eg "lt%d" it will try and find a suitable
1017  *	id. It scans list of devices to build up a free map, then chooses
1018  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1019  *	while allocating the name and adding the device in order to avoid
1020  *	duplicates.
1021  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022  *	Returns the number of the unit assigned or a negative errno code.
1023  */
1024 
__dev_alloc_name(struct net * net,const char * name,char * buf)1025 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1026 {
1027 	int i = 0;
1028 	const char *p;
1029 	const int max_netdevices = 8*PAGE_SIZE;
1030 	unsigned long *inuse;
1031 	struct net_device *d;
1032 
1033 	p = strnchr(name, IFNAMSIZ-1, '%');
1034 	if (p) {
1035 		/*
1036 		 * Verify the string as this thing may have come from
1037 		 * the user.  There must be either one "%d" and no other "%"
1038 		 * characters.
1039 		 */
1040 		if (p[1] != 'd' || strchr(p + 2, '%'))
1041 			return -EINVAL;
1042 
1043 		/* Use one page as a bit array of possible slots */
1044 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1045 		if (!inuse)
1046 			return -ENOMEM;
1047 
1048 		for_each_netdev(net, d) {
1049 			if (!sscanf(d->name, name, &i))
1050 				continue;
1051 			if (i < 0 || i >= max_netdevices)
1052 				continue;
1053 
1054 			/*  avoid cases where sscanf is not exact inverse of printf */
1055 			snprintf(buf, IFNAMSIZ, name, i);
1056 			if (!strncmp(buf, d->name, IFNAMSIZ))
1057 				set_bit(i, inuse);
1058 		}
1059 
1060 		i = find_first_zero_bit(inuse, max_netdevices);
1061 		free_page((unsigned long) inuse);
1062 	}
1063 
1064 	if (buf != name)
1065 		snprintf(buf, IFNAMSIZ, name, i);
1066 	if (!__dev_get_by_name(net, buf))
1067 		return i;
1068 
1069 	/* It is possible to run out of possible slots
1070 	 * when the name is long and there isn't enough space left
1071 	 * for the digits, or if all bits are used.
1072 	 */
1073 	return -ENFILE;
1074 }
1075 
1076 /**
1077  *	dev_alloc_name - allocate a name for a device
1078  *	@dev: device
1079  *	@name: name format string
1080  *
1081  *	Passed a format string - eg "lt%d" it will try and find a suitable
1082  *	id. It scans list of devices to build up a free map, then chooses
1083  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1084  *	while allocating the name and adding the device in order to avoid
1085  *	duplicates.
1086  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1087  *	Returns the number of the unit assigned or a negative errno code.
1088  */
1089 
dev_alloc_name(struct net_device * dev,const char * name)1090 int dev_alloc_name(struct net_device *dev, const char *name)
1091 {
1092 	char buf[IFNAMSIZ];
1093 	struct net *net;
1094 	int ret;
1095 
1096 	BUG_ON(!dev_net(dev));
1097 	net = dev_net(dev);
1098 	ret = __dev_alloc_name(net, name, buf);
1099 	if (ret >= 0)
1100 		strlcpy(dev->name, buf, IFNAMSIZ);
1101 	return ret;
1102 }
1103 EXPORT_SYMBOL(dev_alloc_name);
1104 
dev_alloc_name_ns(struct net * net,struct net_device * dev,const char * name)1105 static int dev_alloc_name_ns(struct net *net,
1106 			     struct net_device *dev,
1107 			     const char *name)
1108 {
1109 	char buf[IFNAMSIZ];
1110 	int ret;
1111 
1112 	ret = __dev_alloc_name(net, name, buf);
1113 	if (ret >= 0)
1114 		strlcpy(dev->name, buf, IFNAMSIZ);
1115 	return ret;
1116 }
1117 
dev_get_valid_name(struct net * net,struct net_device * dev,const char * name)1118 int dev_get_valid_name(struct net *net, struct net_device *dev,
1119 		       const char *name)
1120 {
1121 	BUG_ON(!net);
1122 
1123 	if (!dev_valid_name(name))
1124 		return -EINVAL;
1125 
1126 	if (strchr(name, '%'))
1127 		return dev_alloc_name_ns(net, dev, name);
1128 	else if (__dev_get_by_name(net, name))
1129 		return -EEXIST;
1130 	else if (dev->name != name)
1131 		strlcpy(dev->name, name, IFNAMSIZ);
1132 
1133 	return 0;
1134 }
1135 EXPORT_SYMBOL(dev_get_valid_name);
1136 
1137 /**
1138  *	dev_change_name - change name of a device
1139  *	@dev: device
1140  *	@newname: name (or format string) must be at least IFNAMSIZ
1141  *
1142  *	Change name of a device, can pass format strings "eth%d".
1143  *	for wildcarding.
1144  */
dev_change_name(struct net_device * dev,const char * newname)1145 int dev_change_name(struct net_device *dev, const char *newname)
1146 {
1147 	unsigned char old_assign_type;
1148 	char oldname[IFNAMSIZ];
1149 	int err = 0;
1150 	int ret;
1151 	struct net *net;
1152 
1153 	ASSERT_RTNL();
1154 	BUG_ON(!dev_net(dev));
1155 
1156 	net = dev_net(dev);
1157 	if (dev->flags & IFF_UP)
1158 		return -EBUSY;
1159 
1160 	write_seqcount_begin(&devnet_rename_seq);
1161 
1162 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1163 		write_seqcount_end(&devnet_rename_seq);
1164 		return 0;
1165 	}
1166 
1167 	memcpy(oldname, dev->name, IFNAMSIZ);
1168 
1169 	err = dev_get_valid_name(net, dev, newname);
1170 	if (err < 0) {
1171 		write_seqcount_end(&devnet_rename_seq);
1172 		return err;
1173 	}
1174 
1175 	if (oldname[0] && !strchr(oldname, '%'))
1176 		netdev_info(dev, "renamed from %s\n", oldname);
1177 
1178 	old_assign_type = dev->name_assign_type;
1179 	dev->name_assign_type = NET_NAME_RENAMED;
1180 
1181 rollback:
1182 	ret = device_rename(&dev->dev, dev->name);
1183 	if (ret) {
1184 		memcpy(dev->name, oldname, IFNAMSIZ);
1185 		dev->name_assign_type = old_assign_type;
1186 		write_seqcount_end(&devnet_rename_seq);
1187 		return ret;
1188 	}
1189 
1190 	write_seqcount_end(&devnet_rename_seq);
1191 
1192 	netdev_adjacent_rename_links(dev, oldname);
1193 
1194 	write_lock_bh(&dev_base_lock);
1195 	hlist_del_rcu(&dev->name_hlist);
1196 	write_unlock_bh(&dev_base_lock);
1197 
1198 	synchronize_rcu();
1199 
1200 	write_lock_bh(&dev_base_lock);
1201 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1202 	write_unlock_bh(&dev_base_lock);
1203 
1204 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1205 	ret = notifier_to_errno(ret);
1206 
1207 	if (ret) {
1208 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1209 		if (err >= 0) {
1210 			err = ret;
1211 			write_seqcount_begin(&devnet_rename_seq);
1212 			memcpy(dev->name, oldname, IFNAMSIZ);
1213 			memcpy(oldname, newname, IFNAMSIZ);
1214 			dev->name_assign_type = old_assign_type;
1215 			old_assign_type = NET_NAME_RENAMED;
1216 			goto rollback;
1217 		} else {
1218 			pr_err("%s: name change rollback failed: %d\n",
1219 			       dev->name, ret);
1220 		}
1221 	}
1222 
1223 	return err;
1224 }
1225 
1226 /**
1227  *	dev_set_alias - change ifalias of a device
1228  *	@dev: device
1229  *	@alias: name up to IFALIASZ
1230  *	@len: limit of bytes to copy from info
1231  *
1232  *	Set ifalias for a device,
1233  */
dev_set_alias(struct net_device * dev,const char * alias,size_t len)1234 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1235 {
1236 	char *new_ifalias;
1237 
1238 	ASSERT_RTNL();
1239 
1240 	if (len >= IFALIASZ)
1241 		return -EINVAL;
1242 
1243 	if (!len) {
1244 		kfree(dev->ifalias);
1245 		dev->ifalias = NULL;
1246 		return 0;
1247 	}
1248 
1249 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1250 	if (!new_ifalias)
1251 		return -ENOMEM;
1252 	dev->ifalias = new_ifalias;
1253 	memcpy(dev->ifalias, alias, len);
1254 	dev->ifalias[len] = 0;
1255 
1256 	return len;
1257 }
1258 
1259 
1260 /**
1261  *	netdev_features_change - device changes features
1262  *	@dev: device to cause notification
1263  *
1264  *	Called to indicate a device has changed features.
1265  */
netdev_features_change(struct net_device * dev)1266 void netdev_features_change(struct net_device *dev)
1267 {
1268 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1269 }
1270 EXPORT_SYMBOL(netdev_features_change);
1271 
1272 /**
1273  *	netdev_state_change - device changes state
1274  *	@dev: device to cause notification
1275  *
1276  *	Called to indicate a device has changed state. This function calls
1277  *	the notifier chains for netdev_chain and sends a NEWLINK message
1278  *	to the routing socket.
1279  */
netdev_state_change(struct net_device * dev)1280 void netdev_state_change(struct net_device *dev)
1281 {
1282 	if (dev->flags & IFF_UP) {
1283 		struct netdev_notifier_change_info change_info;
1284 
1285 		change_info.flags_changed = 0;
1286 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1287 					      &change_info.info);
1288 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1289 	}
1290 }
1291 EXPORT_SYMBOL(netdev_state_change);
1292 
1293 /**
1294  * 	netdev_notify_peers - notify network peers about existence of @dev
1295  * 	@dev: network device
1296  *
1297  * Generate traffic such that interested network peers are aware of
1298  * @dev, such as by generating a gratuitous ARP. This may be used when
1299  * a device wants to inform the rest of the network about some sort of
1300  * reconfiguration such as a failover event or virtual machine
1301  * migration.
1302  */
netdev_notify_peers(struct net_device * dev)1303 void netdev_notify_peers(struct net_device *dev)
1304 {
1305 	rtnl_lock();
1306 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1307 	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1308 	rtnl_unlock();
1309 }
1310 EXPORT_SYMBOL(netdev_notify_peers);
1311 
__dev_open(struct net_device * dev)1312 static int __dev_open(struct net_device *dev)
1313 {
1314 	const struct net_device_ops *ops = dev->netdev_ops;
1315 	int ret;
1316 
1317 	ASSERT_RTNL();
1318 
1319 	if (!netif_device_present(dev))
1320 		return -ENODEV;
1321 
1322 	/* Block netpoll from trying to do any rx path servicing.
1323 	 * If we don't do this there is a chance ndo_poll_controller
1324 	 * or ndo_poll may be running while we open the device
1325 	 */
1326 	netpoll_poll_disable(dev);
1327 
1328 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1329 	ret = notifier_to_errno(ret);
1330 	if (ret)
1331 		return ret;
1332 
1333 	set_bit(__LINK_STATE_START, &dev->state);
1334 
1335 	if (ops->ndo_validate_addr)
1336 		ret = ops->ndo_validate_addr(dev);
1337 
1338 	if (!ret && ops->ndo_open)
1339 		ret = ops->ndo_open(dev);
1340 
1341 	netpoll_poll_enable(dev);
1342 
1343 	if (ret)
1344 		clear_bit(__LINK_STATE_START, &dev->state);
1345 	else {
1346 		dev->flags |= IFF_UP;
1347 		dev_set_rx_mode(dev);
1348 		dev_activate(dev);
1349 		add_device_randomness(dev->dev_addr, dev->addr_len);
1350 	}
1351 
1352 	return ret;
1353 }
1354 
1355 /**
1356  *	dev_open	- prepare an interface for use.
1357  *	@dev:	device to open
1358  *
1359  *	Takes a device from down to up state. The device's private open
1360  *	function is invoked and then the multicast lists are loaded. Finally
1361  *	the device is moved into the up state and a %NETDEV_UP message is
1362  *	sent to the netdev notifier chain.
1363  *
1364  *	Calling this function on an active interface is a nop. On a failure
1365  *	a negative errno code is returned.
1366  */
dev_open(struct net_device * dev)1367 int dev_open(struct net_device *dev)
1368 {
1369 	int ret;
1370 
1371 	if (dev->flags & IFF_UP)
1372 		return 0;
1373 
1374 	ret = __dev_open(dev);
1375 	if (ret < 0)
1376 		return ret;
1377 
1378 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1379 	call_netdevice_notifiers(NETDEV_UP, dev);
1380 
1381 	return ret;
1382 }
1383 EXPORT_SYMBOL(dev_open);
1384 
__dev_close_many(struct list_head * head)1385 static int __dev_close_many(struct list_head *head)
1386 {
1387 	struct net_device *dev;
1388 
1389 	ASSERT_RTNL();
1390 	might_sleep();
1391 
1392 	list_for_each_entry(dev, head, close_list) {
1393 		/* Temporarily disable netpoll until the interface is down */
1394 		netpoll_poll_disable(dev);
1395 
1396 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1397 
1398 		clear_bit(__LINK_STATE_START, &dev->state);
1399 
1400 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1401 		 * can be even on different cpu. So just clear netif_running().
1402 		 *
1403 		 * dev->stop() will invoke napi_disable() on all of it's
1404 		 * napi_struct instances on this device.
1405 		 */
1406 		smp_mb__after_atomic(); /* Commit netif_running(). */
1407 	}
1408 
1409 	dev_deactivate_many(head);
1410 
1411 	list_for_each_entry(dev, head, close_list) {
1412 		const struct net_device_ops *ops = dev->netdev_ops;
1413 
1414 		/*
1415 		 *	Call the device specific close. This cannot fail.
1416 		 *	Only if device is UP
1417 		 *
1418 		 *	We allow it to be called even after a DETACH hot-plug
1419 		 *	event.
1420 		 */
1421 		if (ops->ndo_stop)
1422 			ops->ndo_stop(dev);
1423 
1424 		dev->flags &= ~IFF_UP;
1425 		netpoll_poll_enable(dev);
1426 	}
1427 
1428 	return 0;
1429 }
1430 
__dev_close(struct net_device * dev)1431 static int __dev_close(struct net_device *dev)
1432 {
1433 	int retval;
1434 	LIST_HEAD(single);
1435 
1436 	list_add(&dev->close_list, &single);
1437 	retval = __dev_close_many(&single);
1438 	list_del(&single);
1439 
1440 	return retval;
1441 }
1442 
dev_close_many(struct list_head * head,bool unlink)1443 int dev_close_many(struct list_head *head, bool unlink)
1444 {
1445 	struct net_device *dev, *tmp;
1446 
1447 	/* Remove the devices that don't need to be closed */
1448 	list_for_each_entry_safe(dev, tmp, head, close_list)
1449 		if (!(dev->flags & IFF_UP))
1450 			list_del_init(&dev->close_list);
1451 
1452 	__dev_close_many(head);
1453 
1454 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1455 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1456 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1457 		if (unlink)
1458 			list_del_init(&dev->close_list);
1459 	}
1460 
1461 	return 0;
1462 }
1463 EXPORT_SYMBOL(dev_close_many);
1464 
1465 /**
1466  *	dev_close - shutdown an interface.
1467  *	@dev: device to shutdown
1468  *
1469  *	This function moves an active device into down state. A
1470  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1471  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1472  *	chain.
1473  */
dev_close(struct net_device * dev)1474 int dev_close(struct net_device *dev)
1475 {
1476 	if (dev->flags & IFF_UP) {
1477 		LIST_HEAD(single);
1478 
1479 		list_add(&dev->close_list, &single);
1480 		dev_close_many(&single, true);
1481 		list_del(&single);
1482 	}
1483 	return 0;
1484 }
1485 EXPORT_SYMBOL(dev_close);
1486 
1487 
1488 /**
1489  *	dev_disable_lro - disable Large Receive Offload on a device
1490  *	@dev: device
1491  *
1492  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1493  *	called under RTNL.  This is needed if received packets may be
1494  *	forwarded to another interface.
1495  */
dev_disable_lro(struct net_device * dev)1496 void dev_disable_lro(struct net_device *dev)
1497 {
1498 	struct net_device *lower_dev;
1499 	struct list_head *iter;
1500 
1501 	dev->wanted_features &= ~NETIF_F_LRO;
1502 	netdev_update_features(dev);
1503 
1504 	if (unlikely(dev->features & NETIF_F_LRO))
1505 		netdev_WARN(dev, "failed to disable LRO!\n");
1506 
1507 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1508 		dev_disable_lro(lower_dev);
1509 }
1510 EXPORT_SYMBOL(dev_disable_lro);
1511 
call_netdevice_notifier(struct notifier_block * nb,unsigned long val,struct net_device * dev)1512 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1513 				   struct net_device *dev)
1514 {
1515 	struct netdev_notifier_info info;
1516 
1517 	netdev_notifier_info_init(&info, dev);
1518 	return nb->notifier_call(nb, val, &info);
1519 }
1520 
1521 static int dev_boot_phase = 1;
1522 
1523 /**
1524  *	register_netdevice_notifier - register a network notifier block
1525  *	@nb: notifier
1526  *
1527  *	Register a notifier to be called when network device events occur.
1528  *	The notifier passed is linked into the kernel structures and must
1529  *	not be reused until it has been unregistered. A negative errno code
1530  *	is returned on a failure.
1531  *
1532  * 	When registered all registration and up events are replayed
1533  *	to the new notifier to allow device to have a race free
1534  *	view of the network device list.
1535  */
1536 
register_netdevice_notifier(struct notifier_block * nb)1537 int register_netdevice_notifier(struct notifier_block *nb)
1538 {
1539 	struct net_device *dev;
1540 	struct net_device *last;
1541 	struct net *net;
1542 	int err;
1543 
1544 	rtnl_lock();
1545 	err = raw_notifier_chain_register(&netdev_chain, nb);
1546 	if (err)
1547 		goto unlock;
1548 	if (dev_boot_phase)
1549 		goto unlock;
1550 	for_each_net(net) {
1551 		for_each_netdev(net, dev) {
1552 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1553 			err = notifier_to_errno(err);
1554 			if (err)
1555 				goto rollback;
1556 
1557 			if (!(dev->flags & IFF_UP))
1558 				continue;
1559 
1560 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1561 		}
1562 	}
1563 
1564 unlock:
1565 	rtnl_unlock();
1566 	return err;
1567 
1568 rollback:
1569 	last = dev;
1570 	for_each_net(net) {
1571 		for_each_netdev(net, dev) {
1572 			if (dev == last)
1573 				goto outroll;
1574 
1575 			if (dev->flags & IFF_UP) {
1576 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1577 							dev);
1578 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1579 			}
1580 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1581 		}
1582 	}
1583 
1584 outroll:
1585 	raw_notifier_chain_unregister(&netdev_chain, nb);
1586 	goto unlock;
1587 }
1588 EXPORT_SYMBOL(register_netdevice_notifier);
1589 
1590 /**
1591  *	unregister_netdevice_notifier - unregister a network notifier block
1592  *	@nb: notifier
1593  *
1594  *	Unregister a notifier previously registered by
1595  *	register_netdevice_notifier(). The notifier is unlinked into the
1596  *	kernel structures and may then be reused. A negative errno code
1597  *	is returned on a failure.
1598  *
1599  * 	After unregistering unregister and down device events are synthesized
1600  *	for all devices on the device list to the removed notifier to remove
1601  *	the need for special case cleanup code.
1602  */
1603 
unregister_netdevice_notifier(struct notifier_block * nb)1604 int unregister_netdevice_notifier(struct notifier_block *nb)
1605 {
1606 	struct net_device *dev;
1607 	struct net *net;
1608 	int err;
1609 
1610 	rtnl_lock();
1611 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1612 	if (err)
1613 		goto unlock;
1614 
1615 	for_each_net(net) {
1616 		for_each_netdev(net, dev) {
1617 			if (dev->flags & IFF_UP) {
1618 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1619 							dev);
1620 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1621 			}
1622 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1623 		}
1624 	}
1625 unlock:
1626 	rtnl_unlock();
1627 	return err;
1628 }
1629 EXPORT_SYMBOL(unregister_netdevice_notifier);
1630 
1631 /**
1632  *	call_netdevice_notifiers_info - call all network notifier blocks
1633  *	@val: value passed unmodified to notifier function
1634  *	@dev: net_device pointer passed unmodified to notifier function
1635  *	@info: notifier information data
1636  *
1637  *	Call all network notifier blocks.  Parameters and return value
1638  *	are as for raw_notifier_call_chain().
1639  */
1640 
call_netdevice_notifiers_info(unsigned long val,struct net_device * dev,struct netdev_notifier_info * info)1641 static int call_netdevice_notifiers_info(unsigned long val,
1642 					 struct net_device *dev,
1643 					 struct netdev_notifier_info *info)
1644 {
1645 	ASSERT_RTNL();
1646 	netdev_notifier_info_init(info, dev);
1647 	return raw_notifier_call_chain(&netdev_chain, val, info);
1648 }
1649 
1650 /**
1651  *	call_netdevice_notifiers - call all network notifier blocks
1652  *      @val: value passed unmodified to notifier function
1653  *      @dev: net_device pointer passed unmodified to notifier function
1654  *
1655  *	Call all network notifier blocks.  Parameters and return value
1656  *	are as for raw_notifier_call_chain().
1657  */
1658 
call_netdevice_notifiers(unsigned long val,struct net_device * dev)1659 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1660 {
1661 	struct netdev_notifier_info info;
1662 
1663 	return call_netdevice_notifiers_info(val, dev, &info);
1664 }
1665 EXPORT_SYMBOL(call_netdevice_notifiers);
1666 
1667 #ifdef CONFIG_NET_INGRESS
1668 static struct static_key ingress_needed __read_mostly;
1669 
net_inc_ingress_queue(void)1670 void net_inc_ingress_queue(void)
1671 {
1672 	static_key_slow_inc(&ingress_needed);
1673 }
1674 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1675 
net_dec_ingress_queue(void)1676 void net_dec_ingress_queue(void)
1677 {
1678 	static_key_slow_dec(&ingress_needed);
1679 }
1680 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1681 #endif
1682 
1683 #ifdef CONFIG_NET_EGRESS
1684 static struct static_key egress_needed __read_mostly;
1685 
net_inc_egress_queue(void)1686 void net_inc_egress_queue(void)
1687 {
1688 	static_key_slow_inc(&egress_needed);
1689 }
1690 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1691 
net_dec_egress_queue(void)1692 void net_dec_egress_queue(void)
1693 {
1694 	static_key_slow_dec(&egress_needed);
1695 }
1696 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1697 #endif
1698 
1699 static struct static_key netstamp_needed __read_mostly;
1700 #ifdef HAVE_JUMP_LABEL
1701 static atomic_t netstamp_needed_deferred;
1702 static atomic_t netstamp_wanted;
netstamp_clear(struct work_struct * work)1703 static void netstamp_clear(struct work_struct *work)
1704 {
1705 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1706 	int wanted;
1707 
1708 	wanted = atomic_add_return(deferred, &netstamp_wanted);
1709 	if (wanted > 0)
1710 		static_key_enable(&netstamp_needed);
1711 	else
1712 		static_key_disable(&netstamp_needed);
1713 }
1714 static DECLARE_WORK(netstamp_work, netstamp_clear);
1715 #endif
1716 
net_enable_timestamp(void)1717 void net_enable_timestamp(void)
1718 {
1719 #ifdef HAVE_JUMP_LABEL
1720 	int wanted;
1721 
1722 	while (1) {
1723 		wanted = atomic_read(&netstamp_wanted);
1724 		if (wanted <= 0)
1725 			break;
1726 		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1727 			return;
1728 	}
1729 	atomic_inc(&netstamp_needed_deferred);
1730 	schedule_work(&netstamp_work);
1731 #else
1732 	static_key_slow_inc(&netstamp_needed);
1733 #endif
1734 }
1735 EXPORT_SYMBOL(net_enable_timestamp);
1736 
net_disable_timestamp(void)1737 void net_disable_timestamp(void)
1738 {
1739 #ifdef HAVE_JUMP_LABEL
1740 	int wanted;
1741 
1742 	while (1) {
1743 		wanted = atomic_read(&netstamp_wanted);
1744 		if (wanted <= 1)
1745 			break;
1746 		if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1747 			return;
1748 	}
1749 	atomic_dec(&netstamp_needed_deferred);
1750 	schedule_work(&netstamp_work);
1751 #else
1752 	static_key_slow_dec(&netstamp_needed);
1753 #endif
1754 }
1755 EXPORT_SYMBOL(net_disable_timestamp);
1756 
net_timestamp_set(struct sk_buff * skb)1757 static inline void net_timestamp_set(struct sk_buff *skb)
1758 {
1759 	skb->tstamp.tv64 = 0;
1760 	if (static_key_false(&netstamp_needed))
1761 		__net_timestamp(skb);
1762 }
1763 
1764 #define net_timestamp_check(COND, SKB)			\
1765 	if (static_key_false(&netstamp_needed)) {		\
1766 		if ((COND) && !(SKB)->tstamp.tv64)	\
1767 			__net_timestamp(SKB);		\
1768 	}						\
1769 
is_skb_forwardable(const struct net_device * dev,const struct sk_buff * skb)1770 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1771 {
1772 	unsigned int len;
1773 
1774 	if (!(dev->flags & IFF_UP))
1775 		return false;
1776 
1777 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1778 	if (skb->len <= len)
1779 		return true;
1780 
1781 	/* if TSO is enabled, we don't care about the length as the packet
1782 	 * could be forwarded without being segmented before
1783 	 */
1784 	if (skb_is_gso(skb))
1785 		return true;
1786 
1787 	return false;
1788 }
1789 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1790 
__dev_forward_skb(struct net_device * dev,struct sk_buff * skb)1791 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1792 {
1793 	int ret = ____dev_forward_skb(dev, skb);
1794 
1795 	if (likely(!ret)) {
1796 		skb->protocol = eth_type_trans(skb, dev);
1797 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1798 	}
1799 
1800 	return ret;
1801 }
1802 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1803 
1804 /**
1805  * dev_forward_skb - loopback an skb to another netif
1806  *
1807  * @dev: destination network device
1808  * @skb: buffer to forward
1809  *
1810  * return values:
1811  *	NET_RX_SUCCESS	(no congestion)
1812  *	NET_RX_DROP     (packet was dropped, but freed)
1813  *
1814  * dev_forward_skb can be used for injecting an skb from the
1815  * start_xmit function of one device into the receive queue
1816  * of another device.
1817  *
1818  * The receiving device may be in another namespace, so
1819  * we have to clear all information in the skb that could
1820  * impact namespace isolation.
1821  */
dev_forward_skb(struct net_device * dev,struct sk_buff * skb)1822 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1823 {
1824 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1825 }
1826 EXPORT_SYMBOL_GPL(dev_forward_skb);
1827 
deliver_skb(struct sk_buff * skb,struct packet_type * pt_prev,struct net_device * orig_dev)1828 static inline int deliver_skb(struct sk_buff *skb,
1829 			      struct packet_type *pt_prev,
1830 			      struct net_device *orig_dev)
1831 {
1832 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1833 		return -ENOMEM;
1834 	atomic_inc(&skb->users);
1835 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1836 }
1837 
deliver_ptype_list_skb(struct sk_buff * skb,struct packet_type ** pt,struct net_device * orig_dev,__be16 type,struct list_head * ptype_list)1838 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1839 					  struct packet_type **pt,
1840 					  struct net_device *orig_dev,
1841 					  __be16 type,
1842 					  struct list_head *ptype_list)
1843 {
1844 	struct packet_type *ptype, *pt_prev = *pt;
1845 
1846 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1847 		if (ptype->type != type)
1848 			continue;
1849 		if (pt_prev)
1850 			deliver_skb(skb, pt_prev, orig_dev);
1851 		pt_prev = ptype;
1852 	}
1853 	*pt = pt_prev;
1854 }
1855 
skb_loop_sk(struct packet_type * ptype,struct sk_buff * skb)1856 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1857 {
1858 	if (!ptype->af_packet_priv || !skb->sk)
1859 		return false;
1860 
1861 	if (ptype->id_match)
1862 		return ptype->id_match(ptype, skb->sk);
1863 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1864 		return true;
1865 
1866 	return false;
1867 }
1868 
1869 /*
1870  *	Support routine. Sends outgoing frames to any network
1871  *	taps currently in use.
1872  */
1873 
dev_queue_xmit_nit(struct sk_buff * skb,struct net_device * dev)1874 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1875 {
1876 	struct packet_type *ptype;
1877 	struct sk_buff *skb2 = NULL;
1878 	struct packet_type *pt_prev = NULL;
1879 	struct list_head *ptype_list = &ptype_all;
1880 
1881 	rcu_read_lock();
1882 again:
1883 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1884 		/* Never send packets back to the socket
1885 		 * they originated from - MvS (miquels@drinkel.ow.org)
1886 		 */
1887 		if (skb_loop_sk(ptype, skb))
1888 			continue;
1889 
1890 		if (pt_prev) {
1891 			deliver_skb(skb2, pt_prev, skb->dev);
1892 			pt_prev = ptype;
1893 			continue;
1894 		}
1895 
1896 		/* need to clone skb, done only once */
1897 		skb2 = skb_clone(skb, GFP_ATOMIC);
1898 		if (!skb2)
1899 			goto out_unlock;
1900 
1901 		net_timestamp_set(skb2);
1902 
1903 		/* skb->nh should be correctly
1904 		 * set by sender, so that the second statement is
1905 		 * just protection against buggy protocols.
1906 		 */
1907 		skb_reset_mac_header(skb2);
1908 
1909 		if (skb_network_header(skb2) < skb2->data ||
1910 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1911 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1912 					     ntohs(skb2->protocol),
1913 					     dev->name);
1914 			skb_reset_network_header(skb2);
1915 		}
1916 
1917 		skb2->transport_header = skb2->network_header;
1918 		skb2->pkt_type = PACKET_OUTGOING;
1919 		pt_prev = ptype;
1920 	}
1921 
1922 	if (ptype_list == &ptype_all) {
1923 		ptype_list = &dev->ptype_all;
1924 		goto again;
1925 	}
1926 out_unlock:
1927 	if (pt_prev)
1928 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1929 	rcu_read_unlock();
1930 }
1931 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1932 
1933 /**
1934  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1935  * @dev: Network device
1936  * @txq: number of queues available
1937  *
1938  * If real_num_tx_queues is changed the tc mappings may no longer be
1939  * valid. To resolve this verify the tc mapping remains valid and if
1940  * not NULL the mapping. With no priorities mapping to this
1941  * offset/count pair it will no longer be used. In the worst case TC0
1942  * is invalid nothing can be done so disable priority mappings. If is
1943  * expected that drivers will fix this mapping if they can before
1944  * calling netif_set_real_num_tx_queues.
1945  */
netif_setup_tc(struct net_device * dev,unsigned int txq)1946 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1947 {
1948 	int i;
1949 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1950 
1951 	/* If TC0 is invalidated disable TC mapping */
1952 	if (tc->offset + tc->count > txq) {
1953 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1954 		dev->num_tc = 0;
1955 		return;
1956 	}
1957 
1958 	/* Invalidated prio to tc mappings set to TC0 */
1959 	for (i = 1; i < TC_BITMASK + 1; i++) {
1960 		int q = netdev_get_prio_tc_map(dev, i);
1961 
1962 		tc = &dev->tc_to_txq[q];
1963 		if (tc->offset + tc->count > txq) {
1964 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1965 				i, q);
1966 			netdev_set_prio_tc_map(dev, i, 0);
1967 		}
1968 	}
1969 }
1970 
1971 #ifdef CONFIG_XPS
1972 static DEFINE_MUTEX(xps_map_mutex);
1973 #define xmap_dereference(P)		\
1974 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1975 
remove_xps_queue(struct xps_dev_maps * dev_maps,int cpu,u16 index)1976 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1977 					int cpu, u16 index)
1978 {
1979 	struct xps_map *map = NULL;
1980 	int pos;
1981 
1982 	if (dev_maps)
1983 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1984 
1985 	for (pos = 0; map && pos < map->len; pos++) {
1986 		if (map->queues[pos] == index) {
1987 			if (map->len > 1) {
1988 				map->queues[pos] = map->queues[--map->len];
1989 			} else {
1990 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1991 				kfree_rcu(map, rcu);
1992 				map = NULL;
1993 			}
1994 			break;
1995 		}
1996 	}
1997 
1998 	return map;
1999 }
2000 
netif_reset_xps_queues_gt(struct net_device * dev,u16 index)2001 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2002 {
2003 	struct xps_dev_maps *dev_maps;
2004 	int cpu, i;
2005 	bool active = false;
2006 
2007 	mutex_lock(&xps_map_mutex);
2008 	dev_maps = xmap_dereference(dev->xps_maps);
2009 
2010 	if (!dev_maps)
2011 		goto out_no_maps;
2012 
2013 	for_each_possible_cpu(cpu) {
2014 		for (i = index; i < dev->num_tx_queues; i++) {
2015 			if (!remove_xps_queue(dev_maps, cpu, i))
2016 				break;
2017 		}
2018 		if (i == dev->num_tx_queues)
2019 			active = true;
2020 	}
2021 
2022 	if (!active) {
2023 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2024 		kfree_rcu(dev_maps, rcu);
2025 	}
2026 
2027 	for (i = index; i < dev->num_tx_queues; i++)
2028 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2029 					     NUMA_NO_NODE);
2030 
2031 out_no_maps:
2032 	mutex_unlock(&xps_map_mutex);
2033 }
2034 
expand_xps_map(struct xps_map * map,int cpu,u16 index)2035 static struct xps_map *expand_xps_map(struct xps_map *map,
2036 				      int cpu, u16 index)
2037 {
2038 	struct xps_map *new_map;
2039 	int alloc_len = XPS_MIN_MAP_ALLOC;
2040 	int i, pos;
2041 
2042 	for (pos = 0; map && pos < map->len; pos++) {
2043 		if (map->queues[pos] != index)
2044 			continue;
2045 		return map;
2046 	}
2047 
2048 	/* Need to add queue to this CPU's existing map */
2049 	if (map) {
2050 		if (pos < map->alloc_len)
2051 			return map;
2052 
2053 		alloc_len = map->alloc_len * 2;
2054 	}
2055 
2056 	/* Need to allocate new map to store queue on this CPU's map */
2057 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2058 			       cpu_to_node(cpu));
2059 	if (!new_map)
2060 		return NULL;
2061 
2062 	for (i = 0; i < pos; i++)
2063 		new_map->queues[i] = map->queues[i];
2064 	new_map->alloc_len = alloc_len;
2065 	new_map->len = pos;
2066 
2067 	return new_map;
2068 }
2069 
netif_set_xps_queue(struct net_device * dev,const struct cpumask * mask,u16 index)2070 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2071 			u16 index)
2072 {
2073 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2074 	struct xps_map *map, *new_map;
2075 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2076 	int cpu, numa_node_id = -2;
2077 	bool active = false;
2078 
2079 	mutex_lock(&xps_map_mutex);
2080 
2081 	dev_maps = xmap_dereference(dev->xps_maps);
2082 
2083 	/* allocate memory for queue storage */
2084 	for_each_online_cpu(cpu) {
2085 		if (!cpumask_test_cpu(cpu, mask))
2086 			continue;
2087 
2088 		if (!new_dev_maps)
2089 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2090 		if (!new_dev_maps) {
2091 			mutex_unlock(&xps_map_mutex);
2092 			return -ENOMEM;
2093 		}
2094 
2095 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2096 				 NULL;
2097 
2098 		map = expand_xps_map(map, cpu, index);
2099 		if (!map)
2100 			goto error;
2101 
2102 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2103 	}
2104 
2105 	if (!new_dev_maps)
2106 		goto out_no_new_maps;
2107 
2108 	for_each_possible_cpu(cpu) {
2109 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2110 			/* add queue to CPU maps */
2111 			int pos = 0;
2112 
2113 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2114 			while ((pos < map->len) && (map->queues[pos] != index))
2115 				pos++;
2116 
2117 			if (pos == map->len)
2118 				map->queues[map->len++] = index;
2119 #ifdef CONFIG_NUMA
2120 			if (numa_node_id == -2)
2121 				numa_node_id = cpu_to_node(cpu);
2122 			else if (numa_node_id != cpu_to_node(cpu))
2123 				numa_node_id = -1;
2124 #endif
2125 		} else if (dev_maps) {
2126 			/* fill in the new device map from the old device map */
2127 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2128 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2129 		}
2130 
2131 	}
2132 
2133 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2134 
2135 	/* Cleanup old maps */
2136 	if (dev_maps) {
2137 		for_each_possible_cpu(cpu) {
2138 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2139 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2140 			if (map && map != new_map)
2141 				kfree_rcu(map, rcu);
2142 		}
2143 
2144 		kfree_rcu(dev_maps, rcu);
2145 	}
2146 
2147 	dev_maps = new_dev_maps;
2148 	active = true;
2149 
2150 out_no_new_maps:
2151 	/* update Tx queue numa node */
2152 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2153 				     (numa_node_id >= 0) ? numa_node_id :
2154 				     NUMA_NO_NODE);
2155 
2156 	if (!dev_maps)
2157 		goto out_no_maps;
2158 
2159 	/* removes queue from unused CPUs */
2160 	for_each_possible_cpu(cpu) {
2161 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2162 			continue;
2163 
2164 		if (remove_xps_queue(dev_maps, cpu, index))
2165 			active = true;
2166 	}
2167 
2168 	/* free map if not active */
2169 	if (!active) {
2170 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2171 		kfree_rcu(dev_maps, rcu);
2172 	}
2173 
2174 out_no_maps:
2175 	mutex_unlock(&xps_map_mutex);
2176 
2177 	return 0;
2178 error:
2179 	/* remove any maps that we added */
2180 	for_each_possible_cpu(cpu) {
2181 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2182 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2183 				 NULL;
2184 		if (new_map && new_map != map)
2185 			kfree(new_map);
2186 	}
2187 
2188 	mutex_unlock(&xps_map_mutex);
2189 
2190 	kfree(new_dev_maps);
2191 	return -ENOMEM;
2192 }
2193 EXPORT_SYMBOL(netif_set_xps_queue);
2194 
2195 #endif
2196 /*
2197  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2198  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2199  */
netif_set_real_num_tx_queues(struct net_device * dev,unsigned int txq)2200 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2201 {
2202 	bool disabling;
2203 	int rc;
2204 
2205 	disabling = txq < dev->real_num_tx_queues;
2206 
2207 	if (txq < 1 || txq > dev->num_tx_queues)
2208 		return -EINVAL;
2209 
2210 	if (dev->reg_state == NETREG_REGISTERED ||
2211 	    dev->reg_state == NETREG_UNREGISTERING) {
2212 		ASSERT_RTNL();
2213 
2214 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2215 						  txq);
2216 		if (rc)
2217 			return rc;
2218 
2219 		if (dev->num_tc)
2220 			netif_setup_tc(dev, txq);
2221 
2222 		dev->real_num_tx_queues = txq;
2223 
2224 		if (disabling) {
2225 			synchronize_net();
2226 			qdisc_reset_all_tx_gt(dev, txq);
2227 #ifdef CONFIG_XPS
2228 			netif_reset_xps_queues_gt(dev, txq);
2229 #endif
2230 		}
2231 	} else {
2232 		dev->real_num_tx_queues = txq;
2233 	}
2234 
2235 	return 0;
2236 }
2237 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2238 
2239 #ifdef CONFIG_SYSFS
2240 /**
2241  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2242  *	@dev: Network device
2243  *	@rxq: Actual number of RX queues
2244  *
2245  *	This must be called either with the rtnl_lock held or before
2246  *	registration of the net device.  Returns 0 on success, or a
2247  *	negative error code.  If called before registration, it always
2248  *	succeeds.
2249  */
netif_set_real_num_rx_queues(struct net_device * dev,unsigned int rxq)2250 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2251 {
2252 	int rc;
2253 
2254 	if (rxq < 1 || rxq > dev->num_rx_queues)
2255 		return -EINVAL;
2256 
2257 	if (dev->reg_state == NETREG_REGISTERED) {
2258 		ASSERT_RTNL();
2259 
2260 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2261 						  rxq);
2262 		if (rc)
2263 			return rc;
2264 	}
2265 
2266 	dev->real_num_rx_queues = rxq;
2267 	return 0;
2268 }
2269 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2270 #endif
2271 
2272 /**
2273  * netif_get_num_default_rss_queues - default number of RSS queues
2274  *
2275  * This routine should set an upper limit on the number of RSS queues
2276  * used by default by multiqueue devices.
2277  */
netif_get_num_default_rss_queues(void)2278 int netif_get_num_default_rss_queues(void)
2279 {
2280 	return is_kdump_kernel() ?
2281 		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2282 }
2283 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2284 
__netif_reschedule(struct Qdisc * q)2285 static void __netif_reschedule(struct Qdisc *q)
2286 {
2287 	struct softnet_data *sd;
2288 	unsigned long flags;
2289 
2290 	local_irq_save(flags);
2291 	sd = this_cpu_ptr(&softnet_data);
2292 	q->next_sched = NULL;
2293 	*sd->output_queue_tailp = q;
2294 	sd->output_queue_tailp = &q->next_sched;
2295 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2296 	local_irq_restore(flags);
2297 }
2298 
__netif_schedule(struct Qdisc * q)2299 void __netif_schedule(struct Qdisc *q)
2300 {
2301 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2302 		__netif_reschedule(q);
2303 }
2304 EXPORT_SYMBOL(__netif_schedule);
2305 
2306 struct dev_kfree_skb_cb {
2307 	enum skb_free_reason reason;
2308 };
2309 
get_kfree_skb_cb(const struct sk_buff * skb)2310 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2311 {
2312 	return (struct dev_kfree_skb_cb *)skb->cb;
2313 }
2314 
netif_schedule_queue(struct netdev_queue * txq)2315 void netif_schedule_queue(struct netdev_queue *txq)
2316 {
2317 	rcu_read_lock();
2318 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2319 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2320 
2321 		__netif_schedule(q);
2322 	}
2323 	rcu_read_unlock();
2324 }
2325 EXPORT_SYMBOL(netif_schedule_queue);
2326 
2327 /**
2328  *	netif_wake_subqueue - allow sending packets on subqueue
2329  *	@dev: network device
2330  *	@queue_index: sub queue index
2331  *
2332  * Resume individual transmit queue of a device with multiple transmit queues.
2333  */
netif_wake_subqueue(struct net_device * dev,u16 queue_index)2334 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2335 {
2336 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2337 
2338 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2339 		struct Qdisc *q;
2340 
2341 		rcu_read_lock();
2342 		q = rcu_dereference(txq->qdisc);
2343 		__netif_schedule(q);
2344 		rcu_read_unlock();
2345 	}
2346 }
2347 EXPORT_SYMBOL(netif_wake_subqueue);
2348 
netif_tx_wake_queue(struct netdev_queue * dev_queue)2349 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2350 {
2351 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2352 		struct Qdisc *q;
2353 
2354 		rcu_read_lock();
2355 		q = rcu_dereference(dev_queue->qdisc);
2356 		__netif_schedule(q);
2357 		rcu_read_unlock();
2358 	}
2359 }
2360 EXPORT_SYMBOL(netif_tx_wake_queue);
2361 
__dev_kfree_skb_irq(struct sk_buff * skb,enum skb_free_reason reason)2362 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2363 {
2364 	unsigned long flags;
2365 
2366 	if (unlikely(!skb))
2367 		return;
2368 
2369 	if (likely(atomic_read(&skb->users) == 1)) {
2370 		smp_rmb();
2371 		atomic_set(&skb->users, 0);
2372 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2373 		return;
2374 	}
2375 	get_kfree_skb_cb(skb)->reason = reason;
2376 	local_irq_save(flags);
2377 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2378 	__this_cpu_write(softnet_data.completion_queue, skb);
2379 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2380 	local_irq_restore(flags);
2381 }
2382 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2383 
__dev_kfree_skb_any(struct sk_buff * skb,enum skb_free_reason reason)2384 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2385 {
2386 	if (in_irq() || irqs_disabled())
2387 		__dev_kfree_skb_irq(skb, reason);
2388 	else
2389 		dev_kfree_skb(skb);
2390 }
2391 EXPORT_SYMBOL(__dev_kfree_skb_any);
2392 
2393 
2394 /**
2395  * netif_device_detach - mark device as removed
2396  * @dev: network device
2397  *
2398  * Mark device as removed from system and therefore no longer available.
2399  */
netif_device_detach(struct net_device * dev)2400 void netif_device_detach(struct net_device *dev)
2401 {
2402 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2403 	    netif_running(dev)) {
2404 		netif_tx_stop_all_queues(dev);
2405 	}
2406 }
2407 EXPORT_SYMBOL(netif_device_detach);
2408 
2409 /**
2410  * netif_device_attach - mark device as attached
2411  * @dev: network device
2412  *
2413  * Mark device as attached from system and restart if needed.
2414  */
netif_device_attach(struct net_device * dev)2415 void netif_device_attach(struct net_device *dev)
2416 {
2417 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2418 	    netif_running(dev)) {
2419 		netif_tx_wake_all_queues(dev);
2420 		__netdev_watchdog_up(dev);
2421 	}
2422 }
2423 EXPORT_SYMBOL(netif_device_attach);
2424 
2425 /*
2426  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2427  * to be used as a distribution range.
2428  */
__skb_tx_hash(const struct net_device * dev,struct sk_buff * skb,unsigned int num_tx_queues)2429 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2430 		  unsigned int num_tx_queues)
2431 {
2432 	u32 hash;
2433 	u16 qoffset = 0;
2434 	u16 qcount = num_tx_queues;
2435 
2436 	if (skb_rx_queue_recorded(skb)) {
2437 		hash = skb_get_rx_queue(skb);
2438 		while (unlikely(hash >= num_tx_queues))
2439 			hash -= num_tx_queues;
2440 		return hash;
2441 	}
2442 
2443 	if (dev->num_tc) {
2444 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2445 		qoffset = dev->tc_to_txq[tc].offset;
2446 		qcount = dev->tc_to_txq[tc].count;
2447 	}
2448 
2449 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2450 }
2451 EXPORT_SYMBOL(__skb_tx_hash);
2452 
skb_warn_bad_offload(const struct sk_buff * skb)2453 static void skb_warn_bad_offload(const struct sk_buff *skb)
2454 {
2455 	static const netdev_features_t null_features;
2456 	struct net_device *dev = skb->dev;
2457 	const char *name = "";
2458 
2459 	if (!net_ratelimit())
2460 		return;
2461 
2462 	if (dev) {
2463 		if (dev->dev.parent)
2464 			name = dev_driver_string(dev->dev.parent);
2465 		else
2466 			name = netdev_name(dev);
2467 	}
2468 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2469 	     "gso_type=%d ip_summed=%d\n",
2470 	     name, dev ? &dev->features : &null_features,
2471 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2472 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2473 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2474 }
2475 
2476 /*
2477  * Invalidate hardware checksum when packet is to be mangled, and
2478  * complete checksum manually on outgoing path.
2479  */
skb_checksum_help(struct sk_buff * skb)2480 int skb_checksum_help(struct sk_buff *skb)
2481 {
2482 	__wsum csum;
2483 	int ret = 0, offset;
2484 
2485 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2486 		goto out_set_summed;
2487 
2488 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2489 		skb_warn_bad_offload(skb);
2490 		return -EINVAL;
2491 	}
2492 
2493 	/* Before computing a checksum, we should make sure no frag could
2494 	 * be modified by an external entity : checksum could be wrong.
2495 	 */
2496 	if (skb_has_shared_frag(skb)) {
2497 		ret = __skb_linearize(skb);
2498 		if (ret)
2499 			goto out;
2500 	}
2501 
2502 	offset = skb_checksum_start_offset(skb);
2503 	BUG_ON(offset >= skb_headlen(skb));
2504 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2505 
2506 	offset += skb->csum_offset;
2507 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2508 
2509 	if (skb_cloned(skb) &&
2510 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2511 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2512 		if (ret)
2513 			goto out;
2514 	}
2515 
2516 	*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2517 out_set_summed:
2518 	skb->ip_summed = CHECKSUM_NONE;
2519 out:
2520 	return ret;
2521 }
2522 EXPORT_SYMBOL(skb_checksum_help);
2523 
2524 /* skb_csum_offload_check - Driver helper function to determine if a device
2525  * with limited checksum offload capabilities is able to offload the checksum
2526  * for a given packet.
2527  *
2528  * Arguments:
2529  *   skb - sk_buff for the packet in question
2530  *   spec - contains the description of what device can offload
2531  *   csum_encapped - returns true if the checksum being offloaded is
2532  *	      encpasulated. That is it is checksum for the transport header
2533  *	      in the inner headers.
2534  *   checksum_help - when set indicates that helper function should
2535  *	      call skb_checksum_help if offload checks fail
2536  *
2537  * Returns:
2538  *   true: Packet has passed the checksum checks and should be offloadable to
2539  *	   the device (a driver may still need to check for additional
2540  *	   restrictions of its device)
2541  *   false: Checksum is not offloadable. If checksum_help was set then
2542  *	   skb_checksum_help was called to resolve checksum for non-GSO
2543  *	   packets and when IP protocol is not SCTP
2544  */
__skb_csum_offload_chk(struct sk_buff * skb,const struct skb_csum_offl_spec * spec,bool * csum_encapped,bool csum_help)2545 bool __skb_csum_offload_chk(struct sk_buff *skb,
2546 			    const struct skb_csum_offl_spec *spec,
2547 			    bool *csum_encapped,
2548 			    bool csum_help)
2549 {
2550 	struct iphdr *iph;
2551 	struct ipv6hdr *ipv6;
2552 	void *nhdr;
2553 	int protocol;
2554 	u8 ip_proto;
2555 
2556 	if (skb->protocol == htons(ETH_P_8021Q) ||
2557 	    skb->protocol == htons(ETH_P_8021AD)) {
2558 		if (!spec->vlan_okay)
2559 			goto need_help;
2560 	}
2561 
2562 	/* We check whether the checksum refers to a transport layer checksum in
2563 	 * the outermost header or an encapsulated transport layer checksum that
2564 	 * corresponds to the inner headers of the skb. If the checksum is for
2565 	 * something else in the packet we need help.
2566 	 */
2567 	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2568 		/* Non-encapsulated checksum */
2569 		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2570 		nhdr = skb_network_header(skb);
2571 		*csum_encapped = false;
2572 		if (spec->no_not_encapped)
2573 			goto need_help;
2574 	} else if (skb->encapsulation && spec->encap_okay &&
2575 		   skb_checksum_start_offset(skb) ==
2576 		   skb_inner_transport_offset(skb)) {
2577 		/* Encapsulated checksum */
2578 		*csum_encapped = true;
2579 		switch (skb->inner_protocol_type) {
2580 		case ENCAP_TYPE_ETHER:
2581 			protocol = eproto_to_ipproto(skb->inner_protocol);
2582 			break;
2583 		case ENCAP_TYPE_IPPROTO:
2584 			protocol = skb->inner_protocol;
2585 			break;
2586 		}
2587 		nhdr = skb_inner_network_header(skb);
2588 	} else {
2589 		goto need_help;
2590 	}
2591 
2592 	switch (protocol) {
2593 	case IPPROTO_IP:
2594 		if (!spec->ipv4_okay)
2595 			goto need_help;
2596 		iph = nhdr;
2597 		ip_proto = iph->protocol;
2598 		if (iph->ihl != 5 && !spec->ip_options_okay)
2599 			goto need_help;
2600 		break;
2601 	case IPPROTO_IPV6:
2602 		if (!spec->ipv6_okay)
2603 			goto need_help;
2604 		if (spec->no_encapped_ipv6 && *csum_encapped)
2605 			goto need_help;
2606 		ipv6 = nhdr;
2607 		nhdr += sizeof(*ipv6);
2608 		ip_proto = ipv6->nexthdr;
2609 		break;
2610 	default:
2611 		goto need_help;
2612 	}
2613 
2614 ip_proto_again:
2615 	switch (ip_proto) {
2616 	case IPPROTO_TCP:
2617 		if (!spec->tcp_okay ||
2618 		    skb->csum_offset != offsetof(struct tcphdr, check))
2619 			goto need_help;
2620 		break;
2621 	case IPPROTO_UDP:
2622 		if (!spec->udp_okay ||
2623 		    skb->csum_offset != offsetof(struct udphdr, check))
2624 			goto need_help;
2625 		break;
2626 	case IPPROTO_SCTP:
2627 		if (!spec->sctp_okay ||
2628 		    skb->csum_offset != offsetof(struct sctphdr, checksum))
2629 			goto cant_help;
2630 		break;
2631 	case NEXTHDR_HOP:
2632 	case NEXTHDR_ROUTING:
2633 	case NEXTHDR_DEST: {
2634 		u8 *opthdr = nhdr;
2635 
2636 		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2637 			goto need_help;
2638 
2639 		ip_proto = opthdr[0];
2640 		nhdr += (opthdr[1] + 1) << 3;
2641 
2642 		goto ip_proto_again;
2643 	}
2644 	default:
2645 		goto need_help;
2646 	}
2647 
2648 	/* Passed the tests for offloading checksum */
2649 	return true;
2650 
2651 need_help:
2652 	if (csum_help && !skb_shinfo(skb)->gso_size)
2653 		skb_checksum_help(skb);
2654 cant_help:
2655 	return false;
2656 }
2657 EXPORT_SYMBOL(__skb_csum_offload_chk);
2658 
skb_network_protocol(struct sk_buff * skb,int * depth)2659 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2660 {
2661 	__be16 type = skb->protocol;
2662 
2663 	/* Tunnel gso handlers can set protocol to ethernet. */
2664 	if (type == htons(ETH_P_TEB)) {
2665 		struct ethhdr *eth;
2666 
2667 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2668 			return 0;
2669 
2670 		eth = (struct ethhdr *)skb->data;
2671 		type = eth->h_proto;
2672 	}
2673 
2674 	return __vlan_get_protocol(skb, type, depth);
2675 }
2676 
2677 /**
2678  *	skb_mac_gso_segment - mac layer segmentation handler.
2679  *	@skb: buffer to segment
2680  *	@features: features for the output path (see dev->features)
2681  */
skb_mac_gso_segment(struct sk_buff * skb,netdev_features_t features)2682 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2683 				    netdev_features_t features)
2684 {
2685 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2686 	struct packet_offload *ptype;
2687 	int vlan_depth = skb->mac_len;
2688 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2689 
2690 	if (unlikely(!type))
2691 		return ERR_PTR(-EINVAL);
2692 
2693 	__skb_pull(skb, vlan_depth);
2694 
2695 	rcu_read_lock();
2696 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2697 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2698 			segs = ptype->callbacks.gso_segment(skb, features);
2699 			break;
2700 		}
2701 	}
2702 	rcu_read_unlock();
2703 
2704 	__skb_push(skb, skb->data - skb_mac_header(skb));
2705 
2706 	return segs;
2707 }
2708 EXPORT_SYMBOL(skb_mac_gso_segment);
2709 
2710 
2711 /* openvswitch calls this on rx path, so we need a different check.
2712  */
skb_needs_check(struct sk_buff * skb,bool tx_path)2713 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2714 {
2715 	if (tx_path)
2716 		return skb->ip_summed != CHECKSUM_PARTIAL &&
2717 		       skb->ip_summed != CHECKSUM_UNNECESSARY;
2718 
2719 	return skb->ip_summed == CHECKSUM_NONE;
2720 }
2721 
2722 /**
2723  *	__skb_gso_segment - Perform segmentation on skb.
2724  *	@skb: buffer to segment
2725  *	@features: features for the output path (see dev->features)
2726  *	@tx_path: whether it is called in TX path
2727  *
2728  *	This function segments the given skb and returns a list of segments.
2729  *
2730  *	It may return NULL if the skb requires no segmentation.  This is
2731  *	only possible when GSO is used for verifying header integrity.
2732  *
2733  *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2734  */
__skb_gso_segment(struct sk_buff * skb,netdev_features_t features,bool tx_path)2735 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2736 				  netdev_features_t features, bool tx_path)
2737 {
2738 	struct sk_buff *segs;
2739 
2740 	if (unlikely(skb_needs_check(skb, tx_path))) {
2741 		int err;
2742 
2743 		/* We're going to init ->check field in TCP or UDP header */
2744 		err = skb_cow_head(skb, 0);
2745 		if (err < 0)
2746 			return ERR_PTR(err);
2747 	}
2748 
2749 	/* Only report GSO partial support if it will enable us to
2750 	 * support segmentation on this frame without needing additional
2751 	 * work.
2752 	 */
2753 	if (features & NETIF_F_GSO_PARTIAL) {
2754 		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2755 		struct net_device *dev = skb->dev;
2756 
2757 		partial_features |= dev->features & dev->gso_partial_features;
2758 		if (!skb_gso_ok(skb, features | partial_features))
2759 			features &= ~NETIF_F_GSO_PARTIAL;
2760 	}
2761 
2762 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2763 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2764 
2765 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2766 	SKB_GSO_CB(skb)->encap_level = 0;
2767 
2768 	skb_reset_mac_header(skb);
2769 	skb_reset_mac_len(skb);
2770 
2771 	segs = skb_mac_gso_segment(skb, features);
2772 
2773 	if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2774 		skb_warn_bad_offload(skb);
2775 
2776 	return segs;
2777 }
2778 EXPORT_SYMBOL(__skb_gso_segment);
2779 
2780 /* Take action when hardware reception checksum errors are detected. */
2781 #ifdef CONFIG_BUG
netdev_rx_csum_fault(struct net_device * dev)2782 void netdev_rx_csum_fault(struct net_device *dev)
2783 {
2784 	if (net_ratelimit()) {
2785 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2786 		dump_stack();
2787 	}
2788 }
2789 EXPORT_SYMBOL(netdev_rx_csum_fault);
2790 #endif
2791 
2792 /* Actually, we should eliminate this check as soon as we know, that:
2793  * 1. IOMMU is present and allows to map all the memory.
2794  * 2. No high memory really exists on this machine.
2795  */
2796 
illegal_highdma(struct net_device * dev,struct sk_buff * skb)2797 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2798 {
2799 #ifdef CONFIG_HIGHMEM
2800 	int i;
2801 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2802 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2803 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2804 			if (PageHighMem(skb_frag_page(frag)))
2805 				return 1;
2806 		}
2807 	}
2808 
2809 	if (PCI_DMA_BUS_IS_PHYS) {
2810 		struct device *pdev = dev->dev.parent;
2811 
2812 		if (!pdev)
2813 			return 0;
2814 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2815 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2816 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2817 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2818 				return 1;
2819 		}
2820 	}
2821 #endif
2822 	return 0;
2823 }
2824 
2825 /* If MPLS offload request, verify we are testing hardware MPLS features
2826  * instead of standard features for the netdev.
2827  */
2828 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)2829 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2830 					   netdev_features_t features,
2831 					   __be16 type)
2832 {
2833 	if (eth_p_mpls(type))
2834 		features &= skb->dev->mpls_features;
2835 
2836 	return features;
2837 }
2838 #else
net_mpls_features(struct sk_buff * skb,netdev_features_t features,__be16 type)2839 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2840 					   netdev_features_t features,
2841 					   __be16 type)
2842 {
2843 	return features;
2844 }
2845 #endif
2846 
harmonize_features(struct sk_buff * skb,netdev_features_t features)2847 static netdev_features_t harmonize_features(struct sk_buff *skb,
2848 	netdev_features_t features)
2849 {
2850 	int tmp;
2851 	__be16 type;
2852 
2853 	type = skb_network_protocol(skb, &tmp);
2854 	features = net_mpls_features(skb, features, type);
2855 
2856 	if (skb->ip_summed != CHECKSUM_NONE &&
2857 	    !can_checksum_protocol(features, type)) {
2858 		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2859 	}
2860 	if (illegal_highdma(skb->dev, skb))
2861 		features &= ~NETIF_F_SG;
2862 
2863 	return features;
2864 }
2865 
passthru_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2866 netdev_features_t passthru_features_check(struct sk_buff *skb,
2867 					  struct net_device *dev,
2868 					  netdev_features_t features)
2869 {
2870 	return features;
2871 }
2872 EXPORT_SYMBOL(passthru_features_check);
2873 
dflt_features_check(struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2874 static netdev_features_t dflt_features_check(struct sk_buff *skb,
2875 					     struct net_device *dev,
2876 					     netdev_features_t features)
2877 {
2878 	return vlan_features_check(skb, features);
2879 }
2880 
gso_features_check(const struct sk_buff * skb,struct net_device * dev,netdev_features_t features)2881 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2882 					    struct net_device *dev,
2883 					    netdev_features_t features)
2884 {
2885 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2886 
2887 	if (gso_segs > dev->gso_max_segs)
2888 		return features & ~NETIF_F_GSO_MASK;
2889 
2890 	/* Support for GSO partial features requires software
2891 	 * intervention before we can actually process the packets
2892 	 * so we need to strip support for any partial features now
2893 	 * and we can pull them back in after we have partially
2894 	 * segmented the frame.
2895 	 */
2896 	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2897 		features &= ~dev->gso_partial_features;
2898 
2899 	/* Make sure to clear the IPv4 ID mangling feature if the
2900 	 * IPv4 header has the potential to be fragmented.
2901 	 */
2902 	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2903 		struct iphdr *iph = skb->encapsulation ?
2904 				    inner_ip_hdr(skb) : ip_hdr(skb);
2905 
2906 		if (!(iph->frag_off & htons(IP_DF)))
2907 			features &= ~NETIF_F_TSO_MANGLEID;
2908 	}
2909 
2910 	return features;
2911 }
2912 
netif_skb_features(struct sk_buff * skb)2913 netdev_features_t netif_skb_features(struct sk_buff *skb)
2914 {
2915 	struct net_device *dev = skb->dev;
2916 	netdev_features_t features = dev->features;
2917 
2918 	if (skb_is_gso(skb))
2919 		features = gso_features_check(skb, dev, features);
2920 
2921 	/* If encapsulation offload request, verify we are testing
2922 	 * hardware encapsulation features instead of standard
2923 	 * features for the netdev
2924 	 */
2925 	if (skb->encapsulation)
2926 		features &= dev->hw_enc_features;
2927 
2928 	if (skb_vlan_tagged(skb))
2929 		features = netdev_intersect_features(features,
2930 						     dev->vlan_features |
2931 						     NETIF_F_HW_VLAN_CTAG_TX |
2932 						     NETIF_F_HW_VLAN_STAG_TX);
2933 
2934 	if (dev->netdev_ops->ndo_features_check)
2935 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2936 								features);
2937 	else
2938 		features &= dflt_features_check(skb, dev, features);
2939 
2940 	return harmonize_features(skb, features);
2941 }
2942 EXPORT_SYMBOL(netif_skb_features);
2943 
xmit_one(struct sk_buff * skb,struct net_device * dev,struct netdev_queue * txq,bool more)2944 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2945 		    struct netdev_queue *txq, bool more)
2946 {
2947 	unsigned int len;
2948 	int rc;
2949 
2950 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2951 		dev_queue_xmit_nit(skb, dev);
2952 
2953 	len = skb->len;
2954 	trace_net_dev_start_xmit(skb, dev);
2955 	rc = netdev_start_xmit(skb, dev, txq, more);
2956 	trace_net_dev_xmit(skb, rc, dev, len);
2957 
2958 	return rc;
2959 }
2960 
dev_hard_start_xmit(struct sk_buff * first,struct net_device * dev,struct netdev_queue * txq,int * ret)2961 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2962 				    struct netdev_queue *txq, int *ret)
2963 {
2964 	struct sk_buff *skb = first;
2965 	int rc = NETDEV_TX_OK;
2966 
2967 	while (skb) {
2968 		struct sk_buff *next = skb->next;
2969 
2970 		skb->next = NULL;
2971 		rc = xmit_one(skb, dev, txq, next != NULL);
2972 		if (unlikely(!dev_xmit_complete(rc))) {
2973 			skb->next = next;
2974 			goto out;
2975 		}
2976 
2977 		skb = next;
2978 		if (netif_xmit_stopped(txq) && skb) {
2979 			rc = NETDEV_TX_BUSY;
2980 			break;
2981 		}
2982 	}
2983 
2984 out:
2985 	*ret = rc;
2986 	return skb;
2987 }
2988 
validate_xmit_vlan(struct sk_buff * skb,netdev_features_t features)2989 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2990 					  netdev_features_t features)
2991 {
2992 	if (skb_vlan_tag_present(skb) &&
2993 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2994 		skb = __vlan_hwaccel_push_inside(skb);
2995 	return skb;
2996 }
2997 
validate_xmit_skb(struct sk_buff * skb,struct net_device * dev)2998 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2999 {
3000 	netdev_features_t features;
3001 
3002 	features = netif_skb_features(skb);
3003 	skb = validate_xmit_vlan(skb, features);
3004 	if (unlikely(!skb))
3005 		goto out_null;
3006 
3007 	if (netif_needs_gso(skb, features)) {
3008 		struct sk_buff *segs;
3009 
3010 		segs = skb_gso_segment(skb, features);
3011 		if (IS_ERR(segs)) {
3012 			goto out_kfree_skb;
3013 		} else if (segs) {
3014 			consume_skb(skb);
3015 			skb = segs;
3016 		}
3017 	} else {
3018 		if (skb_needs_linearize(skb, features) &&
3019 		    __skb_linearize(skb))
3020 			goto out_kfree_skb;
3021 
3022 		/* If packet is not checksummed and device does not
3023 		 * support checksumming for this protocol, complete
3024 		 * checksumming here.
3025 		 */
3026 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
3027 			if (skb->encapsulation)
3028 				skb_set_inner_transport_header(skb,
3029 							       skb_checksum_start_offset(skb));
3030 			else
3031 				skb_set_transport_header(skb,
3032 							 skb_checksum_start_offset(skb));
3033 			if (!(features & NETIF_F_CSUM_MASK) &&
3034 			    skb_checksum_help(skb))
3035 				goto out_kfree_skb;
3036 		}
3037 	}
3038 
3039 	return skb;
3040 
3041 out_kfree_skb:
3042 	kfree_skb(skb);
3043 out_null:
3044 	atomic_long_inc(&dev->tx_dropped);
3045 	return NULL;
3046 }
3047 
validate_xmit_skb_list(struct sk_buff * skb,struct net_device * dev)3048 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3049 {
3050 	struct sk_buff *next, *head = NULL, *tail;
3051 
3052 	for (; skb != NULL; skb = next) {
3053 		next = skb->next;
3054 		skb->next = NULL;
3055 
3056 		/* in case skb wont be segmented, point to itself */
3057 		skb->prev = skb;
3058 
3059 		skb = validate_xmit_skb(skb, dev);
3060 		if (!skb)
3061 			continue;
3062 
3063 		if (!head)
3064 			head = skb;
3065 		else
3066 			tail->next = skb;
3067 		/* If skb was segmented, skb->prev points to
3068 		 * the last segment. If not, it still contains skb.
3069 		 */
3070 		tail = skb->prev;
3071 	}
3072 	return head;
3073 }
3074 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3075 
qdisc_pkt_len_init(struct sk_buff * skb)3076 static void qdisc_pkt_len_init(struct sk_buff *skb)
3077 {
3078 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3079 
3080 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3081 
3082 	/* To get more precise estimation of bytes sent on wire,
3083 	 * we add to pkt_len the headers size of all segments
3084 	 */
3085 	if (shinfo->gso_size)  {
3086 		unsigned int hdr_len;
3087 		u16 gso_segs = shinfo->gso_segs;
3088 
3089 		/* mac layer + network layer */
3090 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3091 
3092 		/* + transport layer */
3093 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3094 			const struct tcphdr *th;
3095 			struct tcphdr _tcphdr;
3096 
3097 			th = skb_header_pointer(skb, skb_transport_offset(skb),
3098 						sizeof(_tcphdr), &_tcphdr);
3099 			if (likely(th))
3100 				hdr_len += __tcp_hdrlen(th);
3101 		} else {
3102 			struct udphdr _udphdr;
3103 
3104 			if (skb_header_pointer(skb, skb_transport_offset(skb),
3105 					       sizeof(_udphdr), &_udphdr))
3106 				hdr_len += sizeof(struct udphdr);
3107 		}
3108 
3109 		if (shinfo->gso_type & SKB_GSO_DODGY)
3110 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3111 						shinfo->gso_size);
3112 
3113 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3114 	}
3115 }
3116 
__dev_xmit_skb(struct sk_buff * skb,struct Qdisc * q,struct net_device * dev,struct netdev_queue * txq)3117 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3118 				 struct net_device *dev,
3119 				 struct netdev_queue *txq)
3120 {
3121 	spinlock_t *root_lock = qdisc_lock(q);
3122 	struct sk_buff *to_free = NULL;
3123 	bool contended;
3124 	int rc;
3125 
3126 	qdisc_calculate_pkt_len(skb, q);
3127 	/*
3128 	 * Heuristic to force contended enqueues to serialize on a
3129 	 * separate lock before trying to get qdisc main lock.
3130 	 * This permits qdisc->running owner to get the lock more
3131 	 * often and dequeue packets faster.
3132 	 */
3133 	contended = qdisc_is_running(q);
3134 	if (unlikely(contended))
3135 		spin_lock(&q->busylock);
3136 
3137 	spin_lock(root_lock);
3138 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3139 		__qdisc_drop(skb, &to_free);
3140 		rc = NET_XMIT_DROP;
3141 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3142 		   qdisc_run_begin(q)) {
3143 		/*
3144 		 * This is a work-conserving queue; there are no old skbs
3145 		 * waiting to be sent out; and the qdisc is not running -
3146 		 * xmit the skb directly.
3147 		 */
3148 
3149 		qdisc_bstats_update(q, skb);
3150 
3151 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3152 			if (unlikely(contended)) {
3153 				spin_unlock(&q->busylock);
3154 				contended = false;
3155 			}
3156 			__qdisc_run(q);
3157 		} else
3158 			qdisc_run_end(q);
3159 
3160 		rc = NET_XMIT_SUCCESS;
3161 	} else {
3162 		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3163 		if (qdisc_run_begin(q)) {
3164 			if (unlikely(contended)) {
3165 				spin_unlock(&q->busylock);
3166 				contended = false;
3167 			}
3168 			__qdisc_run(q);
3169 		}
3170 	}
3171 	spin_unlock(root_lock);
3172 	if (unlikely(to_free))
3173 		kfree_skb_list(to_free);
3174 	if (unlikely(contended))
3175 		spin_unlock(&q->busylock);
3176 	return rc;
3177 }
3178 
3179 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
skb_update_prio(struct sk_buff * skb)3180 static void skb_update_prio(struct sk_buff *skb)
3181 {
3182 	const struct netprio_map *map;
3183 	const struct sock *sk;
3184 	unsigned int prioidx;
3185 
3186 	if (skb->priority)
3187 		return;
3188 	map = rcu_dereference_bh(skb->dev->priomap);
3189 	if (!map)
3190 		return;
3191 	sk = skb_to_full_sk(skb);
3192 	if (!sk)
3193 		return;
3194 
3195 	prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3196 
3197 	if (prioidx < map->priomap_len)
3198 		skb->priority = map->priomap[prioidx];
3199 }
3200 #else
3201 #define skb_update_prio(skb)
3202 #endif
3203 
3204 DEFINE_PER_CPU(int, xmit_recursion);
3205 EXPORT_SYMBOL(xmit_recursion);
3206 
3207 /**
3208  *	dev_loopback_xmit - loop back @skb
3209  *	@net: network namespace this loopback is happening in
3210  *	@sk:  sk needed to be a netfilter okfn
3211  *	@skb: buffer to transmit
3212  */
dev_loopback_xmit(struct net * net,struct sock * sk,struct sk_buff * skb)3213 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3214 {
3215 	skb_reset_mac_header(skb);
3216 	__skb_pull(skb, skb_network_offset(skb));
3217 	skb->pkt_type = PACKET_LOOPBACK;
3218 	skb->ip_summed = CHECKSUM_UNNECESSARY;
3219 	WARN_ON(!skb_dst(skb));
3220 	skb_dst_force(skb);
3221 	netif_rx_ni(skb);
3222 	return 0;
3223 }
3224 EXPORT_SYMBOL(dev_loopback_xmit);
3225 
3226 #ifdef CONFIG_NET_EGRESS
3227 static struct sk_buff *
sch_handle_egress(struct sk_buff * skb,int * ret,struct net_device * dev)3228 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3229 {
3230 	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3231 	struct tcf_result cl_res;
3232 
3233 	if (!cl)
3234 		return skb;
3235 
3236 	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3237 	 * earlier by the caller.
3238 	 */
3239 	qdisc_bstats_cpu_update(cl->q, skb);
3240 
3241 	switch (tc_classify(skb, cl, &cl_res, false)) {
3242 	case TC_ACT_OK:
3243 	case TC_ACT_RECLASSIFY:
3244 		skb->tc_index = TC_H_MIN(cl_res.classid);
3245 		break;
3246 	case TC_ACT_SHOT:
3247 		qdisc_qstats_cpu_drop(cl->q);
3248 		*ret = NET_XMIT_DROP;
3249 		kfree_skb(skb);
3250 		return NULL;
3251 	case TC_ACT_STOLEN:
3252 	case TC_ACT_QUEUED:
3253 		*ret = NET_XMIT_SUCCESS;
3254 		consume_skb(skb);
3255 		return NULL;
3256 	case TC_ACT_REDIRECT:
3257 		/* No need to push/pop skb's mac_header here on egress! */
3258 		skb_do_redirect(skb);
3259 		*ret = NET_XMIT_SUCCESS;
3260 		return NULL;
3261 	default:
3262 		break;
3263 	}
3264 
3265 	return skb;
3266 }
3267 #endif /* CONFIG_NET_EGRESS */
3268 
get_xps_queue(struct net_device * dev,struct sk_buff * skb)3269 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3270 {
3271 #ifdef CONFIG_XPS
3272 	struct xps_dev_maps *dev_maps;
3273 	struct xps_map *map;
3274 	int queue_index = -1;
3275 
3276 	rcu_read_lock();
3277 	dev_maps = rcu_dereference(dev->xps_maps);
3278 	if (dev_maps) {
3279 		map = rcu_dereference(
3280 		    dev_maps->cpu_map[skb->sender_cpu - 1]);
3281 		if (map) {
3282 			if (map->len == 1)
3283 				queue_index = map->queues[0];
3284 			else
3285 				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3286 									   map->len)];
3287 			if (unlikely(queue_index >= dev->real_num_tx_queues))
3288 				queue_index = -1;
3289 		}
3290 	}
3291 	rcu_read_unlock();
3292 
3293 	return queue_index;
3294 #else
3295 	return -1;
3296 #endif
3297 }
3298 
__netdev_pick_tx(struct net_device * dev,struct sk_buff * skb)3299 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3300 {
3301 	struct sock *sk = skb->sk;
3302 	int queue_index = sk_tx_queue_get(sk);
3303 
3304 	if (queue_index < 0 || skb->ooo_okay ||
3305 	    queue_index >= dev->real_num_tx_queues) {
3306 		int new_index = get_xps_queue(dev, skb);
3307 		if (new_index < 0)
3308 			new_index = skb_tx_hash(dev, skb);
3309 
3310 		if (queue_index != new_index && sk &&
3311 		    sk_fullsock(sk) &&
3312 		    rcu_access_pointer(sk->sk_dst_cache))
3313 			sk_tx_queue_set(sk, new_index);
3314 
3315 		queue_index = new_index;
3316 	}
3317 
3318 	return queue_index;
3319 }
3320 
netdev_pick_tx(struct net_device * dev,struct sk_buff * skb,void * accel_priv)3321 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3322 				    struct sk_buff *skb,
3323 				    void *accel_priv)
3324 {
3325 	int queue_index = 0;
3326 
3327 #ifdef CONFIG_XPS
3328 	u32 sender_cpu = skb->sender_cpu - 1;
3329 
3330 	if (sender_cpu >= (u32)NR_CPUS)
3331 		skb->sender_cpu = raw_smp_processor_id() + 1;
3332 #endif
3333 
3334 	if (dev->real_num_tx_queues != 1) {
3335 		const struct net_device_ops *ops = dev->netdev_ops;
3336 		if (ops->ndo_select_queue)
3337 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3338 							    __netdev_pick_tx);
3339 		else
3340 			queue_index = __netdev_pick_tx(dev, skb);
3341 
3342 		if (!accel_priv)
3343 			queue_index = netdev_cap_txqueue(dev, queue_index);
3344 	}
3345 
3346 	skb_set_queue_mapping(skb, queue_index);
3347 	return netdev_get_tx_queue(dev, queue_index);
3348 }
3349 
3350 /**
3351  *	__dev_queue_xmit - transmit a buffer
3352  *	@skb: buffer to transmit
3353  *	@accel_priv: private data used for L2 forwarding offload
3354  *
3355  *	Queue a buffer for transmission to a network device. The caller must
3356  *	have set the device and priority and built the buffer before calling
3357  *	this function. The function can be called from an interrupt.
3358  *
3359  *	A negative errno code is returned on a failure. A success does not
3360  *	guarantee the frame will be transmitted as it may be dropped due
3361  *	to congestion or traffic shaping.
3362  *
3363  * -----------------------------------------------------------------------------------
3364  *      I notice this method can also return errors from the queue disciplines,
3365  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3366  *      be positive.
3367  *
3368  *      Regardless of the return value, the skb is consumed, so it is currently
3369  *      difficult to retry a send to this method.  (You can bump the ref count
3370  *      before sending to hold a reference for retry if you are careful.)
3371  *
3372  *      When calling this method, interrupts MUST be enabled.  This is because
3373  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3374  *          --BLG
3375  */
__dev_queue_xmit(struct sk_buff * skb,void * accel_priv)3376 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3377 {
3378 	struct net_device *dev = skb->dev;
3379 	struct netdev_queue *txq;
3380 	struct Qdisc *q;
3381 	int rc = -ENOMEM;
3382 
3383 	skb_reset_mac_header(skb);
3384 
3385 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3386 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3387 
3388 	/* Disable soft irqs for various locks below. Also
3389 	 * stops preemption for RCU.
3390 	 */
3391 	rcu_read_lock_bh();
3392 
3393 	skb_update_prio(skb);
3394 
3395 	qdisc_pkt_len_init(skb);
3396 #ifdef CONFIG_NET_CLS_ACT
3397 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3398 # ifdef CONFIG_NET_EGRESS
3399 	if (static_key_false(&egress_needed)) {
3400 		skb = sch_handle_egress(skb, &rc, dev);
3401 		if (!skb)
3402 			goto out;
3403 	}
3404 # endif
3405 #endif
3406 	/* If device/qdisc don't need skb->dst, release it right now while
3407 	 * its hot in this cpu cache.
3408 	 */
3409 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3410 		skb_dst_drop(skb);
3411 	else
3412 		skb_dst_force(skb);
3413 
3414 	txq = netdev_pick_tx(dev, skb, accel_priv);
3415 	q = rcu_dereference_bh(txq->qdisc);
3416 
3417 	trace_net_dev_queue(skb);
3418 	if (q->enqueue) {
3419 		rc = __dev_xmit_skb(skb, q, dev, txq);
3420 		goto out;
3421 	}
3422 
3423 	/* The device has no queue. Common case for software devices:
3424 	   loopback, all the sorts of tunnels...
3425 
3426 	   Really, it is unlikely that netif_tx_lock protection is necessary
3427 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3428 	   counters.)
3429 	   However, it is possible, that they rely on protection
3430 	   made by us here.
3431 
3432 	   Check this and shot the lock. It is not prone from deadlocks.
3433 	   Either shot noqueue qdisc, it is even simpler 8)
3434 	 */
3435 	if (dev->flags & IFF_UP) {
3436 		int cpu = smp_processor_id(); /* ok because BHs are off */
3437 
3438 		if (txq->xmit_lock_owner != cpu) {
3439 			if (unlikely(__this_cpu_read(xmit_recursion) >
3440 				     XMIT_RECURSION_LIMIT))
3441 				goto recursion_alert;
3442 
3443 			skb = validate_xmit_skb(skb, dev);
3444 			if (!skb)
3445 				goto out;
3446 
3447 			HARD_TX_LOCK(dev, txq, cpu);
3448 
3449 			if (!netif_xmit_stopped(txq)) {
3450 				__this_cpu_inc(xmit_recursion);
3451 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3452 				__this_cpu_dec(xmit_recursion);
3453 				if (dev_xmit_complete(rc)) {
3454 					HARD_TX_UNLOCK(dev, txq);
3455 					goto out;
3456 				}
3457 			}
3458 			HARD_TX_UNLOCK(dev, txq);
3459 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3460 					     dev->name);
3461 		} else {
3462 			/* Recursion is detected! It is possible,
3463 			 * unfortunately
3464 			 */
3465 recursion_alert:
3466 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3467 					     dev->name);
3468 		}
3469 	}
3470 
3471 	rc = -ENETDOWN;
3472 	rcu_read_unlock_bh();
3473 
3474 	atomic_long_inc(&dev->tx_dropped);
3475 	kfree_skb_list(skb);
3476 	return rc;
3477 out:
3478 	rcu_read_unlock_bh();
3479 	return rc;
3480 }
3481 
dev_queue_xmit(struct sk_buff * skb)3482 int dev_queue_xmit(struct sk_buff *skb)
3483 {
3484 	return __dev_queue_xmit(skb, NULL);
3485 }
3486 EXPORT_SYMBOL(dev_queue_xmit);
3487 
dev_queue_xmit_accel(struct sk_buff * skb,void * accel_priv)3488 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3489 {
3490 	return __dev_queue_xmit(skb, accel_priv);
3491 }
3492 EXPORT_SYMBOL(dev_queue_xmit_accel);
3493 
3494 
3495 /*=======================================================================
3496 			Receiver routines
3497   =======================================================================*/
3498 
3499 int netdev_max_backlog __read_mostly = 1000;
3500 EXPORT_SYMBOL(netdev_max_backlog);
3501 
3502 int netdev_tstamp_prequeue __read_mostly = 1;
3503 int netdev_budget __read_mostly = 300;
3504 int weight_p __read_mostly = 64;            /* old backlog weight */
3505 
3506 /* Called with irq disabled */
____napi_schedule(struct softnet_data * sd,struct napi_struct * napi)3507 static inline void ____napi_schedule(struct softnet_data *sd,
3508 				     struct napi_struct *napi)
3509 {
3510 	list_add_tail(&napi->poll_list, &sd->poll_list);
3511 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3512 }
3513 
3514 #ifdef CONFIG_RPS
3515 
3516 /* One global table that all flow-based protocols share. */
3517 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3518 EXPORT_SYMBOL(rps_sock_flow_table);
3519 u32 rps_cpu_mask __read_mostly;
3520 EXPORT_SYMBOL(rps_cpu_mask);
3521 
3522 struct static_key rps_needed __read_mostly;
3523 EXPORT_SYMBOL(rps_needed);
3524 
3525 static struct rps_dev_flow *
set_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow * rflow,u16 next_cpu)3526 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3527 	    struct rps_dev_flow *rflow, u16 next_cpu)
3528 {
3529 	if (next_cpu < nr_cpu_ids) {
3530 #ifdef CONFIG_RFS_ACCEL
3531 		struct netdev_rx_queue *rxqueue;
3532 		struct rps_dev_flow_table *flow_table;
3533 		struct rps_dev_flow *old_rflow;
3534 		u32 flow_id;
3535 		u16 rxq_index;
3536 		int rc;
3537 
3538 		/* Should we steer this flow to a different hardware queue? */
3539 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3540 		    !(dev->features & NETIF_F_NTUPLE))
3541 			goto out;
3542 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3543 		if (rxq_index == skb_get_rx_queue(skb))
3544 			goto out;
3545 
3546 		rxqueue = dev->_rx + rxq_index;
3547 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3548 		if (!flow_table)
3549 			goto out;
3550 		flow_id = skb_get_hash(skb) & flow_table->mask;
3551 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3552 							rxq_index, flow_id);
3553 		if (rc < 0)
3554 			goto out;
3555 		old_rflow = rflow;
3556 		rflow = &flow_table->flows[flow_id];
3557 		rflow->filter = rc;
3558 		if (old_rflow->filter == rflow->filter)
3559 			old_rflow->filter = RPS_NO_FILTER;
3560 	out:
3561 #endif
3562 		rflow->last_qtail =
3563 			per_cpu(softnet_data, next_cpu).input_queue_head;
3564 	}
3565 
3566 	rflow->cpu = next_cpu;
3567 	return rflow;
3568 }
3569 
3570 /*
3571  * get_rps_cpu is called from netif_receive_skb and returns the target
3572  * CPU from the RPS map of the receiving queue for a given skb.
3573  * rcu_read_lock must be held on entry.
3574  */
get_rps_cpu(struct net_device * dev,struct sk_buff * skb,struct rps_dev_flow ** rflowp)3575 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3576 		       struct rps_dev_flow **rflowp)
3577 {
3578 	const struct rps_sock_flow_table *sock_flow_table;
3579 	struct netdev_rx_queue *rxqueue = dev->_rx;
3580 	struct rps_dev_flow_table *flow_table;
3581 	struct rps_map *map;
3582 	int cpu = -1;
3583 	u32 tcpu;
3584 	u32 hash;
3585 
3586 	if (skb_rx_queue_recorded(skb)) {
3587 		u16 index = skb_get_rx_queue(skb);
3588 
3589 		if (unlikely(index >= dev->real_num_rx_queues)) {
3590 			WARN_ONCE(dev->real_num_rx_queues > 1,
3591 				  "%s received packet on queue %u, but number "
3592 				  "of RX queues is %u\n",
3593 				  dev->name, index, dev->real_num_rx_queues);
3594 			goto done;
3595 		}
3596 		rxqueue += index;
3597 	}
3598 
3599 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3600 
3601 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3602 	map = rcu_dereference(rxqueue->rps_map);
3603 	if (!flow_table && !map)
3604 		goto done;
3605 
3606 	skb_reset_network_header(skb);
3607 	hash = skb_get_hash(skb);
3608 	if (!hash)
3609 		goto done;
3610 
3611 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3612 	if (flow_table && sock_flow_table) {
3613 		struct rps_dev_flow *rflow;
3614 		u32 next_cpu;
3615 		u32 ident;
3616 
3617 		/* First check into global flow table if there is a match */
3618 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3619 		if ((ident ^ hash) & ~rps_cpu_mask)
3620 			goto try_rps;
3621 
3622 		next_cpu = ident & rps_cpu_mask;
3623 
3624 		/* OK, now we know there is a match,
3625 		 * we can look at the local (per receive queue) flow table
3626 		 */
3627 		rflow = &flow_table->flows[hash & flow_table->mask];
3628 		tcpu = rflow->cpu;
3629 
3630 		/*
3631 		 * If the desired CPU (where last recvmsg was done) is
3632 		 * different from current CPU (one in the rx-queue flow
3633 		 * table entry), switch if one of the following holds:
3634 		 *   - Current CPU is unset (>= nr_cpu_ids).
3635 		 *   - Current CPU is offline.
3636 		 *   - The current CPU's queue tail has advanced beyond the
3637 		 *     last packet that was enqueued using this table entry.
3638 		 *     This guarantees that all previous packets for the flow
3639 		 *     have been dequeued, thus preserving in order delivery.
3640 		 */
3641 		if (unlikely(tcpu != next_cpu) &&
3642 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3643 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3644 		      rflow->last_qtail)) >= 0)) {
3645 			tcpu = next_cpu;
3646 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3647 		}
3648 
3649 		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3650 			*rflowp = rflow;
3651 			cpu = tcpu;
3652 			goto done;
3653 		}
3654 	}
3655 
3656 try_rps:
3657 
3658 	if (map) {
3659 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3660 		if (cpu_online(tcpu)) {
3661 			cpu = tcpu;
3662 			goto done;
3663 		}
3664 	}
3665 
3666 done:
3667 	return cpu;
3668 }
3669 
3670 #ifdef CONFIG_RFS_ACCEL
3671 
3672 /**
3673  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3674  * @dev: Device on which the filter was set
3675  * @rxq_index: RX queue index
3676  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3677  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3678  *
3679  * Drivers that implement ndo_rx_flow_steer() should periodically call
3680  * this function for each installed filter and remove the filters for
3681  * which it returns %true.
3682  */
rps_may_expire_flow(struct net_device * dev,u16 rxq_index,u32 flow_id,u16 filter_id)3683 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3684 			 u32 flow_id, u16 filter_id)
3685 {
3686 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3687 	struct rps_dev_flow_table *flow_table;
3688 	struct rps_dev_flow *rflow;
3689 	bool expire = true;
3690 	unsigned int cpu;
3691 
3692 	rcu_read_lock();
3693 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3694 	if (flow_table && flow_id <= flow_table->mask) {
3695 		rflow = &flow_table->flows[flow_id];
3696 		cpu = ACCESS_ONCE(rflow->cpu);
3697 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3698 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3699 			   rflow->last_qtail) <
3700 		     (int)(10 * flow_table->mask)))
3701 			expire = false;
3702 	}
3703 	rcu_read_unlock();
3704 	return expire;
3705 }
3706 EXPORT_SYMBOL(rps_may_expire_flow);
3707 
3708 #endif /* CONFIG_RFS_ACCEL */
3709 
3710 /* Called from hardirq (IPI) context */
rps_trigger_softirq(void * data)3711 static void rps_trigger_softirq(void *data)
3712 {
3713 	struct softnet_data *sd = data;
3714 
3715 	____napi_schedule(sd, &sd->backlog);
3716 	sd->received_rps++;
3717 }
3718 
3719 #endif /* CONFIG_RPS */
3720 
3721 /*
3722  * Check if this softnet_data structure is another cpu one
3723  * If yes, queue it to our IPI list and return 1
3724  * If no, return 0
3725  */
rps_ipi_queued(struct softnet_data * sd)3726 static int rps_ipi_queued(struct softnet_data *sd)
3727 {
3728 #ifdef CONFIG_RPS
3729 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3730 
3731 	if (sd != mysd) {
3732 		sd->rps_ipi_next = mysd->rps_ipi_list;
3733 		mysd->rps_ipi_list = sd;
3734 
3735 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3736 		return 1;
3737 	}
3738 #endif /* CONFIG_RPS */
3739 	return 0;
3740 }
3741 
3742 #ifdef CONFIG_NET_FLOW_LIMIT
3743 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3744 #endif
3745 
skb_flow_limit(struct sk_buff * skb,unsigned int qlen)3746 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3747 {
3748 #ifdef CONFIG_NET_FLOW_LIMIT
3749 	struct sd_flow_limit *fl;
3750 	struct softnet_data *sd;
3751 	unsigned int old_flow, new_flow;
3752 
3753 	if (qlen < (netdev_max_backlog >> 1))
3754 		return false;
3755 
3756 	sd = this_cpu_ptr(&softnet_data);
3757 
3758 	rcu_read_lock();
3759 	fl = rcu_dereference(sd->flow_limit);
3760 	if (fl) {
3761 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3762 		old_flow = fl->history[fl->history_head];
3763 		fl->history[fl->history_head] = new_flow;
3764 
3765 		fl->history_head++;
3766 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3767 
3768 		if (likely(fl->buckets[old_flow]))
3769 			fl->buckets[old_flow]--;
3770 
3771 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3772 			fl->count++;
3773 			rcu_read_unlock();
3774 			return true;
3775 		}
3776 	}
3777 	rcu_read_unlock();
3778 #endif
3779 	return false;
3780 }
3781 
3782 /*
3783  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3784  * queue (may be a remote CPU queue).
3785  */
enqueue_to_backlog(struct sk_buff * skb,int cpu,unsigned int * qtail)3786 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3787 			      unsigned int *qtail)
3788 {
3789 	struct softnet_data *sd;
3790 	unsigned long flags;
3791 	unsigned int qlen;
3792 
3793 	sd = &per_cpu(softnet_data, cpu);
3794 
3795 	local_irq_save(flags);
3796 
3797 	rps_lock(sd);
3798 	if (!netif_running(skb->dev))
3799 		goto drop;
3800 	qlen = skb_queue_len(&sd->input_pkt_queue);
3801 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3802 		if (qlen) {
3803 enqueue:
3804 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3805 			input_queue_tail_incr_save(sd, qtail);
3806 			rps_unlock(sd);
3807 			local_irq_restore(flags);
3808 			return NET_RX_SUCCESS;
3809 		}
3810 
3811 		/* Schedule NAPI for backlog device
3812 		 * We can use non atomic operation since we own the queue lock
3813 		 */
3814 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3815 			if (!rps_ipi_queued(sd))
3816 				____napi_schedule(sd, &sd->backlog);
3817 		}
3818 		goto enqueue;
3819 	}
3820 
3821 drop:
3822 	sd->dropped++;
3823 	rps_unlock(sd);
3824 
3825 	local_irq_restore(flags);
3826 
3827 	atomic_long_inc(&skb->dev->rx_dropped);
3828 	kfree_skb(skb);
3829 	return NET_RX_DROP;
3830 }
3831 
netif_rx_internal(struct sk_buff * skb)3832 static int netif_rx_internal(struct sk_buff *skb)
3833 {
3834 	int ret;
3835 
3836 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3837 
3838 	trace_netif_rx(skb);
3839 #ifdef CONFIG_RPS
3840 	if (static_key_false(&rps_needed)) {
3841 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3842 		int cpu;
3843 
3844 		preempt_disable();
3845 		rcu_read_lock();
3846 
3847 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3848 		if (cpu < 0)
3849 			cpu = smp_processor_id();
3850 
3851 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3852 
3853 		rcu_read_unlock();
3854 		preempt_enable();
3855 	} else
3856 #endif
3857 	{
3858 		unsigned int qtail;
3859 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3860 		put_cpu();
3861 	}
3862 	return ret;
3863 }
3864 
3865 /**
3866  *	netif_rx	-	post buffer to the network code
3867  *	@skb: buffer to post
3868  *
3869  *	This function receives a packet from a device driver and queues it for
3870  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3871  *	may be dropped during processing for congestion control or by the
3872  *	protocol layers.
3873  *
3874  *	return values:
3875  *	NET_RX_SUCCESS	(no congestion)
3876  *	NET_RX_DROP     (packet was dropped)
3877  *
3878  */
3879 
netif_rx(struct sk_buff * skb)3880 int netif_rx(struct sk_buff *skb)
3881 {
3882 	trace_netif_rx_entry(skb);
3883 
3884 	return netif_rx_internal(skb);
3885 }
3886 EXPORT_SYMBOL(netif_rx);
3887 
netif_rx_ni(struct sk_buff * skb)3888 int netif_rx_ni(struct sk_buff *skb)
3889 {
3890 	int err;
3891 
3892 	trace_netif_rx_ni_entry(skb);
3893 
3894 	preempt_disable();
3895 	err = netif_rx_internal(skb);
3896 	if (local_softirq_pending())
3897 		do_softirq();
3898 	preempt_enable();
3899 
3900 	return err;
3901 }
3902 EXPORT_SYMBOL(netif_rx_ni);
3903 
net_tx_action(struct softirq_action * h)3904 static __latent_entropy void net_tx_action(struct softirq_action *h)
3905 {
3906 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3907 
3908 	if (sd->completion_queue) {
3909 		struct sk_buff *clist;
3910 
3911 		local_irq_disable();
3912 		clist = sd->completion_queue;
3913 		sd->completion_queue = NULL;
3914 		local_irq_enable();
3915 
3916 		while (clist) {
3917 			struct sk_buff *skb = clist;
3918 			clist = clist->next;
3919 
3920 			WARN_ON(atomic_read(&skb->users));
3921 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3922 				trace_consume_skb(skb);
3923 			else
3924 				trace_kfree_skb(skb, net_tx_action);
3925 
3926 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3927 				__kfree_skb(skb);
3928 			else
3929 				__kfree_skb_defer(skb);
3930 		}
3931 
3932 		__kfree_skb_flush();
3933 	}
3934 
3935 	if (sd->output_queue) {
3936 		struct Qdisc *head;
3937 
3938 		local_irq_disable();
3939 		head = sd->output_queue;
3940 		sd->output_queue = NULL;
3941 		sd->output_queue_tailp = &sd->output_queue;
3942 		local_irq_enable();
3943 
3944 		while (head) {
3945 			struct Qdisc *q = head;
3946 			spinlock_t *root_lock;
3947 
3948 			head = head->next_sched;
3949 
3950 			root_lock = qdisc_lock(q);
3951 			spin_lock(root_lock);
3952 			/* We need to make sure head->next_sched is read
3953 			 * before clearing __QDISC_STATE_SCHED
3954 			 */
3955 			smp_mb__before_atomic();
3956 			clear_bit(__QDISC_STATE_SCHED, &q->state);
3957 			qdisc_run(q);
3958 			spin_unlock(root_lock);
3959 		}
3960 	}
3961 }
3962 
3963 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3964 /* This hook is defined here for ATM LANE */
3965 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3966 			     unsigned char *addr) __read_mostly;
3967 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3968 #endif
3969 
3970 static inline struct sk_buff *
sch_handle_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)3971 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3972 		   struct net_device *orig_dev)
3973 {
3974 #ifdef CONFIG_NET_CLS_ACT
3975 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3976 	struct tcf_result cl_res;
3977 
3978 	/* If there's at least one ingress present somewhere (so
3979 	 * we get here via enabled static key), remaining devices
3980 	 * that are not configured with an ingress qdisc will bail
3981 	 * out here.
3982 	 */
3983 	if (!cl)
3984 		return skb;
3985 	if (*pt_prev) {
3986 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3987 		*pt_prev = NULL;
3988 	}
3989 
3990 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3991 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3992 	qdisc_bstats_cpu_update(cl->q, skb);
3993 
3994 	switch (tc_classify(skb, cl, &cl_res, false)) {
3995 	case TC_ACT_OK:
3996 	case TC_ACT_RECLASSIFY:
3997 		skb->tc_index = TC_H_MIN(cl_res.classid);
3998 		break;
3999 	case TC_ACT_SHOT:
4000 		qdisc_qstats_cpu_drop(cl->q);
4001 		kfree_skb(skb);
4002 		return NULL;
4003 	case TC_ACT_STOLEN:
4004 	case TC_ACT_QUEUED:
4005 		consume_skb(skb);
4006 		return NULL;
4007 	case TC_ACT_REDIRECT:
4008 		/* skb_mac_header check was done by cls/act_bpf, so
4009 		 * we can safely push the L2 header back before
4010 		 * redirecting to another netdev
4011 		 */
4012 		__skb_push(skb, skb->mac_len);
4013 		skb_do_redirect(skb);
4014 		return NULL;
4015 	default:
4016 		break;
4017 	}
4018 #endif /* CONFIG_NET_CLS_ACT */
4019 	return skb;
4020 }
4021 
4022 /**
4023  *	netdev_is_rx_handler_busy - check if receive handler is registered
4024  *	@dev: device to check
4025  *
4026  *	Check if a receive handler is already registered for a given device.
4027  *	Return true if there one.
4028  *
4029  *	The caller must hold the rtnl_mutex.
4030  */
netdev_is_rx_handler_busy(struct net_device * dev)4031 bool netdev_is_rx_handler_busy(struct net_device *dev)
4032 {
4033 	ASSERT_RTNL();
4034 	return dev && rtnl_dereference(dev->rx_handler);
4035 }
4036 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4037 
4038 /**
4039  *	netdev_rx_handler_register - register receive handler
4040  *	@dev: device to register a handler for
4041  *	@rx_handler: receive handler to register
4042  *	@rx_handler_data: data pointer that is used by rx handler
4043  *
4044  *	Register a receive handler for a device. This handler will then be
4045  *	called from __netif_receive_skb. A negative errno code is returned
4046  *	on a failure.
4047  *
4048  *	The caller must hold the rtnl_mutex.
4049  *
4050  *	For a general description of rx_handler, see enum rx_handler_result.
4051  */
netdev_rx_handler_register(struct net_device * dev,rx_handler_func_t * rx_handler,void * rx_handler_data)4052 int netdev_rx_handler_register(struct net_device *dev,
4053 			       rx_handler_func_t *rx_handler,
4054 			       void *rx_handler_data)
4055 {
4056 	ASSERT_RTNL();
4057 
4058 	if (dev->rx_handler)
4059 		return -EBUSY;
4060 
4061 	/* Note: rx_handler_data must be set before rx_handler */
4062 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4063 	rcu_assign_pointer(dev->rx_handler, rx_handler);
4064 
4065 	return 0;
4066 }
4067 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4068 
4069 /**
4070  *	netdev_rx_handler_unregister - unregister receive handler
4071  *	@dev: device to unregister a handler from
4072  *
4073  *	Unregister a receive handler from a device.
4074  *
4075  *	The caller must hold the rtnl_mutex.
4076  */
netdev_rx_handler_unregister(struct net_device * dev)4077 void netdev_rx_handler_unregister(struct net_device *dev)
4078 {
4079 
4080 	ASSERT_RTNL();
4081 	RCU_INIT_POINTER(dev->rx_handler, NULL);
4082 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4083 	 * section has a guarantee to see a non NULL rx_handler_data
4084 	 * as well.
4085 	 */
4086 	synchronize_net();
4087 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4088 }
4089 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4090 
4091 /*
4092  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4093  * the special handling of PFMEMALLOC skbs.
4094  */
skb_pfmemalloc_protocol(struct sk_buff * skb)4095 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4096 {
4097 	switch (skb->protocol) {
4098 	case htons(ETH_P_ARP):
4099 	case htons(ETH_P_IP):
4100 	case htons(ETH_P_IPV6):
4101 	case htons(ETH_P_8021Q):
4102 	case htons(ETH_P_8021AD):
4103 		return true;
4104 	default:
4105 		return false;
4106 	}
4107 }
4108 
nf_ingress(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)4109 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4110 			     int *ret, struct net_device *orig_dev)
4111 {
4112 #ifdef CONFIG_NETFILTER_INGRESS
4113 	if (nf_hook_ingress_active(skb)) {
4114 		int ingress_retval;
4115 
4116 		if (*pt_prev) {
4117 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4118 			*pt_prev = NULL;
4119 		}
4120 
4121 		rcu_read_lock();
4122 		ingress_retval = nf_hook_ingress(skb);
4123 		rcu_read_unlock();
4124 		return ingress_retval;
4125 	}
4126 #endif /* CONFIG_NETFILTER_INGRESS */
4127 	return 0;
4128 }
4129 
__netif_receive_skb_core(struct sk_buff * skb,bool pfmemalloc)4130 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4131 {
4132 	struct packet_type *ptype, *pt_prev;
4133 	rx_handler_func_t *rx_handler;
4134 	struct net_device *orig_dev;
4135 	bool deliver_exact = false;
4136 	int ret = NET_RX_DROP;
4137 	__be16 type;
4138 
4139 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4140 
4141 	trace_netif_receive_skb(skb);
4142 
4143 	orig_dev = skb->dev;
4144 
4145 	skb_reset_network_header(skb);
4146 	if (!skb_transport_header_was_set(skb))
4147 		skb_reset_transport_header(skb);
4148 	skb_reset_mac_len(skb);
4149 
4150 	pt_prev = NULL;
4151 
4152 another_round:
4153 	skb->skb_iif = skb->dev->ifindex;
4154 
4155 	__this_cpu_inc(softnet_data.processed);
4156 
4157 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4158 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4159 		skb = skb_vlan_untag(skb);
4160 		if (unlikely(!skb))
4161 			goto out;
4162 	}
4163 
4164 #ifdef CONFIG_NET_CLS_ACT
4165 	if (skb->tc_verd & TC_NCLS) {
4166 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4167 		goto ncls;
4168 	}
4169 #endif
4170 
4171 	if (pfmemalloc)
4172 		goto skip_taps;
4173 
4174 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4175 		if (pt_prev)
4176 			ret = deliver_skb(skb, pt_prev, orig_dev);
4177 		pt_prev = ptype;
4178 	}
4179 
4180 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4181 		if (pt_prev)
4182 			ret = deliver_skb(skb, pt_prev, orig_dev);
4183 		pt_prev = ptype;
4184 	}
4185 
4186 skip_taps:
4187 #ifdef CONFIG_NET_INGRESS
4188 	if (static_key_false(&ingress_needed)) {
4189 		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4190 		if (!skb)
4191 			goto out;
4192 
4193 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4194 			goto out;
4195 	}
4196 #endif
4197 #ifdef CONFIG_NET_CLS_ACT
4198 	skb->tc_verd = 0;
4199 ncls:
4200 #endif
4201 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4202 		goto drop;
4203 
4204 	if (skb_vlan_tag_present(skb)) {
4205 		if (pt_prev) {
4206 			ret = deliver_skb(skb, pt_prev, orig_dev);
4207 			pt_prev = NULL;
4208 		}
4209 		if (vlan_do_receive(&skb))
4210 			goto another_round;
4211 		else if (unlikely(!skb))
4212 			goto out;
4213 	}
4214 
4215 	rx_handler = rcu_dereference(skb->dev->rx_handler);
4216 	if (rx_handler) {
4217 		if (pt_prev) {
4218 			ret = deliver_skb(skb, pt_prev, orig_dev);
4219 			pt_prev = NULL;
4220 		}
4221 		switch (rx_handler(&skb)) {
4222 		case RX_HANDLER_CONSUMED:
4223 			ret = NET_RX_SUCCESS;
4224 			goto out;
4225 		case RX_HANDLER_ANOTHER:
4226 			goto another_round;
4227 		case RX_HANDLER_EXACT:
4228 			deliver_exact = true;
4229 		case RX_HANDLER_PASS:
4230 			break;
4231 		default:
4232 			BUG();
4233 		}
4234 	}
4235 
4236 	if (unlikely(skb_vlan_tag_present(skb))) {
4237 		if (skb_vlan_tag_get_id(skb))
4238 			skb->pkt_type = PACKET_OTHERHOST;
4239 		/* Note: we might in the future use prio bits
4240 		 * and set skb->priority like in vlan_do_receive()
4241 		 * For the time being, just ignore Priority Code Point
4242 		 */
4243 		skb->vlan_tci = 0;
4244 	}
4245 
4246 	type = skb->protocol;
4247 
4248 	/* deliver only exact match when indicated */
4249 	if (likely(!deliver_exact)) {
4250 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4251 				       &ptype_base[ntohs(type) &
4252 						   PTYPE_HASH_MASK]);
4253 	}
4254 
4255 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4256 			       &orig_dev->ptype_specific);
4257 
4258 	if (unlikely(skb->dev != orig_dev)) {
4259 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4260 				       &skb->dev->ptype_specific);
4261 	}
4262 
4263 	if (pt_prev) {
4264 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4265 			goto drop;
4266 		else
4267 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4268 	} else {
4269 drop:
4270 		if (!deliver_exact)
4271 			atomic_long_inc(&skb->dev->rx_dropped);
4272 		else
4273 			atomic_long_inc(&skb->dev->rx_nohandler);
4274 		kfree_skb(skb);
4275 		/* Jamal, now you will not able to escape explaining
4276 		 * me how you were going to use this. :-)
4277 		 */
4278 		ret = NET_RX_DROP;
4279 	}
4280 
4281 out:
4282 	return ret;
4283 }
4284 
__netif_receive_skb(struct sk_buff * skb)4285 static int __netif_receive_skb(struct sk_buff *skb)
4286 {
4287 	int ret;
4288 
4289 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4290 		unsigned long pflags = current->flags;
4291 
4292 		/*
4293 		 * PFMEMALLOC skbs are special, they should
4294 		 * - be delivered to SOCK_MEMALLOC sockets only
4295 		 * - stay away from userspace
4296 		 * - have bounded memory usage
4297 		 *
4298 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4299 		 * context down to all allocation sites.
4300 		 */
4301 		current->flags |= PF_MEMALLOC;
4302 		ret = __netif_receive_skb_core(skb, true);
4303 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4304 	} else
4305 		ret = __netif_receive_skb_core(skb, false);
4306 
4307 	return ret;
4308 }
4309 
netif_receive_skb_internal(struct sk_buff * skb)4310 static int netif_receive_skb_internal(struct sk_buff *skb)
4311 {
4312 	int ret;
4313 
4314 	net_timestamp_check(netdev_tstamp_prequeue, skb);
4315 
4316 	if (skb_defer_rx_timestamp(skb))
4317 		return NET_RX_SUCCESS;
4318 
4319 	rcu_read_lock();
4320 
4321 #ifdef CONFIG_RPS
4322 	if (static_key_false(&rps_needed)) {
4323 		struct rps_dev_flow voidflow, *rflow = &voidflow;
4324 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4325 
4326 		if (cpu >= 0) {
4327 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4328 			rcu_read_unlock();
4329 			return ret;
4330 		}
4331 	}
4332 #endif
4333 	ret = __netif_receive_skb(skb);
4334 	rcu_read_unlock();
4335 	return ret;
4336 }
4337 
4338 /**
4339  *	netif_receive_skb - process receive buffer from network
4340  *	@skb: buffer to process
4341  *
4342  *	netif_receive_skb() is the main receive data processing function.
4343  *	It always succeeds. The buffer may be dropped during processing
4344  *	for congestion control or by the protocol layers.
4345  *
4346  *	This function may only be called from softirq context and interrupts
4347  *	should be enabled.
4348  *
4349  *	Return values (usually ignored):
4350  *	NET_RX_SUCCESS: no congestion
4351  *	NET_RX_DROP: packet was dropped
4352  */
netif_receive_skb(struct sk_buff * skb)4353 int netif_receive_skb(struct sk_buff *skb)
4354 {
4355 	trace_netif_receive_skb_entry(skb);
4356 
4357 	return netif_receive_skb_internal(skb);
4358 }
4359 EXPORT_SYMBOL(netif_receive_skb);
4360 
4361 DEFINE_PER_CPU(struct work_struct, flush_works);
4362 
4363 /* Network device is going away, flush any packets still pending */
flush_backlog(struct work_struct * work)4364 static void flush_backlog(struct work_struct *work)
4365 {
4366 	struct sk_buff *skb, *tmp;
4367 	struct softnet_data *sd;
4368 
4369 	local_bh_disable();
4370 	sd = this_cpu_ptr(&softnet_data);
4371 
4372 	local_irq_disable();
4373 	rps_lock(sd);
4374 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4375 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4376 			__skb_unlink(skb, &sd->input_pkt_queue);
4377 			kfree_skb(skb);
4378 			input_queue_head_incr(sd);
4379 		}
4380 	}
4381 	rps_unlock(sd);
4382 	local_irq_enable();
4383 
4384 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4385 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4386 			__skb_unlink(skb, &sd->process_queue);
4387 			kfree_skb(skb);
4388 			input_queue_head_incr(sd);
4389 		}
4390 	}
4391 	local_bh_enable();
4392 }
4393 
flush_all_backlogs(void)4394 static void flush_all_backlogs(void)
4395 {
4396 	unsigned int cpu;
4397 
4398 	get_online_cpus();
4399 
4400 	for_each_online_cpu(cpu)
4401 		queue_work_on(cpu, system_highpri_wq,
4402 			      per_cpu_ptr(&flush_works, cpu));
4403 
4404 	for_each_online_cpu(cpu)
4405 		flush_work(per_cpu_ptr(&flush_works, cpu));
4406 
4407 	put_online_cpus();
4408 }
4409 
napi_gro_complete(struct sk_buff * skb)4410 static int napi_gro_complete(struct sk_buff *skb)
4411 {
4412 	struct packet_offload *ptype;
4413 	__be16 type = skb->protocol;
4414 	struct list_head *head = &offload_base;
4415 	int err = -ENOENT;
4416 
4417 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4418 
4419 	if (NAPI_GRO_CB(skb)->count == 1) {
4420 		skb_shinfo(skb)->gso_size = 0;
4421 		goto out;
4422 	}
4423 
4424 	rcu_read_lock();
4425 	list_for_each_entry_rcu(ptype, head, list) {
4426 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4427 			continue;
4428 
4429 		err = ptype->callbacks.gro_complete(skb, 0);
4430 		break;
4431 	}
4432 	rcu_read_unlock();
4433 
4434 	if (err) {
4435 		WARN_ON(&ptype->list == head);
4436 		kfree_skb(skb);
4437 		return NET_RX_SUCCESS;
4438 	}
4439 
4440 out:
4441 	return netif_receive_skb_internal(skb);
4442 }
4443 
4444 /* napi->gro_list contains packets ordered by age.
4445  * youngest packets at the head of it.
4446  * Complete skbs in reverse order to reduce latencies.
4447  */
napi_gro_flush(struct napi_struct * napi,bool flush_old)4448 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4449 {
4450 	struct sk_buff *skb, *prev = NULL;
4451 
4452 	/* scan list and build reverse chain */
4453 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4454 		skb->prev = prev;
4455 		prev = skb;
4456 	}
4457 
4458 	for (skb = prev; skb; skb = prev) {
4459 		skb->next = NULL;
4460 
4461 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4462 			return;
4463 
4464 		prev = skb->prev;
4465 		napi_gro_complete(skb);
4466 		napi->gro_count--;
4467 	}
4468 
4469 	napi->gro_list = NULL;
4470 }
4471 EXPORT_SYMBOL(napi_gro_flush);
4472 
gro_list_prepare(struct napi_struct * napi,struct sk_buff * skb)4473 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4474 {
4475 	struct sk_buff *p;
4476 	unsigned int maclen = skb->dev->hard_header_len;
4477 	u32 hash = skb_get_hash_raw(skb);
4478 
4479 	for (p = napi->gro_list; p; p = p->next) {
4480 		unsigned long diffs;
4481 
4482 		NAPI_GRO_CB(p)->flush = 0;
4483 
4484 		if (hash != skb_get_hash_raw(p)) {
4485 			NAPI_GRO_CB(p)->same_flow = 0;
4486 			continue;
4487 		}
4488 
4489 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4490 		diffs |= p->vlan_tci ^ skb->vlan_tci;
4491 		diffs |= skb_metadata_dst_cmp(p, skb);
4492 		if (maclen == ETH_HLEN)
4493 			diffs |= compare_ether_header(skb_mac_header(p),
4494 						      skb_mac_header(skb));
4495 		else if (!diffs)
4496 			diffs = memcmp(skb_mac_header(p),
4497 				       skb_mac_header(skb),
4498 				       maclen);
4499 		NAPI_GRO_CB(p)->same_flow = !diffs;
4500 	}
4501 }
4502 
skb_gro_reset_offset(struct sk_buff * skb)4503 static void skb_gro_reset_offset(struct sk_buff *skb)
4504 {
4505 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4506 	const skb_frag_t *frag0 = &pinfo->frags[0];
4507 
4508 	NAPI_GRO_CB(skb)->data_offset = 0;
4509 	NAPI_GRO_CB(skb)->frag0 = NULL;
4510 	NAPI_GRO_CB(skb)->frag0_len = 0;
4511 
4512 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4513 	    pinfo->nr_frags &&
4514 	    !PageHighMem(skb_frag_page(frag0))) {
4515 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4516 		NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4517 						    skb_frag_size(frag0),
4518 						    skb->end - skb->tail);
4519 	}
4520 }
4521 
gro_pull_from_frag0(struct sk_buff * skb,int grow)4522 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4523 {
4524 	struct skb_shared_info *pinfo = skb_shinfo(skb);
4525 
4526 	BUG_ON(skb->end - skb->tail < grow);
4527 
4528 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4529 
4530 	skb->data_len -= grow;
4531 	skb->tail += grow;
4532 
4533 	pinfo->frags[0].page_offset += grow;
4534 	skb_frag_size_sub(&pinfo->frags[0], grow);
4535 
4536 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4537 		skb_frag_unref(skb, 0);
4538 		memmove(pinfo->frags, pinfo->frags + 1,
4539 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4540 	}
4541 }
4542 
dev_gro_receive(struct napi_struct * napi,struct sk_buff * skb)4543 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4544 {
4545 	struct sk_buff **pp = NULL;
4546 	struct packet_offload *ptype;
4547 	__be16 type = skb->protocol;
4548 	struct list_head *head = &offload_base;
4549 	int same_flow;
4550 	enum gro_result ret;
4551 	int grow;
4552 
4553 	if (!(skb->dev->features & NETIF_F_GRO))
4554 		goto normal;
4555 
4556 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4557 		goto normal;
4558 
4559 	gro_list_prepare(napi, skb);
4560 
4561 	rcu_read_lock();
4562 	list_for_each_entry_rcu(ptype, head, list) {
4563 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4564 			continue;
4565 
4566 		skb_set_network_header(skb, skb_gro_offset(skb));
4567 		skb_reset_mac_len(skb);
4568 		NAPI_GRO_CB(skb)->same_flow = 0;
4569 		NAPI_GRO_CB(skb)->flush = 0;
4570 		NAPI_GRO_CB(skb)->free = 0;
4571 		NAPI_GRO_CB(skb)->encap_mark = 0;
4572 		NAPI_GRO_CB(skb)->recursion_counter = 0;
4573 		NAPI_GRO_CB(skb)->is_fou = 0;
4574 		NAPI_GRO_CB(skb)->is_atomic = 1;
4575 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4576 
4577 		/* Setup for GRO checksum validation */
4578 		switch (skb->ip_summed) {
4579 		case CHECKSUM_COMPLETE:
4580 			NAPI_GRO_CB(skb)->csum = skb->csum;
4581 			NAPI_GRO_CB(skb)->csum_valid = 1;
4582 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4583 			break;
4584 		case CHECKSUM_UNNECESSARY:
4585 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4586 			NAPI_GRO_CB(skb)->csum_valid = 0;
4587 			break;
4588 		default:
4589 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4590 			NAPI_GRO_CB(skb)->csum_valid = 0;
4591 		}
4592 
4593 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4594 		break;
4595 	}
4596 	rcu_read_unlock();
4597 
4598 	if (&ptype->list == head)
4599 		goto normal;
4600 
4601 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4602 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4603 
4604 	if (pp) {
4605 		struct sk_buff *nskb = *pp;
4606 
4607 		*pp = nskb->next;
4608 		nskb->next = NULL;
4609 		napi_gro_complete(nskb);
4610 		napi->gro_count--;
4611 	}
4612 
4613 	if (same_flow)
4614 		goto ok;
4615 
4616 	if (NAPI_GRO_CB(skb)->flush)
4617 		goto normal;
4618 
4619 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4620 		struct sk_buff *nskb = napi->gro_list;
4621 
4622 		/* locate the end of the list to select the 'oldest' flow */
4623 		while (nskb->next) {
4624 			pp = &nskb->next;
4625 			nskb = *pp;
4626 		}
4627 		*pp = NULL;
4628 		nskb->next = NULL;
4629 		napi_gro_complete(nskb);
4630 	} else {
4631 		napi->gro_count++;
4632 	}
4633 	NAPI_GRO_CB(skb)->count = 1;
4634 	NAPI_GRO_CB(skb)->age = jiffies;
4635 	NAPI_GRO_CB(skb)->last = skb;
4636 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4637 	skb->next = napi->gro_list;
4638 	napi->gro_list = skb;
4639 	ret = GRO_HELD;
4640 
4641 pull:
4642 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4643 	if (grow > 0)
4644 		gro_pull_from_frag0(skb, grow);
4645 ok:
4646 	return ret;
4647 
4648 normal:
4649 	ret = GRO_NORMAL;
4650 	goto pull;
4651 }
4652 
gro_find_receive_by_type(__be16 type)4653 struct packet_offload *gro_find_receive_by_type(__be16 type)
4654 {
4655 	struct list_head *offload_head = &offload_base;
4656 	struct packet_offload *ptype;
4657 
4658 	list_for_each_entry_rcu(ptype, offload_head, list) {
4659 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4660 			continue;
4661 		return ptype;
4662 	}
4663 	return NULL;
4664 }
4665 EXPORT_SYMBOL(gro_find_receive_by_type);
4666 
gro_find_complete_by_type(__be16 type)4667 struct packet_offload *gro_find_complete_by_type(__be16 type)
4668 {
4669 	struct list_head *offload_head = &offload_base;
4670 	struct packet_offload *ptype;
4671 
4672 	list_for_each_entry_rcu(ptype, offload_head, list) {
4673 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4674 			continue;
4675 		return ptype;
4676 	}
4677 	return NULL;
4678 }
4679 EXPORT_SYMBOL(gro_find_complete_by_type);
4680 
napi_skb_free_stolen_head(struct sk_buff * skb)4681 static void napi_skb_free_stolen_head(struct sk_buff *skb)
4682 {
4683 	skb_dst_drop(skb);
4684 	kmem_cache_free(skbuff_head_cache, skb);
4685 }
4686 
napi_skb_finish(gro_result_t ret,struct sk_buff * skb)4687 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4688 {
4689 	switch (ret) {
4690 	case GRO_NORMAL:
4691 		if (netif_receive_skb_internal(skb))
4692 			ret = GRO_DROP;
4693 		break;
4694 
4695 	case GRO_DROP:
4696 		kfree_skb(skb);
4697 		break;
4698 
4699 	case GRO_MERGED_FREE:
4700 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4701 			napi_skb_free_stolen_head(skb);
4702 		else
4703 			__kfree_skb(skb);
4704 		break;
4705 
4706 	case GRO_HELD:
4707 	case GRO_MERGED:
4708 		break;
4709 	}
4710 
4711 	return ret;
4712 }
4713 
napi_gro_receive(struct napi_struct * napi,struct sk_buff * skb)4714 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4715 {
4716 	skb_mark_napi_id(skb, napi);
4717 	trace_napi_gro_receive_entry(skb);
4718 
4719 	skb_gro_reset_offset(skb);
4720 
4721 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4722 }
4723 EXPORT_SYMBOL(napi_gro_receive);
4724 
napi_reuse_skb(struct napi_struct * napi,struct sk_buff * skb)4725 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4726 {
4727 	if (unlikely(skb->pfmemalloc)) {
4728 		consume_skb(skb);
4729 		return;
4730 	}
4731 	__skb_pull(skb, skb_headlen(skb));
4732 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4733 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4734 	skb->vlan_tci = 0;
4735 	skb->dev = napi->dev;
4736 	skb->skb_iif = 0;
4737 	skb->encapsulation = 0;
4738 	skb_shinfo(skb)->gso_type = 0;
4739 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4740 
4741 	napi->skb = skb;
4742 }
4743 
napi_get_frags(struct napi_struct * napi)4744 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4745 {
4746 	struct sk_buff *skb = napi->skb;
4747 
4748 	if (!skb) {
4749 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4750 		if (skb) {
4751 			napi->skb = skb;
4752 			skb_mark_napi_id(skb, napi);
4753 		}
4754 	}
4755 	return skb;
4756 }
4757 EXPORT_SYMBOL(napi_get_frags);
4758 
napi_frags_finish(struct napi_struct * napi,struct sk_buff * skb,gro_result_t ret)4759 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4760 				      struct sk_buff *skb,
4761 				      gro_result_t ret)
4762 {
4763 	switch (ret) {
4764 	case GRO_NORMAL:
4765 	case GRO_HELD:
4766 		__skb_push(skb, ETH_HLEN);
4767 		skb->protocol = eth_type_trans(skb, skb->dev);
4768 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4769 			ret = GRO_DROP;
4770 		break;
4771 
4772 	case GRO_DROP:
4773 		napi_reuse_skb(napi, skb);
4774 		break;
4775 
4776 	case GRO_MERGED_FREE:
4777 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4778 			napi_skb_free_stolen_head(skb);
4779 		else
4780 			napi_reuse_skb(napi, skb);
4781 		break;
4782 
4783 	case GRO_MERGED:
4784 		break;
4785 	}
4786 
4787 	return ret;
4788 }
4789 
4790 /* Upper GRO stack assumes network header starts at gro_offset=0
4791  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4792  * We copy ethernet header into skb->data to have a common layout.
4793  */
napi_frags_skb(struct napi_struct * napi)4794 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4795 {
4796 	struct sk_buff *skb = napi->skb;
4797 	const struct ethhdr *eth;
4798 	unsigned int hlen = sizeof(*eth);
4799 
4800 	napi->skb = NULL;
4801 
4802 	skb_reset_mac_header(skb);
4803 	skb_gro_reset_offset(skb);
4804 
4805 	eth = skb_gro_header_fast(skb, 0);
4806 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4807 		eth = skb_gro_header_slow(skb, hlen, 0);
4808 		if (unlikely(!eth)) {
4809 			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4810 					     __func__, napi->dev->name);
4811 			napi_reuse_skb(napi, skb);
4812 			return NULL;
4813 		}
4814 	} else {
4815 		gro_pull_from_frag0(skb, hlen);
4816 		NAPI_GRO_CB(skb)->frag0 += hlen;
4817 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4818 	}
4819 	__skb_pull(skb, hlen);
4820 
4821 	/*
4822 	 * This works because the only protocols we care about don't require
4823 	 * special handling.
4824 	 * We'll fix it up properly in napi_frags_finish()
4825 	 */
4826 	skb->protocol = eth->h_proto;
4827 
4828 	return skb;
4829 }
4830 
napi_gro_frags(struct napi_struct * napi)4831 gro_result_t napi_gro_frags(struct napi_struct *napi)
4832 {
4833 	struct sk_buff *skb = napi_frags_skb(napi);
4834 
4835 	if (!skb)
4836 		return GRO_DROP;
4837 
4838 	trace_napi_gro_frags_entry(skb);
4839 
4840 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4841 }
4842 EXPORT_SYMBOL(napi_gro_frags);
4843 
4844 /* Compute the checksum from gro_offset and return the folded value
4845  * after adding in any pseudo checksum.
4846  */
__skb_gro_checksum_complete(struct sk_buff * skb)4847 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4848 {
4849 	__wsum wsum;
4850 	__sum16 sum;
4851 
4852 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4853 
4854 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4855 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4856 	if (likely(!sum)) {
4857 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4858 		    !skb->csum_complete_sw)
4859 			netdev_rx_csum_fault(skb->dev);
4860 	}
4861 
4862 	NAPI_GRO_CB(skb)->csum = wsum;
4863 	NAPI_GRO_CB(skb)->csum_valid = 1;
4864 
4865 	return sum;
4866 }
4867 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4868 
4869 /*
4870  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4871  * Note: called with local irq disabled, but exits with local irq enabled.
4872  */
net_rps_action_and_irq_enable(struct softnet_data * sd)4873 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4874 {
4875 #ifdef CONFIG_RPS
4876 	struct softnet_data *remsd = sd->rps_ipi_list;
4877 
4878 	if (remsd) {
4879 		sd->rps_ipi_list = NULL;
4880 
4881 		local_irq_enable();
4882 
4883 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4884 		while (remsd) {
4885 			struct softnet_data *next = remsd->rps_ipi_next;
4886 
4887 			if (cpu_online(remsd->cpu))
4888 				smp_call_function_single_async(remsd->cpu,
4889 							   &remsd->csd);
4890 			remsd = next;
4891 		}
4892 	} else
4893 #endif
4894 		local_irq_enable();
4895 }
4896 
sd_has_rps_ipi_waiting(struct softnet_data * sd)4897 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4898 {
4899 #ifdef CONFIG_RPS
4900 	return sd->rps_ipi_list != NULL;
4901 #else
4902 	return false;
4903 #endif
4904 }
4905 
process_backlog(struct napi_struct * napi,int quota)4906 static int process_backlog(struct napi_struct *napi, int quota)
4907 {
4908 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4909 	bool again = true;
4910 	int work = 0;
4911 
4912 	/* Check if we have pending ipi, its better to send them now,
4913 	 * not waiting net_rx_action() end.
4914 	 */
4915 	if (sd_has_rps_ipi_waiting(sd)) {
4916 		local_irq_disable();
4917 		net_rps_action_and_irq_enable(sd);
4918 	}
4919 
4920 	napi->weight = weight_p;
4921 	while (again) {
4922 		struct sk_buff *skb;
4923 
4924 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4925 			rcu_read_lock();
4926 			__netif_receive_skb(skb);
4927 			rcu_read_unlock();
4928 			input_queue_head_incr(sd);
4929 			if (++work >= quota)
4930 				return work;
4931 
4932 		}
4933 
4934 		local_irq_disable();
4935 		rps_lock(sd);
4936 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4937 			/*
4938 			 * Inline a custom version of __napi_complete().
4939 			 * only current cpu owns and manipulates this napi,
4940 			 * and NAPI_STATE_SCHED is the only possible flag set
4941 			 * on backlog.
4942 			 * We can use a plain write instead of clear_bit(),
4943 			 * and we dont need an smp_mb() memory barrier.
4944 			 */
4945 			napi->state = 0;
4946 			again = false;
4947 		} else {
4948 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4949 						   &sd->process_queue);
4950 		}
4951 		rps_unlock(sd);
4952 		local_irq_enable();
4953 	}
4954 
4955 	return work;
4956 }
4957 
4958 /**
4959  * __napi_schedule - schedule for receive
4960  * @n: entry to schedule
4961  *
4962  * The entry's receive function will be scheduled to run.
4963  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4964  */
__napi_schedule(struct napi_struct * n)4965 void __napi_schedule(struct napi_struct *n)
4966 {
4967 	unsigned long flags;
4968 
4969 	local_irq_save(flags);
4970 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4971 	local_irq_restore(flags);
4972 }
4973 EXPORT_SYMBOL(__napi_schedule);
4974 
4975 /**
4976  * __napi_schedule_irqoff - schedule for receive
4977  * @n: entry to schedule
4978  *
4979  * Variant of __napi_schedule() assuming hard irqs are masked
4980  */
__napi_schedule_irqoff(struct napi_struct * n)4981 void __napi_schedule_irqoff(struct napi_struct *n)
4982 {
4983 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4984 }
4985 EXPORT_SYMBOL(__napi_schedule_irqoff);
4986 
__napi_complete(struct napi_struct * n)4987 void __napi_complete(struct napi_struct *n)
4988 {
4989 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4990 
4991 	list_del_init(&n->poll_list);
4992 	smp_mb__before_atomic();
4993 	clear_bit(NAPI_STATE_SCHED, &n->state);
4994 }
4995 EXPORT_SYMBOL(__napi_complete);
4996 
napi_complete_done(struct napi_struct * n,int work_done)4997 void napi_complete_done(struct napi_struct *n, int work_done)
4998 {
4999 	unsigned long flags;
5000 
5001 	/*
5002 	 * don't let napi dequeue from the cpu poll list
5003 	 * just in case its running on a different cpu
5004 	 */
5005 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
5006 		return;
5007 
5008 	if (n->gro_list) {
5009 		unsigned long timeout = 0;
5010 
5011 		if (work_done)
5012 			timeout = n->dev->gro_flush_timeout;
5013 
5014 		if (timeout)
5015 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
5016 				      HRTIMER_MODE_REL_PINNED);
5017 		else
5018 			napi_gro_flush(n, false);
5019 	}
5020 	if (likely(list_empty(&n->poll_list))) {
5021 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
5022 	} else {
5023 		/* If n->poll_list is not empty, we need to mask irqs */
5024 		local_irq_save(flags);
5025 		__napi_complete(n);
5026 		local_irq_restore(flags);
5027 	}
5028 }
5029 EXPORT_SYMBOL(napi_complete_done);
5030 
5031 /* must be called under rcu_read_lock(), as we dont take a reference */
napi_by_id(unsigned int napi_id)5032 static struct napi_struct *napi_by_id(unsigned int napi_id)
5033 {
5034 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5035 	struct napi_struct *napi;
5036 
5037 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5038 		if (napi->napi_id == napi_id)
5039 			return napi;
5040 
5041 	return NULL;
5042 }
5043 
5044 #if defined(CONFIG_NET_RX_BUSY_POLL)
5045 #define BUSY_POLL_BUDGET 8
sk_busy_loop(struct sock * sk,int nonblock)5046 bool sk_busy_loop(struct sock *sk, int nonblock)
5047 {
5048 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5049 	int (*busy_poll)(struct napi_struct *dev);
5050 	struct napi_struct *napi;
5051 	int rc = false;
5052 
5053 	rcu_read_lock();
5054 
5055 	napi = napi_by_id(sk->sk_napi_id);
5056 	if (!napi)
5057 		goto out;
5058 
5059 	/* Note: ndo_busy_poll method is optional in linux-4.5 */
5060 	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5061 
5062 	do {
5063 		rc = 0;
5064 		local_bh_disable();
5065 		if (busy_poll) {
5066 			rc = busy_poll(napi);
5067 		} else if (napi_schedule_prep(napi)) {
5068 			void *have = netpoll_poll_lock(napi);
5069 
5070 			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
5071 				rc = napi->poll(napi, BUSY_POLL_BUDGET);
5072 				trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5073 				if (rc == BUSY_POLL_BUDGET) {
5074 					napi_complete_done(napi, rc);
5075 					napi_schedule(napi);
5076 				}
5077 			}
5078 			netpoll_poll_unlock(have);
5079 		}
5080 		if (rc > 0)
5081 			__NET_ADD_STATS(sock_net(sk),
5082 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5083 		local_bh_enable();
5084 
5085 		if (rc == LL_FLUSH_FAILED)
5086 			break; /* permanent failure */
5087 
5088 		cpu_relax();
5089 	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5090 		 !need_resched() && !busy_loop_timeout(end_time));
5091 
5092 	rc = !skb_queue_empty(&sk->sk_receive_queue);
5093 out:
5094 	rcu_read_unlock();
5095 	return rc;
5096 }
5097 EXPORT_SYMBOL(sk_busy_loop);
5098 
5099 #endif /* CONFIG_NET_RX_BUSY_POLL */
5100 
napi_hash_add(struct napi_struct * napi)5101 void napi_hash_add(struct napi_struct *napi)
5102 {
5103 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5104 	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5105 		return;
5106 
5107 	spin_lock(&napi_hash_lock);
5108 
5109 	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5110 	do {
5111 		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5112 			napi_gen_id = NR_CPUS + 1;
5113 	} while (napi_by_id(napi_gen_id));
5114 	napi->napi_id = napi_gen_id;
5115 
5116 	hlist_add_head_rcu(&napi->napi_hash_node,
5117 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5118 
5119 	spin_unlock(&napi_hash_lock);
5120 }
5121 EXPORT_SYMBOL_GPL(napi_hash_add);
5122 
5123 /* Warning : caller is responsible to make sure rcu grace period
5124  * is respected before freeing memory containing @napi
5125  */
napi_hash_del(struct napi_struct * napi)5126 bool napi_hash_del(struct napi_struct *napi)
5127 {
5128 	bool rcu_sync_needed = false;
5129 
5130 	spin_lock(&napi_hash_lock);
5131 
5132 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5133 		rcu_sync_needed = true;
5134 		hlist_del_rcu(&napi->napi_hash_node);
5135 	}
5136 	spin_unlock(&napi_hash_lock);
5137 	return rcu_sync_needed;
5138 }
5139 EXPORT_SYMBOL_GPL(napi_hash_del);
5140 
napi_watchdog(struct hrtimer * timer)5141 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5142 {
5143 	struct napi_struct *napi;
5144 
5145 	napi = container_of(timer, struct napi_struct, timer);
5146 	if (napi->gro_list)
5147 		napi_schedule(napi);
5148 
5149 	return HRTIMER_NORESTART;
5150 }
5151 
netif_napi_add(struct net_device * dev,struct napi_struct * napi,int (* poll)(struct napi_struct *,int),int weight)5152 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5153 		    int (*poll)(struct napi_struct *, int), int weight)
5154 {
5155 	INIT_LIST_HEAD(&napi->poll_list);
5156 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5157 	napi->timer.function = napi_watchdog;
5158 	napi->gro_count = 0;
5159 	napi->gro_list = NULL;
5160 	napi->skb = NULL;
5161 	napi->poll = poll;
5162 	if (weight > NAPI_POLL_WEIGHT)
5163 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5164 			    weight, dev->name);
5165 	napi->weight = weight;
5166 	list_add(&napi->dev_list, &dev->napi_list);
5167 	napi->dev = dev;
5168 #ifdef CONFIG_NETPOLL
5169 	spin_lock_init(&napi->poll_lock);
5170 	napi->poll_owner = -1;
5171 #endif
5172 	set_bit(NAPI_STATE_SCHED, &napi->state);
5173 	napi_hash_add(napi);
5174 }
5175 EXPORT_SYMBOL(netif_napi_add);
5176 
napi_disable(struct napi_struct * n)5177 void napi_disable(struct napi_struct *n)
5178 {
5179 	might_sleep();
5180 	set_bit(NAPI_STATE_DISABLE, &n->state);
5181 
5182 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5183 		msleep(1);
5184 	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5185 		msleep(1);
5186 
5187 	hrtimer_cancel(&n->timer);
5188 
5189 	clear_bit(NAPI_STATE_DISABLE, &n->state);
5190 }
5191 EXPORT_SYMBOL(napi_disable);
5192 
5193 /* Must be called in process context */
netif_napi_del(struct napi_struct * napi)5194 void netif_napi_del(struct napi_struct *napi)
5195 {
5196 	might_sleep();
5197 	if (napi_hash_del(napi))
5198 		synchronize_net();
5199 	list_del_init(&napi->dev_list);
5200 	napi_free_frags(napi);
5201 
5202 	kfree_skb_list(napi->gro_list);
5203 	napi->gro_list = NULL;
5204 	napi->gro_count = 0;
5205 }
5206 EXPORT_SYMBOL(netif_napi_del);
5207 
napi_poll(struct napi_struct * n,struct list_head * repoll)5208 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5209 {
5210 	void *have;
5211 	int work, weight;
5212 
5213 	list_del_init(&n->poll_list);
5214 
5215 	have = netpoll_poll_lock(n);
5216 
5217 	weight = n->weight;
5218 
5219 	/* This NAPI_STATE_SCHED test is for avoiding a race
5220 	 * with netpoll's poll_napi().  Only the entity which
5221 	 * obtains the lock and sees NAPI_STATE_SCHED set will
5222 	 * actually make the ->poll() call.  Therefore we avoid
5223 	 * accidentally calling ->poll() when NAPI is not scheduled.
5224 	 */
5225 	work = 0;
5226 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5227 		work = n->poll(n, weight);
5228 		trace_napi_poll(n, work, weight);
5229 	}
5230 
5231 	WARN_ON_ONCE(work > weight);
5232 
5233 	if (likely(work < weight))
5234 		goto out_unlock;
5235 
5236 	/* Drivers must not modify the NAPI state if they
5237 	 * consume the entire weight.  In such cases this code
5238 	 * still "owns" the NAPI instance and therefore can
5239 	 * move the instance around on the list at-will.
5240 	 */
5241 	if (unlikely(napi_disable_pending(n))) {
5242 		napi_complete(n);
5243 		goto out_unlock;
5244 	}
5245 
5246 	if (n->gro_list) {
5247 		/* flush too old packets
5248 		 * If HZ < 1000, flush all packets.
5249 		 */
5250 		napi_gro_flush(n, HZ >= 1000);
5251 	}
5252 
5253 	/* Some drivers may have called napi_schedule
5254 	 * prior to exhausting their budget.
5255 	 */
5256 	if (unlikely(!list_empty(&n->poll_list))) {
5257 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5258 			     n->dev ? n->dev->name : "backlog");
5259 		goto out_unlock;
5260 	}
5261 
5262 	list_add_tail(&n->poll_list, repoll);
5263 
5264 out_unlock:
5265 	netpoll_poll_unlock(have);
5266 
5267 	return work;
5268 }
5269 
net_rx_action(struct softirq_action * h)5270 static __latent_entropy void net_rx_action(struct softirq_action *h)
5271 {
5272 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5273 	unsigned long time_limit = jiffies + 2;
5274 	int budget = netdev_budget;
5275 	LIST_HEAD(list);
5276 	LIST_HEAD(repoll);
5277 
5278 	local_irq_disable();
5279 	list_splice_init(&sd->poll_list, &list);
5280 	local_irq_enable();
5281 
5282 	for (;;) {
5283 		struct napi_struct *n;
5284 
5285 		if (list_empty(&list)) {
5286 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5287 				return;
5288 			break;
5289 		}
5290 
5291 		n = list_first_entry(&list, struct napi_struct, poll_list);
5292 		budget -= napi_poll(n, &repoll);
5293 
5294 		/* If softirq window is exhausted then punt.
5295 		 * Allow this to run for 2 jiffies since which will allow
5296 		 * an average latency of 1.5/HZ.
5297 		 */
5298 		if (unlikely(budget <= 0 ||
5299 			     time_after_eq(jiffies, time_limit))) {
5300 			sd->time_squeeze++;
5301 			break;
5302 		}
5303 	}
5304 
5305 	__kfree_skb_flush();
5306 	local_irq_disable();
5307 
5308 	list_splice_tail_init(&sd->poll_list, &list);
5309 	list_splice_tail(&repoll, &list);
5310 	list_splice(&list, &sd->poll_list);
5311 	if (!list_empty(&sd->poll_list))
5312 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5313 
5314 	net_rps_action_and_irq_enable(sd);
5315 }
5316 
5317 struct netdev_adjacent {
5318 	struct net_device *dev;
5319 
5320 	/* upper master flag, there can only be one master device per list */
5321 	bool master;
5322 
5323 	/* counter for the number of times this device was added to us */
5324 	u16 ref_nr;
5325 
5326 	/* private field for the users */
5327 	void *private;
5328 
5329 	struct list_head list;
5330 	struct rcu_head rcu;
5331 };
5332 
__netdev_find_adj(struct net_device * adj_dev,struct list_head * adj_list)5333 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5334 						 struct list_head *adj_list)
5335 {
5336 	struct netdev_adjacent *adj;
5337 
5338 	list_for_each_entry(adj, adj_list, list) {
5339 		if (adj->dev == adj_dev)
5340 			return adj;
5341 	}
5342 	return NULL;
5343 }
5344 
5345 /**
5346  * netdev_has_upper_dev - Check if device is linked to an upper device
5347  * @dev: device
5348  * @upper_dev: upper device to check
5349  *
5350  * Find out if a device is linked to specified upper device and return true
5351  * in case it is. Note that this checks only immediate upper device,
5352  * not through a complete stack of devices. The caller must hold the RTNL lock.
5353  */
netdev_has_upper_dev(struct net_device * dev,struct net_device * upper_dev)5354 bool netdev_has_upper_dev(struct net_device *dev,
5355 			  struct net_device *upper_dev)
5356 {
5357 	ASSERT_RTNL();
5358 
5359 	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5360 }
5361 EXPORT_SYMBOL(netdev_has_upper_dev);
5362 
5363 /**
5364  * netdev_has_any_upper_dev - Check if device is linked to some device
5365  * @dev: device
5366  *
5367  * Find out if a device is linked to an upper device and return true in case
5368  * it is. The caller must hold the RTNL lock.
5369  */
netdev_has_any_upper_dev(struct net_device * dev)5370 bool netdev_has_any_upper_dev(struct net_device *dev)
5371 {
5372 	ASSERT_RTNL();
5373 
5374 	return !list_empty(&dev->all_adj_list.upper);
5375 }
5376 EXPORT_SYMBOL(netdev_has_any_upper_dev);
5377 
5378 /**
5379  * netdev_master_upper_dev_get - Get master upper device
5380  * @dev: device
5381  *
5382  * Find a master upper device and return pointer to it or NULL in case
5383  * it's not there. The caller must hold the RTNL lock.
5384  */
netdev_master_upper_dev_get(struct net_device * dev)5385 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5386 {
5387 	struct netdev_adjacent *upper;
5388 
5389 	ASSERT_RTNL();
5390 
5391 	if (list_empty(&dev->adj_list.upper))
5392 		return NULL;
5393 
5394 	upper = list_first_entry(&dev->adj_list.upper,
5395 				 struct netdev_adjacent, list);
5396 	if (likely(upper->master))
5397 		return upper->dev;
5398 	return NULL;
5399 }
5400 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5401 
netdev_adjacent_get_private(struct list_head * adj_list)5402 void *netdev_adjacent_get_private(struct list_head *adj_list)
5403 {
5404 	struct netdev_adjacent *adj;
5405 
5406 	adj = list_entry(adj_list, struct netdev_adjacent, list);
5407 
5408 	return adj->private;
5409 }
5410 EXPORT_SYMBOL(netdev_adjacent_get_private);
5411 
5412 /**
5413  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5414  * @dev: device
5415  * @iter: list_head ** of the current position
5416  *
5417  * Gets the next device from the dev's upper list, starting from iter
5418  * position. The caller must hold RCU read lock.
5419  */
netdev_upper_get_next_dev_rcu(struct net_device * dev,struct list_head ** iter)5420 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5421 						 struct list_head **iter)
5422 {
5423 	struct netdev_adjacent *upper;
5424 
5425 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5426 
5427 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5428 
5429 	if (&upper->list == &dev->adj_list.upper)
5430 		return NULL;
5431 
5432 	*iter = &upper->list;
5433 
5434 	return upper->dev;
5435 }
5436 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5437 
5438 /**
5439  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5440  * @dev: device
5441  * @iter: list_head ** of the current position
5442  *
5443  * Gets the next device from the dev's upper list, starting from iter
5444  * position. The caller must hold RCU read lock.
5445  */
netdev_all_upper_get_next_dev_rcu(struct net_device * dev,struct list_head ** iter)5446 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5447 						     struct list_head **iter)
5448 {
5449 	struct netdev_adjacent *upper;
5450 
5451 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5452 
5453 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5454 
5455 	if (&upper->list == &dev->all_adj_list.upper)
5456 		return NULL;
5457 
5458 	*iter = &upper->list;
5459 
5460 	return upper->dev;
5461 }
5462 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5463 
5464 /**
5465  * netdev_lower_get_next_private - Get the next ->private from the
5466  *				   lower neighbour list
5467  * @dev: device
5468  * @iter: list_head ** of the current position
5469  *
5470  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5471  * list, starting from iter position. The caller must hold either hold the
5472  * RTNL lock or its own locking that guarantees that the neighbour lower
5473  * list will remain unchanged.
5474  */
netdev_lower_get_next_private(struct net_device * dev,struct list_head ** iter)5475 void *netdev_lower_get_next_private(struct net_device *dev,
5476 				    struct list_head **iter)
5477 {
5478 	struct netdev_adjacent *lower;
5479 
5480 	lower = list_entry(*iter, struct netdev_adjacent, list);
5481 
5482 	if (&lower->list == &dev->adj_list.lower)
5483 		return NULL;
5484 
5485 	*iter = lower->list.next;
5486 
5487 	return lower->private;
5488 }
5489 EXPORT_SYMBOL(netdev_lower_get_next_private);
5490 
5491 /**
5492  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5493  *				       lower neighbour list, RCU
5494  *				       variant
5495  * @dev: device
5496  * @iter: list_head ** of the current position
5497  *
5498  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5499  * list, starting from iter position. The caller must hold RCU read lock.
5500  */
netdev_lower_get_next_private_rcu(struct net_device * dev,struct list_head ** iter)5501 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5502 					struct list_head **iter)
5503 {
5504 	struct netdev_adjacent *lower;
5505 
5506 	WARN_ON_ONCE(!rcu_read_lock_held());
5507 
5508 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5509 
5510 	if (&lower->list == &dev->adj_list.lower)
5511 		return NULL;
5512 
5513 	*iter = &lower->list;
5514 
5515 	return lower->private;
5516 }
5517 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5518 
5519 /**
5520  * netdev_lower_get_next - Get the next device from the lower neighbour
5521  *                         list
5522  * @dev: device
5523  * @iter: list_head ** of the current position
5524  *
5525  * Gets the next netdev_adjacent from the dev's lower neighbour
5526  * list, starting from iter position. The caller must hold RTNL lock or
5527  * its own locking that guarantees that the neighbour lower
5528  * list will remain unchanged.
5529  */
netdev_lower_get_next(struct net_device * dev,struct list_head ** iter)5530 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5531 {
5532 	struct netdev_adjacent *lower;
5533 
5534 	lower = list_entry(*iter, struct netdev_adjacent, list);
5535 
5536 	if (&lower->list == &dev->adj_list.lower)
5537 		return NULL;
5538 
5539 	*iter = lower->list.next;
5540 
5541 	return lower->dev;
5542 }
5543 EXPORT_SYMBOL(netdev_lower_get_next);
5544 
5545 /**
5546  * netdev_all_lower_get_next - Get the next device from all lower neighbour list
5547  * @dev: device
5548  * @iter: list_head ** of the current position
5549  *
5550  * Gets the next netdev_adjacent from the dev's all lower neighbour
5551  * list, starting from iter position. The caller must hold RTNL lock or
5552  * its own locking that guarantees that the neighbour all lower
5553  * list will remain unchanged.
5554  */
netdev_all_lower_get_next(struct net_device * dev,struct list_head ** iter)5555 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5556 {
5557 	struct netdev_adjacent *lower;
5558 
5559 	lower = list_entry(*iter, struct netdev_adjacent, list);
5560 
5561 	if (&lower->list == &dev->all_adj_list.lower)
5562 		return NULL;
5563 
5564 	*iter = lower->list.next;
5565 
5566 	return lower->dev;
5567 }
5568 EXPORT_SYMBOL(netdev_all_lower_get_next);
5569 
5570 /**
5571  * netdev_all_lower_get_next_rcu - Get the next device from all
5572  *				   lower neighbour list, RCU variant
5573  * @dev: device
5574  * @iter: list_head ** of the current position
5575  *
5576  * Gets the next netdev_adjacent from the dev's all lower neighbour
5577  * list, starting from iter position. The caller must hold RCU read lock.
5578  */
netdev_all_lower_get_next_rcu(struct net_device * dev,struct list_head ** iter)5579 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
5580 						 struct list_head **iter)
5581 {
5582 	struct netdev_adjacent *lower;
5583 
5584 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5585 
5586 	if (&lower->list == &dev->all_adj_list.lower)
5587 		return NULL;
5588 
5589 	*iter = &lower->list;
5590 
5591 	return lower->dev;
5592 }
5593 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
5594 
5595 /**
5596  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5597  *				       lower neighbour list, RCU
5598  *				       variant
5599  * @dev: device
5600  *
5601  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5602  * list. The caller must hold RCU read lock.
5603  */
netdev_lower_get_first_private_rcu(struct net_device * dev)5604 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5605 {
5606 	struct netdev_adjacent *lower;
5607 
5608 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5609 			struct netdev_adjacent, list);
5610 	if (lower)
5611 		return lower->private;
5612 	return NULL;
5613 }
5614 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5615 
5616 /**
5617  * netdev_master_upper_dev_get_rcu - Get master upper device
5618  * @dev: device
5619  *
5620  * Find a master upper device and return pointer to it or NULL in case
5621  * it's not there. The caller must hold the RCU read lock.
5622  */
netdev_master_upper_dev_get_rcu(struct net_device * dev)5623 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5624 {
5625 	struct netdev_adjacent *upper;
5626 
5627 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5628 				       struct netdev_adjacent, list);
5629 	if (upper && likely(upper->master))
5630 		return upper->dev;
5631 	return NULL;
5632 }
5633 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5634 
netdev_adjacent_sysfs_add(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)5635 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5636 			      struct net_device *adj_dev,
5637 			      struct list_head *dev_list)
5638 {
5639 	char linkname[IFNAMSIZ+7];
5640 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5641 		"upper_%s" : "lower_%s", adj_dev->name);
5642 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5643 				 linkname);
5644 }
netdev_adjacent_sysfs_del(struct net_device * dev,char * name,struct list_head * dev_list)5645 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5646 			       char *name,
5647 			       struct list_head *dev_list)
5648 {
5649 	char linkname[IFNAMSIZ+7];
5650 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5651 		"upper_%s" : "lower_%s", name);
5652 	sysfs_remove_link(&(dev->dev.kobj), linkname);
5653 }
5654 
netdev_adjacent_is_neigh_list(struct net_device * dev,struct net_device * adj_dev,struct list_head * dev_list)5655 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5656 						 struct net_device *adj_dev,
5657 						 struct list_head *dev_list)
5658 {
5659 	return (dev_list == &dev->adj_list.upper ||
5660 		dev_list == &dev->adj_list.lower) &&
5661 		net_eq(dev_net(dev), dev_net(adj_dev));
5662 }
5663 
__netdev_adjacent_dev_insert(struct net_device * dev,struct net_device * adj_dev,u16 ref_nr,struct list_head * dev_list,void * private,bool master)5664 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5665 					struct net_device *adj_dev,
5666 					u16 ref_nr,
5667 					struct list_head *dev_list,
5668 					void *private, bool master)
5669 {
5670 	struct netdev_adjacent *adj;
5671 	int ret;
5672 
5673 	adj = __netdev_find_adj(adj_dev, dev_list);
5674 
5675 	if (adj) {
5676 		adj->ref_nr += ref_nr;
5677 		return 0;
5678 	}
5679 
5680 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5681 	if (!adj)
5682 		return -ENOMEM;
5683 
5684 	adj->dev = adj_dev;
5685 	adj->master = master;
5686 	adj->ref_nr = ref_nr;
5687 	adj->private = private;
5688 	dev_hold(adj_dev);
5689 
5690 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5691 		 adj_dev->name, dev->name, adj_dev->name);
5692 
5693 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5694 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5695 		if (ret)
5696 			goto free_adj;
5697 	}
5698 
5699 	/* Ensure that master link is always the first item in list. */
5700 	if (master) {
5701 		ret = sysfs_create_link(&(dev->dev.kobj),
5702 					&(adj_dev->dev.kobj), "master");
5703 		if (ret)
5704 			goto remove_symlinks;
5705 
5706 		list_add_rcu(&adj->list, dev_list);
5707 	} else {
5708 		list_add_tail_rcu(&adj->list, dev_list);
5709 	}
5710 
5711 	return 0;
5712 
5713 remove_symlinks:
5714 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5715 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5716 free_adj:
5717 	kfree(adj);
5718 	dev_put(adj_dev);
5719 
5720 	return ret;
5721 }
5722 
__netdev_adjacent_dev_remove(struct net_device * dev,struct net_device * adj_dev,u16 ref_nr,struct list_head * dev_list)5723 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5724 					 struct net_device *adj_dev,
5725 					 u16 ref_nr,
5726 					 struct list_head *dev_list)
5727 {
5728 	struct netdev_adjacent *adj;
5729 
5730 	adj = __netdev_find_adj(adj_dev, dev_list);
5731 
5732 	if (!adj) {
5733 		pr_err("tried to remove device %s from %s\n",
5734 		       dev->name, adj_dev->name);
5735 		BUG();
5736 	}
5737 
5738 	if (adj->ref_nr > ref_nr) {
5739 		pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5740 			 ref_nr, adj->ref_nr-ref_nr);
5741 		adj->ref_nr -= ref_nr;
5742 		return;
5743 	}
5744 
5745 	if (adj->master)
5746 		sysfs_remove_link(&(dev->dev.kobj), "master");
5747 
5748 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5749 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5750 
5751 	list_del_rcu(&adj->list);
5752 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5753 		 adj_dev->name, dev->name, adj_dev->name);
5754 	dev_put(adj_dev);
5755 	kfree_rcu(adj, rcu);
5756 }
5757 
__netdev_adjacent_dev_link_lists(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr,struct list_head * up_list,struct list_head * down_list,void * private,bool master)5758 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5759 					    struct net_device *upper_dev,
5760 					    u16 ref_nr,
5761 					    struct list_head *up_list,
5762 					    struct list_head *down_list,
5763 					    void *private, bool master)
5764 {
5765 	int ret;
5766 
5767 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5768 					   private, master);
5769 	if (ret)
5770 		return ret;
5771 
5772 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5773 					   private, false);
5774 	if (ret) {
5775 		__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5776 		return ret;
5777 	}
5778 
5779 	return 0;
5780 }
5781 
__netdev_adjacent_dev_link(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr)5782 static int __netdev_adjacent_dev_link(struct net_device *dev,
5783 				      struct net_device *upper_dev,
5784 				      u16 ref_nr)
5785 {
5786 	return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5787 						&dev->all_adj_list.upper,
5788 						&upper_dev->all_adj_list.lower,
5789 						NULL, false);
5790 }
5791 
__netdev_adjacent_dev_unlink_lists(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr,struct list_head * up_list,struct list_head * down_list)5792 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5793 					       struct net_device *upper_dev,
5794 					       u16 ref_nr,
5795 					       struct list_head *up_list,
5796 					       struct list_head *down_list)
5797 {
5798 	__netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5799 	__netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5800 }
5801 
__netdev_adjacent_dev_unlink(struct net_device * dev,struct net_device * upper_dev,u16 ref_nr)5802 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5803 					 struct net_device *upper_dev,
5804 					 u16 ref_nr)
5805 {
5806 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5807 					   &dev->all_adj_list.upper,
5808 					   &upper_dev->all_adj_list.lower);
5809 }
5810 
__netdev_adjacent_dev_link_neighbour(struct net_device * dev,struct net_device * upper_dev,void * private,bool master)5811 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5812 						struct net_device *upper_dev,
5813 						void *private, bool master)
5814 {
5815 	int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5816 
5817 	if (ret)
5818 		return ret;
5819 
5820 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5821 					       &dev->adj_list.upper,
5822 					       &upper_dev->adj_list.lower,
5823 					       private, master);
5824 	if (ret) {
5825 		__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5826 		return ret;
5827 	}
5828 
5829 	return 0;
5830 }
5831 
__netdev_adjacent_dev_unlink_neighbour(struct net_device * dev,struct net_device * upper_dev)5832 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5833 						   struct net_device *upper_dev)
5834 {
5835 	__netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5836 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5837 					   &dev->adj_list.upper,
5838 					   &upper_dev->adj_list.lower);
5839 }
5840 
__netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,bool master,void * upper_priv,void * upper_info)5841 static int __netdev_upper_dev_link(struct net_device *dev,
5842 				   struct net_device *upper_dev, bool master,
5843 				   void *upper_priv, void *upper_info)
5844 {
5845 	struct netdev_notifier_changeupper_info changeupper_info;
5846 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5847 	int ret = 0;
5848 
5849 	ASSERT_RTNL();
5850 
5851 	if (dev == upper_dev)
5852 		return -EBUSY;
5853 
5854 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5855 	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5856 		return -EBUSY;
5857 
5858 	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5859 		return -EEXIST;
5860 
5861 	if (master && netdev_master_upper_dev_get(dev))
5862 		return -EBUSY;
5863 
5864 	changeupper_info.upper_dev = upper_dev;
5865 	changeupper_info.master = master;
5866 	changeupper_info.linking = true;
5867 	changeupper_info.upper_info = upper_info;
5868 
5869 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5870 					    &changeupper_info.info);
5871 	ret = notifier_to_errno(ret);
5872 	if (ret)
5873 		return ret;
5874 
5875 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5876 						   master);
5877 	if (ret)
5878 		return ret;
5879 
5880 	/* Now that we linked these devs, make all the upper_dev's
5881 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5882 	 * versa, and don't forget the devices itself. All of these
5883 	 * links are non-neighbours.
5884 	 */
5885 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5886 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5887 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5888 				 i->dev->name, j->dev->name);
5889 			ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5890 			if (ret)
5891 				goto rollback_mesh;
5892 		}
5893 	}
5894 
5895 	/* add dev to every upper_dev's upper device */
5896 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5897 		pr_debug("linking %s's upper device %s with %s\n",
5898 			 upper_dev->name, i->dev->name, dev->name);
5899 		ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5900 		if (ret)
5901 			goto rollback_upper_mesh;
5902 	}
5903 
5904 	/* add upper_dev to every dev's lower device */
5905 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5906 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5907 			 i->dev->name, upper_dev->name);
5908 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5909 		if (ret)
5910 			goto rollback_lower_mesh;
5911 	}
5912 
5913 	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5914 					    &changeupper_info.info);
5915 	ret = notifier_to_errno(ret);
5916 	if (ret)
5917 		goto rollback_lower_mesh;
5918 
5919 	return 0;
5920 
5921 rollback_lower_mesh:
5922 	to_i = i;
5923 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5924 		if (i == to_i)
5925 			break;
5926 		__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5927 	}
5928 
5929 	i = NULL;
5930 
5931 rollback_upper_mesh:
5932 	to_i = i;
5933 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5934 		if (i == to_i)
5935 			break;
5936 		__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5937 	}
5938 
5939 	i = j = NULL;
5940 
5941 rollback_mesh:
5942 	to_i = i;
5943 	to_j = j;
5944 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5945 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5946 			if (i == to_i && j == to_j)
5947 				break;
5948 			__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5949 		}
5950 		if (i == to_i)
5951 			break;
5952 	}
5953 
5954 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5955 
5956 	return ret;
5957 }
5958 
5959 /**
5960  * netdev_upper_dev_link - Add a link to the upper device
5961  * @dev: device
5962  * @upper_dev: new upper device
5963  *
5964  * Adds a link to device which is upper to this one. The caller must hold
5965  * the RTNL lock. On a failure a negative errno code is returned.
5966  * On success the reference counts are adjusted and the function
5967  * returns zero.
5968  */
netdev_upper_dev_link(struct net_device * dev,struct net_device * upper_dev)5969 int netdev_upper_dev_link(struct net_device *dev,
5970 			  struct net_device *upper_dev)
5971 {
5972 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5973 }
5974 EXPORT_SYMBOL(netdev_upper_dev_link);
5975 
5976 /**
5977  * netdev_master_upper_dev_link - Add a master link to the upper device
5978  * @dev: device
5979  * @upper_dev: new upper device
5980  * @upper_priv: upper device private
5981  * @upper_info: upper info to be passed down via notifier
5982  *
5983  * Adds a link to device which is upper to this one. In this case, only
5984  * one master upper device can be linked, although other non-master devices
5985  * might be linked as well. The caller must hold the RTNL lock.
5986  * On a failure a negative errno code is returned. On success the reference
5987  * counts are adjusted and the function returns zero.
5988  */
netdev_master_upper_dev_link(struct net_device * dev,struct net_device * upper_dev,void * upper_priv,void * upper_info)5989 int netdev_master_upper_dev_link(struct net_device *dev,
5990 				 struct net_device *upper_dev,
5991 				 void *upper_priv, void *upper_info)
5992 {
5993 	return __netdev_upper_dev_link(dev, upper_dev, true,
5994 				       upper_priv, upper_info);
5995 }
5996 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5997 
5998 /**
5999  * netdev_upper_dev_unlink - Removes a link to upper device
6000  * @dev: device
6001  * @upper_dev: new upper device
6002  *
6003  * Removes a link to device which is upper to this one. The caller must hold
6004  * the RTNL lock.
6005  */
netdev_upper_dev_unlink(struct net_device * dev,struct net_device * upper_dev)6006 void netdev_upper_dev_unlink(struct net_device *dev,
6007 			     struct net_device *upper_dev)
6008 {
6009 	struct netdev_notifier_changeupper_info changeupper_info;
6010 	struct netdev_adjacent *i, *j;
6011 	ASSERT_RTNL();
6012 
6013 	changeupper_info.upper_dev = upper_dev;
6014 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6015 	changeupper_info.linking = false;
6016 
6017 	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6018 				      &changeupper_info.info);
6019 
6020 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6021 
6022 	/* Here is the tricky part. We must remove all dev's lower
6023 	 * devices from all upper_dev's upper devices and vice
6024 	 * versa, to maintain the graph relationship.
6025 	 */
6026 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
6027 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
6028 			__netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
6029 
6030 	/* remove also the devices itself from lower/upper device
6031 	 * list
6032 	 */
6033 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
6034 		__netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
6035 
6036 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
6037 		__netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
6038 
6039 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6040 				      &changeupper_info.info);
6041 }
6042 EXPORT_SYMBOL(netdev_upper_dev_unlink);
6043 
6044 /**
6045  * netdev_bonding_info_change - Dispatch event about slave change
6046  * @dev: device
6047  * @bonding_info: info to dispatch
6048  *
6049  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6050  * The caller must hold the RTNL lock.
6051  */
netdev_bonding_info_change(struct net_device * dev,struct netdev_bonding_info * bonding_info)6052 void netdev_bonding_info_change(struct net_device *dev,
6053 				struct netdev_bonding_info *bonding_info)
6054 {
6055 	struct netdev_notifier_bonding_info	info;
6056 
6057 	memcpy(&info.bonding_info, bonding_info,
6058 	       sizeof(struct netdev_bonding_info));
6059 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6060 				      &info.info);
6061 }
6062 EXPORT_SYMBOL(netdev_bonding_info_change);
6063 
netdev_adjacent_add_links(struct net_device * dev)6064 static void netdev_adjacent_add_links(struct net_device *dev)
6065 {
6066 	struct netdev_adjacent *iter;
6067 
6068 	struct net *net = dev_net(dev);
6069 
6070 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6071 		if (!net_eq(net, dev_net(iter->dev)))
6072 			continue;
6073 		netdev_adjacent_sysfs_add(iter->dev, dev,
6074 					  &iter->dev->adj_list.lower);
6075 		netdev_adjacent_sysfs_add(dev, iter->dev,
6076 					  &dev->adj_list.upper);
6077 	}
6078 
6079 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6080 		if (!net_eq(net, dev_net(iter->dev)))
6081 			continue;
6082 		netdev_adjacent_sysfs_add(iter->dev, dev,
6083 					  &iter->dev->adj_list.upper);
6084 		netdev_adjacent_sysfs_add(dev, iter->dev,
6085 					  &dev->adj_list.lower);
6086 	}
6087 }
6088 
netdev_adjacent_del_links(struct net_device * dev)6089 static void netdev_adjacent_del_links(struct net_device *dev)
6090 {
6091 	struct netdev_adjacent *iter;
6092 
6093 	struct net *net = dev_net(dev);
6094 
6095 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6096 		if (!net_eq(net, dev_net(iter->dev)))
6097 			continue;
6098 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6099 					  &iter->dev->adj_list.lower);
6100 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6101 					  &dev->adj_list.upper);
6102 	}
6103 
6104 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6105 		if (!net_eq(net, dev_net(iter->dev)))
6106 			continue;
6107 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
6108 					  &iter->dev->adj_list.upper);
6109 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
6110 					  &dev->adj_list.lower);
6111 	}
6112 }
6113 
netdev_adjacent_rename_links(struct net_device * dev,char * oldname)6114 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6115 {
6116 	struct netdev_adjacent *iter;
6117 
6118 	struct net *net = dev_net(dev);
6119 
6120 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
6121 		if (!net_eq(net, dev_net(iter->dev)))
6122 			continue;
6123 		netdev_adjacent_sysfs_del(iter->dev, oldname,
6124 					  &iter->dev->adj_list.lower);
6125 		netdev_adjacent_sysfs_add(iter->dev, dev,
6126 					  &iter->dev->adj_list.lower);
6127 	}
6128 
6129 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
6130 		if (!net_eq(net, dev_net(iter->dev)))
6131 			continue;
6132 		netdev_adjacent_sysfs_del(iter->dev, oldname,
6133 					  &iter->dev->adj_list.upper);
6134 		netdev_adjacent_sysfs_add(iter->dev, dev,
6135 					  &iter->dev->adj_list.upper);
6136 	}
6137 }
6138 
netdev_lower_dev_get_private(struct net_device * dev,struct net_device * lower_dev)6139 void *netdev_lower_dev_get_private(struct net_device *dev,
6140 				   struct net_device *lower_dev)
6141 {
6142 	struct netdev_adjacent *lower;
6143 
6144 	if (!lower_dev)
6145 		return NULL;
6146 	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6147 	if (!lower)
6148 		return NULL;
6149 
6150 	return lower->private;
6151 }
6152 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6153 
6154 
dev_get_nest_level(struct net_device * dev)6155 int dev_get_nest_level(struct net_device *dev)
6156 {
6157 	struct net_device *lower = NULL;
6158 	struct list_head *iter;
6159 	int max_nest = -1;
6160 	int nest;
6161 
6162 	ASSERT_RTNL();
6163 
6164 	netdev_for_each_lower_dev(dev, lower, iter) {
6165 		nest = dev_get_nest_level(lower);
6166 		if (max_nest < nest)
6167 			max_nest = nest;
6168 	}
6169 
6170 	return max_nest + 1;
6171 }
6172 EXPORT_SYMBOL(dev_get_nest_level);
6173 
6174 /**
6175  * netdev_lower_change - Dispatch event about lower device state change
6176  * @lower_dev: device
6177  * @lower_state_info: state to dispatch
6178  *
6179  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6180  * The caller must hold the RTNL lock.
6181  */
netdev_lower_state_changed(struct net_device * lower_dev,void * lower_state_info)6182 void netdev_lower_state_changed(struct net_device *lower_dev,
6183 				void *lower_state_info)
6184 {
6185 	struct netdev_notifier_changelowerstate_info changelowerstate_info;
6186 
6187 	ASSERT_RTNL();
6188 	changelowerstate_info.lower_state_info = lower_state_info;
6189 	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6190 				      &changelowerstate_info.info);
6191 }
6192 EXPORT_SYMBOL(netdev_lower_state_changed);
6193 
netdev_default_l2upper_neigh_construct(struct net_device * dev,struct neighbour * n)6194 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6195 					   struct neighbour *n)
6196 {
6197 	struct net_device *lower_dev, *stop_dev;
6198 	struct list_head *iter;
6199 	int err;
6200 
6201 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6202 		if (!lower_dev->netdev_ops->ndo_neigh_construct)
6203 			continue;
6204 		err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6205 		if (err) {
6206 			stop_dev = lower_dev;
6207 			goto rollback;
6208 		}
6209 	}
6210 	return 0;
6211 
6212 rollback:
6213 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6214 		if (lower_dev == stop_dev)
6215 			break;
6216 		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6217 			continue;
6218 		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6219 	}
6220 	return err;
6221 }
6222 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6223 
netdev_default_l2upper_neigh_destroy(struct net_device * dev,struct neighbour * n)6224 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6225 					  struct neighbour *n)
6226 {
6227 	struct net_device *lower_dev;
6228 	struct list_head *iter;
6229 
6230 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
6231 		if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6232 			continue;
6233 		lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6234 	}
6235 }
6236 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6237 
dev_change_rx_flags(struct net_device * dev,int flags)6238 static void dev_change_rx_flags(struct net_device *dev, int flags)
6239 {
6240 	const struct net_device_ops *ops = dev->netdev_ops;
6241 
6242 	if (ops->ndo_change_rx_flags)
6243 		ops->ndo_change_rx_flags(dev, flags);
6244 }
6245 
__dev_set_promiscuity(struct net_device * dev,int inc,bool notify)6246 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6247 {
6248 	unsigned int old_flags = dev->flags;
6249 	kuid_t uid;
6250 	kgid_t gid;
6251 
6252 	ASSERT_RTNL();
6253 
6254 	dev->flags |= IFF_PROMISC;
6255 	dev->promiscuity += inc;
6256 	if (dev->promiscuity == 0) {
6257 		/*
6258 		 * Avoid overflow.
6259 		 * If inc causes overflow, untouch promisc and return error.
6260 		 */
6261 		if (inc < 0)
6262 			dev->flags &= ~IFF_PROMISC;
6263 		else {
6264 			dev->promiscuity -= inc;
6265 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6266 				dev->name);
6267 			return -EOVERFLOW;
6268 		}
6269 	}
6270 	if (dev->flags != old_flags) {
6271 		pr_info("device %s %s promiscuous mode\n",
6272 			dev->name,
6273 			dev->flags & IFF_PROMISC ? "entered" : "left");
6274 		if (audit_enabled) {
6275 			current_uid_gid(&uid, &gid);
6276 			audit_log(current->audit_context, GFP_ATOMIC,
6277 				AUDIT_ANOM_PROMISCUOUS,
6278 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6279 				dev->name, (dev->flags & IFF_PROMISC),
6280 				(old_flags & IFF_PROMISC),
6281 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6282 				from_kuid(&init_user_ns, uid),
6283 				from_kgid(&init_user_ns, gid),
6284 				audit_get_sessionid(current));
6285 		}
6286 
6287 		dev_change_rx_flags(dev, IFF_PROMISC);
6288 	}
6289 	if (notify)
6290 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6291 	return 0;
6292 }
6293 
6294 /**
6295  *	dev_set_promiscuity	- update promiscuity count on a device
6296  *	@dev: device
6297  *	@inc: modifier
6298  *
6299  *	Add or remove promiscuity from a device. While the count in the device
6300  *	remains above zero the interface remains promiscuous. Once it hits zero
6301  *	the device reverts back to normal filtering operation. A negative inc
6302  *	value is used to drop promiscuity on the device.
6303  *	Return 0 if successful or a negative errno code on error.
6304  */
dev_set_promiscuity(struct net_device * dev,int inc)6305 int dev_set_promiscuity(struct net_device *dev, int inc)
6306 {
6307 	unsigned int old_flags = dev->flags;
6308 	int err;
6309 
6310 	err = __dev_set_promiscuity(dev, inc, true);
6311 	if (err < 0)
6312 		return err;
6313 	if (dev->flags != old_flags)
6314 		dev_set_rx_mode(dev);
6315 	return err;
6316 }
6317 EXPORT_SYMBOL(dev_set_promiscuity);
6318 
__dev_set_allmulti(struct net_device * dev,int inc,bool notify)6319 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6320 {
6321 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6322 
6323 	ASSERT_RTNL();
6324 
6325 	dev->flags |= IFF_ALLMULTI;
6326 	dev->allmulti += inc;
6327 	if (dev->allmulti == 0) {
6328 		/*
6329 		 * Avoid overflow.
6330 		 * If inc causes overflow, untouch allmulti and return error.
6331 		 */
6332 		if (inc < 0)
6333 			dev->flags &= ~IFF_ALLMULTI;
6334 		else {
6335 			dev->allmulti -= inc;
6336 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6337 				dev->name);
6338 			return -EOVERFLOW;
6339 		}
6340 	}
6341 	if (dev->flags ^ old_flags) {
6342 		dev_change_rx_flags(dev, IFF_ALLMULTI);
6343 		dev_set_rx_mode(dev);
6344 		if (notify)
6345 			__dev_notify_flags(dev, old_flags,
6346 					   dev->gflags ^ old_gflags);
6347 	}
6348 	return 0;
6349 }
6350 
6351 /**
6352  *	dev_set_allmulti	- update allmulti count on a device
6353  *	@dev: device
6354  *	@inc: modifier
6355  *
6356  *	Add or remove reception of all multicast frames to a device. While the
6357  *	count in the device remains above zero the interface remains listening
6358  *	to all interfaces. Once it hits zero the device reverts back to normal
6359  *	filtering operation. A negative @inc value is used to drop the counter
6360  *	when releasing a resource needing all multicasts.
6361  *	Return 0 if successful or a negative errno code on error.
6362  */
6363 
dev_set_allmulti(struct net_device * dev,int inc)6364 int dev_set_allmulti(struct net_device *dev, int inc)
6365 {
6366 	return __dev_set_allmulti(dev, inc, true);
6367 }
6368 EXPORT_SYMBOL(dev_set_allmulti);
6369 
6370 /*
6371  *	Upload unicast and multicast address lists to device and
6372  *	configure RX filtering. When the device doesn't support unicast
6373  *	filtering it is put in promiscuous mode while unicast addresses
6374  *	are present.
6375  */
__dev_set_rx_mode(struct net_device * dev)6376 void __dev_set_rx_mode(struct net_device *dev)
6377 {
6378 	const struct net_device_ops *ops = dev->netdev_ops;
6379 
6380 	/* dev_open will call this function so the list will stay sane. */
6381 	if (!(dev->flags&IFF_UP))
6382 		return;
6383 
6384 	if (!netif_device_present(dev))
6385 		return;
6386 
6387 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6388 		/* Unicast addresses changes may only happen under the rtnl,
6389 		 * therefore calling __dev_set_promiscuity here is safe.
6390 		 */
6391 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6392 			__dev_set_promiscuity(dev, 1, false);
6393 			dev->uc_promisc = true;
6394 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6395 			__dev_set_promiscuity(dev, -1, false);
6396 			dev->uc_promisc = false;
6397 		}
6398 	}
6399 
6400 	if (ops->ndo_set_rx_mode)
6401 		ops->ndo_set_rx_mode(dev);
6402 }
6403 
dev_set_rx_mode(struct net_device * dev)6404 void dev_set_rx_mode(struct net_device *dev)
6405 {
6406 	netif_addr_lock_bh(dev);
6407 	__dev_set_rx_mode(dev);
6408 	netif_addr_unlock_bh(dev);
6409 }
6410 
6411 /**
6412  *	dev_get_flags - get flags reported to userspace
6413  *	@dev: device
6414  *
6415  *	Get the combination of flag bits exported through APIs to userspace.
6416  */
dev_get_flags(const struct net_device * dev)6417 unsigned int dev_get_flags(const struct net_device *dev)
6418 {
6419 	unsigned int flags;
6420 
6421 	flags = (dev->flags & ~(IFF_PROMISC |
6422 				IFF_ALLMULTI |
6423 				IFF_RUNNING |
6424 				IFF_LOWER_UP |
6425 				IFF_DORMANT)) |
6426 		(dev->gflags & (IFF_PROMISC |
6427 				IFF_ALLMULTI));
6428 
6429 	if (netif_running(dev)) {
6430 		if (netif_oper_up(dev))
6431 			flags |= IFF_RUNNING;
6432 		if (netif_carrier_ok(dev))
6433 			flags |= IFF_LOWER_UP;
6434 		if (netif_dormant(dev))
6435 			flags |= IFF_DORMANT;
6436 	}
6437 
6438 	return flags;
6439 }
6440 EXPORT_SYMBOL(dev_get_flags);
6441 
__dev_change_flags(struct net_device * dev,unsigned int flags)6442 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6443 {
6444 	unsigned int old_flags = dev->flags;
6445 	int ret;
6446 
6447 	ASSERT_RTNL();
6448 
6449 	/*
6450 	 *	Set the flags on our device.
6451 	 */
6452 
6453 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6454 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6455 			       IFF_AUTOMEDIA)) |
6456 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6457 				    IFF_ALLMULTI));
6458 
6459 	/*
6460 	 *	Load in the correct multicast list now the flags have changed.
6461 	 */
6462 
6463 	if ((old_flags ^ flags) & IFF_MULTICAST)
6464 		dev_change_rx_flags(dev, IFF_MULTICAST);
6465 
6466 	dev_set_rx_mode(dev);
6467 
6468 	/*
6469 	 *	Have we downed the interface. We handle IFF_UP ourselves
6470 	 *	according to user attempts to set it, rather than blindly
6471 	 *	setting it.
6472 	 */
6473 
6474 	ret = 0;
6475 	if ((old_flags ^ flags) & IFF_UP)
6476 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6477 
6478 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6479 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6480 		unsigned int old_flags = dev->flags;
6481 
6482 		dev->gflags ^= IFF_PROMISC;
6483 
6484 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6485 			if (dev->flags != old_flags)
6486 				dev_set_rx_mode(dev);
6487 	}
6488 
6489 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6490 	   is important. Some (broken) drivers set IFF_PROMISC, when
6491 	   IFF_ALLMULTI is requested not asking us and not reporting.
6492 	 */
6493 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6494 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6495 
6496 		dev->gflags ^= IFF_ALLMULTI;
6497 		__dev_set_allmulti(dev, inc, false);
6498 	}
6499 
6500 	return ret;
6501 }
6502 
__dev_notify_flags(struct net_device * dev,unsigned int old_flags,unsigned int gchanges)6503 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6504 			unsigned int gchanges)
6505 {
6506 	unsigned int changes = dev->flags ^ old_flags;
6507 
6508 	if (gchanges)
6509 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6510 
6511 	if (changes & IFF_UP) {
6512 		if (dev->flags & IFF_UP)
6513 			call_netdevice_notifiers(NETDEV_UP, dev);
6514 		else
6515 			call_netdevice_notifiers(NETDEV_DOWN, dev);
6516 	}
6517 
6518 	if (dev->flags & IFF_UP &&
6519 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6520 		struct netdev_notifier_change_info change_info;
6521 
6522 		change_info.flags_changed = changes;
6523 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6524 					      &change_info.info);
6525 	}
6526 }
6527 
6528 /**
6529  *	dev_change_flags - change device settings
6530  *	@dev: device
6531  *	@flags: device state flags
6532  *
6533  *	Change settings on device based state flags. The flags are
6534  *	in the userspace exported format.
6535  */
dev_change_flags(struct net_device * dev,unsigned int flags)6536 int dev_change_flags(struct net_device *dev, unsigned int flags)
6537 {
6538 	int ret;
6539 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6540 
6541 	ret = __dev_change_flags(dev, flags);
6542 	if (ret < 0)
6543 		return ret;
6544 
6545 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6546 	__dev_notify_flags(dev, old_flags, changes);
6547 	return ret;
6548 }
6549 EXPORT_SYMBOL(dev_change_flags);
6550 
__dev_set_mtu(struct net_device * dev,int new_mtu)6551 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6552 {
6553 	const struct net_device_ops *ops = dev->netdev_ops;
6554 
6555 	if (ops->ndo_change_mtu)
6556 		return ops->ndo_change_mtu(dev, new_mtu);
6557 
6558 	dev->mtu = new_mtu;
6559 	return 0;
6560 }
6561 
6562 /**
6563  *	dev_set_mtu - Change maximum transfer unit
6564  *	@dev: device
6565  *	@new_mtu: new transfer unit
6566  *
6567  *	Change the maximum transfer size of the network device.
6568  */
dev_set_mtu(struct net_device * dev,int new_mtu)6569 int dev_set_mtu(struct net_device *dev, int new_mtu)
6570 {
6571 	int err, orig_mtu;
6572 
6573 	if (new_mtu == dev->mtu)
6574 		return 0;
6575 
6576 	/*	MTU must be positive.	 */
6577 	if (new_mtu < 0)
6578 		return -EINVAL;
6579 
6580 	if (!netif_device_present(dev))
6581 		return -ENODEV;
6582 
6583 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6584 	err = notifier_to_errno(err);
6585 	if (err)
6586 		return err;
6587 
6588 	orig_mtu = dev->mtu;
6589 	err = __dev_set_mtu(dev, new_mtu);
6590 
6591 	if (!err) {
6592 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6593 		err = notifier_to_errno(err);
6594 		if (err) {
6595 			/* setting mtu back and notifying everyone again,
6596 			 * so that they have a chance to revert changes.
6597 			 */
6598 			__dev_set_mtu(dev, orig_mtu);
6599 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6600 		}
6601 	}
6602 	return err;
6603 }
6604 EXPORT_SYMBOL(dev_set_mtu);
6605 
6606 /**
6607  *	dev_set_group - Change group this device belongs to
6608  *	@dev: device
6609  *	@new_group: group this device should belong to
6610  */
dev_set_group(struct net_device * dev,int new_group)6611 void dev_set_group(struct net_device *dev, int new_group)
6612 {
6613 	dev->group = new_group;
6614 }
6615 EXPORT_SYMBOL(dev_set_group);
6616 
6617 /**
6618  *	dev_set_mac_address - Change Media Access Control Address
6619  *	@dev: device
6620  *	@sa: new address
6621  *
6622  *	Change the hardware (MAC) address of the device
6623  */
dev_set_mac_address(struct net_device * dev,struct sockaddr * sa)6624 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6625 {
6626 	const struct net_device_ops *ops = dev->netdev_ops;
6627 	int err;
6628 
6629 	if (!ops->ndo_set_mac_address)
6630 		return -EOPNOTSUPP;
6631 	if (sa->sa_family != dev->type)
6632 		return -EINVAL;
6633 	if (!netif_device_present(dev))
6634 		return -ENODEV;
6635 	err = ops->ndo_set_mac_address(dev, sa);
6636 	if (err)
6637 		return err;
6638 	dev->addr_assign_type = NET_ADDR_SET;
6639 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6640 	add_device_randomness(dev->dev_addr, dev->addr_len);
6641 	return 0;
6642 }
6643 EXPORT_SYMBOL(dev_set_mac_address);
6644 
6645 /**
6646  *	dev_change_carrier - Change device carrier
6647  *	@dev: device
6648  *	@new_carrier: new value
6649  *
6650  *	Change device carrier
6651  */
dev_change_carrier(struct net_device * dev,bool new_carrier)6652 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6653 {
6654 	const struct net_device_ops *ops = dev->netdev_ops;
6655 
6656 	if (!ops->ndo_change_carrier)
6657 		return -EOPNOTSUPP;
6658 	if (!netif_device_present(dev))
6659 		return -ENODEV;
6660 	return ops->ndo_change_carrier(dev, new_carrier);
6661 }
6662 EXPORT_SYMBOL(dev_change_carrier);
6663 
6664 /**
6665  *	dev_get_phys_port_id - Get device physical port ID
6666  *	@dev: device
6667  *	@ppid: port ID
6668  *
6669  *	Get device physical port ID
6670  */
dev_get_phys_port_id(struct net_device * dev,struct netdev_phys_item_id * ppid)6671 int dev_get_phys_port_id(struct net_device *dev,
6672 			 struct netdev_phys_item_id *ppid)
6673 {
6674 	const struct net_device_ops *ops = dev->netdev_ops;
6675 
6676 	if (!ops->ndo_get_phys_port_id)
6677 		return -EOPNOTSUPP;
6678 	return ops->ndo_get_phys_port_id(dev, ppid);
6679 }
6680 EXPORT_SYMBOL(dev_get_phys_port_id);
6681 
6682 /**
6683  *	dev_get_phys_port_name - Get device physical port name
6684  *	@dev: device
6685  *	@name: port name
6686  *	@len: limit of bytes to copy to name
6687  *
6688  *	Get device physical port name
6689  */
dev_get_phys_port_name(struct net_device * dev,char * name,size_t len)6690 int dev_get_phys_port_name(struct net_device *dev,
6691 			   char *name, size_t len)
6692 {
6693 	const struct net_device_ops *ops = dev->netdev_ops;
6694 
6695 	if (!ops->ndo_get_phys_port_name)
6696 		return -EOPNOTSUPP;
6697 	return ops->ndo_get_phys_port_name(dev, name, len);
6698 }
6699 EXPORT_SYMBOL(dev_get_phys_port_name);
6700 
6701 /**
6702  *	dev_change_proto_down - update protocol port state information
6703  *	@dev: device
6704  *	@proto_down: new value
6705  *
6706  *	This info can be used by switch drivers to set the phys state of the
6707  *	port.
6708  */
dev_change_proto_down(struct net_device * dev,bool proto_down)6709 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6710 {
6711 	const struct net_device_ops *ops = dev->netdev_ops;
6712 
6713 	if (!ops->ndo_change_proto_down)
6714 		return -EOPNOTSUPP;
6715 	if (!netif_device_present(dev))
6716 		return -ENODEV;
6717 	return ops->ndo_change_proto_down(dev, proto_down);
6718 }
6719 EXPORT_SYMBOL(dev_change_proto_down);
6720 
6721 /**
6722  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
6723  *	@dev: device
6724  *	@fd: new program fd or negative value to clear
6725  *
6726  *	Set or clear a bpf program for a device
6727  */
dev_change_xdp_fd(struct net_device * dev,int fd)6728 int dev_change_xdp_fd(struct net_device *dev, int fd)
6729 {
6730 	const struct net_device_ops *ops = dev->netdev_ops;
6731 	struct bpf_prog *prog = NULL;
6732 	struct netdev_xdp xdp = {};
6733 	int err;
6734 
6735 	if (!ops->ndo_xdp)
6736 		return -EOPNOTSUPP;
6737 	if (fd >= 0) {
6738 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6739 		if (IS_ERR(prog))
6740 			return PTR_ERR(prog);
6741 	}
6742 
6743 	xdp.command = XDP_SETUP_PROG;
6744 	xdp.prog = prog;
6745 	err = ops->ndo_xdp(dev, &xdp);
6746 	if (err < 0 && prog)
6747 		bpf_prog_put(prog);
6748 
6749 	return err;
6750 }
6751 EXPORT_SYMBOL(dev_change_xdp_fd);
6752 
6753 /**
6754  *	dev_new_index	-	allocate an ifindex
6755  *	@net: the applicable net namespace
6756  *
6757  *	Returns a suitable unique value for a new device interface
6758  *	number.  The caller must hold the rtnl semaphore or the
6759  *	dev_base_lock to be sure it remains unique.
6760  */
dev_new_index(struct net * net)6761 static int dev_new_index(struct net *net)
6762 {
6763 	int ifindex = net->ifindex;
6764 	for (;;) {
6765 		if (++ifindex <= 0)
6766 			ifindex = 1;
6767 		if (!__dev_get_by_index(net, ifindex))
6768 			return net->ifindex = ifindex;
6769 	}
6770 }
6771 
6772 /* Delayed registration/unregisteration */
6773 static LIST_HEAD(net_todo_list);
6774 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6775 
net_set_todo(struct net_device * dev)6776 static void net_set_todo(struct net_device *dev)
6777 {
6778 	list_add_tail(&dev->todo_list, &net_todo_list);
6779 	dev_net(dev)->dev_unreg_count++;
6780 }
6781 
rollback_registered_many(struct list_head * head)6782 static void rollback_registered_many(struct list_head *head)
6783 {
6784 	struct net_device *dev, *tmp;
6785 	LIST_HEAD(close_head);
6786 
6787 	BUG_ON(dev_boot_phase);
6788 	ASSERT_RTNL();
6789 
6790 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6791 		/* Some devices call without registering
6792 		 * for initialization unwind. Remove those
6793 		 * devices and proceed with the remaining.
6794 		 */
6795 		if (dev->reg_state == NETREG_UNINITIALIZED) {
6796 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6797 				 dev->name, dev);
6798 
6799 			WARN_ON(1);
6800 			list_del(&dev->unreg_list);
6801 			continue;
6802 		}
6803 		dev->dismantle = true;
6804 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6805 	}
6806 
6807 	/* If device is running, close it first. */
6808 	list_for_each_entry(dev, head, unreg_list)
6809 		list_add_tail(&dev->close_list, &close_head);
6810 	dev_close_many(&close_head, true);
6811 
6812 	list_for_each_entry(dev, head, unreg_list) {
6813 		/* And unlink it from device chain. */
6814 		unlist_netdevice(dev);
6815 
6816 		dev->reg_state = NETREG_UNREGISTERING;
6817 	}
6818 	flush_all_backlogs();
6819 
6820 	synchronize_net();
6821 
6822 	list_for_each_entry(dev, head, unreg_list) {
6823 		struct sk_buff *skb = NULL;
6824 
6825 		/* Shutdown queueing discipline. */
6826 		dev_shutdown(dev);
6827 
6828 
6829 		/* Notify protocols, that we are about to destroy
6830 		   this device. They should clean all the things.
6831 		*/
6832 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6833 
6834 		if (!dev->rtnl_link_ops ||
6835 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6836 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6837 						     GFP_KERNEL);
6838 
6839 		/*
6840 		 *	Flush the unicast and multicast chains
6841 		 */
6842 		dev_uc_flush(dev);
6843 		dev_mc_flush(dev);
6844 
6845 		if (dev->netdev_ops->ndo_uninit)
6846 			dev->netdev_ops->ndo_uninit(dev);
6847 
6848 		if (skb)
6849 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6850 
6851 		/* Notifier chain MUST detach us all upper devices. */
6852 		WARN_ON(netdev_has_any_upper_dev(dev));
6853 
6854 		/* Remove entries from kobject tree */
6855 		netdev_unregister_kobject(dev);
6856 #ifdef CONFIG_XPS
6857 		/* Remove XPS queueing entries */
6858 		netif_reset_xps_queues_gt(dev, 0);
6859 #endif
6860 	}
6861 
6862 	synchronize_net();
6863 
6864 	list_for_each_entry(dev, head, unreg_list)
6865 		dev_put(dev);
6866 }
6867 
rollback_registered(struct net_device * dev)6868 static void rollback_registered(struct net_device *dev)
6869 {
6870 	LIST_HEAD(single);
6871 
6872 	list_add(&dev->unreg_list, &single);
6873 	rollback_registered_many(&single);
6874 	list_del(&single);
6875 }
6876 
netdev_sync_upper_features(struct net_device * lower,struct net_device * upper,netdev_features_t features)6877 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6878 	struct net_device *upper, netdev_features_t features)
6879 {
6880 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6881 	netdev_features_t feature;
6882 	int feature_bit;
6883 
6884 	for_each_netdev_feature(&upper_disables, feature_bit) {
6885 		feature = __NETIF_F_BIT(feature_bit);
6886 		if (!(upper->wanted_features & feature)
6887 		    && (features & feature)) {
6888 			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6889 				   &feature, upper->name);
6890 			features &= ~feature;
6891 		}
6892 	}
6893 
6894 	return features;
6895 }
6896 
netdev_sync_lower_features(struct net_device * upper,struct net_device * lower,netdev_features_t features)6897 static void netdev_sync_lower_features(struct net_device *upper,
6898 	struct net_device *lower, netdev_features_t features)
6899 {
6900 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6901 	netdev_features_t feature;
6902 	int feature_bit;
6903 
6904 	for_each_netdev_feature(&upper_disables, feature_bit) {
6905 		feature = __NETIF_F_BIT(feature_bit);
6906 		if (!(features & feature) && (lower->features & feature)) {
6907 			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6908 				   &feature, lower->name);
6909 			lower->wanted_features &= ~feature;
6910 			netdev_update_features(lower);
6911 
6912 			if (unlikely(lower->features & feature))
6913 				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6914 					    &feature, lower->name);
6915 		}
6916 	}
6917 }
6918 
netdev_fix_features(struct net_device * dev,netdev_features_t features)6919 static netdev_features_t netdev_fix_features(struct net_device *dev,
6920 	netdev_features_t features)
6921 {
6922 	/* Fix illegal checksum combinations */
6923 	if ((features & NETIF_F_HW_CSUM) &&
6924 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6925 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6926 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6927 	}
6928 
6929 	/* TSO requires that SG is present as well. */
6930 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6931 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6932 		features &= ~NETIF_F_ALL_TSO;
6933 	}
6934 
6935 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6936 					!(features & NETIF_F_IP_CSUM)) {
6937 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6938 		features &= ~NETIF_F_TSO;
6939 		features &= ~NETIF_F_TSO_ECN;
6940 	}
6941 
6942 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6943 					 !(features & NETIF_F_IPV6_CSUM)) {
6944 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6945 		features &= ~NETIF_F_TSO6;
6946 	}
6947 
6948 	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6949 	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6950 		features &= ~NETIF_F_TSO_MANGLEID;
6951 
6952 	/* TSO ECN requires that TSO is present as well. */
6953 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6954 		features &= ~NETIF_F_TSO_ECN;
6955 
6956 	/* Software GSO depends on SG. */
6957 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6958 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6959 		features &= ~NETIF_F_GSO;
6960 	}
6961 
6962 	/* UFO needs SG and checksumming */
6963 	if (features & NETIF_F_UFO) {
6964 		/* maybe split UFO into V4 and V6? */
6965 		if (!(features & NETIF_F_HW_CSUM) &&
6966 		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6967 		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6968 			netdev_dbg(dev,
6969 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6970 			features &= ~NETIF_F_UFO;
6971 		}
6972 
6973 		if (!(features & NETIF_F_SG)) {
6974 			netdev_dbg(dev,
6975 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6976 			features &= ~NETIF_F_UFO;
6977 		}
6978 	}
6979 
6980 	/* GSO partial features require GSO partial be set */
6981 	if ((features & dev->gso_partial_features) &&
6982 	    !(features & NETIF_F_GSO_PARTIAL)) {
6983 		netdev_dbg(dev,
6984 			   "Dropping partially supported GSO features since no GSO partial.\n");
6985 		features &= ~dev->gso_partial_features;
6986 	}
6987 
6988 #ifdef CONFIG_NET_RX_BUSY_POLL
6989 	if (dev->netdev_ops->ndo_busy_poll)
6990 		features |= NETIF_F_BUSY_POLL;
6991 	else
6992 #endif
6993 		features &= ~NETIF_F_BUSY_POLL;
6994 
6995 	return features;
6996 }
6997 
__netdev_update_features(struct net_device * dev)6998 int __netdev_update_features(struct net_device *dev)
6999 {
7000 	struct net_device *upper, *lower;
7001 	netdev_features_t features;
7002 	struct list_head *iter;
7003 	int err = -1;
7004 
7005 	ASSERT_RTNL();
7006 
7007 	features = netdev_get_wanted_features(dev);
7008 
7009 	if (dev->netdev_ops->ndo_fix_features)
7010 		features = dev->netdev_ops->ndo_fix_features(dev, features);
7011 
7012 	/* driver might be less strict about feature dependencies */
7013 	features = netdev_fix_features(dev, features);
7014 
7015 	/* some features can't be enabled if they're off an an upper device */
7016 	netdev_for_each_upper_dev_rcu(dev, upper, iter)
7017 		features = netdev_sync_upper_features(dev, upper, features);
7018 
7019 	if (dev->features == features)
7020 		goto sync_lower;
7021 
7022 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7023 		&dev->features, &features);
7024 
7025 	if (dev->netdev_ops->ndo_set_features)
7026 		err = dev->netdev_ops->ndo_set_features(dev, features);
7027 	else
7028 		err = 0;
7029 
7030 	if (unlikely(err < 0)) {
7031 		netdev_err(dev,
7032 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
7033 			err, &features, &dev->features);
7034 		/* return non-0 since some features might have changed and
7035 		 * it's better to fire a spurious notification than miss it
7036 		 */
7037 		return -1;
7038 	}
7039 
7040 sync_lower:
7041 	/* some features must be disabled on lower devices when disabled
7042 	 * on an upper device (think: bonding master or bridge)
7043 	 */
7044 	netdev_for_each_lower_dev(dev, lower, iter)
7045 		netdev_sync_lower_features(dev, lower, features);
7046 
7047 	if (!err)
7048 		dev->features = features;
7049 
7050 	return err < 0 ? 0 : 1;
7051 }
7052 
7053 /**
7054  *	netdev_update_features - recalculate device features
7055  *	@dev: the device to check
7056  *
7057  *	Recalculate dev->features set and send notifications if it
7058  *	has changed. Should be called after driver or hardware dependent
7059  *	conditions might have changed that influence the features.
7060  */
netdev_update_features(struct net_device * dev)7061 void netdev_update_features(struct net_device *dev)
7062 {
7063 	if (__netdev_update_features(dev))
7064 		netdev_features_change(dev);
7065 }
7066 EXPORT_SYMBOL(netdev_update_features);
7067 
7068 /**
7069  *	netdev_change_features - recalculate device features
7070  *	@dev: the device to check
7071  *
7072  *	Recalculate dev->features set and send notifications even
7073  *	if they have not changed. Should be called instead of
7074  *	netdev_update_features() if also dev->vlan_features might
7075  *	have changed to allow the changes to be propagated to stacked
7076  *	VLAN devices.
7077  */
netdev_change_features(struct net_device * dev)7078 void netdev_change_features(struct net_device *dev)
7079 {
7080 	__netdev_update_features(dev);
7081 	netdev_features_change(dev);
7082 }
7083 EXPORT_SYMBOL(netdev_change_features);
7084 
7085 /**
7086  *	netif_stacked_transfer_operstate -	transfer operstate
7087  *	@rootdev: the root or lower level device to transfer state from
7088  *	@dev: the device to transfer operstate to
7089  *
7090  *	Transfer operational state from root to device. This is normally
7091  *	called when a stacking relationship exists between the root
7092  *	device and the device(a leaf device).
7093  */
netif_stacked_transfer_operstate(const struct net_device * rootdev,struct net_device * dev)7094 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7095 					struct net_device *dev)
7096 {
7097 	if (rootdev->operstate == IF_OPER_DORMANT)
7098 		netif_dormant_on(dev);
7099 	else
7100 		netif_dormant_off(dev);
7101 
7102 	if (netif_carrier_ok(rootdev)) {
7103 		if (!netif_carrier_ok(dev))
7104 			netif_carrier_on(dev);
7105 	} else {
7106 		if (netif_carrier_ok(dev))
7107 			netif_carrier_off(dev);
7108 	}
7109 }
7110 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7111 
7112 #ifdef CONFIG_SYSFS
netif_alloc_rx_queues(struct net_device * dev)7113 static int netif_alloc_rx_queues(struct net_device *dev)
7114 {
7115 	unsigned int i, count = dev->num_rx_queues;
7116 	struct netdev_rx_queue *rx;
7117 	size_t sz = count * sizeof(*rx);
7118 
7119 	BUG_ON(count < 1);
7120 
7121 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7122 	if (!rx) {
7123 		rx = vzalloc(sz);
7124 		if (!rx)
7125 			return -ENOMEM;
7126 	}
7127 	dev->_rx = rx;
7128 
7129 	for (i = 0; i < count; i++)
7130 		rx[i].dev = dev;
7131 	return 0;
7132 }
7133 #endif
7134 
netdev_init_one_queue(struct net_device * dev,struct netdev_queue * queue,void * _unused)7135 static void netdev_init_one_queue(struct net_device *dev,
7136 				  struct netdev_queue *queue, void *_unused)
7137 {
7138 	/* Initialize queue lock */
7139 	spin_lock_init(&queue->_xmit_lock);
7140 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7141 	queue->xmit_lock_owner = -1;
7142 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7143 	queue->dev = dev;
7144 #ifdef CONFIG_BQL
7145 	dql_init(&queue->dql, HZ);
7146 #endif
7147 }
7148 
netif_free_tx_queues(struct net_device * dev)7149 static void netif_free_tx_queues(struct net_device *dev)
7150 {
7151 	kvfree(dev->_tx);
7152 }
7153 
netif_alloc_netdev_queues(struct net_device * dev)7154 static int netif_alloc_netdev_queues(struct net_device *dev)
7155 {
7156 	unsigned int count = dev->num_tx_queues;
7157 	struct netdev_queue *tx;
7158 	size_t sz = count * sizeof(*tx);
7159 
7160 	if (count < 1 || count > 0xffff)
7161 		return -EINVAL;
7162 
7163 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7164 	if (!tx) {
7165 		tx = vzalloc(sz);
7166 		if (!tx)
7167 			return -ENOMEM;
7168 	}
7169 	dev->_tx = tx;
7170 
7171 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7172 	spin_lock_init(&dev->tx_global_lock);
7173 
7174 	return 0;
7175 }
7176 
netif_tx_stop_all_queues(struct net_device * dev)7177 void netif_tx_stop_all_queues(struct net_device *dev)
7178 {
7179 	unsigned int i;
7180 
7181 	for (i = 0; i < dev->num_tx_queues; i++) {
7182 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7183 		netif_tx_stop_queue(txq);
7184 	}
7185 }
7186 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7187 
7188 /**
7189  *	register_netdevice	- register a network device
7190  *	@dev: device to register
7191  *
7192  *	Take a completed network device structure and add it to the kernel
7193  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7194  *	chain. 0 is returned on success. A negative errno code is returned
7195  *	on a failure to set up the device, or if the name is a duplicate.
7196  *
7197  *	Callers must hold the rtnl semaphore. You may want
7198  *	register_netdev() instead of this.
7199  *
7200  *	BUGS:
7201  *	The locking appears insufficient to guarantee two parallel registers
7202  *	will not get the same name.
7203  */
7204 
register_netdevice(struct net_device * dev)7205 int register_netdevice(struct net_device *dev)
7206 {
7207 	int ret;
7208 	struct net *net = dev_net(dev);
7209 
7210 	BUG_ON(dev_boot_phase);
7211 	ASSERT_RTNL();
7212 
7213 	might_sleep();
7214 
7215 	/* When net_device's are persistent, this will be fatal. */
7216 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7217 	BUG_ON(!net);
7218 
7219 	spin_lock_init(&dev->addr_list_lock);
7220 	netdev_set_addr_lockdep_class(dev);
7221 
7222 	ret = dev_get_valid_name(net, dev, dev->name);
7223 	if (ret < 0)
7224 		goto out;
7225 
7226 	/* Init, if this function is available */
7227 	if (dev->netdev_ops->ndo_init) {
7228 		ret = dev->netdev_ops->ndo_init(dev);
7229 		if (ret) {
7230 			if (ret > 0)
7231 				ret = -EIO;
7232 			goto out;
7233 		}
7234 	}
7235 
7236 	if (((dev->hw_features | dev->features) &
7237 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7238 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7239 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7240 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7241 		ret = -EINVAL;
7242 		goto err_uninit;
7243 	}
7244 
7245 	ret = -EBUSY;
7246 	if (!dev->ifindex)
7247 		dev->ifindex = dev_new_index(net);
7248 	else if (__dev_get_by_index(net, dev->ifindex))
7249 		goto err_uninit;
7250 
7251 	/* Transfer changeable features to wanted_features and enable
7252 	 * software offloads (GSO and GRO).
7253 	 */
7254 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7255 	dev->features |= NETIF_F_SOFT_FEATURES;
7256 	dev->wanted_features = dev->features & dev->hw_features;
7257 
7258 	if (!(dev->flags & IFF_LOOPBACK))
7259 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7260 
7261 	/* If IPv4 TCP segmentation offload is supported we should also
7262 	 * allow the device to enable segmenting the frame with the option
7263 	 * of ignoring a static IP ID value.  This doesn't enable the
7264 	 * feature itself but allows the user to enable it later.
7265 	 */
7266 	if (dev->hw_features & NETIF_F_TSO)
7267 		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7268 	if (dev->vlan_features & NETIF_F_TSO)
7269 		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7270 	if (dev->mpls_features & NETIF_F_TSO)
7271 		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7272 	if (dev->hw_enc_features & NETIF_F_TSO)
7273 		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7274 
7275 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7276 	 */
7277 	dev->vlan_features |= NETIF_F_HIGHDMA;
7278 
7279 	/* Make NETIF_F_SG inheritable to tunnel devices.
7280 	 */
7281 	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7282 
7283 	/* Make NETIF_F_SG inheritable to MPLS.
7284 	 */
7285 	dev->mpls_features |= NETIF_F_SG;
7286 
7287 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7288 	ret = notifier_to_errno(ret);
7289 	if (ret)
7290 		goto err_uninit;
7291 
7292 	ret = netdev_register_kobject(dev);
7293 	if (ret)
7294 		goto err_uninit;
7295 	dev->reg_state = NETREG_REGISTERED;
7296 
7297 	__netdev_update_features(dev);
7298 
7299 	/*
7300 	 *	Default initial state at registry is that the
7301 	 *	device is present.
7302 	 */
7303 
7304 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7305 
7306 	linkwatch_init_dev(dev);
7307 
7308 	dev_init_scheduler(dev);
7309 	dev_hold(dev);
7310 	list_netdevice(dev);
7311 	add_device_randomness(dev->dev_addr, dev->addr_len);
7312 
7313 	/* If the device has permanent device address, driver should
7314 	 * set dev_addr and also addr_assign_type should be set to
7315 	 * NET_ADDR_PERM (default value).
7316 	 */
7317 	if (dev->addr_assign_type == NET_ADDR_PERM)
7318 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7319 
7320 	/* Notify protocols, that a new device appeared. */
7321 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7322 	ret = notifier_to_errno(ret);
7323 	if (ret) {
7324 		rollback_registered(dev);
7325 		dev->reg_state = NETREG_UNREGISTERED;
7326 	}
7327 	/*
7328 	 *	Prevent userspace races by waiting until the network
7329 	 *	device is fully setup before sending notifications.
7330 	 */
7331 	if (!dev->rtnl_link_ops ||
7332 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7333 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7334 
7335 out:
7336 	return ret;
7337 
7338 err_uninit:
7339 	if (dev->netdev_ops->ndo_uninit)
7340 		dev->netdev_ops->ndo_uninit(dev);
7341 	goto out;
7342 }
7343 EXPORT_SYMBOL(register_netdevice);
7344 
7345 /**
7346  *	init_dummy_netdev	- init a dummy network device for NAPI
7347  *	@dev: device to init
7348  *
7349  *	This takes a network device structure and initialize the minimum
7350  *	amount of fields so it can be used to schedule NAPI polls without
7351  *	registering a full blown interface. This is to be used by drivers
7352  *	that need to tie several hardware interfaces to a single NAPI
7353  *	poll scheduler due to HW limitations.
7354  */
init_dummy_netdev(struct net_device * dev)7355 int init_dummy_netdev(struct net_device *dev)
7356 {
7357 	/* Clear everything. Note we don't initialize spinlocks
7358 	 * are they aren't supposed to be taken by any of the
7359 	 * NAPI code and this dummy netdev is supposed to be
7360 	 * only ever used for NAPI polls
7361 	 */
7362 	memset(dev, 0, sizeof(struct net_device));
7363 
7364 	/* make sure we BUG if trying to hit standard
7365 	 * register/unregister code path
7366 	 */
7367 	dev->reg_state = NETREG_DUMMY;
7368 
7369 	/* NAPI wants this */
7370 	INIT_LIST_HEAD(&dev->napi_list);
7371 
7372 	/* a dummy interface is started by default */
7373 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7374 	set_bit(__LINK_STATE_START, &dev->state);
7375 
7376 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7377 	 * because users of this 'device' dont need to change
7378 	 * its refcount.
7379 	 */
7380 
7381 	return 0;
7382 }
7383 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7384 
7385 
7386 /**
7387  *	register_netdev	- register a network device
7388  *	@dev: device to register
7389  *
7390  *	Take a completed network device structure and add it to the kernel
7391  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7392  *	chain. 0 is returned on success. A negative errno code is returned
7393  *	on a failure to set up the device, or if the name is a duplicate.
7394  *
7395  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7396  *	and expands the device name if you passed a format string to
7397  *	alloc_netdev.
7398  */
register_netdev(struct net_device * dev)7399 int register_netdev(struct net_device *dev)
7400 {
7401 	int err;
7402 
7403 	rtnl_lock();
7404 	err = register_netdevice(dev);
7405 	rtnl_unlock();
7406 	return err;
7407 }
7408 EXPORT_SYMBOL(register_netdev);
7409 
netdev_refcnt_read(const struct net_device * dev)7410 int netdev_refcnt_read(const struct net_device *dev)
7411 {
7412 	int i, refcnt = 0;
7413 
7414 	for_each_possible_cpu(i)
7415 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7416 	return refcnt;
7417 }
7418 EXPORT_SYMBOL(netdev_refcnt_read);
7419 
7420 /**
7421  * netdev_wait_allrefs - wait until all references are gone.
7422  * @dev: target net_device
7423  *
7424  * This is called when unregistering network devices.
7425  *
7426  * Any protocol or device that holds a reference should register
7427  * for netdevice notification, and cleanup and put back the
7428  * reference if they receive an UNREGISTER event.
7429  * We can get stuck here if buggy protocols don't correctly
7430  * call dev_put.
7431  */
netdev_wait_allrefs(struct net_device * dev)7432 static void netdev_wait_allrefs(struct net_device *dev)
7433 {
7434 	unsigned long rebroadcast_time, warning_time;
7435 	int refcnt;
7436 
7437 	linkwatch_forget_dev(dev);
7438 
7439 	rebroadcast_time = warning_time = jiffies;
7440 	refcnt = netdev_refcnt_read(dev);
7441 
7442 	while (refcnt != 0) {
7443 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7444 			rtnl_lock();
7445 
7446 			/* Rebroadcast unregister notification */
7447 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7448 
7449 			__rtnl_unlock();
7450 			rcu_barrier();
7451 			rtnl_lock();
7452 
7453 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7454 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7455 				     &dev->state)) {
7456 				/* We must not have linkwatch events
7457 				 * pending on unregister. If this
7458 				 * happens, we simply run the queue
7459 				 * unscheduled, resulting in a noop
7460 				 * for this device.
7461 				 */
7462 				linkwatch_run_queue();
7463 			}
7464 
7465 			__rtnl_unlock();
7466 
7467 			rebroadcast_time = jiffies;
7468 		}
7469 
7470 		msleep(250);
7471 
7472 		refcnt = netdev_refcnt_read(dev);
7473 
7474 		if (time_after(jiffies, warning_time + 10 * HZ)) {
7475 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7476 				 dev->name, refcnt);
7477 			warning_time = jiffies;
7478 		}
7479 	}
7480 }
7481 
7482 /* The sequence is:
7483  *
7484  *	rtnl_lock();
7485  *	...
7486  *	register_netdevice(x1);
7487  *	register_netdevice(x2);
7488  *	...
7489  *	unregister_netdevice(y1);
7490  *	unregister_netdevice(y2);
7491  *      ...
7492  *	rtnl_unlock();
7493  *	free_netdev(y1);
7494  *	free_netdev(y2);
7495  *
7496  * We are invoked by rtnl_unlock().
7497  * This allows us to deal with problems:
7498  * 1) We can delete sysfs objects which invoke hotplug
7499  *    without deadlocking with linkwatch via keventd.
7500  * 2) Since we run with the RTNL semaphore not held, we can sleep
7501  *    safely in order to wait for the netdev refcnt to drop to zero.
7502  *
7503  * We must not return until all unregister events added during
7504  * the interval the lock was held have been completed.
7505  */
netdev_run_todo(void)7506 void netdev_run_todo(void)
7507 {
7508 	struct list_head list;
7509 
7510 	/* Snapshot list, allow later requests */
7511 	list_replace_init(&net_todo_list, &list);
7512 
7513 	__rtnl_unlock();
7514 
7515 
7516 	/* Wait for rcu callbacks to finish before next phase */
7517 	if (!list_empty(&list))
7518 		rcu_barrier();
7519 
7520 	while (!list_empty(&list)) {
7521 		struct net_device *dev
7522 			= list_first_entry(&list, struct net_device, todo_list);
7523 		list_del(&dev->todo_list);
7524 
7525 		rtnl_lock();
7526 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7527 		__rtnl_unlock();
7528 
7529 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7530 			pr_err("network todo '%s' but state %d\n",
7531 			       dev->name, dev->reg_state);
7532 			dump_stack();
7533 			continue;
7534 		}
7535 
7536 		dev->reg_state = NETREG_UNREGISTERED;
7537 
7538 		netdev_wait_allrefs(dev);
7539 
7540 		/* paranoia */
7541 		BUG_ON(netdev_refcnt_read(dev));
7542 		BUG_ON(!list_empty(&dev->ptype_all));
7543 		BUG_ON(!list_empty(&dev->ptype_specific));
7544 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7545 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7546 		WARN_ON(dev->dn_ptr);
7547 
7548 		if (dev->destructor)
7549 			dev->destructor(dev);
7550 
7551 		/* Report a network device has been unregistered */
7552 		rtnl_lock();
7553 		dev_net(dev)->dev_unreg_count--;
7554 		__rtnl_unlock();
7555 		wake_up(&netdev_unregistering_wq);
7556 
7557 		/* Free network device */
7558 		kobject_put(&dev->dev.kobj);
7559 	}
7560 }
7561 
7562 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7563  * all the same fields in the same order as net_device_stats, with only
7564  * the type differing, but rtnl_link_stats64 may have additional fields
7565  * at the end for newer counters.
7566  */
netdev_stats_to_stats64(struct rtnl_link_stats64 * stats64,const struct net_device_stats * netdev_stats)7567 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7568 			     const struct net_device_stats *netdev_stats)
7569 {
7570 #if BITS_PER_LONG == 64
7571 	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7572 	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
7573 	/* zero out counters that only exist in rtnl_link_stats64 */
7574 	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7575 	       sizeof(*stats64) - sizeof(*netdev_stats));
7576 #else
7577 	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7578 	const unsigned long *src = (const unsigned long *)netdev_stats;
7579 	u64 *dst = (u64 *)stats64;
7580 
7581 	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7582 	for (i = 0; i < n; i++)
7583 		dst[i] = src[i];
7584 	/* zero out counters that only exist in rtnl_link_stats64 */
7585 	memset((char *)stats64 + n * sizeof(u64), 0,
7586 	       sizeof(*stats64) - n * sizeof(u64));
7587 #endif
7588 }
7589 EXPORT_SYMBOL(netdev_stats_to_stats64);
7590 
7591 /**
7592  *	dev_get_stats	- get network device statistics
7593  *	@dev: device to get statistics from
7594  *	@storage: place to store stats
7595  *
7596  *	Get network statistics from device. Return @storage.
7597  *	The device driver may provide its own method by setting
7598  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7599  *	otherwise the internal statistics structure is used.
7600  */
dev_get_stats(struct net_device * dev,struct rtnl_link_stats64 * storage)7601 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7602 					struct rtnl_link_stats64 *storage)
7603 {
7604 	const struct net_device_ops *ops = dev->netdev_ops;
7605 
7606 	if (ops->ndo_get_stats64) {
7607 		memset(storage, 0, sizeof(*storage));
7608 		ops->ndo_get_stats64(dev, storage);
7609 	} else if (ops->ndo_get_stats) {
7610 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7611 	} else {
7612 		netdev_stats_to_stats64(storage, &dev->stats);
7613 	}
7614 	storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
7615 	storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
7616 	storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
7617 	return storage;
7618 }
7619 EXPORT_SYMBOL(dev_get_stats);
7620 
dev_ingress_queue_create(struct net_device * dev)7621 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7622 {
7623 	struct netdev_queue *queue = dev_ingress_queue(dev);
7624 
7625 #ifdef CONFIG_NET_CLS_ACT
7626 	if (queue)
7627 		return queue;
7628 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7629 	if (!queue)
7630 		return NULL;
7631 	netdev_init_one_queue(dev, queue, NULL);
7632 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7633 	queue->qdisc_sleeping = &noop_qdisc;
7634 	rcu_assign_pointer(dev->ingress_queue, queue);
7635 #endif
7636 	return queue;
7637 }
7638 
7639 static const struct ethtool_ops default_ethtool_ops;
7640 
netdev_set_default_ethtool_ops(struct net_device * dev,const struct ethtool_ops * ops)7641 void netdev_set_default_ethtool_ops(struct net_device *dev,
7642 				    const struct ethtool_ops *ops)
7643 {
7644 	if (dev->ethtool_ops == &default_ethtool_ops)
7645 		dev->ethtool_ops = ops;
7646 }
7647 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7648 
netdev_freemem(struct net_device * dev)7649 void netdev_freemem(struct net_device *dev)
7650 {
7651 	char *addr = (char *)dev - dev->padded;
7652 
7653 	kvfree(addr);
7654 }
7655 
7656 /**
7657  *	alloc_netdev_mqs - allocate network device
7658  *	@sizeof_priv:		size of private data to allocate space for
7659  *	@name:			device name format string
7660  *	@name_assign_type: 	origin of device name
7661  *	@setup:			callback to initialize device
7662  *	@txqs:			the number of TX subqueues to allocate
7663  *	@rxqs:			the number of RX subqueues to allocate
7664  *
7665  *	Allocates a struct net_device with private data area for driver use
7666  *	and performs basic initialization.  Also allocates subqueue structs
7667  *	for each queue on the device.
7668  */
alloc_netdev_mqs(int sizeof_priv,const char * name,unsigned char name_assign_type,void (* setup)(struct net_device *),unsigned int txqs,unsigned int rxqs)7669 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7670 		unsigned char name_assign_type,
7671 		void (*setup)(struct net_device *),
7672 		unsigned int txqs, unsigned int rxqs)
7673 {
7674 	struct net_device *dev;
7675 	size_t alloc_size;
7676 	struct net_device *p;
7677 
7678 	BUG_ON(strlen(name) >= sizeof(dev->name));
7679 
7680 	if (txqs < 1) {
7681 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7682 		return NULL;
7683 	}
7684 
7685 #ifdef CONFIG_SYSFS
7686 	if (rxqs < 1) {
7687 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7688 		return NULL;
7689 	}
7690 #endif
7691 
7692 	alloc_size = sizeof(struct net_device);
7693 	if (sizeof_priv) {
7694 		/* ensure 32-byte alignment of private area */
7695 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7696 		alloc_size += sizeof_priv;
7697 	}
7698 	/* ensure 32-byte alignment of whole construct */
7699 	alloc_size += NETDEV_ALIGN - 1;
7700 
7701 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7702 	if (!p)
7703 		p = vzalloc(alloc_size);
7704 	if (!p)
7705 		return NULL;
7706 
7707 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7708 	dev->padded = (char *)dev - (char *)p;
7709 
7710 	dev->pcpu_refcnt = alloc_percpu(int);
7711 	if (!dev->pcpu_refcnt)
7712 		goto free_dev;
7713 
7714 	if (dev_addr_init(dev))
7715 		goto free_pcpu;
7716 
7717 	dev_mc_init(dev);
7718 	dev_uc_init(dev);
7719 
7720 	dev_net_set(dev, &init_net);
7721 
7722 	dev->gso_max_size = GSO_MAX_SIZE;
7723 	dev->gso_max_segs = GSO_MAX_SEGS;
7724 
7725 	INIT_LIST_HEAD(&dev->napi_list);
7726 	INIT_LIST_HEAD(&dev->unreg_list);
7727 	INIT_LIST_HEAD(&dev->close_list);
7728 	INIT_LIST_HEAD(&dev->link_watch_list);
7729 	INIT_LIST_HEAD(&dev->adj_list.upper);
7730 	INIT_LIST_HEAD(&dev->adj_list.lower);
7731 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7732 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7733 	INIT_LIST_HEAD(&dev->ptype_all);
7734 	INIT_LIST_HEAD(&dev->ptype_specific);
7735 #ifdef CONFIG_NET_SCHED
7736 	hash_init(dev->qdisc_hash);
7737 #endif
7738 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7739 	setup(dev);
7740 
7741 	if (!dev->tx_queue_len) {
7742 		dev->priv_flags |= IFF_NO_QUEUE;
7743 		dev->tx_queue_len = 1;
7744 	}
7745 
7746 	dev->num_tx_queues = txqs;
7747 	dev->real_num_tx_queues = txqs;
7748 	if (netif_alloc_netdev_queues(dev))
7749 		goto free_all;
7750 
7751 #ifdef CONFIG_SYSFS
7752 	dev->num_rx_queues = rxqs;
7753 	dev->real_num_rx_queues = rxqs;
7754 	if (netif_alloc_rx_queues(dev))
7755 		goto free_all;
7756 #endif
7757 
7758 	strcpy(dev->name, name);
7759 	dev->name_assign_type = name_assign_type;
7760 	dev->group = INIT_NETDEV_GROUP;
7761 	if (!dev->ethtool_ops)
7762 		dev->ethtool_ops = &default_ethtool_ops;
7763 
7764 	nf_hook_ingress_init(dev);
7765 
7766 	return dev;
7767 
7768 free_all:
7769 	free_netdev(dev);
7770 	return NULL;
7771 
7772 free_pcpu:
7773 	free_percpu(dev->pcpu_refcnt);
7774 free_dev:
7775 	netdev_freemem(dev);
7776 	return NULL;
7777 }
7778 EXPORT_SYMBOL(alloc_netdev_mqs);
7779 
7780 /**
7781  *	free_netdev - free network device
7782  *	@dev: device
7783  *
7784  *	This function does the last stage of destroying an allocated device
7785  * 	interface. The reference to the device object is released.
7786  *	If this is the last reference then it will be freed.
7787  *	Must be called in process context.
7788  */
free_netdev(struct net_device * dev)7789 void free_netdev(struct net_device *dev)
7790 {
7791 	struct napi_struct *p, *n;
7792 
7793 	might_sleep();
7794 	netif_free_tx_queues(dev);
7795 #ifdef CONFIG_SYSFS
7796 	kvfree(dev->_rx);
7797 #endif
7798 
7799 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7800 
7801 	/* Flush device addresses */
7802 	dev_addr_flush(dev);
7803 
7804 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7805 		netif_napi_del(p);
7806 
7807 	free_percpu(dev->pcpu_refcnt);
7808 	dev->pcpu_refcnt = NULL;
7809 
7810 	/*  Compatibility with error handling in drivers */
7811 	if (dev->reg_state == NETREG_UNINITIALIZED) {
7812 		netdev_freemem(dev);
7813 		return;
7814 	}
7815 
7816 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7817 	dev->reg_state = NETREG_RELEASED;
7818 
7819 	/* will free via device release */
7820 	put_device(&dev->dev);
7821 }
7822 EXPORT_SYMBOL(free_netdev);
7823 
7824 /**
7825  *	synchronize_net -  Synchronize with packet receive processing
7826  *
7827  *	Wait for packets currently being received to be done.
7828  *	Does not block later packets from starting.
7829  */
synchronize_net(void)7830 void synchronize_net(void)
7831 {
7832 	might_sleep();
7833 	if (rtnl_is_locked())
7834 		synchronize_rcu_expedited();
7835 	else
7836 		synchronize_rcu();
7837 }
7838 EXPORT_SYMBOL(synchronize_net);
7839 
7840 /**
7841  *	unregister_netdevice_queue - remove device from the kernel
7842  *	@dev: device
7843  *	@head: list
7844  *
7845  *	This function shuts down a device interface and removes it
7846  *	from the kernel tables.
7847  *	If head not NULL, device is queued to be unregistered later.
7848  *
7849  *	Callers must hold the rtnl semaphore.  You may want
7850  *	unregister_netdev() instead of this.
7851  */
7852 
unregister_netdevice_queue(struct net_device * dev,struct list_head * head)7853 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7854 {
7855 	ASSERT_RTNL();
7856 
7857 	if (head) {
7858 		list_move_tail(&dev->unreg_list, head);
7859 	} else {
7860 		rollback_registered(dev);
7861 		/* Finish processing unregister after unlock */
7862 		net_set_todo(dev);
7863 	}
7864 }
7865 EXPORT_SYMBOL(unregister_netdevice_queue);
7866 
7867 /**
7868  *	unregister_netdevice_many - unregister many devices
7869  *	@head: list of devices
7870  *
7871  *  Note: As most callers use a stack allocated list_head,
7872  *  we force a list_del() to make sure stack wont be corrupted later.
7873  */
unregister_netdevice_many(struct list_head * head)7874 void unregister_netdevice_many(struct list_head *head)
7875 {
7876 	struct net_device *dev;
7877 
7878 	if (!list_empty(head)) {
7879 		rollback_registered_many(head);
7880 		list_for_each_entry(dev, head, unreg_list)
7881 			net_set_todo(dev);
7882 		list_del(head);
7883 	}
7884 }
7885 EXPORT_SYMBOL(unregister_netdevice_many);
7886 
7887 /**
7888  *	unregister_netdev - remove device from the kernel
7889  *	@dev: device
7890  *
7891  *	This function shuts down a device interface and removes it
7892  *	from the kernel tables.
7893  *
7894  *	This is just a wrapper for unregister_netdevice that takes
7895  *	the rtnl semaphore.  In general you want to use this and not
7896  *	unregister_netdevice.
7897  */
unregister_netdev(struct net_device * dev)7898 void unregister_netdev(struct net_device *dev)
7899 {
7900 	rtnl_lock();
7901 	unregister_netdevice(dev);
7902 	rtnl_unlock();
7903 }
7904 EXPORT_SYMBOL(unregister_netdev);
7905 
7906 /**
7907  *	dev_change_net_namespace - move device to different nethost namespace
7908  *	@dev: device
7909  *	@net: network namespace
7910  *	@pat: If not NULL name pattern to try if the current device name
7911  *	      is already taken in the destination network namespace.
7912  *
7913  *	This function shuts down a device interface and moves it
7914  *	to a new network namespace. On success 0 is returned, on
7915  *	a failure a netagive errno code is returned.
7916  *
7917  *	Callers must hold the rtnl semaphore.
7918  */
7919 
dev_change_net_namespace(struct net_device * dev,struct net * net,const char * pat)7920 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7921 {
7922 	int err;
7923 
7924 	ASSERT_RTNL();
7925 
7926 	/* Don't allow namespace local devices to be moved. */
7927 	err = -EINVAL;
7928 	if (dev->features & NETIF_F_NETNS_LOCAL)
7929 		goto out;
7930 
7931 	/* Ensure the device has been registrered */
7932 	if (dev->reg_state != NETREG_REGISTERED)
7933 		goto out;
7934 
7935 	/* Get out if there is nothing todo */
7936 	err = 0;
7937 	if (net_eq(dev_net(dev), net))
7938 		goto out;
7939 
7940 	/* Pick the destination device name, and ensure
7941 	 * we can use it in the destination network namespace.
7942 	 */
7943 	err = -EEXIST;
7944 	if (__dev_get_by_name(net, dev->name)) {
7945 		/* We get here if we can't use the current device name */
7946 		if (!pat)
7947 			goto out;
7948 		if (dev_get_valid_name(net, dev, pat) < 0)
7949 			goto out;
7950 	}
7951 
7952 	/*
7953 	 * And now a mini version of register_netdevice unregister_netdevice.
7954 	 */
7955 
7956 	/* If device is running close it first. */
7957 	dev_close(dev);
7958 
7959 	/* And unlink it from device chain */
7960 	err = -ENODEV;
7961 	unlist_netdevice(dev);
7962 
7963 	synchronize_net();
7964 
7965 	/* Shutdown queueing discipline. */
7966 	dev_shutdown(dev);
7967 
7968 	/* Notify protocols, that we are about to destroy
7969 	   this device. They should clean all the things.
7970 
7971 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7972 	   This is wanted because this way 8021q and macvlan know
7973 	   the device is just moving and can keep their slaves up.
7974 	*/
7975 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7976 	rcu_barrier();
7977 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7978 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7979 
7980 	/*
7981 	 *	Flush the unicast and multicast chains
7982 	 */
7983 	dev_uc_flush(dev);
7984 	dev_mc_flush(dev);
7985 
7986 	/* Send a netdev-removed uevent to the old namespace */
7987 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7988 	netdev_adjacent_del_links(dev);
7989 
7990 	/* Actually switch the network namespace */
7991 	dev_net_set(dev, net);
7992 
7993 	/* If there is an ifindex conflict assign a new one */
7994 	if (__dev_get_by_index(net, dev->ifindex))
7995 		dev->ifindex = dev_new_index(net);
7996 
7997 	/* Send a netdev-add uevent to the new namespace */
7998 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7999 	netdev_adjacent_add_links(dev);
8000 
8001 	/* Fixup kobjects */
8002 	err = device_rename(&dev->dev, dev->name);
8003 	WARN_ON(err);
8004 
8005 	/* Add the device back in the hashes */
8006 	list_netdevice(dev);
8007 
8008 	/* Notify protocols, that a new device appeared. */
8009 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
8010 
8011 	/*
8012 	 *	Prevent userspace races by waiting until the network
8013 	 *	device is fully setup before sending notifications.
8014 	 */
8015 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8016 
8017 	synchronize_net();
8018 	err = 0;
8019 out:
8020 	return err;
8021 }
8022 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8023 
dev_cpu_callback(struct notifier_block * nfb,unsigned long action,void * ocpu)8024 static int dev_cpu_callback(struct notifier_block *nfb,
8025 			    unsigned long action,
8026 			    void *ocpu)
8027 {
8028 	struct sk_buff **list_skb;
8029 	struct sk_buff *skb;
8030 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
8031 	struct softnet_data *sd, *oldsd;
8032 
8033 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
8034 		return NOTIFY_OK;
8035 
8036 	local_irq_disable();
8037 	cpu = smp_processor_id();
8038 	sd = &per_cpu(softnet_data, cpu);
8039 	oldsd = &per_cpu(softnet_data, oldcpu);
8040 
8041 	/* Find end of our completion_queue. */
8042 	list_skb = &sd->completion_queue;
8043 	while (*list_skb)
8044 		list_skb = &(*list_skb)->next;
8045 	/* Append completion queue from offline CPU. */
8046 	*list_skb = oldsd->completion_queue;
8047 	oldsd->completion_queue = NULL;
8048 
8049 	/* Append output queue from offline CPU. */
8050 	if (oldsd->output_queue) {
8051 		*sd->output_queue_tailp = oldsd->output_queue;
8052 		sd->output_queue_tailp = oldsd->output_queue_tailp;
8053 		oldsd->output_queue = NULL;
8054 		oldsd->output_queue_tailp = &oldsd->output_queue;
8055 	}
8056 	/* Append NAPI poll list from offline CPU, with one exception :
8057 	 * process_backlog() must be called by cpu owning percpu backlog.
8058 	 * We properly handle process_queue & input_pkt_queue later.
8059 	 */
8060 	while (!list_empty(&oldsd->poll_list)) {
8061 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8062 							    struct napi_struct,
8063 							    poll_list);
8064 
8065 		list_del_init(&napi->poll_list);
8066 		if (napi->poll == process_backlog)
8067 			napi->state = 0;
8068 		else
8069 			____napi_schedule(sd, napi);
8070 	}
8071 
8072 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
8073 	local_irq_enable();
8074 
8075 	/* Process offline CPU's input_pkt_queue */
8076 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8077 		netif_rx_ni(skb);
8078 		input_queue_head_incr(oldsd);
8079 	}
8080 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8081 		netif_rx_ni(skb);
8082 		input_queue_head_incr(oldsd);
8083 	}
8084 
8085 	return NOTIFY_OK;
8086 }
8087 
8088 
8089 /**
8090  *	netdev_increment_features - increment feature set by one
8091  *	@all: current feature set
8092  *	@one: new feature set
8093  *	@mask: mask feature set
8094  *
8095  *	Computes a new feature set after adding a device with feature set
8096  *	@one to the master device with current feature set @all.  Will not
8097  *	enable anything that is off in @mask. Returns the new feature set.
8098  */
netdev_increment_features(netdev_features_t all,netdev_features_t one,netdev_features_t mask)8099 netdev_features_t netdev_increment_features(netdev_features_t all,
8100 	netdev_features_t one, netdev_features_t mask)
8101 {
8102 	if (mask & NETIF_F_HW_CSUM)
8103 		mask |= NETIF_F_CSUM_MASK;
8104 	mask |= NETIF_F_VLAN_CHALLENGED;
8105 
8106 	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8107 	all &= one | ~NETIF_F_ALL_FOR_ALL;
8108 
8109 	/* If one device supports hw checksumming, set for all. */
8110 	if (all & NETIF_F_HW_CSUM)
8111 		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8112 
8113 	return all;
8114 }
8115 EXPORT_SYMBOL(netdev_increment_features);
8116 
netdev_create_hash(void)8117 static struct hlist_head * __net_init netdev_create_hash(void)
8118 {
8119 	int i;
8120 	struct hlist_head *hash;
8121 
8122 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8123 	if (hash != NULL)
8124 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
8125 			INIT_HLIST_HEAD(&hash[i]);
8126 
8127 	return hash;
8128 }
8129 
8130 /* Initialize per network namespace state */
netdev_init(struct net * net)8131 static int __net_init netdev_init(struct net *net)
8132 {
8133 	if (net != &init_net)
8134 		INIT_LIST_HEAD(&net->dev_base_head);
8135 
8136 	net->dev_name_head = netdev_create_hash();
8137 	if (net->dev_name_head == NULL)
8138 		goto err_name;
8139 
8140 	net->dev_index_head = netdev_create_hash();
8141 	if (net->dev_index_head == NULL)
8142 		goto err_idx;
8143 
8144 	return 0;
8145 
8146 err_idx:
8147 	kfree(net->dev_name_head);
8148 err_name:
8149 	return -ENOMEM;
8150 }
8151 
8152 /**
8153  *	netdev_drivername - network driver for the device
8154  *	@dev: network device
8155  *
8156  *	Determine network driver for device.
8157  */
netdev_drivername(const struct net_device * dev)8158 const char *netdev_drivername(const struct net_device *dev)
8159 {
8160 	const struct device_driver *driver;
8161 	const struct device *parent;
8162 	const char *empty = "";
8163 
8164 	parent = dev->dev.parent;
8165 	if (!parent)
8166 		return empty;
8167 
8168 	driver = parent->driver;
8169 	if (driver && driver->name)
8170 		return driver->name;
8171 	return empty;
8172 }
8173 
__netdev_printk(const char * level,const struct net_device * dev,struct va_format * vaf)8174 static void __netdev_printk(const char *level, const struct net_device *dev,
8175 			    struct va_format *vaf)
8176 {
8177 	if (dev && dev->dev.parent) {
8178 		dev_printk_emit(level[1] - '0',
8179 				dev->dev.parent,
8180 				"%s %s %s%s: %pV",
8181 				dev_driver_string(dev->dev.parent),
8182 				dev_name(dev->dev.parent),
8183 				netdev_name(dev), netdev_reg_state(dev),
8184 				vaf);
8185 	} else if (dev) {
8186 		printk("%s%s%s: %pV",
8187 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
8188 	} else {
8189 		printk("%s(NULL net_device): %pV", level, vaf);
8190 	}
8191 }
8192 
netdev_printk(const char * level,const struct net_device * dev,const char * format,...)8193 void netdev_printk(const char *level, const struct net_device *dev,
8194 		   const char *format, ...)
8195 {
8196 	struct va_format vaf;
8197 	va_list args;
8198 
8199 	va_start(args, format);
8200 
8201 	vaf.fmt = format;
8202 	vaf.va = &args;
8203 
8204 	__netdev_printk(level, dev, &vaf);
8205 
8206 	va_end(args);
8207 }
8208 EXPORT_SYMBOL(netdev_printk);
8209 
8210 #define define_netdev_printk_level(func, level)			\
8211 void func(const struct net_device *dev, const char *fmt, ...)	\
8212 {								\
8213 	struct va_format vaf;					\
8214 	va_list args;						\
8215 								\
8216 	va_start(args, fmt);					\
8217 								\
8218 	vaf.fmt = fmt;						\
8219 	vaf.va = &args;						\
8220 								\
8221 	__netdev_printk(level, dev, &vaf);			\
8222 								\
8223 	va_end(args);						\
8224 }								\
8225 EXPORT_SYMBOL(func);
8226 
8227 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8228 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8229 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8230 define_netdev_printk_level(netdev_err, KERN_ERR);
8231 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8232 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8233 define_netdev_printk_level(netdev_info, KERN_INFO);
8234 
netdev_exit(struct net * net)8235 static void __net_exit netdev_exit(struct net *net)
8236 {
8237 	kfree(net->dev_name_head);
8238 	kfree(net->dev_index_head);
8239 }
8240 
8241 static struct pernet_operations __net_initdata netdev_net_ops = {
8242 	.init = netdev_init,
8243 	.exit = netdev_exit,
8244 };
8245 
default_device_exit(struct net * net)8246 static void __net_exit default_device_exit(struct net *net)
8247 {
8248 	struct net_device *dev, *aux;
8249 	/*
8250 	 * Push all migratable network devices back to the
8251 	 * initial network namespace
8252 	 */
8253 	rtnl_lock();
8254 	for_each_netdev_safe(net, dev, aux) {
8255 		int err;
8256 		char fb_name[IFNAMSIZ];
8257 
8258 		/* Ignore unmoveable devices (i.e. loopback) */
8259 		if (dev->features & NETIF_F_NETNS_LOCAL)
8260 			continue;
8261 
8262 		/* Leave virtual devices for the generic cleanup */
8263 		if (dev->rtnl_link_ops)
8264 			continue;
8265 
8266 		/* Push remaining network devices to init_net */
8267 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8268 		err = dev_change_net_namespace(dev, &init_net, fb_name);
8269 		if (err) {
8270 			pr_emerg("%s: failed to move %s to init_net: %d\n",
8271 				 __func__, dev->name, err);
8272 			BUG();
8273 		}
8274 	}
8275 	rtnl_unlock();
8276 }
8277 
rtnl_lock_unregistering(struct list_head * net_list)8278 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8279 {
8280 	/* Return with the rtnl_lock held when there are no network
8281 	 * devices unregistering in any network namespace in net_list.
8282 	 */
8283 	struct net *net;
8284 	bool unregistering;
8285 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8286 
8287 	add_wait_queue(&netdev_unregistering_wq, &wait);
8288 	for (;;) {
8289 		unregistering = false;
8290 		rtnl_lock();
8291 		list_for_each_entry(net, net_list, exit_list) {
8292 			if (net->dev_unreg_count > 0) {
8293 				unregistering = true;
8294 				break;
8295 			}
8296 		}
8297 		if (!unregistering)
8298 			break;
8299 		__rtnl_unlock();
8300 
8301 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8302 	}
8303 	remove_wait_queue(&netdev_unregistering_wq, &wait);
8304 }
8305 
default_device_exit_batch(struct list_head * net_list)8306 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8307 {
8308 	/* At exit all network devices most be removed from a network
8309 	 * namespace.  Do this in the reverse order of registration.
8310 	 * Do this across as many network namespaces as possible to
8311 	 * improve batching efficiency.
8312 	 */
8313 	struct net_device *dev;
8314 	struct net *net;
8315 	LIST_HEAD(dev_kill_list);
8316 
8317 	/* To prevent network device cleanup code from dereferencing
8318 	 * loopback devices or network devices that have been freed
8319 	 * wait here for all pending unregistrations to complete,
8320 	 * before unregistring the loopback device and allowing the
8321 	 * network namespace be freed.
8322 	 *
8323 	 * The netdev todo list containing all network devices
8324 	 * unregistrations that happen in default_device_exit_batch
8325 	 * will run in the rtnl_unlock() at the end of
8326 	 * default_device_exit_batch.
8327 	 */
8328 	rtnl_lock_unregistering(net_list);
8329 	list_for_each_entry(net, net_list, exit_list) {
8330 		for_each_netdev_reverse(net, dev) {
8331 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8332 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8333 			else
8334 				unregister_netdevice_queue(dev, &dev_kill_list);
8335 		}
8336 	}
8337 	unregister_netdevice_many(&dev_kill_list);
8338 	rtnl_unlock();
8339 }
8340 
8341 static struct pernet_operations __net_initdata default_device_ops = {
8342 	.exit = default_device_exit,
8343 	.exit_batch = default_device_exit_batch,
8344 };
8345 
8346 /*
8347  *	Initialize the DEV module. At boot time this walks the device list and
8348  *	unhooks any devices that fail to initialise (normally hardware not
8349  *	present) and leaves us with a valid list of present and active devices.
8350  *
8351  */
8352 
8353 /*
8354  *       This is called single threaded during boot, so no need
8355  *       to take the rtnl semaphore.
8356  */
net_dev_init(void)8357 static int __init net_dev_init(void)
8358 {
8359 	int i, rc = -ENOMEM;
8360 
8361 	BUG_ON(!dev_boot_phase);
8362 
8363 	if (dev_proc_init())
8364 		goto out;
8365 
8366 	if (netdev_kobject_init())
8367 		goto out;
8368 
8369 	INIT_LIST_HEAD(&ptype_all);
8370 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8371 		INIT_LIST_HEAD(&ptype_base[i]);
8372 
8373 	INIT_LIST_HEAD(&offload_base);
8374 
8375 	if (register_pernet_subsys(&netdev_net_ops))
8376 		goto out;
8377 
8378 	/*
8379 	 *	Initialise the packet receive queues.
8380 	 */
8381 
8382 	for_each_possible_cpu(i) {
8383 		struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8384 		struct softnet_data *sd = &per_cpu(softnet_data, i);
8385 
8386 		INIT_WORK(flush, flush_backlog);
8387 
8388 		skb_queue_head_init(&sd->input_pkt_queue);
8389 		skb_queue_head_init(&sd->process_queue);
8390 		INIT_LIST_HEAD(&sd->poll_list);
8391 		sd->output_queue_tailp = &sd->output_queue;
8392 #ifdef CONFIG_RPS
8393 		sd->csd.func = rps_trigger_softirq;
8394 		sd->csd.info = sd;
8395 		sd->cpu = i;
8396 #endif
8397 
8398 		sd->backlog.poll = process_backlog;
8399 		sd->backlog.weight = weight_p;
8400 	}
8401 
8402 	dev_boot_phase = 0;
8403 
8404 	/* The loopback device is special if any other network devices
8405 	 * is present in a network namespace the loopback device must
8406 	 * be present. Since we now dynamically allocate and free the
8407 	 * loopback device ensure this invariant is maintained by
8408 	 * keeping the loopback device as the first device on the
8409 	 * list of network devices.  Ensuring the loopback devices
8410 	 * is the first device that appears and the last network device
8411 	 * that disappears.
8412 	 */
8413 	if (register_pernet_device(&loopback_net_ops))
8414 		goto out;
8415 
8416 	if (register_pernet_device(&default_device_ops))
8417 		goto out;
8418 
8419 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8420 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8421 
8422 	hotcpu_notifier(dev_cpu_callback, 0);
8423 	dst_subsys_init();
8424 	rc = 0;
8425 out:
8426 	return rc;
8427 }
8428 
8429 subsys_initcall(net_dev_init);
8430