• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/sched.h>
83 #include <linux/mutex.h>
84 #include <linux/string.h>
85 #include <linux/mm.h>
86 #include <linux/socket.h>
87 #include <linux/sockios.h>
88 #include <linux/errno.h>
89 #include <linux/interrupt.h>
90 #include <linux/if_ether.h>
91 #include <linux/netdevice.h>
92 #include <linux/etherdevice.h>
93 #include <linux/ethtool.h>
94 #include <linux/notifier.h>
95 #include <linux/skbuff.h>
96 #include <net/net_namespace.h>
97 #include <net/sock.h>
98 #include <linux/rtnetlink.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/stat.h>
102 #include <linux/if_bridge.h>
103 #include <linux/if_macvlan.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 
130 #include "net-sysfs.h"
131 
132 /* Instead of increasing this, you should create a hash table. */
133 #define MAX_GRO_SKBS 8
134 
135 /* This should be increased if a protocol with a bigger head is added. */
136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
137 
138 /*
139  *	The list of packet types we will receive (as opposed to discard)
140  *	and the routines to invoke.
141  *
142  *	Why 16. Because with 16 the only overlap we get on a hash of the
143  *	low nibble of the protocol value is RARP/SNAP/X.25.
144  *
145  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
146  *             sure which should go first, but I bet it won't make much
147  *             difference if we are running VLANs.  The good news is that
148  *             this protocol won't be in the list unless compiled in, so
149  *             the average user (w/out VLANs) will not be adversely affected.
150  *             --BLG
151  *
152  *		0800	IP
153  *		8100    802.1Q VLAN
154  *		0001	802.3
155  *		0002	AX.25
156  *		0004	802.2
157  *		8035	RARP
158  *		0005	SNAP
159  *		0805	X.25
160  *		0806	ARP
161  *		8137	IPX
162  *		0009	Localtalk
163  *		86DD	IPv6
164  */
165 
166 #define PTYPE_HASH_SIZE	(16)
167 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
168 
169 static DEFINE_SPINLOCK(ptype_lock);
170 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
171 static struct list_head ptype_all __read_mostly;	/* Taps */
172 
173 /*
174  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175  * semaphore.
176  *
177  * Pure readers hold dev_base_lock for reading.
178  *
179  * Writers must hold the rtnl semaphore while they loop through the
180  * dev_base_head list, and hold dev_base_lock for writing when they do the
181  * actual updates.  This allows pure readers to access the list even
182  * while a writer is preparing to update it.
183  *
184  * To put it another way, dev_base_lock is held for writing only to
185  * protect against pure readers; the rtnl semaphore provides the
186  * protection against other writers.
187  *
188  * See, for example usages, register_netdevice() and
189  * unregister_netdevice(), which must be called with the rtnl
190  * semaphore held.
191  */
192 DEFINE_RWLOCK(dev_base_lock);
193 
194 EXPORT_SYMBOL(dev_base_lock);
195 
196 #define NETDEV_HASHBITS	8
197 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
198 
dev_name_hash(struct net * net,const char * name)199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
200 {
201 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
202 	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
203 }
204 
dev_index_hash(struct net * net,int ifindex)205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
208 }
209 
210 /* Device list insertion */
list_netdevice(struct net_device * dev)211 static int list_netdevice(struct net_device *dev)
212 {
213 	struct net *net = dev_net(dev);
214 
215 	ASSERT_RTNL();
216 
217 	write_lock_bh(&dev_base_lock);
218 	list_add_tail(&dev->dev_list, &net->dev_base_head);
219 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
220 	hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal */
unlist_netdevice(struct net_device * dev)226 static void unlist_netdevice(struct net_device *dev)
227 {
228 	ASSERT_RTNL();
229 
230 	/* Unlink dev from the device chain */
231 	write_lock_bh(&dev_base_lock);
232 	list_del(&dev->dev_list);
233 	hlist_del(&dev->name_hlist);
234 	hlist_del(&dev->index_hlist);
235 	write_unlock_bh(&dev_base_lock);
236 }
237 
238 /*
239  *	Our notifier list
240  */
241 
242 static RAW_NOTIFIER_HEAD(netdev_chain);
243 
244 /*
245  *	Device drivers call our routines to queue packets here. We empty the
246  *	queue in the local softnet handler.
247  */
248 
249 DEFINE_PER_CPU(struct softnet_data, softnet_data);
250 
251 #ifdef CONFIG_LOCKDEP
252 /*
253  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
254  * according to dev->type
255  */
256 static const unsigned short netdev_lock_type[] =
257 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
258 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
259 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
260 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
261 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
262 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
263 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
264 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
265 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
266 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
267 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
268 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
269 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
270 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
271 	 ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
272 
273 static const char *netdev_lock_name[] =
274 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
275 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
276 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
277 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
278 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
279 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
280 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
281 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
282 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
283 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
284 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
285 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
286 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
287 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
288 	 "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
289 
290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
292 
netdev_lock_pos(unsigned short dev_type)293 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
294 {
295 	int i;
296 
297 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
298 		if (netdev_lock_type[i] == dev_type)
299 			return i;
300 	/* the last key is used by default */
301 	return ARRAY_SIZE(netdev_lock_type) - 1;
302 }
303 
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
305 						 unsigned short dev_type)
306 {
307 	int i;
308 
309 	i = netdev_lock_pos(dev_type);
310 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
311 				   netdev_lock_name[i]);
312 }
313 
netdev_set_addr_lockdep_class(struct net_device * dev)314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
315 {
316 	int i;
317 
318 	i = netdev_lock_pos(dev->type);
319 	lockdep_set_class_and_name(&dev->addr_list_lock,
320 				   &netdev_addr_lock_key[i],
321 				   netdev_lock_name[i]);
322 }
323 #else
netdev_set_xmit_lockdep_class(spinlock_t * lock,unsigned short dev_type)324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
325 						 unsigned short dev_type)
326 {
327 }
netdev_set_addr_lockdep_class(struct net_device * dev)328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330 }
331 #endif
332 
333 /*******************************************************************************
334 
335 		Protocol management and registration routines
336 
337 *******************************************************************************/
338 
339 /*
340  *	Add a protocol ID to the list. Now that the input handler is
341  *	smarter we can dispense with all the messy stuff that used to be
342  *	here.
343  *
344  *	BEWARE!!! Protocol handlers, mangling input packets,
345  *	MUST BE last in hash buckets and checking protocol handlers
346  *	MUST start from promiscuous ptype_all chain in net_bh.
347  *	It is true now, do not change it.
348  *	Explanation follows: if protocol handler, mangling packet, will
349  *	be the first on list, it is not able to sense, that packet
350  *	is cloned and should be copied-on-write, so that it will
351  *	change it and subsequent readers will get broken packet.
352  *							--ANK (980803)
353  */
354 
355 /**
356  *	dev_add_pack - add packet handler
357  *	@pt: packet type declaration
358  *
359  *	Add a protocol handler to the networking stack. The passed &packet_type
360  *	is linked into kernel lists and may not be freed until it has been
361  *	removed from the kernel lists.
362  *
363  *	This call does not sleep therefore it can not
364  *	guarantee all CPU's that are in middle of receiving packets
365  *	will see the new packet type (until the next received packet).
366  */
367 
dev_add_pack(struct packet_type * pt)368 void dev_add_pack(struct packet_type *pt)
369 {
370 	int hash;
371 
372 	spin_lock_bh(&ptype_lock);
373 	if (pt->type == htons(ETH_P_ALL))
374 		list_add_rcu(&pt->list, &ptype_all);
375 	else {
376 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
377 		list_add_rcu(&pt->list, &ptype_base[hash]);
378 	}
379 	spin_unlock_bh(&ptype_lock);
380 }
381 
382 /**
383  *	__dev_remove_pack	 - remove packet handler
384  *	@pt: packet type declaration
385  *
386  *	Remove a protocol handler that was previously added to the kernel
387  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
388  *	from the kernel lists and can be freed or reused once this function
389  *	returns.
390  *
391  *      The packet type might still be in use by receivers
392  *	and must not be freed until after all the CPU's have gone
393  *	through a quiescent state.
394  */
__dev_remove_pack(struct packet_type * pt)395 void __dev_remove_pack(struct packet_type *pt)
396 {
397 	struct list_head *head;
398 	struct packet_type *pt1;
399 
400 	spin_lock_bh(&ptype_lock);
401 
402 	if (pt->type == htons(ETH_P_ALL))
403 		head = &ptype_all;
404 	else
405 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
406 
407 	list_for_each_entry(pt1, head, list) {
408 		if (pt == pt1) {
409 			list_del_rcu(&pt->list);
410 			goto out;
411 		}
412 	}
413 
414 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
415 out:
416 	spin_unlock_bh(&ptype_lock);
417 }
418 /**
419  *	dev_remove_pack	 - remove packet handler
420  *	@pt: packet type declaration
421  *
422  *	Remove a protocol handler that was previously added to the kernel
423  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
424  *	from the kernel lists and can be freed or reused once this function
425  *	returns.
426  *
427  *	This call sleeps to guarantee that no CPU is looking at the packet
428  *	type after return.
429  */
dev_remove_pack(struct packet_type * pt)430 void dev_remove_pack(struct packet_type *pt)
431 {
432 	__dev_remove_pack(pt);
433 
434 	synchronize_net();
435 }
436 
437 /******************************************************************************
438 
439 		      Device Boot-time Settings Routines
440 
441 *******************************************************************************/
442 
443 /* Boot time configuration table */
444 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
445 
446 /**
447  *	netdev_boot_setup_add	- add new setup entry
448  *	@name: name of the device
449  *	@map: configured settings for the device
450  *
451  *	Adds new setup entry to the dev_boot_setup list.  The function
452  *	returns 0 on error and 1 on success.  This is a generic routine to
453  *	all netdevices.
454  */
netdev_boot_setup_add(char * name,struct ifmap * map)455 static int netdev_boot_setup_add(char *name, struct ifmap *map)
456 {
457 	struct netdev_boot_setup *s;
458 	int i;
459 
460 	s = dev_boot_setup;
461 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
462 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
463 			memset(s[i].name, 0, sizeof(s[i].name));
464 			strlcpy(s[i].name, name, IFNAMSIZ);
465 			memcpy(&s[i].map, map, sizeof(s[i].map));
466 			break;
467 		}
468 	}
469 
470 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
471 }
472 
473 /**
474  *	netdev_boot_setup_check	- check boot time settings
475  *	@dev: the netdevice
476  *
477  * 	Check boot time settings for the device.
478  *	The found settings are set for the device to be used
479  *	later in the device probing.
480  *	Returns 0 if no settings found, 1 if they are.
481  */
netdev_boot_setup_check(struct net_device * dev)482 int netdev_boot_setup_check(struct net_device *dev)
483 {
484 	struct netdev_boot_setup *s = dev_boot_setup;
485 	int i;
486 
487 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
488 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
489 		    !strcmp(dev->name, s[i].name)) {
490 			dev->irq 	= s[i].map.irq;
491 			dev->base_addr 	= s[i].map.base_addr;
492 			dev->mem_start 	= s[i].map.mem_start;
493 			dev->mem_end 	= s[i].map.mem_end;
494 			return 1;
495 		}
496 	}
497 	return 0;
498 }
499 
500 
501 /**
502  *	netdev_boot_base	- get address from boot time settings
503  *	@prefix: prefix for network device
504  *	@unit: id for network device
505  *
506  * 	Check boot time settings for the base address of device.
507  *	The found settings are set for the device to be used
508  *	later in the device probing.
509  *	Returns 0 if no settings found.
510  */
netdev_boot_base(const char * prefix,int unit)511 unsigned long netdev_boot_base(const char *prefix, int unit)
512 {
513 	const struct netdev_boot_setup *s = dev_boot_setup;
514 	char name[IFNAMSIZ];
515 	int i;
516 
517 	sprintf(name, "%s%d", prefix, unit);
518 
519 	/*
520 	 * If device already registered then return base of 1
521 	 * to indicate not to probe for this interface
522 	 */
523 	if (__dev_get_by_name(&init_net, name))
524 		return 1;
525 
526 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
527 		if (!strcmp(name, s[i].name))
528 			return s[i].map.base_addr;
529 	return 0;
530 }
531 
532 /*
533  * Saves at boot time configured settings for any netdevice.
534  */
netdev_boot_setup(char * str)535 int __init netdev_boot_setup(char *str)
536 {
537 	int ints[5];
538 	struct ifmap map;
539 
540 	str = get_options(str, ARRAY_SIZE(ints), ints);
541 	if (!str || !*str)
542 		return 0;
543 
544 	/* Save settings */
545 	memset(&map, 0, sizeof(map));
546 	if (ints[0] > 0)
547 		map.irq = ints[1];
548 	if (ints[0] > 1)
549 		map.base_addr = ints[2];
550 	if (ints[0] > 2)
551 		map.mem_start = ints[3];
552 	if (ints[0] > 3)
553 		map.mem_end = ints[4];
554 
555 	/* Add new entry to the list */
556 	return netdev_boot_setup_add(str, &map);
557 }
558 
559 __setup("netdev=", netdev_boot_setup);
560 
561 /*******************************************************************************
562 
563 			    Device Interface Subroutines
564 
565 *******************************************************************************/
566 
567 /**
568  *	__dev_get_by_name	- find a device by its name
569  *	@net: the applicable net namespace
570  *	@name: name to find
571  *
572  *	Find an interface by name. Must be called under RTNL semaphore
573  *	or @dev_base_lock. If the name is found a pointer to the device
574  *	is returned. If the name is not found then %NULL is returned. The
575  *	reference counters are not incremented so the caller must be
576  *	careful with locks.
577  */
578 
__dev_get_by_name(struct net * net,const char * name)579 struct net_device *__dev_get_by_name(struct net *net, const char *name)
580 {
581 	struct hlist_node *p;
582 
583 	hlist_for_each(p, dev_name_hash(net, name)) {
584 		struct net_device *dev
585 			= hlist_entry(p, struct net_device, name_hlist);
586 		if (!strncmp(dev->name, name, IFNAMSIZ))
587 			return dev;
588 	}
589 	return NULL;
590 }
591 
592 /**
593  *	dev_get_by_name		- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. This can be called from any
598  *	context and does its own locking. The returned handle has
599  *	the usage count incremented and the caller must use dev_put() to
600  *	release it when it is no longer needed. %NULL is returned if no
601  *	matching device is found.
602  */
603 
dev_get_by_name(struct net * net,const char * name)604 struct net_device *dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct net_device *dev;
607 
608 	read_lock(&dev_base_lock);
609 	dev = __dev_get_by_name(net, name);
610 	if (dev)
611 		dev_hold(dev);
612 	read_unlock(&dev_base_lock);
613 	return dev;
614 }
615 
616 /**
617  *	__dev_get_by_index - find a device by its ifindex
618  *	@net: the applicable net namespace
619  *	@ifindex: index of device
620  *
621  *	Search for an interface by index. Returns %NULL if the device
622  *	is not found or a pointer to the device. The device has not
623  *	had its reference counter increased so the caller must be careful
624  *	about locking. The caller must hold either the RTNL semaphore
625  *	or @dev_base_lock.
626  */
627 
__dev_get_by_index(struct net * net,int ifindex)628 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
629 {
630 	struct hlist_node *p;
631 
632 	hlist_for_each(p, dev_index_hash(net, ifindex)) {
633 		struct net_device *dev
634 			= hlist_entry(p, struct net_device, index_hlist);
635 		if (dev->ifindex == ifindex)
636 			return dev;
637 	}
638 	return NULL;
639 }
640 
641 
642 /**
643  *	dev_get_by_index - find a device by its ifindex
644  *	@net: the applicable net namespace
645  *	@ifindex: index of device
646  *
647  *	Search for an interface by index. Returns NULL if the device
648  *	is not found or a pointer to the device. The device returned has
649  *	had a reference added and the pointer is safe until the user calls
650  *	dev_put to indicate they have finished with it.
651  */
652 
dev_get_by_index(struct net * net,int ifindex)653 struct net_device *dev_get_by_index(struct net *net, int ifindex)
654 {
655 	struct net_device *dev;
656 
657 	read_lock(&dev_base_lock);
658 	dev = __dev_get_by_index(net, ifindex);
659 	if (dev)
660 		dev_hold(dev);
661 	read_unlock(&dev_base_lock);
662 	return dev;
663 }
664 
665 /**
666  *	dev_getbyhwaddr - find a device by its hardware address
667  *	@net: the applicable net namespace
668  *	@type: media type of device
669  *	@ha: hardware address
670  *
671  *	Search for an interface by MAC address. Returns NULL if the device
672  *	is not found or a pointer to the device. The caller must hold the
673  *	rtnl semaphore. The returned device has not had its ref count increased
674  *	and the caller must therefore be careful about locking
675  *
676  *	BUGS:
677  *	If the API was consistent this would be __dev_get_by_hwaddr
678  */
679 
dev_getbyhwaddr(struct net * net,unsigned short type,char * ha)680 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
681 {
682 	struct net_device *dev;
683 
684 	ASSERT_RTNL();
685 
686 	for_each_netdev(net, dev)
687 		if (dev->type == type &&
688 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
689 			return dev;
690 
691 	return NULL;
692 }
693 
694 EXPORT_SYMBOL(dev_getbyhwaddr);
695 
__dev_getfirstbyhwtype(struct net * net,unsigned short type)696 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
697 {
698 	struct net_device *dev;
699 
700 	ASSERT_RTNL();
701 	for_each_netdev(net, dev)
702 		if (dev->type == type)
703 			return dev;
704 
705 	return NULL;
706 }
707 
708 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
709 
dev_getfirstbyhwtype(struct net * net,unsigned short type)710 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
711 {
712 	struct net_device *dev;
713 
714 	rtnl_lock();
715 	dev = __dev_getfirstbyhwtype(net, type);
716 	if (dev)
717 		dev_hold(dev);
718 	rtnl_unlock();
719 	return dev;
720 }
721 
722 EXPORT_SYMBOL(dev_getfirstbyhwtype);
723 
724 /**
725  *	dev_get_by_flags - find any device with given flags
726  *	@net: the applicable net namespace
727  *	@if_flags: IFF_* values
728  *	@mask: bitmask of bits in if_flags to check
729  *
730  *	Search for any interface with the given flags. Returns NULL if a device
731  *	is not found or a pointer to the device. The device returned has
732  *	had a reference added and the pointer is safe until the user calls
733  *	dev_put to indicate they have finished with it.
734  */
735 
dev_get_by_flags(struct net * net,unsigned short if_flags,unsigned short mask)736 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
737 {
738 	struct net_device *dev, *ret;
739 
740 	ret = NULL;
741 	read_lock(&dev_base_lock);
742 	for_each_netdev(net, dev) {
743 		if (((dev->flags ^ if_flags) & mask) == 0) {
744 			dev_hold(dev);
745 			ret = dev;
746 			break;
747 		}
748 	}
749 	read_unlock(&dev_base_lock);
750 	return ret;
751 }
752 
753 /**
754  *	dev_valid_name - check if name is okay for network device
755  *	@name: name string
756  *
757  *	Network device names need to be valid file names to
758  *	to allow sysfs to work.  We also disallow any kind of
759  *	whitespace.
760  */
dev_valid_name(const char * name)761 int dev_valid_name(const char *name)
762 {
763 	if (*name == '\0')
764 		return 0;
765 	if (strlen(name) >= IFNAMSIZ)
766 		return 0;
767 	if (!strcmp(name, ".") || !strcmp(name, ".."))
768 		return 0;
769 
770 	while (*name) {
771 		if (*name == '/' || isspace(*name))
772 			return 0;
773 		name++;
774 	}
775 	return 1;
776 }
777 
778 /**
779  *	__dev_alloc_name - allocate a name for a device
780  *	@net: network namespace to allocate the device name in
781  *	@name: name format string
782  *	@buf:  scratch buffer and result name string
783  *
784  *	Passed a format string - eg "lt%d" it will try and find a suitable
785  *	id. It scans list of devices to build up a free map, then chooses
786  *	the first empty slot. The caller must hold the dev_base or rtnl lock
787  *	while allocating the name and adding the device in order to avoid
788  *	duplicates.
789  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
790  *	Returns the number of the unit assigned or a negative errno code.
791  */
792 
__dev_alloc_name(struct net * net,const char * name,char * buf)793 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
794 {
795 	int i = 0;
796 	const char *p;
797 	const int max_netdevices = 8*PAGE_SIZE;
798 	unsigned long *inuse;
799 	struct net_device *d;
800 
801 	p = strnchr(name, IFNAMSIZ-1, '%');
802 	if (p) {
803 		/*
804 		 * Verify the string as this thing may have come from
805 		 * the user.  There must be either one "%d" and no other "%"
806 		 * characters.
807 		 */
808 		if (p[1] != 'd' || strchr(p + 2, '%'))
809 			return -EINVAL;
810 
811 		/* Use one page as a bit array of possible slots */
812 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
813 		if (!inuse)
814 			return -ENOMEM;
815 
816 		for_each_netdev(net, d) {
817 			if (!sscanf(d->name, name, &i))
818 				continue;
819 			if (i < 0 || i >= max_netdevices)
820 				continue;
821 
822 			/*  avoid cases where sscanf is not exact inverse of printf */
823 			snprintf(buf, IFNAMSIZ, name, i);
824 			if (!strncmp(buf, d->name, IFNAMSIZ))
825 				set_bit(i, inuse);
826 		}
827 
828 		i = find_first_zero_bit(inuse, max_netdevices);
829 		free_page((unsigned long) inuse);
830 	}
831 
832 	snprintf(buf, IFNAMSIZ, name, i);
833 	if (!__dev_get_by_name(net, buf))
834 		return i;
835 
836 	/* It is possible to run out of possible slots
837 	 * when the name is long and there isn't enough space left
838 	 * for the digits, or if all bits are used.
839 	 */
840 	return -ENFILE;
841 }
842 
843 /**
844  *	dev_alloc_name - allocate a name for a device
845  *	@dev: device
846  *	@name: name format string
847  *
848  *	Passed a format string - eg "lt%d" it will try and find a suitable
849  *	id. It scans list of devices to build up a free map, then chooses
850  *	the first empty slot. The caller must hold the dev_base or rtnl lock
851  *	while allocating the name and adding the device in order to avoid
852  *	duplicates.
853  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
854  *	Returns the number of the unit assigned or a negative errno code.
855  */
856 
dev_alloc_name(struct net_device * dev,const char * name)857 int dev_alloc_name(struct net_device *dev, const char *name)
858 {
859 	char buf[IFNAMSIZ];
860 	struct net *net;
861 	int ret;
862 
863 	BUG_ON(!dev_net(dev));
864 	net = dev_net(dev);
865 	ret = __dev_alloc_name(net, name, buf);
866 	if (ret >= 0)
867 		strlcpy(dev->name, buf, IFNAMSIZ);
868 	return ret;
869 }
870 
871 
872 /**
873  *	dev_change_name - change name of a device
874  *	@dev: device
875  *	@newname: name (or format string) must be at least IFNAMSIZ
876  *
877  *	Change name of a device, can pass format strings "eth%d".
878  *	for wildcarding.
879  */
dev_change_name(struct net_device * dev,const char * newname)880 int dev_change_name(struct net_device *dev, const char *newname)
881 {
882 	char oldname[IFNAMSIZ];
883 	int err = 0;
884 	int ret;
885 	struct net *net;
886 
887 	ASSERT_RTNL();
888 	BUG_ON(!dev_net(dev));
889 
890 	net = dev_net(dev);
891 	if (dev->flags & IFF_UP)
892 		return -EBUSY;
893 
894 	if (!dev_valid_name(newname))
895 		return -EINVAL;
896 
897 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
898 		return 0;
899 
900 	memcpy(oldname, dev->name, IFNAMSIZ);
901 
902 	if (strchr(newname, '%')) {
903 		err = dev_alloc_name(dev, newname);
904 		if (err < 0)
905 			return err;
906 	}
907 	else if (__dev_get_by_name(net, newname))
908 		return -EEXIST;
909 	else
910 		strlcpy(dev->name, newname, IFNAMSIZ);
911 
912 rollback:
913 	/* For now only devices in the initial network namespace
914 	 * are in sysfs.
915 	 */
916 	if (net == &init_net) {
917 		ret = device_rename(&dev->dev, dev->name);
918 		if (ret) {
919 			memcpy(dev->name, oldname, IFNAMSIZ);
920 			return ret;
921 		}
922 	}
923 
924 	write_lock_bh(&dev_base_lock);
925 	hlist_del(&dev->name_hlist);
926 	hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
927 	write_unlock_bh(&dev_base_lock);
928 
929 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
930 	ret = notifier_to_errno(ret);
931 
932 	if (ret) {
933 		if (err) {
934 			printk(KERN_ERR
935 			       "%s: name change rollback failed: %d.\n",
936 			       dev->name, ret);
937 		} else {
938 			err = ret;
939 			memcpy(dev->name, oldname, IFNAMSIZ);
940 			goto rollback;
941 		}
942 	}
943 
944 	return err;
945 }
946 
947 /**
948  *	dev_set_alias - change ifalias of a device
949  *	@dev: device
950  *	@alias: name up to IFALIASZ
951  *	@len: limit of bytes to copy from info
952  *
953  *	Set ifalias for a device,
954  */
dev_set_alias(struct net_device * dev,const char * alias,size_t len)955 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
956 {
957 	ASSERT_RTNL();
958 
959 	if (len >= IFALIASZ)
960 		return -EINVAL;
961 
962 	if (!len) {
963 		if (dev->ifalias) {
964 			kfree(dev->ifalias);
965 			dev->ifalias = NULL;
966 		}
967 		return 0;
968 	}
969 
970 	dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
971 	if (!dev->ifalias)
972 		return -ENOMEM;
973 
974 	strlcpy(dev->ifalias, alias, len+1);
975 	return len;
976 }
977 
978 
979 /**
980  *	netdev_features_change - device changes features
981  *	@dev: device to cause notification
982  *
983  *	Called to indicate a device has changed features.
984  */
netdev_features_change(struct net_device * dev)985 void netdev_features_change(struct net_device *dev)
986 {
987 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
988 }
989 EXPORT_SYMBOL(netdev_features_change);
990 
991 /**
992  *	netdev_state_change - device changes state
993  *	@dev: device to cause notification
994  *
995  *	Called to indicate a device has changed state. This function calls
996  *	the notifier chains for netdev_chain and sends a NEWLINK message
997  *	to the routing socket.
998  */
netdev_state_change(struct net_device * dev)999 void netdev_state_change(struct net_device *dev)
1000 {
1001 	if (dev->flags & IFF_UP) {
1002 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1003 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1004 	}
1005 }
1006 
netdev_bonding_change(struct net_device * dev)1007 void netdev_bonding_change(struct net_device *dev)
1008 {
1009 	call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1010 }
1011 EXPORT_SYMBOL(netdev_bonding_change);
1012 
1013 /**
1014  *	dev_load 	- load a network module
1015  *	@net: the applicable net namespace
1016  *	@name: name of interface
1017  *
1018  *	If a network interface is not present and the process has suitable
1019  *	privileges this function loads the module. If module loading is not
1020  *	available in this kernel then it becomes a nop.
1021  */
1022 
dev_load(struct net * net,const char * name)1023 void dev_load(struct net *net, const char *name)
1024 {
1025 	struct net_device *dev;
1026 
1027 	read_lock(&dev_base_lock);
1028 	dev = __dev_get_by_name(net, name);
1029 	read_unlock(&dev_base_lock);
1030 
1031 	if (!dev && capable(CAP_SYS_MODULE))
1032 		request_module("%s", name);
1033 }
1034 
1035 /**
1036  *	dev_open	- prepare an interface for use.
1037  *	@dev:	device to open
1038  *
1039  *	Takes a device from down to up state. The device's private open
1040  *	function is invoked and then the multicast lists are loaded. Finally
1041  *	the device is moved into the up state and a %NETDEV_UP message is
1042  *	sent to the netdev notifier chain.
1043  *
1044  *	Calling this function on an active interface is a nop. On a failure
1045  *	a negative errno code is returned.
1046  */
dev_open(struct net_device * dev)1047 int dev_open(struct net_device *dev)
1048 {
1049 	const struct net_device_ops *ops = dev->netdev_ops;
1050 	int ret = 0;
1051 
1052 	ASSERT_RTNL();
1053 
1054 	/*
1055 	 *	Is it already up?
1056 	 */
1057 
1058 	if (dev->flags & IFF_UP)
1059 		return 0;
1060 
1061 	/*
1062 	 *	Is it even present?
1063 	 */
1064 	if (!netif_device_present(dev))
1065 		return -ENODEV;
1066 
1067 	/*
1068 	 *	Call device private open method
1069 	 */
1070 	set_bit(__LINK_STATE_START, &dev->state);
1071 
1072 	if (ops->ndo_validate_addr)
1073 		ret = ops->ndo_validate_addr(dev);
1074 
1075 	if (!ret && ops->ndo_open)
1076 		ret = ops->ndo_open(dev);
1077 
1078 	/*
1079 	 *	If it went open OK then:
1080 	 */
1081 
1082 	if (ret)
1083 		clear_bit(__LINK_STATE_START, &dev->state);
1084 	else {
1085 		/*
1086 		 *	Set the flags.
1087 		 */
1088 		dev->flags |= IFF_UP;
1089 
1090 		/*
1091 		 *	Enable NET_DMA
1092 		 */
1093 		net_dmaengine_get();
1094 
1095 		/*
1096 		 *	Initialize multicasting status
1097 		 */
1098 		dev_set_rx_mode(dev);
1099 
1100 		/*
1101 		 *	Wakeup transmit queue engine
1102 		 */
1103 		dev_activate(dev);
1104 
1105 		/*
1106 		 *	... and announce new interface.
1107 		 */
1108 		call_netdevice_notifiers(NETDEV_UP, dev);
1109 	}
1110 
1111 	return ret;
1112 }
1113 
1114 /**
1115  *	dev_close - shutdown an interface.
1116  *	@dev: device to shutdown
1117  *
1118  *	This function moves an active device into down state. A
1119  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1120  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1121  *	chain.
1122  */
dev_close(struct net_device * dev)1123 int dev_close(struct net_device *dev)
1124 {
1125 	const struct net_device_ops *ops = dev->netdev_ops;
1126 	ASSERT_RTNL();
1127 
1128 	might_sleep();
1129 
1130 	if (!(dev->flags & IFF_UP))
1131 		return 0;
1132 
1133 	/*
1134 	 *	Tell people we are going down, so that they can
1135 	 *	prepare to death, when device is still operating.
1136 	 */
1137 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1138 
1139 	clear_bit(__LINK_STATE_START, &dev->state);
1140 
1141 	/* Synchronize to scheduled poll. We cannot touch poll list,
1142 	 * it can be even on different cpu. So just clear netif_running().
1143 	 *
1144 	 * dev->stop() will invoke napi_disable() on all of it's
1145 	 * napi_struct instances on this device.
1146 	 */
1147 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1148 
1149 	dev_deactivate(dev);
1150 
1151 	/*
1152 	 *	Call the device specific close. This cannot fail.
1153 	 *	Only if device is UP
1154 	 *
1155 	 *	We allow it to be called even after a DETACH hot-plug
1156 	 *	event.
1157 	 */
1158 	if (ops->ndo_stop)
1159 		ops->ndo_stop(dev);
1160 
1161 	/*
1162 	 *	Device is now down.
1163 	 */
1164 
1165 	dev->flags &= ~IFF_UP;
1166 
1167 	/*
1168 	 * Tell people we are down
1169 	 */
1170 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1171 
1172 	/*
1173 	 *	Shutdown NET_DMA
1174 	 */
1175 	net_dmaengine_put();
1176 
1177 	return 0;
1178 }
1179 
1180 
1181 /**
1182  *	dev_disable_lro - disable Large Receive Offload on a device
1183  *	@dev: device
1184  *
1185  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1186  *	called under RTNL.  This is needed if received packets may be
1187  *	forwarded to another interface.
1188  */
dev_disable_lro(struct net_device * dev)1189 void dev_disable_lro(struct net_device *dev)
1190 {
1191 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1192 	    dev->ethtool_ops->set_flags) {
1193 		u32 flags = dev->ethtool_ops->get_flags(dev);
1194 		if (flags & ETH_FLAG_LRO) {
1195 			flags &= ~ETH_FLAG_LRO;
1196 			dev->ethtool_ops->set_flags(dev, flags);
1197 		}
1198 	}
1199 	WARN_ON(dev->features & NETIF_F_LRO);
1200 }
1201 EXPORT_SYMBOL(dev_disable_lro);
1202 
1203 
1204 static int dev_boot_phase = 1;
1205 
1206 /*
1207  *	Device change register/unregister. These are not inline or static
1208  *	as we export them to the world.
1209  */
1210 
1211 /**
1212  *	register_netdevice_notifier - register a network notifier block
1213  *	@nb: notifier
1214  *
1215  *	Register a notifier to be called when network device events occur.
1216  *	The notifier passed is linked into the kernel structures and must
1217  *	not be reused until it has been unregistered. A negative errno code
1218  *	is returned on a failure.
1219  *
1220  * 	When registered all registration and up events are replayed
1221  *	to the new notifier to allow device to have a race free
1222  *	view of the network device list.
1223  */
1224 
register_netdevice_notifier(struct notifier_block * nb)1225 int register_netdevice_notifier(struct notifier_block *nb)
1226 {
1227 	struct net_device *dev;
1228 	struct net_device *last;
1229 	struct net *net;
1230 	int err;
1231 
1232 	rtnl_lock();
1233 	err = raw_notifier_chain_register(&netdev_chain, nb);
1234 	if (err)
1235 		goto unlock;
1236 	if (dev_boot_phase)
1237 		goto unlock;
1238 	for_each_net(net) {
1239 		for_each_netdev(net, dev) {
1240 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1241 			err = notifier_to_errno(err);
1242 			if (err)
1243 				goto rollback;
1244 
1245 			if (!(dev->flags & IFF_UP))
1246 				continue;
1247 
1248 			nb->notifier_call(nb, NETDEV_UP, dev);
1249 		}
1250 	}
1251 
1252 unlock:
1253 	rtnl_unlock();
1254 	return err;
1255 
1256 rollback:
1257 	last = dev;
1258 	for_each_net(net) {
1259 		for_each_netdev(net, dev) {
1260 			if (dev == last)
1261 				break;
1262 
1263 			if (dev->flags & IFF_UP) {
1264 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1265 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1266 			}
1267 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1268 		}
1269 	}
1270 
1271 	raw_notifier_chain_unregister(&netdev_chain, nb);
1272 	goto unlock;
1273 }
1274 
1275 /**
1276  *	unregister_netdevice_notifier - unregister a network notifier block
1277  *	@nb: notifier
1278  *
1279  *	Unregister a notifier previously registered by
1280  *	register_netdevice_notifier(). The notifier is unlinked into the
1281  *	kernel structures and may then be reused. A negative errno code
1282  *	is returned on a failure.
1283  */
1284 
unregister_netdevice_notifier(struct notifier_block * nb)1285 int unregister_netdevice_notifier(struct notifier_block *nb)
1286 {
1287 	int err;
1288 
1289 	rtnl_lock();
1290 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1291 	rtnl_unlock();
1292 	return err;
1293 }
1294 
1295 /**
1296  *	call_netdevice_notifiers - call all network notifier blocks
1297  *      @val: value passed unmodified to notifier function
1298  *      @dev: net_device pointer passed unmodified to notifier function
1299  *
1300  *	Call all network notifier blocks.  Parameters and return value
1301  *	are as for raw_notifier_call_chain().
1302  */
1303 
call_netdevice_notifiers(unsigned long val,struct net_device * dev)1304 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1305 {
1306 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1307 }
1308 
1309 /* When > 0 there are consumers of rx skb time stamps */
1310 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1311 
net_enable_timestamp(void)1312 void net_enable_timestamp(void)
1313 {
1314 	atomic_inc(&netstamp_needed);
1315 }
1316 
net_disable_timestamp(void)1317 void net_disable_timestamp(void)
1318 {
1319 	atomic_dec(&netstamp_needed);
1320 }
1321 
net_timestamp(struct sk_buff * skb)1322 static inline void net_timestamp(struct sk_buff *skb)
1323 {
1324 	if (atomic_read(&netstamp_needed))
1325 		__net_timestamp(skb);
1326 	else
1327 		skb->tstamp.tv64 = 0;
1328 }
1329 
1330 /*
1331  *	Support routine. Sends outgoing frames to any network
1332  *	taps currently in use.
1333  */
1334 
dev_queue_xmit_nit(struct sk_buff * skb,struct net_device * dev)1335 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1336 {
1337 	struct packet_type *ptype;
1338 
1339 	net_timestamp(skb);
1340 
1341 	rcu_read_lock();
1342 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1343 		/* Never send packets back to the socket
1344 		 * they originated from - MvS (miquels@drinkel.ow.org)
1345 		 */
1346 		if ((ptype->dev == dev || !ptype->dev) &&
1347 		    (ptype->af_packet_priv == NULL ||
1348 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1349 			struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1350 			if (!skb2)
1351 				break;
1352 
1353 			/* skb->nh should be correctly
1354 			   set by sender, so that the second statement is
1355 			   just protection against buggy protocols.
1356 			 */
1357 			skb_reset_mac_header(skb2);
1358 
1359 			if (skb_network_header(skb2) < skb2->data ||
1360 			    skb2->network_header > skb2->tail) {
1361 				if (net_ratelimit())
1362 					printk(KERN_CRIT "protocol %04x is "
1363 					       "buggy, dev %s\n",
1364 					       skb2->protocol, dev->name);
1365 				skb_reset_network_header(skb2);
1366 			}
1367 
1368 			skb2->transport_header = skb2->network_header;
1369 			skb2->pkt_type = PACKET_OUTGOING;
1370 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1371 		}
1372 	}
1373 	rcu_read_unlock();
1374 }
1375 
1376 
__netif_reschedule(struct Qdisc * q)1377 static inline void __netif_reschedule(struct Qdisc *q)
1378 {
1379 	struct softnet_data *sd;
1380 	unsigned long flags;
1381 
1382 	local_irq_save(flags);
1383 	sd = &__get_cpu_var(softnet_data);
1384 	q->next_sched = sd->output_queue;
1385 	sd->output_queue = q;
1386 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1387 	local_irq_restore(flags);
1388 }
1389 
__netif_schedule(struct Qdisc * q)1390 void __netif_schedule(struct Qdisc *q)
1391 {
1392 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1393 		__netif_reschedule(q);
1394 }
1395 EXPORT_SYMBOL(__netif_schedule);
1396 
dev_kfree_skb_irq(struct sk_buff * skb)1397 void dev_kfree_skb_irq(struct sk_buff *skb)
1398 {
1399 	if (atomic_dec_and_test(&skb->users)) {
1400 		struct softnet_data *sd;
1401 		unsigned long flags;
1402 
1403 		local_irq_save(flags);
1404 		sd = &__get_cpu_var(softnet_data);
1405 		skb->next = sd->completion_queue;
1406 		sd->completion_queue = skb;
1407 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1408 		local_irq_restore(flags);
1409 	}
1410 }
1411 EXPORT_SYMBOL(dev_kfree_skb_irq);
1412 
dev_kfree_skb_any(struct sk_buff * skb)1413 void dev_kfree_skb_any(struct sk_buff *skb)
1414 {
1415 	if (in_irq() || irqs_disabled())
1416 		dev_kfree_skb_irq(skb);
1417 	else
1418 		dev_kfree_skb(skb);
1419 }
1420 EXPORT_SYMBOL(dev_kfree_skb_any);
1421 
1422 
1423 /**
1424  * netif_device_detach - mark device as removed
1425  * @dev: network device
1426  *
1427  * Mark device as removed from system and therefore no longer available.
1428  */
netif_device_detach(struct net_device * dev)1429 void netif_device_detach(struct net_device *dev)
1430 {
1431 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1432 	    netif_running(dev)) {
1433 		netif_stop_queue(dev);
1434 	}
1435 }
1436 EXPORT_SYMBOL(netif_device_detach);
1437 
1438 /**
1439  * netif_device_attach - mark device as attached
1440  * @dev: network device
1441  *
1442  * Mark device as attached from system and restart if needed.
1443  */
netif_device_attach(struct net_device * dev)1444 void netif_device_attach(struct net_device *dev)
1445 {
1446 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1447 	    netif_running(dev)) {
1448 		netif_wake_queue(dev);
1449 		__netdev_watchdog_up(dev);
1450 	}
1451 }
1452 EXPORT_SYMBOL(netif_device_attach);
1453 
can_checksum_protocol(unsigned long features,__be16 protocol)1454 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1455 {
1456 	return ((features & NETIF_F_GEN_CSUM) ||
1457 		((features & NETIF_F_IP_CSUM) &&
1458 		 protocol == htons(ETH_P_IP)) ||
1459 		((features & NETIF_F_IPV6_CSUM) &&
1460 		 protocol == htons(ETH_P_IPV6)));
1461 }
1462 
dev_can_checksum(struct net_device * dev,struct sk_buff * skb)1463 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1464 {
1465 	if (can_checksum_protocol(dev->features, skb->protocol))
1466 		return true;
1467 
1468 	if (skb->protocol == htons(ETH_P_8021Q)) {
1469 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1470 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1471 					  veh->h_vlan_encapsulated_proto))
1472 			return true;
1473 	}
1474 
1475 	return false;
1476 }
1477 
1478 /*
1479  * Invalidate hardware checksum when packet is to be mangled, and
1480  * complete checksum manually on outgoing path.
1481  */
skb_checksum_help(struct sk_buff * skb)1482 int skb_checksum_help(struct sk_buff *skb)
1483 {
1484 	__wsum csum;
1485 	int ret = 0, offset;
1486 
1487 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1488 		goto out_set_summed;
1489 
1490 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1491 		/* Let GSO fix up the checksum. */
1492 		goto out_set_summed;
1493 	}
1494 
1495 	offset = skb->csum_start - skb_headroom(skb);
1496 	BUG_ON(offset >= skb_headlen(skb));
1497 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1498 
1499 	offset += skb->csum_offset;
1500 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1501 
1502 	if (skb_cloned(skb) &&
1503 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1504 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1505 		if (ret)
1506 			goto out;
1507 	}
1508 
1509 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1510 out_set_summed:
1511 	skb->ip_summed = CHECKSUM_NONE;
1512 out:
1513 	return ret;
1514 }
1515 
1516 /**
1517  *	skb_gso_segment - Perform segmentation on skb.
1518  *	@skb: buffer to segment
1519  *	@features: features for the output path (see dev->features)
1520  *
1521  *	This function segments the given skb and returns a list of segments.
1522  *
1523  *	It may return NULL if the skb requires no segmentation.  This is
1524  *	only possible when GSO is used for verifying header integrity.
1525  */
skb_gso_segment(struct sk_buff * skb,int features)1526 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1527 {
1528 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1529 	struct packet_type *ptype;
1530 	__be16 type = skb->protocol;
1531 	int err;
1532 
1533 	skb_reset_mac_header(skb);
1534 	skb->mac_len = skb->network_header - skb->mac_header;
1535 	__skb_pull(skb, skb->mac_len);
1536 
1537 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1538 		struct net_device *dev = skb->dev;
1539 		struct ethtool_drvinfo info = {};
1540 
1541 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1542 			dev->ethtool_ops->get_drvinfo(dev, &info);
1543 
1544 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1545 			"ip_summed=%d",
1546 		     info.driver, dev ? dev->features : 0L,
1547 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1548 		     skb->len, skb->data_len, skb->ip_summed);
1549 
1550 		if (skb_header_cloned(skb) &&
1551 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1552 			return ERR_PTR(err);
1553 	}
1554 
1555 	rcu_read_lock();
1556 	list_for_each_entry_rcu(ptype,
1557 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1558 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1559 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1560 				err = ptype->gso_send_check(skb);
1561 				segs = ERR_PTR(err);
1562 				if (err || skb_gso_ok(skb, features))
1563 					break;
1564 				__skb_push(skb, (skb->data -
1565 						 skb_network_header(skb)));
1566 			}
1567 			segs = ptype->gso_segment(skb, features);
1568 			break;
1569 		}
1570 	}
1571 	rcu_read_unlock();
1572 
1573 	__skb_push(skb, skb->data - skb_mac_header(skb));
1574 
1575 	return segs;
1576 }
1577 
1578 EXPORT_SYMBOL(skb_gso_segment);
1579 
1580 /* Take action when hardware reception checksum errors are detected. */
1581 #ifdef CONFIG_BUG
netdev_rx_csum_fault(struct net_device * dev)1582 void netdev_rx_csum_fault(struct net_device *dev)
1583 {
1584 	if (net_ratelimit()) {
1585 		printk(KERN_ERR "%s: hw csum failure.\n",
1586 			dev ? dev->name : "<unknown>");
1587 		dump_stack();
1588 	}
1589 }
1590 EXPORT_SYMBOL(netdev_rx_csum_fault);
1591 #endif
1592 
1593 /* Actually, we should eliminate this check as soon as we know, that:
1594  * 1. IOMMU is present and allows to map all the memory.
1595  * 2. No high memory really exists on this machine.
1596  */
1597 
illegal_highdma(struct net_device * dev,struct sk_buff * skb)1598 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1599 {
1600 #ifdef CONFIG_HIGHMEM
1601 	int i;
1602 
1603 	if (dev->features & NETIF_F_HIGHDMA)
1604 		return 0;
1605 
1606 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1607 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1608 			return 1;
1609 
1610 #endif
1611 	return 0;
1612 }
1613 
1614 struct dev_gso_cb {
1615 	void (*destructor)(struct sk_buff *skb);
1616 };
1617 
1618 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1619 
dev_gso_skb_destructor(struct sk_buff * skb)1620 static void dev_gso_skb_destructor(struct sk_buff *skb)
1621 {
1622 	struct dev_gso_cb *cb;
1623 
1624 	do {
1625 		struct sk_buff *nskb = skb->next;
1626 
1627 		skb->next = nskb->next;
1628 		nskb->next = NULL;
1629 		kfree_skb(nskb);
1630 	} while (skb->next);
1631 
1632 	cb = DEV_GSO_CB(skb);
1633 	if (cb->destructor)
1634 		cb->destructor(skb);
1635 }
1636 
1637 /**
1638  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1639  *	@skb: buffer to segment
1640  *
1641  *	This function segments the given skb and stores the list of segments
1642  *	in skb->next.
1643  */
dev_gso_segment(struct sk_buff * skb)1644 static int dev_gso_segment(struct sk_buff *skb)
1645 {
1646 	struct net_device *dev = skb->dev;
1647 	struct sk_buff *segs;
1648 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1649 					 NETIF_F_SG : 0);
1650 
1651 	segs = skb_gso_segment(skb, features);
1652 
1653 	/* Verifying header integrity only. */
1654 	if (!segs)
1655 		return 0;
1656 
1657 	if (IS_ERR(segs))
1658 		return PTR_ERR(segs);
1659 
1660 	skb->next = segs;
1661 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1662 	skb->destructor = dev_gso_skb_destructor;
1663 
1664 	return 0;
1665 }
1666 
dev_hard_start_xmit(struct sk_buff * skb,struct net_device * dev,struct netdev_queue * txq)1667 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1668 			struct netdev_queue *txq)
1669 {
1670 	const struct net_device_ops *ops = dev->netdev_ops;
1671 
1672 	prefetch(&dev->netdev_ops->ndo_start_xmit);
1673 	if (likely(!skb->next)) {
1674 		if (!list_empty(&ptype_all))
1675 			dev_queue_xmit_nit(skb, dev);
1676 
1677 		if (netif_needs_gso(dev, skb)) {
1678 			if (unlikely(dev_gso_segment(skb)))
1679 				goto out_kfree_skb;
1680 			if (skb->next)
1681 				goto gso;
1682 		}
1683 
1684 		return ops->ndo_start_xmit(skb, dev);
1685 	}
1686 
1687 gso:
1688 	do {
1689 		struct sk_buff *nskb = skb->next;
1690 		int rc;
1691 
1692 		skb->next = nskb->next;
1693 		nskb->next = NULL;
1694 		rc = ops->ndo_start_xmit(nskb, dev);
1695 		if (unlikely(rc)) {
1696 			nskb->next = skb->next;
1697 			skb->next = nskb;
1698 			return rc;
1699 		}
1700 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1701 			return NETDEV_TX_BUSY;
1702 	} while (skb->next);
1703 
1704 	skb->destructor = DEV_GSO_CB(skb)->destructor;
1705 
1706 out_kfree_skb:
1707 	kfree_skb(skb);
1708 	return 0;
1709 }
1710 
1711 static u32 simple_tx_hashrnd;
1712 static int simple_tx_hashrnd_initialized = 0;
1713 
simple_tx_hash(struct net_device * dev,struct sk_buff * skb)1714 static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb)
1715 {
1716 	u32 addr1, addr2, ports;
1717 	u32 hash, ihl;
1718 	u8 ip_proto = 0;
1719 
1720 	if (unlikely(!simple_tx_hashrnd_initialized)) {
1721 		get_random_bytes(&simple_tx_hashrnd, 4);
1722 		simple_tx_hashrnd_initialized = 1;
1723 	}
1724 
1725 	switch (skb->protocol) {
1726 	case htons(ETH_P_IP):
1727 		if (!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)))
1728 			ip_proto = ip_hdr(skb)->protocol;
1729 		addr1 = ip_hdr(skb)->saddr;
1730 		addr2 = ip_hdr(skb)->daddr;
1731 		ihl = ip_hdr(skb)->ihl;
1732 		break;
1733 	case htons(ETH_P_IPV6):
1734 		ip_proto = ipv6_hdr(skb)->nexthdr;
1735 		addr1 = ipv6_hdr(skb)->saddr.s6_addr32[3];
1736 		addr2 = ipv6_hdr(skb)->daddr.s6_addr32[3];
1737 		ihl = (40 >> 2);
1738 		break;
1739 	default:
1740 		return 0;
1741 	}
1742 
1743 
1744 	switch (ip_proto) {
1745 	case IPPROTO_TCP:
1746 	case IPPROTO_UDP:
1747 	case IPPROTO_DCCP:
1748 	case IPPROTO_ESP:
1749 	case IPPROTO_AH:
1750 	case IPPROTO_SCTP:
1751 	case IPPROTO_UDPLITE:
1752 		ports = *((u32 *) (skb_network_header(skb) + (ihl * 4)));
1753 		break;
1754 
1755 	default:
1756 		ports = 0;
1757 		break;
1758 	}
1759 
1760 	hash = jhash_3words(addr1, addr2, ports, simple_tx_hashrnd);
1761 
1762 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1763 }
1764 
dev_pick_tx(struct net_device * dev,struct sk_buff * skb)1765 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1766 					struct sk_buff *skb)
1767 {
1768 	const struct net_device_ops *ops = dev->netdev_ops;
1769 	u16 queue_index = 0;
1770 
1771 	if (ops->ndo_select_queue)
1772 		queue_index = ops->ndo_select_queue(dev, skb);
1773 	else if (dev->real_num_tx_queues > 1)
1774 		queue_index = simple_tx_hash(dev, skb);
1775 
1776 	skb_set_queue_mapping(skb, queue_index);
1777 	return netdev_get_tx_queue(dev, queue_index);
1778 }
1779 
1780 /**
1781  *	dev_queue_xmit - transmit a buffer
1782  *	@skb: buffer to transmit
1783  *
1784  *	Queue a buffer for transmission to a network device. The caller must
1785  *	have set the device and priority and built the buffer before calling
1786  *	this function. The function can be called from an interrupt.
1787  *
1788  *	A negative errno code is returned on a failure. A success does not
1789  *	guarantee the frame will be transmitted as it may be dropped due
1790  *	to congestion or traffic shaping.
1791  *
1792  * -----------------------------------------------------------------------------------
1793  *      I notice this method can also return errors from the queue disciplines,
1794  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1795  *      be positive.
1796  *
1797  *      Regardless of the return value, the skb is consumed, so it is currently
1798  *      difficult to retry a send to this method.  (You can bump the ref count
1799  *      before sending to hold a reference for retry if you are careful.)
1800  *
1801  *      When calling this method, interrupts MUST be enabled.  This is because
1802  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1803  *          --BLG
1804  */
dev_queue_xmit(struct sk_buff * skb)1805 int dev_queue_xmit(struct sk_buff *skb)
1806 {
1807 	struct net_device *dev = skb->dev;
1808 	struct netdev_queue *txq;
1809 	struct Qdisc *q;
1810 	int rc = -ENOMEM;
1811 
1812 	/* GSO will handle the following emulations directly. */
1813 	if (netif_needs_gso(dev, skb))
1814 		goto gso;
1815 
1816 	if (skb_shinfo(skb)->frag_list &&
1817 	    !(dev->features & NETIF_F_FRAGLIST) &&
1818 	    __skb_linearize(skb))
1819 		goto out_kfree_skb;
1820 
1821 	/* Fragmented skb is linearized if device does not support SG,
1822 	 * or if at least one of fragments is in highmem and device
1823 	 * does not support DMA from it.
1824 	 */
1825 	if (skb_shinfo(skb)->nr_frags &&
1826 	    (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1827 	    __skb_linearize(skb))
1828 		goto out_kfree_skb;
1829 
1830 	/* If packet is not checksummed and device does not support
1831 	 * checksumming for this protocol, complete checksumming here.
1832 	 */
1833 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
1834 		skb_set_transport_header(skb, skb->csum_start -
1835 					      skb_headroom(skb));
1836 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1837 			goto out_kfree_skb;
1838 	}
1839 
1840 gso:
1841 	/* Disable soft irqs for various locks below. Also
1842 	 * stops preemption for RCU.
1843 	 */
1844 	rcu_read_lock_bh();
1845 
1846 	txq = dev_pick_tx(dev, skb);
1847 	q = rcu_dereference(txq->qdisc);
1848 
1849 #ifdef CONFIG_NET_CLS_ACT
1850 	skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1851 #endif
1852 	if (q->enqueue) {
1853 		spinlock_t *root_lock = qdisc_lock(q);
1854 
1855 		spin_lock(root_lock);
1856 
1857 		if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1858 			kfree_skb(skb);
1859 			rc = NET_XMIT_DROP;
1860 		} else {
1861 			rc = qdisc_enqueue_root(skb, q);
1862 			qdisc_run(q);
1863 		}
1864 		spin_unlock(root_lock);
1865 
1866 		goto out;
1867 	}
1868 
1869 	/* The device has no queue. Common case for software devices:
1870 	   loopback, all the sorts of tunnels...
1871 
1872 	   Really, it is unlikely that netif_tx_lock protection is necessary
1873 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1874 	   counters.)
1875 	   However, it is possible, that they rely on protection
1876 	   made by us here.
1877 
1878 	   Check this and shot the lock. It is not prone from deadlocks.
1879 	   Either shot noqueue qdisc, it is even simpler 8)
1880 	 */
1881 	if (dev->flags & IFF_UP) {
1882 		int cpu = smp_processor_id(); /* ok because BHs are off */
1883 
1884 		if (txq->xmit_lock_owner != cpu) {
1885 
1886 			HARD_TX_LOCK(dev, txq, cpu);
1887 
1888 			if (!netif_tx_queue_stopped(txq)) {
1889 				rc = 0;
1890 				if (!dev_hard_start_xmit(skb, dev, txq)) {
1891 					HARD_TX_UNLOCK(dev, txq);
1892 					goto out;
1893 				}
1894 			}
1895 			HARD_TX_UNLOCK(dev, txq);
1896 			if (net_ratelimit())
1897 				printk(KERN_CRIT "Virtual device %s asks to "
1898 				       "queue packet!\n", dev->name);
1899 		} else {
1900 			/* Recursion is detected! It is possible,
1901 			 * unfortunately */
1902 			if (net_ratelimit())
1903 				printk(KERN_CRIT "Dead loop on virtual device "
1904 				       "%s, fix it urgently!\n", dev->name);
1905 		}
1906 	}
1907 
1908 	rc = -ENETDOWN;
1909 	rcu_read_unlock_bh();
1910 
1911 out_kfree_skb:
1912 	kfree_skb(skb);
1913 	return rc;
1914 out:
1915 	rcu_read_unlock_bh();
1916 	return rc;
1917 }
1918 
1919 
1920 /*=======================================================================
1921 			Receiver routines
1922   =======================================================================*/
1923 
1924 int netdev_max_backlog __read_mostly = 1000;
1925 int netdev_budget __read_mostly = 300;
1926 int weight_p __read_mostly = 64;            /* old backlog weight */
1927 
1928 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1929 
1930 
1931 /**
1932  *	netif_rx	-	post buffer to the network code
1933  *	@skb: buffer to post
1934  *
1935  *	This function receives a packet from a device driver and queues it for
1936  *	the upper (protocol) levels to process.  It always succeeds. The buffer
1937  *	may be dropped during processing for congestion control or by the
1938  *	protocol layers.
1939  *
1940  *	return values:
1941  *	NET_RX_SUCCESS	(no congestion)
1942  *	NET_RX_DROP     (packet was dropped)
1943  *
1944  */
1945 
netif_rx(struct sk_buff * skb)1946 int netif_rx(struct sk_buff *skb)
1947 {
1948 	struct softnet_data *queue;
1949 	unsigned long flags;
1950 
1951 	/* if netpoll wants it, pretend we never saw it */
1952 	if (netpoll_rx(skb))
1953 		return NET_RX_DROP;
1954 
1955 	if (!skb->tstamp.tv64)
1956 		net_timestamp(skb);
1957 
1958 	/*
1959 	 * The code is rearranged so that the path is the most
1960 	 * short when CPU is congested, but is still operating.
1961 	 */
1962 	local_irq_save(flags);
1963 	queue = &__get_cpu_var(softnet_data);
1964 
1965 	__get_cpu_var(netdev_rx_stat).total++;
1966 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1967 		if (queue->input_pkt_queue.qlen) {
1968 enqueue:
1969 			__skb_queue_tail(&queue->input_pkt_queue, skb);
1970 			local_irq_restore(flags);
1971 			return NET_RX_SUCCESS;
1972 		}
1973 
1974 		napi_schedule(&queue->backlog);
1975 		goto enqueue;
1976 	}
1977 
1978 	__get_cpu_var(netdev_rx_stat).dropped++;
1979 	local_irq_restore(flags);
1980 
1981 	kfree_skb(skb);
1982 	return NET_RX_DROP;
1983 }
1984 
netif_rx_ni(struct sk_buff * skb)1985 int netif_rx_ni(struct sk_buff *skb)
1986 {
1987 	int err;
1988 
1989 	preempt_disable();
1990 	err = netif_rx(skb);
1991 	if (local_softirq_pending())
1992 		do_softirq();
1993 	preempt_enable();
1994 
1995 	return err;
1996 }
1997 
1998 EXPORT_SYMBOL(netif_rx_ni);
1999 
net_tx_action(struct softirq_action * h)2000 static void net_tx_action(struct softirq_action *h)
2001 {
2002 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2003 
2004 	if (sd->completion_queue) {
2005 		struct sk_buff *clist;
2006 
2007 		local_irq_disable();
2008 		clist = sd->completion_queue;
2009 		sd->completion_queue = NULL;
2010 		local_irq_enable();
2011 
2012 		while (clist) {
2013 			struct sk_buff *skb = clist;
2014 			clist = clist->next;
2015 
2016 			WARN_ON(atomic_read(&skb->users));
2017 			__kfree_skb(skb);
2018 		}
2019 	}
2020 
2021 	if (sd->output_queue) {
2022 		struct Qdisc *head;
2023 
2024 		local_irq_disable();
2025 		head = sd->output_queue;
2026 		sd->output_queue = NULL;
2027 		local_irq_enable();
2028 
2029 		while (head) {
2030 			struct Qdisc *q = head;
2031 			spinlock_t *root_lock;
2032 
2033 			head = head->next_sched;
2034 
2035 			root_lock = qdisc_lock(q);
2036 			if (spin_trylock(root_lock)) {
2037 				smp_mb__before_clear_bit();
2038 				clear_bit(__QDISC_STATE_SCHED,
2039 					  &q->state);
2040 				qdisc_run(q);
2041 				spin_unlock(root_lock);
2042 			} else {
2043 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2044 					      &q->state)) {
2045 					__netif_reschedule(q);
2046 				} else {
2047 					smp_mb__before_clear_bit();
2048 					clear_bit(__QDISC_STATE_SCHED,
2049 						  &q->state);
2050 				}
2051 			}
2052 		}
2053 	}
2054 }
2055 
deliver_skb(struct sk_buff * skb,struct packet_type * pt_prev,struct net_device * orig_dev)2056 static inline int deliver_skb(struct sk_buff *skb,
2057 			      struct packet_type *pt_prev,
2058 			      struct net_device *orig_dev)
2059 {
2060 	atomic_inc(&skb->users);
2061 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2062 }
2063 
2064 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2065 /* These hooks defined here for ATM */
2066 struct net_bridge;
2067 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2068 						unsigned char *addr);
2069 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2070 
2071 /*
2072  * If bridge module is loaded call bridging hook.
2073  *  returns NULL if packet was consumed.
2074  */
2075 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2076 					struct sk_buff *skb) __read_mostly;
handle_bridge(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)2077 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2078 					    struct packet_type **pt_prev, int *ret,
2079 					    struct net_device *orig_dev)
2080 {
2081 	struct net_bridge_port *port;
2082 
2083 	if (skb->pkt_type == PACKET_LOOPBACK ||
2084 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2085 		return skb;
2086 
2087 	if (*pt_prev) {
2088 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2089 		*pt_prev = NULL;
2090 	}
2091 
2092 	return br_handle_frame_hook(port, skb);
2093 }
2094 #else
2095 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2096 #endif
2097 
2098 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2099 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2100 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2101 
handle_macvlan(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)2102 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2103 					     struct packet_type **pt_prev,
2104 					     int *ret,
2105 					     struct net_device *orig_dev)
2106 {
2107 	if (skb->dev->macvlan_port == NULL)
2108 		return skb;
2109 
2110 	if (*pt_prev) {
2111 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2112 		*pt_prev = NULL;
2113 	}
2114 	return macvlan_handle_frame_hook(skb);
2115 }
2116 #else
2117 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2118 #endif
2119 
2120 #ifdef CONFIG_NET_CLS_ACT
2121 /* TODO: Maybe we should just force sch_ingress to be compiled in
2122  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2123  * a compare and 2 stores extra right now if we dont have it on
2124  * but have CONFIG_NET_CLS_ACT
2125  * NOTE: This doesnt stop any functionality; if you dont have
2126  * the ingress scheduler, you just cant add policies on ingress.
2127  *
2128  */
ing_filter(struct sk_buff * skb)2129 static int ing_filter(struct sk_buff *skb)
2130 {
2131 	struct net_device *dev = skb->dev;
2132 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2133 	struct netdev_queue *rxq;
2134 	int result = TC_ACT_OK;
2135 	struct Qdisc *q;
2136 
2137 	if (MAX_RED_LOOP < ttl++) {
2138 		printk(KERN_WARNING
2139 		       "Redir loop detected Dropping packet (%d->%d)\n",
2140 		       skb->iif, dev->ifindex);
2141 		return TC_ACT_SHOT;
2142 	}
2143 
2144 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2145 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2146 
2147 	rxq = &dev->rx_queue;
2148 
2149 	q = rxq->qdisc;
2150 	if (q != &noop_qdisc) {
2151 		spin_lock(qdisc_lock(q));
2152 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2153 			result = qdisc_enqueue_root(skb, q);
2154 		spin_unlock(qdisc_lock(q));
2155 	}
2156 
2157 	return result;
2158 }
2159 
handle_ing(struct sk_buff * skb,struct packet_type ** pt_prev,int * ret,struct net_device * orig_dev)2160 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2161 					 struct packet_type **pt_prev,
2162 					 int *ret, struct net_device *orig_dev)
2163 {
2164 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2165 		goto out;
2166 
2167 	if (*pt_prev) {
2168 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2169 		*pt_prev = NULL;
2170 	} else {
2171 		/* Huh? Why does turning on AF_PACKET affect this? */
2172 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2173 	}
2174 
2175 	switch (ing_filter(skb)) {
2176 	case TC_ACT_SHOT:
2177 	case TC_ACT_STOLEN:
2178 		kfree_skb(skb);
2179 		return NULL;
2180 	}
2181 
2182 out:
2183 	skb->tc_verd = 0;
2184 	return skb;
2185 }
2186 #endif
2187 
2188 /*
2189  * 	netif_nit_deliver - deliver received packets to network taps
2190  * 	@skb: buffer
2191  *
2192  * 	This function is used to deliver incoming packets to network
2193  * 	taps. It should be used when the normal netif_receive_skb path
2194  * 	is bypassed, for example because of VLAN acceleration.
2195  */
netif_nit_deliver(struct sk_buff * skb)2196 void netif_nit_deliver(struct sk_buff *skb)
2197 {
2198 	struct packet_type *ptype;
2199 
2200 	if (list_empty(&ptype_all))
2201 		return;
2202 
2203 	skb_reset_network_header(skb);
2204 	skb_reset_transport_header(skb);
2205 	skb->mac_len = skb->network_header - skb->mac_header;
2206 
2207 	rcu_read_lock();
2208 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2209 		if (!ptype->dev || ptype->dev == skb->dev)
2210 			deliver_skb(skb, ptype, skb->dev);
2211 	}
2212 	rcu_read_unlock();
2213 }
2214 
2215 /**
2216  *	netif_receive_skb - process receive buffer from network
2217  *	@skb: buffer to process
2218  *
2219  *	netif_receive_skb() is the main receive data processing function.
2220  *	It always succeeds. The buffer may be dropped during processing
2221  *	for congestion control or by the protocol layers.
2222  *
2223  *	This function may only be called from softirq context and interrupts
2224  *	should be enabled.
2225  *
2226  *	Return values (usually ignored):
2227  *	NET_RX_SUCCESS: no congestion
2228  *	NET_RX_DROP: packet was dropped
2229  */
netif_receive_skb(struct sk_buff * skb)2230 int netif_receive_skb(struct sk_buff *skb)
2231 {
2232 	struct packet_type *ptype, *pt_prev;
2233 	struct net_device *orig_dev;
2234 	struct net_device *null_or_orig;
2235 	int ret = NET_RX_DROP;
2236 	__be16 type;
2237 
2238 	if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2239 		return NET_RX_SUCCESS;
2240 
2241 	/* if we've gotten here through NAPI, check netpoll */
2242 	if (netpoll_receive_skb(skb))
2243 		return NET_RX_DROP;
2244 
2245 	if (!skb->tstamp.tv64)
2246 		net_timestamp(skb);
2247 
2248 	if (!skb->iif)
2249 		skb->iif = skb->dev->ifindex;
2250 
2251 	null_or_orig = NULL;
2252 	orig_dev = skb->dev;
2253 	if (orig_dev->master) {
2254 		if (skb_bond_should_drop(skb))
2255 			null_or_orig = orig_dev; /* deliver only exact match */
2256 		else
2257 			skb->dev = orig_dev->master;
2258 	}
2259 
2260 	__get_cpu_var(netdev_rx_stat).total++;
2261 
2262 	skb_reset_network_header(skb);
2263 	skb_reset_transport_header(skb);
2264 	skb->mac_len = skb->network_header - skb->mac_header;
2265 
2266 	pt_prev = NULL;
2267 
2268 	rcu_read_lock();
2269 
2270 #ifdef CONFIG_NET_CLS_ACT
2271 	if (skb->tc_verd & TC_NCLS) {
2272 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2273 		goto ncls;
2274 	}
2275 #endif
2276 
2277 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2278 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2279 		    ptype->dev == orig_dev) {
2280 			if (pt_prev)
2281 				ret = deliver_skb(skb, pt_prev, orig_dev);
2282 			pt_prev = ptype;
2283 		}
2284 	}
2285 
2286 #ifdef CONFIG_NET_CLS_ACT
2287 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2288 	if (!skb)
2289 		goto out;
2290 ncls:
2291 #endif
2292 
2293 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2294 	if (!skb)
2295 		goto out;
2296 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2297 	if (!skb)
2298 		goto out;
2299 
2300 	type = skb->protocol;
2301 	list_for_each_entry_rcu(ptype,
2302 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2303 		if (ptype->type == type &&
2304 		    (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2305 		     ptype->dev == orig_dev)) {
2306 			if (pt_prev)
2307 				ret = deliver_skb(skb, pt_prev, orig_dev);
2308 			pt_prev = ptype;
2309 		}
2310 	}
2311 
2312 	if (pt_prev) {
2313 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2314 	} else {
2315 		kfree_skb(skb);
2316 		/* Jamal, now you will not able to escape explaining
2317 		 * me how you were going to use this. :-)
2318 		 */
2319 		ret = NET_RX_DROP;
2320 	}
2321 
2322 out:
2323 	rcu_read_unlock();
2324 	return ret;
2325 }
2326 
2327 /* Network device is going away, flush any packets still pending  */
flush_backlog(void * arg)2328 static void flush_backlog(void *arg)
2329 {
2330 	struct net_device *dev = arg;
2331 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2332 	struct sk_buff *skb, *tmp;
2333 
2334 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2335 		if (skb->dev == dev) {
2336 			__skb_unlink(skb, &queue->input_pkt_queue);
2337 			kfree_skb(skb);
2338 		}
2339 }
2340 
napi_gro_complete(struct sk_buff * skb)2341 static int napi_gro_complete(struct sk_buff *skb)
2342 {
2343 	struct packet_type *ptype;
2344 	__be16 type = skb->protocol;
2345 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2346 	int err = -ENOENT;
2347 
2348 	if (NAPI_GRO_CB(skb)->count == 1)
2349 		goto out;
2350 
2351 	rcu_read_lock();
2352 	list_for_each_entry_rcu(ptype, head, list) {
2353 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2354 			continue;
2355 
2356 		err = ptype->gro_complete(skb);
2357 		break;
2358 	}
2359 	rcu_read_unlock();
2360 
2361 	if (err) {
2362 		WARN_ON(&ptype->list == head);
2363 		kfree_skb(skb);
2364 		return NET_RX_SUCCESS;
2365 	}
2366 
2367 out:
2368 	skb_shinfo(skb)->gso_size = 0;
2369 	__skb_push(skb, -skb_network_offset(skb));
2370 	return netif_receive_skb(skb);
2371 }
2372 
napi_gro_flush(struct napi_struct * napi)2373 void napi_gro_flush(struct napi_struct *napi)
2374 {
2375 	struct sk_buff *skb, *next;
2376 
2377 	for (skb = napi->gro_list; skb; skb = next) {
2378 		next = skb->next;
2379 		skb->next = NULL;
2380 		napi_gro_complete(skb);
2381 	}
2382 
2383 	napi->gro_list = NULL;
2384 }
2385 EXPORT_SYMBOL(napi_gro_flush);
2386 
dev_gro_receive(struct napi_struct * napi,struct sk_buff * skb)2387 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2388 {
2389 	struct sk_buff **pp = NULL;
2390 	struct packet_type *ptype;
2391 	__be16 type = skb->protocol;
2392 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2393 	int count = 0;
2394 	int same_flow;
2395 	int mac_len;
2396 	int free;
2397 
2398 	if (!(skb->dev->features & NETIF_F_GRO))
2399 		goto normal;
2400 
2401 	if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2402 		goto normal;
2403 
2404 	rcu_read_lock();
2405 	list_for_each_entry_rcu(ptype, head, list) {
2406 		struct sk_buff *p;
2407 
2408 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2409 			continue;
2410 
2411 		skb_reset_network_header(skb);
2412 		mac_len = skb->network_header - skb->mac_header;
2413 		skb->mac_len = mac_len;
2414 		NAPI_GRO_CB(skb)->same_flow = 0;
2415 		NAPI_GRO_CB(skb)->flush = 0;
2416 		NAPI_GRO_CB(skb)->free = 0;
2417 
2418 		for (p = napi->gro_list; p; p = p->next) {
2419 			count++;
2420 
2421 			if (!NAPI_GRO_CB(p)->same_flow)
2422 				continue;
2423 
2424 			if (p->mac_len != mac_len ||
2425 			    memcmp(skb_mac_header(p), skb_mac_header(skb),
2426 				   mac_len))
2427 				NAPI_GRO_CB(p)->same_flow = 0;
2428 		}
2429 
2430 		pp = ptype->gro_receive(&napi->gro_list, skb);
2431 		break;
2432 	}
2433 	rcu_read_unlock();
2434 
2435 	if (&ptype->list == head)
2436 		goto normal;
2437 
2438 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2439 	free = NAPI_GRO_CB(skb)->free;
2440 
2441 	if (pp) {
2442 		struct sk_buff *nskb = *pp;
2443 
2444 		*pp = nskb->next;
2445 		nskb->next = NULL;
2446 		napi_gro_complete(nskb);
2447 		count--;
2448 	}
2449 
2450 	if (same_flow)
2451 		goto ok;
2452 
2453 	if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) {
2454 		__skb_push(skb, -skb_network_offset(skb));
2455 		goto normal;
2456 	}
2457 
2458 	NAPI_GRO_CB(skb)->count = 1;
2459 	skb_shinfo(skb)->gso_size = skb->len;
2460 	skb->next = napi->gro_list;
2461 	napi->gro_list = skb;
2462 
2463 ok:
2464 	return free;
2465 
2466 normal:
2467 	return -1;
2468 }
2469 EXPORT_SYMBOL(dev_gro_receive);
2470 
__napi_gro_receive(struct napi_struct * napi,struct sk_buff * skb)2471 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2472 {
2473 	struct sk_buff *p;
2474 
2475 	for (p = napi->gro_list; p; p = p->next) {
2476 		NAPI_GRO_CB(p)->same_flow = 1;
2477 		NAPI_GRO_CB(p)->flush = 0;
2478 	}
2479 
2480 	return dev_gro_receive(napi, skb);
2481 }
2482 
napi_gro_receive(struct napi_struct * napi,struct sk_buff * skb)2483 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2484 {
2485 	if (netpoll_receive_skb(skb))
2486 		return NET_RX_DROP;
2487 
2488 	switch (__napi_gro_receive(napi, skb)) {
2489 	case -1:
2490 		return netif_receive_skb(skb);
2491 
2492 	case 1:
2493 		kfree_skb(skb);
2494 		break;
2495 	}
2496 
2497 	return NET_RX_SUCCESS;
2498 }
2499 EXPORT_SYMBOL(napi_gro_receive);
2500 
napi_reuse_skb(struct napi_struct * napi,struct sk_buff * skb)2501 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2502 {
2503 	__skb_pull(skb, skb_headlen(skb));
2504 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2505 
2506 	napi->skb = skb;
2507 }
2508 EXPORT_SYMBOL(napi_reuse_skb);
2509 
napi_fraginfo_skb(struct napi_struct * napi,struct napi_gro_fraginfo * info)2510 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2511 				  struct napi_gro_fraginfo *info)
2512 {
2513 	struct net_device *dev = napi->dev;
2514 	struct sk_buff *skb = napi->skb;
2515 
2516 	napi->skb = NULL;
2517 
2518 	if (!skb) {
2519 		skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2520 		if (!skb)
2521 			goto out;
2522 
2523 		skb_reserve(skb, NET_IP_ALIGN);
2524 	}
2525 
2526 	BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2527 	skb_shinfo(skb)->nr_frags = info->nr_frags;
2528 	memcpy(skb_shinfo(skb)->frags, info->frags, sizeof(info->frags));
2529 
2530 	skb->data_len = info->len;
2531 	skb->len += info->len;
2532 	skb->truesize += info->len;
2533 
2534 	if (!pskb_may_pull(skb, ETH_HLEN)) {
2535 		napi_reuse_skb(napi, skb);
2536 		skb = NULL;
2537 		goto out;
2538 	}
2539 
2540 	skb->protocol = eth_type_trans(skb, dev);
2541 
2542 	skb->ip_summed = info->ip_summed;
2543 	skb->csum = info->csum;
2544 
2545 out:
2546 	return skb;
2547 }
2548 EXPORT_SYMBOL(napi_fraginfo_skb);
2549 
napi_gro_frags(struct napi_struct * napi,struct napi_gro_fraginfo * info)2550 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2551 {
2552 	struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2553 	int err = NET_RX_DROP;
2554 
2555 	if (!skb)
2556 		goto out;
2557 
2558 	if (netpoll_receive_skb(skb))
2559 		goto out;
2560 
2561 	err = NET_RX_SUCCESS;
2562 
2563 	switch (__napi_gro_receive(napi, skb)) {
2564 	case -1:
2565 		return netif_receive_skb(skb);
2566 
2567 	case 0:
2568 		goto out;
2569 	}
2570 
2571 	napi_reuse_skb(napi, skb);
2572 
2573 out:
2574 	return err;
2575 }
2576 EXPORT_SYMBOL(napi_gro_frags);
2577 
process_backlog(struct napi_struct * napi,int quota)2578 static int process_backlog(struct napi_struct *napi, int quota)
2579 {
2580 	int work = 0;
2581 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2582 	unsigned long start_time = jiffies;
2583 
2584 	napi->weight = weight_p;
2585 	do {
2586 		struct sk_buff *skb;
2587 
2588 		local_irq_disable();
2589 		skb = __skb_dequeue(&queue->input_pkt_queue);
2590 		if (!skb) {
2591 			__napi_complete(napi);
2592 			local_irq_enable();
2593 			break;
2594 		}
2595 		local_irq_enable();
2596 
2597 		netif_receive_skb(skb);
2598 	} while (++work < quota && jiffies == start_time);
2599 
2600 	return work;
2601 }
2602 
2603 /**
2604  * __napi_schedule - schedule for receive
2605  * @n: entry to schedule
2606  *
2607  * The entry's receive function will be scheduled to run
2608  */
__napi_schedule(struct napi_struct * n)2609 void __napi_schedule(struct napi_struct *n)
2610 {
2611 	unsigned long flags;
2612 
2613 	local_irq_save(flags);
2614 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2615 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2616 	local_irq_restore(flags);
2617 }
2618 EXPORT_SYMBOL(__napi_schedule);
2619 
__napi_complete(struct napi_struct * n)2620 void __napi_complete(struct napi_struct *n)
2621 {
2622 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2623 	BUG_ON(n->gro_list);
2624 
2625 	list_del(&n->poll_list);
2626 	smp_mb__before_clear_bit();
2627 	clear_bit(NAPI_STATE_SCHED, &n->state);
2628 }
2629 EXPORT_SYMBOL(__napi_complete);
2630 
napi_complete(struct napi_struct * n)2631 void napi_complete(struct napi_struct *n)
2632 {
2633 	unsigned long flags;
2634 
2635 	/*
2636 	 * don't let napi dequeue from the cpu poll list
2637 	 * just in case its running on a different cpu
2638 	 */
2639 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2640 		return;
2641 
2642 	napi_gro_flush(n);
2643 	local_irq_save(flags);
2644 	__napi_complete(n);
2645 	local_irq_restore(flags);
2646 }
2647 EXPORT_SYMBOL(napi_complete);
2648 
netif_napi_add(struct net_device * dev,struct napi_struct * napi,int (* poll)(struct napi_struct *,int),int weight)2649 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2650 		    int (*poll)(struct napi_struct *, int), int weight)
2651 {
2652 	INIT_LIST_HEAD(&napi->poll_list);
2653 	napi->gro_list = NULL;
2654 	napi->skb = NULL;
2655 	napi->poll = poll;
2656 	napi->weight = weight;
2657 	list_add(&napi->dev_list, &dev->napi_list);
2658 	napi->dev = dev;
2659 #ifdef CONFIG_NETPOLL
2660 	spin_lock_init(&napi->poll_lock);
2661 	napi->poll_owner = -1;
2662 #endif
2663 	set_bit(NAPI_STATE_SCHED, &napi->state);
2664 }
2665 EXPORT_SYMBOL(netif_napi_add);
2666 
netif_napi_del(struct napi_struct * napi)2667 void netif_napi_del(struct napi_struct *napi)
2668 {
2669 	struct sk_buff *skb, *next;
2670 
2671 	list_del_init(&napi->dev_list);
2672 	kfree_skb(napi->skb);
2673 
2674 	for (skb = napi->gro_list; skb; skb = next) {
2675 		next = skb->next;
2676 		skb->next = NULL;
2677 		kfree_skb(skb);
2678 	}
2679 
2680 	napi->gro_list = NULL;
2681 }
2682 EXPORT_SYMBOL(netif_napi_del);
2683 
2684 
net_rx_action(struct softirq_action * h)2685 static void net_rx_action(struct softirq_action *h)
2686 {
2687 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2688 	unsigned long time_limit = jiffies + 2;
2689 	int budget = netdev_budget;
2690 	void *have;
2691 
2692 	local_irq_disable();
2693 
2694 	while (!list_empty(list)) {
2695 		struct napi_struct *n;
2696 		int work, weight;
2697 
2698 		/* If softirq window is exhuasted then punt.
2699 		 * Allow this to run for 2 jiffies since which will allow
2700 		 * an average latency of 1.5/HZ.
2701 		 */
2702 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2703 			goto softnet_break;
2704 
2705 		local_irq_enable();
2706 
2707 		/* Even though interrupts have been re-enabled, this
2708 		 * access is safe because interrupts can only add new
2709 		 * entries to the tail of this list, and only ->poll()
2710 		 * calls can remove this head entry from the list.
2711 		 */
2712 		n = list_entry(list->next, struct napi_struct, poll_list);
2713 
2714 		have = netpoll_poll_lock(n);
2715 
2716 		weight = n->weight;
2717 
2718 		/* This NAPI_STATE_SCHED test is for avoiding a race
2719 		 * with netpoll's poll_napi().  Only the entity which
2720 		 * obtains the lock and sees NAPI_STATE_SCHED set will
2721 		 * actually make the ->poll() call.  Therefore we avoid
2722 		 * accidently calling ->poll() when NAPI is not scheduled.
2723 		 */
2724 		work = 0;
2725 		if (test_bit(NAPI_STATE_SCHED, &n->state))
2726 			work = n->poll(n, weight);
2727 
2728 		WARN_ON_ONCE(work > weight);
2729 
2730 		budget -= work;
2731 
2732 		local_irq_disable();
2733 
2734 		/* Drivers must not modify the NAPI state if they
2735 		 * consume the entire weight.  In such cases this code
2736 		 * still "owns" the NAPI instance and therefore can
2737 		 * move the instance around on the list at-will.
2738 		 */
2739 		if (unlikely(work == weight)) {
2740 			if (unlikely(napi_disable_pending(n)))
2741 				__napi_complete(n);
2742 			else
2743 				list_move_tail(&n->poll_list, list);
2744 		}
2745 
2746 		netpoll_poll_unlock(have);
2747 	}
2748 out:
2749 	local_irq_enable();
2750 
2751 #ifdef CONFIG_NET_DMA
2752 	/*
2753 	 * There may not be any more sk_buffs coming right now, so push
2754 	 * any pending DMA copies to hardware
2755 	 */
2756 	dma_issue_pending_all();
2757 #endif
2758 
2759 	return;
2760 
2761 softnet_break:
2762 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
2763 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2764 	goto out;
2765 }
2766 
2767 static gifconf_func_t * gifconf_list [NPROTO];
2768 
2769 /**
2770  *	register_gifconf	-	register a SIOCGIF handler
2771  *	@family: Address family
2772  *	@gifconf: Function handler
2773  *
2774  *	Register protocol dependent address dumping routines. The handler
2775  *	that is passed must not be freed or reused until it has been replaced
2776  *	by another handler.
2777  */
register_gifconf(unsigned int family,gifconf_func_t * gifconf)2778 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2779 {
2780 	if (family >= NPROTO)
2781 		return -EINVAL;
2782 	gifconf_list[family] = gifconf;
2783 	return 0;
2784 }
2785 
2786 
2787 /*
2788  *	Map an interface index to its name (SIOCGIFNAME)
2789  */
2790 
2791 /*
2792  *	We need this ioctl for efficient implementation of the
2793  *	if_indextoname() function required by the IPv6 API.  Without
2794  *	it, we would have to search all the interfaces to find a
2795  *	match.  --pb
2796  */
2797 
dev_ifname(struct net * net,struct ifreq __user * arg)2798 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2799 {
2800 	struct net_device *dev;
2801 	struct ifreq ifr;
2802 
2803 	/*
2804 	 *	Fetch the caller's info block.
2805 	 */
2806 
2807 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2808 		return -EFAULT;
2809 
2810 	read_lock(&dev_base_lock);
2811 	dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2812 	if (!dev) {
2813 		read_unlock(&dev_base_lock);
2814 		return -ENODEV;
2815 	}
2816 
2817 	strcpy(ifr.ifr_name, dev->name);
2818 	read_unlock(&dev_base_lock);
2819 
2820 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2821 		return -EFAULT;
2822 	return 0;
2823 }
2824 
2825 /*
2826  *	Perform a SIOCGIFCONF call. This structure will change
2827  *	size eventually, and there is nothing I can do about it.
2828  *	Thus we will need a 'compatibility mode'.
2829  */
2830 
dev_ifconf(struct net * net,char __user * arg)2831 static int dev_ifconf(struct net *net, char __user *arg)
2832 {
2833 	struct ifconf ifc;
2834 	struct net_device *dev;
2835 	char __user *pos;
2836 	int len;
2837 	int total;
2838 	int i;
2839 
2840 	/*
2841 	 *	Fetch the caller's info block.
2842 	 */
2843 
2844 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2845 		return -EFAULT;
2846 
2847 	pos = ifc.ifc_buf;
2848 	len = ifc.ifc_len;
2849 
2850 	/*
2851 	 *	Loop over the interfaces, and write an info block for each.
2852 	 */
2853 
2854 	total = 0;
2855 	for_each_netdev(net, dev) {
2856 		for (i = 0; i < NPROTO; i++) {
2857 			if (gifconf_list[i]) {
2858 				int done;
2859 				if (!pos)
2860 					done = gifconf_list[i](dev, NULL, 0);
2861 				else
2862 					done = gifconf_list[i](dev, pos + total,
2863 							       len - total);
2864 				if (done < 0)
2865 					return -EFAULT;
2866 				total += done;
2867 			}
2868 		}
2869 	}
2870 
2871 	/*
2872 	 *	All done.  Write the updated control block back to the caller.
2873 	 */
2874 	ifc.ifc_len = total;
2875 
2876 	/*
2877 	 * 	Both BSD and Solaris return 0 here, so we do too.
2878 	 */
2879 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2880 }
2881 
2882 #ifdef CONFIG_PROC_FS
2883 /*
2884  *	This is invoked by the /proc filesystem handler to display a device
2885  *	in detail.
2886  */
dev_seq_start(struct seq_file * seq,loff_t * pos)2887 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2888 	__acquires(dev_base_lock)
2889 {
2890 	struct net *net = seq_file_net(seq);
2891 	loff_t off;
2892 	struct net_device *dev;
2893 
2894 	read_lock(&dev_base_lock);
2895 	if (!*pos)
2896 		return SEQ_START_TOKEN;
2897 
2898 	off = 1;
2899 	for_each_netdev(net, dev)
2900 		if (off++ == *pos)
2901 			return dev;
2902 
2903 	return NULL;
2904 }
2905 
dev_seq_next(struct seq_file * seq,void * v,loff_t * pos)2906 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2907 {
2908 	struct net *net = seq_file_net(seq);
2909 	++*pos;
2910 	return v == SEQ_START_TOKEN ?
2911 		first_net_device(net) : next_net_device((struct net_device *)v);
2912 }
2913 
dev_seq_stop(struct seq_file * seq,void * v)2914 void dev_seq_stop(struct seq_file *seq, void *v)
2915 	__releases(dev_base_lock)
2916 {
2917 	read_unlock(&dev_base_lock);
2918 }
2919 
dev_seq_printf_stats(struct seq_file * seq,struct net_device * dev)2920 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2921 {
2922 	const struct net_device_stats *stats = dev_get_stats(dev);
2923 
2924 	seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2925 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2926 		   dev->name, stats->rx_bytes, stats->rx_packets,
2927 		   stats->rx_errors,
2928 		   stats->rx_dropped + stats->rx_missed_errors,
2929 		   stats->rx_fifo_errors,
2930 		   stats->rx_length_errors + stats->rx_over_errors +
2931 		    stats->rx_crc_errors + stats->rx_frame_errors,
2932 		   stats->rx_compressed, stats->multicast,
2933 		   stats->tx_bytes, stats->tx_packets,
2934 		   stats->tx_errors, stats->tx_dropped,
2935 		   stats->tx_fifo_errors, stats->collisions,
2936 		   stats->tx_carrier_errors +
2937 		    stats->tx_aborted_errors +
2938 		    stats->tx_window_errors +
2939 		    stats->tx_heartbeat_errors,
2940 		   stats->tx_compressed);
2941 }
2942 
2943 /*
2944  *	Called from the PROCfs module. This now uses the new arbitrary sized
2945  *	/proc/net interface to create /proc/net/dev
2946  */
dev_seq_show(struct seq_file * seq,void * v)2947 static int dev_seq_show(struct seq_file *seq, void *v)
2948 {
2949 	if (v == SEQ_START_TOKEN)
2950 		seq_puts(seq, "Inter-|   Receive                            "
2951 			      "                    |  Transmit\n"
2952 			      " face |bytes    packets errs drop fifo frame "
2953 			      "compressed multicast|bytes    packets errs "
2954 			      "drop fifo colls carrier compressed\n");
2955 	else
2956 		dev_seq_printf_stats(seq, v);
2957 	return 0;
2958 }
2959 
softnet_get_online(loff_t * pos)2960 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2961 {
2962 	struct netif_rx_stats *rc = NULL;
2963 
2964 	while (*pos < nr_cpu_ids)
2965 		if (cpu_online(*pos)) {
2966 			rc = &per_cpu(netdev_rx_stat, *pos);
2967 			break;
2968 		} else
2969 			++*pos;
2970 	return rc;
2971 }
2972 
softnet_seq_start(struct seq_file * seq,loff_t * pos)2973 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2974 {
2975 	return softnet_get_online(pos);
2976 }
2977 
softnet_seq_next(struct seq_file * seq,void * v,loff_t * pos)2978 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2979 {
2980 	++*pos;
2981 	return softnet_get_online(pos);
2982 }
2983 
softnet_seq_stop(struct seq_file * seq,void * v)2984 static void softnet_seq_stop(struct seq_file *seq, void *v)
2985 {
2986 }
2987 
softnet_seq_show(struct seq_file * seq,void * v)2988 static int softnet_seq_show(struct seq_file *seq, void *v)
2989 {
2990 	struct netif_rx_stats *s = v;
2991 
2992 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2993 		   s->total, s->dropped, s->time_squeeze, 0,
2994 		   0, 0, 0, 0, /* was fastroute */
2995 		   s->cpu_collision );
2996 	return 0;
2997 }
2998 
2999 static const struct seq_operations dev_seq_ops = {
3000 	.start = dev_seq_start,
3001 	.next  = dev_seq_next,
3002 	.stop  = dev_seq_stop,
3003 	.show  = dev_seq_show,
3004 };
3005 
dev_seq_open(struct inode * inode,struct file * file)3006 static int dev_seq_open(struct inode *inode, struct file *file)
3007 {
3008 	return seq_open_net(inode, file, &dev_seq_ops,
3009 			    sizeof(struct seq_net_private));
3010 }
3011 
3012 static const struct file_operations dev_seq_fops = {
3013 	.owner	 = THIS_MODULE,
3014 	.open    = dev_seq_open,
3015 	.read    = seq_read,
3016 	.llseek  = seq_lseek,
3017 	.release = seq_release_net,
3018 };
3019 
3020 static const struct seq_operations softnet_seq_ops = {
3021 	.start = softnet_seq_start,
3022 	.next  = softnet_seq_next,
3023 	.stop  = softnet_seq_stop,
3024 	.show  = softnet_seq_show,
3025 };
3026 
softnet_seq_open(struct inode * inode,struct file * file)3027 static int softnet_seq_open(struct inode *inode, struct file *file)
3028 {
3029 	return seq_open(file, &softnet_seq_ops);
3030 }
3031 
3032 static const struct file_operations softnet_seq_fops = {
3033 	.owner	 = THIS_MODULE,
3034 	.open    = softnet_seq_open,
3035 	.read    = seq_read,
3036 	.llseek  = seq_lseek,
3037 	.release = seq_release,
3038 };
3039 
ptype_get_idx(loff_t pos)3040 static void *ptype_get_idx(loff_t pos)
3041 {
3042 	struct packet_type *pt = NULL;
3043 	loff_t i = 0;
3044 	int t;
3045 
3046 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3047 		if (i == pos)
3048 			return pt;
3049 		++i;
3050 	}
3051 
3052 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3053 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3054 			if (i == pos)
3055 				return pt;
3056 			++i;
3057 		}
3058 	}
3059 	return NULL;
3060 }
3061 
ptype_seq_start(struct seq_file * seq,loff_t * pos)3062 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3063 	__acquires(RCU)
3064 {
3065 	rcu_read_lock();
3066 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3067 }
3068 
ptype_seq_next(struct seq_file * seq,void * v,loff_t * pos)3069 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3070 {
3071 	struct packet_type *pt;
3072 	struct list_head *nxt;
3073 	int hash;
3074 
3075 	++*pos;
3076 	if (v == SEQ_START_TOKEN)
3077 		return ptype_get_idx(0);
3078 
3079 	pt = v;
3080 	nxt = pt->list.next;
3081 	if (pt->type == htons(ETH_P_ALL)) {
3082 		if (nxt != &ptype_all)
3083 			goto found;
3084 		hash = 0;
3085 		nxt = ptype_base[0].next;
3086 	} else
3087 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3088 
3089 	while (nxt == &ptype_base[hash]) {
3090 		if (++hash >= PTYPE_HASH_SIZE)
3091 			return NULL;
3092 		nxt = ptype_base[hash].next;
3093 	}
3094 found:
3095 	return list_entry(nxt, struct packet_type, list);
3096 }
3097 
ptype_seq_stop(struct seq_file * seq,void * v)3098 static void ptype_seq_stop(struct seq_file *seq, void *v)
3099 	__releases(RCU)
3100 {
3101 	rcu_read_unlock();
3102 }
3103 
ptype_seq_show(struct seq_file * seq,void * v)3104 static int ptype_seq_show(struct seq_file *seq, void *v)
3105 {
3106 	struct packet_type *pt = v;
3107 
3108 	if (v == SEQ_START_TOKEN)
3109 		seq_puts(seq, "Type Device      Function\n");
3110 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3111 		if (pt->type == htons(ETH_P_ALL))
3112 			seq_puts(seq, "ALL ");
3113 		else
3114 			seq_printf(seq, "%04x", ntohs(pt->type));
3115 
3116 		seq_printf(seq, " %-8s %pF\n",
3117 			   pt->dev ? pt->dev->name : "", pt->func);
3118 	}
3119 
3120 	return 0;
3121 }
3122 
3123 static const struct seq_operations ptype_seq_ops = {
3124 	.start = ptype_seq_start,
3125 	.next  = ptype_seq_next,
3126 	.stop  = ptype_seq_stop,
3127 	.show  = ptype_seq_show,
3128 };
3129 
ptype_seq_open(struct inode * inode,struct file * file)3130 static int ptype_seq_open(struct inode *inode, struct file *file)
3131 {
3132 	return seq_open_net(inode, file, &ptype_seq_ops,
3133 			sizeof(struct seq_net_private));
3134 }
3135 
3136 static const struct file_operations ptype_seq_fops = {
3137 	.owner	 = THIS_MODULE,
3138 	.open    = ptype_seq_open,
3139 	.read    = seq_read,
3140 	.llseek  = seq_lseek,
3141 	.release = seq_release_net,
3142 };
3143 
3144 
dev_proc_net_init(struct net * net)3145 static int __net_init dev_proc_net_init(struct net *net)
3146 {
3147 	int rc = -ENOMEM;
3148 
3149 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3150 		goto out;
3151 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3152 		goto out_dev;
3153 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3154 		goto out_softnet;
3155 
3156 	if (wext_proc_init(net))
3157 		goto out_ptype;
3158 	rc = 0;
3159 out:
3160 	return rc;
3161 out_ptype:
3162 	proc_net_remove(net, "ptype");
3163 out_softnet:
3164 	proc_net_remove(net, "softnet_stat");
3165 out_dev:
3166 	proc_net_remove(net, "dev");
3167 	goto out;
3168 }
3169 
dev_proc_net_exit(struct net * net)3170 static void __net_exit dev_proc_net_exit(struct net *net)
3171 {
3172 	wext_proc_exit(net);
3173 
3174 	proc_net_remove(net, "ptype");
3175 	proc_net_remove(net, "softnet_stat");
3176 	proc_net_remove(net, "dev");
3177 }
3178 
3179 static struct pernet_operations __net_initdata dev_proc_ops = {
3180 	.init = dev_proc_net_init,
3181 	.exit = dev_proc_net_exit,
3182 };
3183 
dev_proc_init(void)3184 static int __init dev_proc_init(void)
3185 {
3186 	return register_pernet_subsys(&dev_proc_ops);
3187 }
3188 #else
3189 #define dev_proc_init() 0
3190 #endif	/* CONFIG_PROC_FS */
3191 
3192 
3193 /**
3194  *	netdev_set_master	-	set up master/slave pair
3195  *	@slave: slave device
3196  *	@master: new master device
3197  *
3198  *	Changes the master device of the slave. Pass %NULL to break the
3199  *	bonding. The caller must hold the RTNL semaphore. On a failure
3200  *	a negative errno code is returned. On success the reference counts
3201  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3202  *	function returns zero.
3203  */
netdev_set_master(struct net_device * slave,struct net_device * master)3204 int netdev_set_master(struct net_device *slave, struct net_device *master)
3205 {
3206 	struct net_device *old = slave->master;
3207 
3208 	ASSERT_RTNL();
3209 
3210 	if (master) {
3211 		if (old)
3212 			return -EBUSY;
3213 		dev_hold(master);
3214 	}
3215 
3216 	slave->master = master;
3217 
3218 	synchronize_net();
3219 
3220 	if (old)
3221 		dev_put(old);
3222 
3223 	if (master)
3224 		slave->flags |= IFF_SLAVE;
3225 	else
3226 		slave->flags &= ~IFF_SLAVE;
3227 
3228 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3229 	return 0;
3230 }
3231 
dev_change_rx_flags(struct net_device * dev,int flags)3232 static void dev_change_rx_flags(struct net_device *dev, int flags)
3233 {
3234 	const struct net_device_ops *ops = dev->netdev_ops;
3235 
3236 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3237 		ops->ndo_change_rx_flags(dev, flags);
3238 }
3239 
__dev_set_promiscuity(struct net_device * dev,int inc)3240 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3241 {
3242 	unsigned short old_flags = dev->flags;
3243 	uid_t uid;
3244 	gid_t gid;
3245 
3246 	ASSERT_RTNL();
3247 
3248 	dev->flags |= IFF_PROMISC;
3249 	dev->promiscuity += inc;
3250 	if (dev->promiscuity == 0) {
3251 		/*
3252 		 * Avoid overflow.
3253 		 * If inc causes overflow, untouch promisc and return error.
3254 		 */
3255 		if (inc < 0)
3256 			dev->flags &= ~IFF_PROMISC;
3257 		else {
3258 			dev->promiscuity -= inc;
3259 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3260 				"set promiscuity failed, promiscuity feature "
3261 				"of device might be broken.\n", dev->name);
3262 			return -EOVERFLOW;
3263 		}
3264 	}
3265 	if (dev->flags != old_flags) {
3266 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3267 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3268 							       "left");
3269 		if (audit_enabled) {
3270 			current_uid_gid(&uid, &gid);
3271 			audit_log(current->audit_context, GFP_ATOMIC,
3272 				AUDIT_ANOM_PROMISCUOUS,
3273 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3274 				dev->name, (dev->flags & IFF_PROMISC),
3275 				(old_flags & IFF_PROMISC),
3276 				audit_get_loginuid(current),
3277 				uid, gid,
3278 				audit_get_sessionid(current));
3279 		}
3280 
3281 		dev_change_rx_flags(dev, IFF_PROMISC);
3282 	}
3283 	return 0;
3284 }
3285 
3286 /**
3287  *	dev_set_promiscuity	- update promiscuity count on a device
3288  *	@dev: device
3289  *	@inc: modifier
3290  *
3291  *	Add or remove promiscuity from a device. While the count in the device
3292  *	remains above zero the interface remains promiscuous. Once it hits zero
3293  *	the device reverts back to normal filtering operation. A negative inc
3294  *	value is used to drop promiscuity on the device.
3295  *	Return 0 if successful or a negative errno code on error.
3296  */
dev_set_promiscuity(struct net_device * dev,int inc)3297 int dev_set_promiscuity(struct net_device *dev, int inc)
3298 {
3299 	unsigned short old_flags = dev->flags;
3300 	int err;
3301 
3302 	err = __dev_set_promiscuity(dev, inc);
3303 	if (err < 0)
3304 		return err;
3305 	if (dev->flags != old_flags)
3306 		dev_set_rx_mode(dev);
3307 	return err;
3308 }
3309 
3310 /**
3311  *	dev_set_allmulti	- update allmulti count on a device
3312  *	@dev: device
3313  *	@inc: modifier
3314  *
3315  *	Add or remove reception of all multicast frames to a device. While the
3316  *	count in the device remains above zero the interface remains listening
3317  *	to all interfaces. Once it hits zero the device reverts back to normal
3318  *	filtering operation. A negative @inc value is used to drop the counter
3319  *	when releasing a resource needing all multicasts.
3320  *	Return 0 if successful or a negative errno code on error.
3321  */
3322 
dev_set_allmulti(struct net_device * dev,int inc)3323 int dev_set_allmulti(struct net_device *dev, int inc)
3324 {
3325 	unsigned short old_flags = dev->flags;
3326 
3327 	ASSERT_RTNL();
3328 
3329 	dev->flags |= IFF_ALLMULTI;
3330 	dev->allmulti += inc;
3331 	if (dev->allmulti == 0) {
3332 		/*
3333 		 * Avoid overflow.
3334 		 * If inc causes overflow, untouch allmulti and return error.
3335 		 */
3336 		if (inc < 0)
3337 			dev->flags &= ~IFF_ALLMULTI;
3338 		else {
3339 			dev->allmulti -= inc;
3340 			printk(KERN_WARNING "%s: allmulti touches roof, "
3341 				"set allmulti failed, allmulti feature of "
3342 				"device might be broken.\n", dev->name);
3343 			return -EOVERFLOW;
3344 		}
3345 	}
3346 	if (dev->flags ^ old_flags) {
3347 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3348 		dev_set_rx_mode(dev);
3349 	}
3350 	return 0;
3351 }
3352 
3353 /*
3354  *	Upload unicast and multicast address lists to device and
3355  *	configure RX filtering. When the device doesn't support unicast
3356  *	filtering it is put in promiscuous mode while unicast addresses
3357  *	are present.
3358  */
__dev_set_rx_mode(struct net_device * dev)3359 void __dev_set_rx_mode(struct net_device *dev)
3360 {
3361 	const struct net_device_ops *ops = dev->netdev_ops;
3362 
3363 	/* dev_open will call this function so the list will stay sane. */
3364 	if (!(dev->flags&IFF_UP))
3365 		return;
3366 
3367 	if (!netif_device_present(dev))
3368 		return;
3369 
3370 	if (ops->ndo_set_rx_mode)
3371 		ops->ndo_set_rx_mode(dev);
3372 	else {
3373 		/* Unicast addresses changes may only happen under the rtnl,
3374 		 * therefore calling __dev_set_promiscuity here is safe.
3375 		 */
3376 		if (dev->uc_count > 0 && !dev->uc_promisc) {
3377 			__dev_set_promiscuity(dev, 1);
3378 			dev->uc_promisc = 1;
3379 		} else if (dev->uc_count == 0 && dev->uc_promisc) {
3380 			__dev_set_promiscuity(dev, -1);
3381 			dev->uc_promisc = 0;
3382 		}
3383 
3384 		if (ops->ndo_set_multicast_list)
3385 			ops->ndo_set_multicast_list(dev);
3386 	}
3387 }
3388 
dev_set_rx_mode(struct net_device * dev)3389 void dev_set_rx_mode(struct net_device *dev)
3390 {
3391 	netif_addr_lock_bh(dev);
3392 	__dev_set_rx_mode(dev);
3393 	netif_addr_unlock_bh(dev);
3394 }
3395 
__dev_addr_delete(struct dev_addr_list ** list,int * count,void * addr,int alen,int glbl)3396 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3397 		      void *addr, int alen, int glbl)
3398 {
3399 	struct dev_addr_list *da;
3400 
3401 	for (; (da = *list) != NULL; list = &da->next) {
3402 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3403 		    alen == da->da_addrlen) {
3404 			if (glbl) {
3405 				int old_glbl = da->da_gusers;
3406 				da->da_gusers = 0;
3407 				if (old_glbl == 0)
3408 					break;
3409 			}
3410 			if (--da->da_users)
3411 				return 0;
3412 
3413 			*list = da->next;
3414 			kfree(da);
3415 			(*count)--;
3416 			return 0;
3417 		}
3418 	}
3419 	return -ENOENT;
3420 }
3421 
__dev_addr_add(struct dev_addr_list ** list,int * count,void * addr,int alen,int glbl)3422 int __dev_addr_add(struct dev_addr_list **list, int *count,
3423 		   void *addr, int alen, int glbl)
3424 {
3425 	struct dev_addr_list *da;
3426 
3427 	for (da = *list; da != NULL; da = da->next) {
3428 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3429 		    da->da_addrlen == alen) {
3430 			if (glbl) {
3431 				int old_glbl = da->da_gusers;
3432 				da->da_gusers = 1;
3433 				if (old_glbl)
3434 					return 0;
3435 			}
3436 			da->da_users++;
3437 			return 0;
3438 		}
3439 	}
3440 
3441 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
3442 	if (da == NULL)
3443 		return -ENOMEM;
3444 	memcpy(da->da_addr, addr, alen);
3445 	da->da_addrlen = alen;
3446 	da->da_users = 1;
3447 	da->da_gusers = glbl ? 1 : 0;
3448 	da->next = *list;
3449 	*list = da;
3450 	(*count)++;
3451 	return 0;
3452 }
3453 
3454 /**
3455  *	dev_unicast_delete	- Release secondary unicast address.
3456  *	@dev: device
3457  *	@addr: address to delete
3458  *	@alen: length of @addr
3459  *
3460  *	Release reference to a secondary unicast address and remove it
3461  *	from the device if the reference count drops to zero.
3462  *
3463  * 	The caller must hold the rtnl_mutex.
3464  */
dev_unicast_delete(struct net_device * dev,void * addr,int alen)3465 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3466 {
3467 	int err;
3468 
3469 	ASSERT_RTNL();
3470 
3471 	netif_addr_lock_bh(dev);
3472 	err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3473 	if (!err)
3474 		__dev_set_rx_mode(dev);
3475 	netif_addr_unlock_bh(dev);
3476 	return err;
3477 }
3478 EXPORT_SYMBOL(dev_unicast_delete);
3479 
3480 /**
3481  *	dev_unicast_add		- add a secondary unicast address
3482  *	@dev: device
3483  *	@addr: address to add
3484  *	@alen: length of @addr
3485  *
3486  *	Add a secondary unicast address to the device or increase
3487  *	the reference count if it already exists.
3488  *
3489  *	The caller must hold the rtnl_mutex.
3490  */
dev_unicast_add(struct net_device * dev,void * addr,int alen)3491 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3492 {
3493 	int err;
3494 
3495 	ASSERT_RTNL();
3496 
3497 	netif_addr_lock_bh(dev);
3498 	err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3499 	if (!err)
3500 		__dev_set_rx_mode(dev);
3501 	netif_addr_unlock_bh(dev);
3502 	return err;
3503 }
3504 EXPORT_SYMBOL(dev_unicast_add);
3505 
__dev_addr_sync(struct dev_addr_list ** to,int * to_count,struct dev_addr_list ** from,int * from_count)3506 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3507 		    struct dev_addr_list **from, int *from_count)
3508 {
3509 	struct dev_addr_list *da, *next;
3510 	int err = 0;
3511 
3512 	da = *from;
3513 	while (da != NULL) {
3514 		next = da->next;
3515 		if (!da->da_synced) {
3516 			err = __dev_addr_add(to, to_count,
3517 					     da->da_addr, da->da_addrlen, 0);
3518 			if (err < 0)
3519 				break;
3520 			da->da_synced = 1;
3521 			da->da_users++;
3522 		} else if (da->da_users == 1) {
3523 			__dev_addr_delete(to, to_count,
3524 					  da->da_addr, da->da_addrlen, 0);
3525 			__dev_addr_delete(from, from_count,
3526 					  da->da_addr, da->da_addrlen, 0);
3527 		}
3528 		da = next;
3529 	}
3530 	return err;
3531 }
3532 
__dev_addr_unsync(struct dev_addr_list ** to,int * to_count,struct dev_addr_list ** from,int * from_count)3533 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3534 		       struct dev_addr_list **from, int *from_count)
3535 {
3536 	struct dev_addr_list *da, *next;
3537 
3538 	da = *from;
3539 	while (da != NULL) {
3540 		next = da->next;
3541 		if (da->da_synced) {
3542 			__dev_addr_delete(to, to_count,
3543 					  da->da_addr, da->da_addrlen, 0);
3544 			da->da_synced = 0;
3545 			__dev_addr_delete(from, from_count,
3546 					  da->da_addr, da->da_addrlen, 0);
3547 		}
3548 		da = next;
3549 	}
3550 }
3551 
3552 /**
3553  *	dev_unicast_sync - Synchronize device's unicast list to another device
3554  *	@to: destination device
3555  *	@from: source device
3556  *
3557  *	Add newly added addresses to the destination device and release
3558  *	addresses that have no users left. The source device must be
3559  *	locked by netif_tx_lock_bh.
3560  *
3561  *	This function is intended to be called from the dev->set_rx_mode
3562  *	function of layered software devices.
3563  */
dev_unicast_sync(struct net_device * to,struct net_device * from)3564 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3565 {
3566 	int err = 0;
3567 
3568 	netif_addr_lock_bh(to);
3569 	err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3570 			      &from->uc_list, &from->uc_count);
3571 	if (!err)
3572 		__dev_set_rx_mode(to);
3573 	netif_addr_unlock_bh(to);
3574 	return err;
3575 }
3576 EXPORT_SYMBOL(dev_unicast_sync);
3577 
3578 /**
3579  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
3580  *	@to: destination device
3581  *	@from: source device
3582  *
3583  *	Remove all addresses that were added to the destination device by
3584  *	dev_unicast_sync(). This function is intended to be called from the
3585  *	dev->stop function of layered software devices.
3586  */
dev_unicast_unsync(struct net_device * to,struct net_device * from)3587 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3588 {
3589 	netif_addr_lock_bh(from);
3590 	netif_addr_lock(to);
3591 
3592 	__dev_addr_unsync(&to->uc_list, &to->uc_count,
3593 			  &from->uc_list, &from->uc_count);
3594 	__dev_set_rx_mode(to);
3595 
3596 	netif_addr_unlock(to);
3597 	netif_addr_unlock_bh(from);
3598 }
3599 EXPORT_SYMBOL(dev_unicast_unsync);
3600 
__dev_addr_discard(struct dev_addr_list ** list)3601 static void __dev_addr_discard(struct dev_addr_list **list)
3602 {
3603 	struct dev_addr_list *tmp;
3604 
3605 	while (*list != NULL) {
3606 		tmp = *list;
3607 		*list = tmp->next;
3608 		if (tmp->da_users > tmp->da_gusers)
3609 			printk("__dev_addr_discard: address leakage! "
3610 			       "da_users=%d\n", tmp->da_users);
3611 		kfree(tmp);
3612 	}
3613 }
3614 
dev_addr_discard(struct net_device * dev)3615 static void dev_addr_discard(struct net_device *dev)
3616 {
3617 	netif_addr_lock_bh(dev);
3618 
3619 	__dev_addr_discard(&dev->uc_list);
3620 	dev->uc_count = 0;
3621 
3622 	__dev_addr_discard(&dev->mc_list);
3623 	dev->mc_count = 0;
3624 
3625 	netif_addr_unlock_bh(dev);
3626 }
3627 
3628 /**
3629  *	dev_get_flags - get flags reported to userspace
3630  *	@dev: device
3631  *
3632  *	Get the combination of flag bits exported through APIs to userspace.
3633  */
dev_get_flags(const struct net_device * dev)3634 unsigned dev_get_flags(const struct net_device *dev)
3635 {
3636 	unsigned flags;
3637 
3638 	flags = (dev->flags & ~(IFF_PROMISC |
3639 				IFF_ALLMULTI |
3640 				IFF_RUNNING |
3641 				IFF_LOWER_UP |
3642 				IFF_DORMANT)) |
3643 		(dev->gflags & (IFF_PROMISC |
3644 				IFF_ALLMULTI));
3645 
3646 	if (netif_running(dev)) {
3647 		if (netif_oper_up(dev))
3648 			flags |= IFF_RUNNING;
3649 		if (netif_carrier_ok(dev))
3650 			flags |= IFF_LOWER_UP;
3651 		if (netif_dormant(dev))
3652 			flags |= IFF_DORMANT;
3653 	}
3654 
3655 	return flags;
3656 }
3657 
3658 /**
3659  *	dev_change_flags - change device settings
3660  *	@dev: device
3661  *	@flags: device state flags
3662  *
3663  *	Change settings on device based state flags. The flags are
3664  *	in the userspace exported format.
3665  */
dev_change_flags(struct net_device * dev,unsigned flags)3666 int dev_change_flags(struct net_device *dev, unsigned flags)
3667 {
3668 	int ret, changes;
3669 	int old_flags = dev->flags;
3670 
3671 	ASSERT_RTNL();
3672 
3673 	/*
3674 	 *	Set the flags on our device.
3675 	 */
3676 
3677 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3678 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3679 			       IFF_AUTOMEDIA)) |
3680 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3681 				    IFF_ALLMULTI));
3682 
3683 	/*
3684 	 *	Load in the correct multicast list now the flags have changed.
3685 	 */
3686 
3687 	if ((old_flags ^ flags) & IFF_MULTICAST)
3688 		dev_change_rx_flags(dev, IFF_MULTICAST);
3689 
3690 	dev_set_rx_mode(dev);
3691 
3692 	/*
3693 	 *	Have we downed the interface. We handle IFF_UP ourselves
3694 	 *	according to user attempts to set it, rather than blindly
3695 	 *	setting it.
3696 	 */
3697 
3698 	ret = 0;
3699 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
3700 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3701 
3702 		if (!ret)
3703 			dev_set_rx_mode(dev);
3704 	}
3705 
3706 	if (dev->flags & IFF_UP &&
3707 	    ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3708 					  IFF_VOLATILE)))
3709 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
3710 
3711 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
3712 		int inc = (flags & IFF_PROMISC) ? +1 : -1;
3713 		dev->gflags ^= IFF_PROMISC;
3714 		dev_set_promiscuity(dev, inc);
3715 	}
3716 
3717 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3718 	   is important. Some (broken) drivers set IFF_PROMISC, when
3719 	   IFF_ALLMULTI is requested not asking us and not reporting.
3720 	 */
3721 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3722 		int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3723 		dev->gflags ^= IFF_ALLMULTI;
3724 		dev_set_allmulti(dev, inc);
3725 	}
3726 
3727 	/* Exclude state transition flags, already notified */
3728 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3729 	if (changes)
3730 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3731 
3732 	return ret;
3733 }
3734 
3735 /**
3736  *	dev_set_mtu - Change maximum transfer unit
3737  *	@dev: device
3738  *	@new_mtu: new transfer unit
3739  *
3740  *	Change the maximum transfer size of the network device.
3741  */
dev_set_mtu(struct net_device * dev,int new_mtu)3742 int dev_set_mtu(struct net_device *dev, int new_mtu)
3743 {
3744 	const struct net_device_ops *ops = dev->netdev_ops;
3745 	int err;
3746 
3747 	if (new_mtu == dev->mtu)
3748 		return 0;
3749 
3750 	/*	MTU must be positive.	 */
3751 	if (new_mtu < 0)
3752 		return -EINVAL;
3753 
3754 	if (!netif_device_present(dev))
3755 		return -ENODEV;
3756 
3757 	err = 0;
3758 	if (ops->ndo_change_mtu)
3759 		err = ops->ndo_change_mtu(dev, new_mtu);
3760 	else
3761 		dev->mtu = new_mtu;
3762 
3763 	if (!err && dev->flags & IFF_UP)
3764 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3765 	return err;
3766 }
3767 
3768 /**
3769  *	dev_set_mac_address - Change Media Access Control Address
3770  *	@dev: device
3771  *	@sa: new address
3772  *
3773  *	Change the hardware (MAC) address of the device
3774  */
dev_set_mac_address(struct net_device * dev,struct sockaddr * sa)3775 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3776 {
3777 	const struct net_device_ops *ops = dev->netdev_ops;
3778 	int err;
3779 
3780 	if (!ops->ndo_set_mac_address)
3781 		return -EOPNOTSUPP;
3782 	if (sa->sa_family != dev->type)
3783 		return -EINVAL;
3784 	if (!netif_device_present(dev))
3785 		return -ENODEV;
3786 	err = ops->ndo_set_mac_address(dev, sa);
3787 	if (!err)
3788 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3789 	return err;
3790 }
3791 
3792 /*
3793  *	Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3794  */
dev_ifsioc_locked(struct net * net,struct ifreq * ifr,unsigned int cmd)3795 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3796 {
3797 	int err;
3798 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3799 
3800 	if (!dev)
3801 		return -ENODEV;
3802 
3803 	switch (cmd) {
3804 		case SIOCGIFFLAGS:	/* Get interface flags */
3805 			ifr->ifr_flags = dev_get_flags(dev);
3806 			return 0;
3807 
3808 		case SIOCGIFMETRIC:	/* Get the metric on the interface
3809 					   (currently unused) */
3810 			ifr->ifr_metric = 0;
3811 			return 0;
3812 
3813 		case SIOCGIFMTU:	/* Get the MTU of a device */
3814 			ifr->ifr_mtu = dev->mtu;
3815 			return 0;
3816 
3817 		case SIOCGIFHWADDR:
3818 			if (!dev->addr_len)
3819 				memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3820 			else
3821 				memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3822 				       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3823 			ifr->ifr_hwaddr.sa_family = dev->type;
3824 			return 0;
3825 
3826 		case SIOCGIFSLAVE:
3827 			err = -EINVAL;
3828 			break;
3829 
3830 		case SIOCGIFMAP:
3831 			ifr->ifr_map.mem_start = dev->mem_start;
3832 			ifr->ifr_map.mem_end   = dev->mem_end;
3833 			ifr->ifr_map.base_addr = dev->base_addr;
3834 			ifr->ifr_map.irq       = dev->irq;
3835 			ifr->ifr_map.dma       = dev->dma;
3836 			ifr->ifr_map.port      = dev->if_port;
3837 			return 0;
3838 
3839 		case SIOCGIFINDEX:
3840 			ifr->ifr_ifindex = dev->ifindex;
3841 			return 0;
3842 
3843 		case SIOCGIFTXQLEN:
3844 			ifr->ifr_qlen = dev->tx_queue_len;
3845 			return 0;
3846 
3847 		default:
3848 			/* dev_ioctl() should ensure this case
3849 			 * is never reached
3850 			 */
3851 			WARN_ON(1);
3852 			err = -EINVAL;
3853 			break;
3854 
3855 	}
3856 	return err;
3857 }
3858 
3859 /*
3860  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
3861  */
dev_ifsioc(struct net * net,struct ifreq * ifr,unsigned int cmd)3862 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3863 {
3864 	int err;
3865 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3866 	const struct net_device_ops *ops;
3867 
3868 	if (!dev)
3869 		return -ENODEV;
3870 
3871 	ops = dev->netdev_ops;
3872 
3873 	switch (cmd) {
3874 		case SIOCSIFFLAGS:	/* Set interface flags */
3875 			return dev_change_flags(dev, ifr->ifr_flags);
3876 
3877 		case SIOCSIFMETRIC:	/* Set the metric on the interface
3878 					   (currently unused) */
3879 			return -EOPNOTSUPP;
3880 
3881 		case SIOCSIFMTU:	/* Set the MTU of a device */
3882 			return dev_set_mtu(dev, ifr->ifr_mtu);
3883 
3884 		case SIOCSIFHWADDR:
3885 			return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3886 
3887 		case SIOCSIFHWBROADCAST:
3888 			if (ifr->ifr_hwaddr.sa_family != dev->type)
3889 				return -EINVAL;
3890 			memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3891 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3892 			call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3893 			return 0;
3894 
3895 		case SIOCSIFMAP:
3896 			if (ops->ndo_set_config) {
3897 				if (!netif_device_present(dev))
3898 					return -ENODEV;
3899 				return ops->ndo_set_config(dev, &ifr->ifr_map);
3900 			}
3901 			return -EOPNOTSUPP;
3902 
3903 		case SIOCADDMULTI:
3904 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3905 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3906 				return -EINVAL;
3907 			if (!netif_device_present(dev))
3908 				return -ENODEV;
3909 			return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3910 					  dev->addr_len, 1);
3911 
3912 		case SIOCDELMULTI:
3913 			if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3914 			    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3915 				return -EINVAL;
3916 			if (!netif_device_present(dev))
3917 				return -ENODEV;
3918 			return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3919 					     dev->addr_len, 1);
3920 
3921 		case SIOCSIFTXQLEN:
3922 			if (ifr->ifr_qlen < 0)
3923 				return -EINVAL;
3924 			dev->tx_queue_len = ifr->ifr_qlen;
3925 			return 0;
3926 
3927 		case SIOCSIFNAME:
3928 			ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3929 			return dev_change_name(dev, ifr->ifr_newname);
3930 
3931 		/*
3932 		 *	Unknown or private ioctl
3933 		 */
3934 
3935 		default:
3936 			if ((cmd >= SIOCDEVPRIVATE &&
3937 			    cmd <= SIOCDEVPRIVATE + 15) ||
3938 			    cmd == SIOCBONDENSLAVE ||
3939 			    cmd == SIOCBONDRELEASE ||
3940 			    cmd == SIOCBONDSETHWADDR ||
3941 			    cmd == SIOCBONDSLAVEINFOQUERY ||
3942 			    cmd == SIOCBONDINFOQUERY ||
3943 			    cmd == SIOCBONDCHANGEACTIVE ||
3944 			    cmd == SIOCGMIIPHY ||
3945 			    cmd == SIOCGMIIREG ||
3946 			    cmd == SIOCSMIIREG ||
3947 			    cmd == SIOCBRADDIF ||
3948 			    cmd == SIOCBRDELIF ||
3949 			    cmd == SIOCWANDEV) {
3950 				err = -EOPNOTSUPP;
3951 				if (ops->ndo_do_ioctl) {
3952 					if (netif_device_present(dev))
3953 						err = ops->ndo_do_ioctl(dev, ifr, cmd);
3954 					else
3955 						err = -ENODEV;
3956 				}
3957 			} else
3958 				err = -EINVAL;
3959 
3960 	}
3961 	return err;
3962 }
3963 
3964 /*
3965  *	This function handles all "interface"-type I/O control requests. The actual
3966  *	'doing' part of this is dev_ifsioc above.
3967  */
3968 
3969 /**
3970  *	dev_ioctl	-	network device ioctl
3971  *	@net: the applicable net namespace
3972  *	@cmd: command to issue
3973  *	@arg: pointer to a struct ifreq in user space
3974  *
3975  *	Issue ioctl functions to devices. This is normally called by the
3976  *	user space syscall interfaces but can sometimes be useful for
3977  *	other purposes. The return value is the return from the syscall if
3978  *	positive or a negative errno code on error.
3979  */
3980 
dev_ioctl(struct net * net,unsigned int cmd,void __user * arg)3981 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3982 {
3983 	struct ifreq ifr;
3984 	int ret;
3985 	char *colon;
3986 
3987 	/* One special case: SIOCGIFCONF takes ifconf argument
3988 	   and requires shared lock, because it sleeps writing
3989 	   to user space.
3990 	 */
3991 
3992 	if (cmd == SIOCGIFCONF) {
3993 		rtnl_lock();
3994 		ret = dev_ifconf(net, (char __user *) arg);
3995 		rtnl_unlock();
3996 		return ret;
3997 	}
3998 	if (cmd == SIOCGIFNAME)
3999 		return dev_ifname(net, (struct ifreq __user *)arg);
4000 
4001 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4002 		return -EFAULT;
4003 
4004 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4005 
4006 	colon = strchr(ifr.ifr_name, ':');
4007 	if (colon)
4008 		*colon = 0;
4009 
4010 	/*
4011 	 *	See which interface the caller is talking about.
4012 	 */
4013 
4014 	switch (cmd) {
4015 		/*
4016 		 *	These ioctl calls:
4017 		 *	- can be done by all.
4018 		 *	- atomic and do not require locking.
4019 		 *	- return a value
4020 		 */
4021 		case SIOCGIFFLAGS:
4022 		case SIOCGIFMETRIC:
4023 		case SIOCGIFMTU:
4024 		case SIOCGIFHWADDR:
4025 		case SIOCGIFSLAVE:
4026 		case SIOCGIFMAP:
4027 		case SIOCGIFINDEX:
4028 		case SIOCGIFTXQLEN:
4029 			dev_load(net, ifr.ifr_name);
4030 			read_lock(&dev_base_lock);
4031 			ret = dev_ifsioc_locked(net, &ifr, cmd);
4032 			read_unlock(&dev_base_lock);
4033 			if (!ret) {
4034 				if (colon)
4035 					*colon = ':';
4036 				if (copy_to_user(arg, &ifr,
4037 						 sizeof(struct ifreq)))
4038 					ret = -EFAULT;
4039 			}
4040 			return ret;
4041 
4042 		case SIOCETHTOOL:
4043 			dev_load(net, ifr.ifr_name);
4044 			rtnl_lock();
4045 			ret = dev_ethtool(net, &ifr);
4046 			rtnl_unlock();
4047 			if (!ret) {
4048 				if (colon)
4049 					*colon = ':';
4050 				if (copy_to_user(arg, &ifr,
4051 						 sizeof(struct ifreq)))
4052 					ret = -EFAULT;
4053 			}
4054 			return ret;
4055 
4056 		/*
4057 		 *	These ioctl calls:
4058 		 *	- require superuser power.
4059 		 *	- require strict serialization.
4060 		 *	- return a value
4061 		 */
4062 		case SIOCGMIIPHY:
4063 		case SIOCGMIIREG:
4064 		case SIOCSIFNAME:
4065 			if (!capable(CAP_NET_ADMIN))
4066 				return -EPERM;
4067 			dev_load(net, ifr.ifr_name);
4068 			rtnl_lock();
4069 			ret = dev_ifsioc(net, &ifr, cmd);
4070 			rtnl_unlock();
4071 			if (!ret) {
4072 				if (colon)
4073 					*colon = ':';
4074 				if (copy_to_user(arg, &ifr,
4075 						 sizeof(struct ifreq)))
4076 					ret = -EFAULT;
4077 			}
4078 			return ret;
4079 
4080 		/*
4081 		 *	These ioctl calls:
4082 		 *	- require superuser power.
4083 		 *	- require strict serialization.
4084 		 *	- do not return a value
4085 		 */
4086 		case SIOCSIFFLAGS:
4087 		case SIOCSIFMETRIC:
4088 		case SIOCSIFMTU:
4089 		case SIOCSIFMAP:
4090 		case SIOCSIFHWADDR:
4091 		case SIOCSIFSLAVE:
4092 		case SIOCADDMULTI:
4093 		case SIOCDELMULTI:
4094 		case SIOCSIFHWBROADCAST:
4095 		case SIOCSIFTXQLEN:
4096 		case SIOCSMIIREG:
4097 		case SIOCBONDENSLAVE:
4098 		case SIOCBONDRELEASE:
4099 		case SIOCBONDSETHWADDR:
4100 		case SIOCBONDCHANGEACTIVE:
4101 		case SIOCBRADDIF:
4102 		case SIOCBRDELIF:
4103 			if (!capable(CAP_NET_ADMIN))
4104 				return -EPERM;
4105 			/* fall through */
4106 		case SIOCBONDSLAVEINFOQUERY:
4107 		case SIOCBONDINFOQUERY:
4108 			dev_load(net, ifr.ifr_name);
4109 			rtnl_lock();
4110 			ret = dev_ifsioc(net, &ifr, cmd);
4111 			rtnl_unlock();
4112 			return ret;
4113 
4114 		case SIOCGIFMEM:
4115 			/* Get the per device memory space. We can add this but
4116 			 * currently do not support it */
4117 		case SIOCSIFMEM:
4118 			/* Set the per device memory buffer space.
4119 			 * Not applicable in our case */
4120 		case SIOCSIFLINK:
4121 			return -EINVAL;
4122 
4123 		/*
4124 		 *	Unknown or private ioctl.
4125 		 */
4126 		default:
4127 			if (cmd == SIOCWANDEV ||
4128 			    (cmd >= SIOCDEVPRIVATE &&
4129 			     cmd <= SIOCDEVPRIVATE + 15)) {
4130 				dev_load(net, ifr.ifr_name);
4131 				rtnl_lock();
4132 				ret = dev_ifsioc(net, &ifr, cmd);
4133 				rtnl_unlock();
4134 				if (!ret && copy_to_user(arg, &ifr,
4135 							 sizeof(struct ifreq)))
4136 					ret = -EFAULT;
4137 				return ret;
4138 			}
4139 			/* Take care of Wireless Extensions */
4140 			if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4141 				return wext_handle_ioctl(net, &ifr, cmd, arg);
4142 			return -EINVAL;
4143 	}
4144 }
4145 
4146 
4147 /**
4148  *	dev_new_index	-	allocate an ifindex
4149  *	@net: the applicable net namespace
4150  *
4151  *	Returns a suitable unique value for a new device interface
4152  *	number.  The caller must hold the rtnl semaphore or the
4153  *	dev_base_lock to be sure it remains unique.
4154  */
dev_new_index(struct net * net)4155 static int dev_new_index(struct net *net)
4156 {
4157 	static int ifindex;
4158 	for (;;) {
4159 		if (++ifindex <= 0)
4160 			ifindex = 1;
4161 		if (!__dev_get_by_index(net, ifindex))
4162 			return ifindex;
4163 	}
4164 }
4165 
4166 /* Delayed registration/unregisteration */
4167 static LIST_HEAD(net_todo_list);
4168 
net_set_todo(struct net_device * dev)4169 static void net_set_todo(struct net_device *dev)
4170 {
4171 	list_add_tail(&dev->todo_list, &net_todo_list);
4172 }
4173 
rollback_registered(struct net_device * dev)4174 static void rollback_registered(struct net_device *dev)
4175 {
4176 	BUG_ON(dev_boot_phase);
4177 	ASSERT_RTNL();
4178 
4179 	/* Some devices call without registering for initialization unwind. */
4180 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4181 		printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4182 				  "was registered\n", dev->name, dev);
4183 
4184 		WARN_ON(1);
4185 		return;
4186 	}
4187 
4188 	BUG_ON(dev->reg_state != NETREG_REGISTERED);
4189 
4190 	/* If device is running, close it first. */
4191 	dev_close(dev);
4192 
4193 	/* And unlink it from device chain. */
4194 	unlist_netdevice(dev);
4195 
4196 	dev->reg_state = NETREG_UNREGISTERING;
4197 
4198 	synchronize_net();
4199 
4200 	/* Shutdown queueing discipline. */
4201 	dev_shutdown(dev);
4202 
4203 
4204 	/* Notify protocols, that we are about to destroy
4205 	   this device. They should clean all the things.
4206 	*/
4207 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4208 
4209 	/*
4210 	 *	Flush the unicast and multicast chains
4211 	 */
4212 	dev_addr_discard(dev);
4213 
4214 	if (dev->netdev_ops->ndo_uninit)
4215 		dev->netdev_ops->ndo_uninit(dev);
4216 
4217 	/* Notifier chain MUST detach us from master device. */
4218 	WARN_ON(dev->master);
4219 
4220 	/* Remove entries from kobject tree */
4221 	netdev_unregister_kobject(dev);
4222 
4223 	synchronize_net();
4224 
4225 	dev_put(dev);
4226 }
4227 
__netdev_init_queue_locks_one(struct net_device * dev,struct netdev_queue * dev_queue,void * _unused)4228 static void __netdev_init_queue_locks_one(struct net_device *dev,
4229 					  struct netdev_queue *dev_queue,
4230 					  void *_unused)
4231 {
4232 	spin_lock_init(&dev_queue->_xmit_lock);
4233 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4234 	dev_queue->xmit_lock_owner = -1;
4235 }
4236 
netdev_init_queue_locks(struct net_device * dev)4237 static void netdev_init_queue_locks(struct net_device *dev)
4238 {
4239 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4240 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4241 }
4242 
netdev_fix_features(unsigned long features,const char * name)4243 unsigned long netdev_fix_features(unsigned long features, const char *name)
4244 {
4245 	/* Fix illegal SG+CSUM combinations. */
4246 	if ((features & NETIF_F_SG) &&
4247 	    !(features & NETIF_F_ALL_CSUM)) {
4248 		if (name)
4249 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4250 			       "checksum feature.\n", name);
4251 		features &= ~NETIF_F_SG;
4252 	}
4253 
4254 	/* TSO requires that SG is present as well. */
4255 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4256 		if (name)
4257 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4258 			       "SG feature.\n", name);
4259 		features &= ~NETIF_F_TSO;
4260 	}
4261 
4262 	if (features & NETIF_F_UFO) {
4263 		if (!(features & NETIF_F_GEN_CSUM)) {
4264 			if (name)
4265 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4266 				       "since no NETIF_F_HW_CSUM feature.\n",
4267 				       name);
4268 			features &= ~NETIF_F_UFO;
4269 		}
4270 
4271 		if (!(features & NETIF_F_SG)) {
4272 			if (name)
4273 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4274 				       "since no NETIF_F_SG feature.\n", name);
4275 			features &= ~NETIF_F_UFO;
4276 		}
4277 	}
4278 
4279 	return features;
4280 }
4281 EXPORT_SYMBOL(netdev_fix_features);
4282 
4283 /* Some devices need to (re-)set their netdev_ops inside
4284  * ->init() or similar.  If that happens, we have to setup
4285  * the compat pointers again.
4286  */
netdev_resync_ops(struct net_device * dev)4287 void netdev_resync_ops(struct net_device *dev)
4288 {
4289 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4290 	const struct net_device_ops *ops = dev->netdev_ops;
4291 
4292 	dev->init = ops->ndo_init;
4293 	dev->uninit = ops->ndo_uninit;
4294 	dev->open = ops->ndo_open;
4295 	dev->change_rx_flags = ops->ndo_change_rx_flags;
4296 	dev->set_rx_mode = ops->ndo_set_rx_mode;
4297 	dev->set_multicast_list = ops->ndo_set_multicast_list;
4298 	dev->set_mac_address = ops->ndo_set_mac_address;
4299 	dev->validate_addr = ops->ndo_validate_addr;
4300 	dev->do_ioctl = ops->ndo_do_ioctl;
4301 	dev->set_config = ops->ndo_set_config;
4302 	dev->change_mtu = ops->ndo_change_mtu;
4303 	dev->neigh_setup = ops->ndo_neigh_setup;
4304 	dev->tx_timeout = ops->ndo_tx_timeout;
4305 	dev->get_stats = ops->ndo_get_stats;
4306 	dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4307 	dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4308 	dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4309 #ifdef CONFIG_NET_POLL_CONTROLLER
4310 	dev->poll_controller = ops->ndo_poll_controller;
4311 #endif
4312 #endif
4313 }
4314 EXPORT_SYMBOL(netdev_resync_ops);
4315 
4316 /**
4317  *	register_netdevice	- register a network device
4318  *	@dev: device to register
4319  *
4320  *	Take a completed network device structure and add it to the kernel
4321  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4322  *	chain. 0 is returned on success. A negative errno code is returned
4323  *	on a failure to set up the device, or if the name is a duplicate.
4324  *
4325  *	Callers must hold the rtnl semaphore. You may want
4326  *	register_netdev() instead of this.
4327  *
4328  *	BUGS:
4329  *	The locking appears insufficient to guarantee two parallel registers
4330  *	will not get the same name.
4331  */
4332 
register_netdevice(struct net_device * dev)4333 int register_netdevice(struct net_device *dev)
4334 {
4335 	struct hlist_head *head;
4336 	struct hlist_node *p;
4337 	int ret;
4338 	struct net *net = dev_net(dev);
4339 
4340 	BUG_ON(dev_boot_phase);
4341 	ASSERT_RTNL();
4342 
4343 	might_sleep();
4344 
4345 	/* When net_device's are persistent, this will be fatal. */
4346 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4347 	BUG_ON(!net);
4348 
4349 	spin_lock_init(&dev->addr_list_lock);
4350 	netdev_set_addr_lockdep_class(dev);
4351 	netdev_init_queue_locks(dev);
4352 
4353 	dev->iflink = -1;
4354 
4355 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4356 	/* Netdevice_ops API compatiability support.
4357 	 * This is temporary until all network devices are converted.
4358 	 */
4359 	if (dev->netdev_ops) {
4360 		netdev_resync_ops(dev);
4361 	} else {
4362 		char drivername[64];
4363 		pr_info("%s (%s): not using net_device_ops yet\n",
4364 			dev->name, netdev_drivername(dev, drivername, 64));
4365 
4366 		/* This works only because net_device_ops and the
4367 		   compatiablity structure are the same. */
4368 		dev->netdev_ops = (void *) &(dev->init);
4369 	}
4370 #endif
4371 
4372 	/* Init, if this function is available */
4373 	if (dev->netdev_ops->ndo_init) {
4374 		ret = dev->netdev_ops->ndo_init(dev);
4375 		if (ret) {
4376 			if (ret > 0)
4377 				ret = -EIO;
4378 			goto out;
4379 		}
4380 	}
4381 
4382 	if (!dev_valid_name(dev->name)) {
4383 		ret = -EINVAL;
4384 		goto err_uninit;
4385 	}
4386 
4387 	dev->ifindex = dev_new_index(net);
4388 	if (dev->iflink == -1)
4389 		dev->iflink = dev->ifindex;
4390 
4391 	/* Check for existence of name */
4392 	head = dev_name_hash(net, dev->name);
4393 	hlist_for_each(p, head) {
4394 		struct net_device *d
4395 			= hlist_entry(p, struct net_device, name_hlist);
4396 		if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4397 			ret = -EEXIST;
4398 			goto err_uninit;
4399 		}
4400 	}
4401 
4402 	/* Fix illegal checksum combinations */
4403 	if ((dev->features & NETIF_F_HW_CSUM) &&
4404 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4405 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4406 		       dev->name);
4407 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4408 	}
4409 
4410 	if ((dev->features & NETIF_F_NO_CSUM) &&
4411 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4412 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4413 		       dev->name);
4414 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4415 	}
4416 
4417 	dev->features = netdev_fix_features(dev->features, dev->name);
4418 
4419 	/* Enable software GSO if SG is supported. */
4420 	if (dev->features & NETIF_F_SG)
4421 		dev->features |= NETIF_F_GSO;
4422 
4423 	netdev_initialize_kobject(dev);
4424 	ret = netdev_register_kobject(dev);
4425 	if (ret)
4426 		goto err_uninit;
4427 	dev->reg_state = NETREG_REGISTERED;
4428 
4429 	/*
4430 	 *	Default initial state at registry is that the
4431 	 *	device is present.
4432 	 */
4433 
4434 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4435 
4436 	dev_init_scheduler(dev);
4437 	dev_hold(dev);
4438 	list_netdevice(dev);
4439 
4440 	/* Notify protocols, that a new device appeared. */
4441 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4442 	ret = notifier_to_errno(ret);
4443 	if (ret) {
4444 		rollback_registered(dev);
4445 		dev->reg_state = NETREG_UNREGISTERED;
4446 	}
4447 
4448 out:
4449 	return ret;
4450 
4451 err_uninit:
4452 	if (dev->netdev_ops->ndo_uninit)
4453 		dev->netdev_ops->ndo_uninit(dev);
4454 	goto out;
4455 }
4456 
4457 /**
4458  *	init_dummy_netdev	- init a dummy network device for NAPI
4459  *	@dev: device to init
4460  *
4461  *	This takes a network device structure and initialize the minimum
4462  *	amount of fields so it can be used to schedule NAPI polls without
4463  *	registering a full blown interface. This is to be used by drivers
4464  *	that need to tie several hardware interfaces to a single NAPI
4465  *	poll scheduler due to HW limitations.
4466  */
init_dummy_netdev(struct net_device * dev)4467 int init_dummy_netdev(struct net_device *dev)
4468 {
4469 	/* Clear everything. Note we don't initialize spinlocks
4470 	 * are they aren't supposed to be taken by any of the
4471 	 * NAPI code and this dummy netdev is supposed to be
4472 	 * only ever used for NAPI polls
4473 	 */
4474 	memset(dev, 0, sizeof(struct net_device));
4475 
4476 	/* make sure we BUG if trying to hit standard
4477 	 * register/unregister code path
4478 	 */
4479 	dev->reg_state = NETREG_DUMMY;
4480 
4481 	/* initialize the ref count */
4482 	atomic_set(&dev->refcnt, 1);
4483 
4484 	/* NAPI wants this */
4485 	INIT_LIST_HEAD(&dev->napi_list);
4486 
4487 	/* a dummy interface is started by default */
4488 	set_bit(__LINK_STATE_PRESENT, &dev->state);
4489 	set_bit(__LINK_STATE_START, &dev->state);
4490 
4491 	return 0;
4492 }
4493 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4494 
4495 
4496 /**
4497  *	register_netdev	- register a network device
4498  *	@dev: device to register
4499  *
4500  *	Take a completed network device structure and add it to the kernel
4501  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4502  *	chain. 0 is returned on success. A negative errno code is returned
4503  *	on a failure to set up the device, or if the name is a duplicate.
4504  *
4505  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
4506  *	and expands the device name if you passed a format string to
4507  *	alloc_netdev.
4508  */
register_netdev(struct net_device * dev)4509 int register_netdev(struct net_device *dev)
4510 {
4511 	int err;
4512 
4513 	rtnl_lock();
4514 
4515 	/*
4516 	 * If the name is a format string the caller wants us to do a
4517 	 * name allocation.
4518 	 */
4519 	if (strchr(dev->name, '%')) {
4520 		err = dev_alloc_name(dev, dev->name);
4521 		if (err < 0)
4522 			goto out;
4523 	}
4524 
4525 	err = register_netdevice(dev);
4526 out:
4527 	rtnl_unlock();
4528 	return err;
4529 }
4530 EXPORT_SYMBOL(register_netdev);
4531 
4532 /*
4533  * netdev_wait_allrefs - wait until all references are gone.
4534  *
4535  * This is called when unregistering network devices.
4536  *
4537  * Any protocol or device that holds a reference should register
4538  * for netdevice notification, and cleanup and put back the
4539  * reference if they receive an UNREGISTER event.
4540  * We can get stuck here if buggy protocols don't correctly
4541  * call dev_put.
4542  */
netdev_wait_allrefs(struct net_device * dev)4543 static void netdev_wait_allrefs(struct net_device *dev)
4544 {
4545 	unsigned long rebroadcast_time, warning_time;
4546 
4547 	rebroadcast_time = warning_time = jiffies;
4548 	while (atomic_read(&dev->refcnt) != 0) {
4549 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4550 			rtnl_lock();
4551 
4552 			/* Rebroadcast unregister notification */
4553 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4554 
4555 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4556 				     &dev->state)) {
4557 				/* We must not have linkwatch events
4558 				 * pending on unregister. If this
4559 				 * happens, we simply run the queue
4560 				 * unscheduled, resulting in a noop
4561 				 * for this device.
4562 				 */
4563 				linkwatch_run_queue();
4564 			}
4565 
4566 			__rtnl_unlock();
4567 
4568 			rebroadcast_time = jiffies;
4569 		}
4570 
4571 		msleep(250);
4572 
4573 		if (time_after(jiffies, warning_time + 10 * HZ)) {
4574 			printk(KERN_EMERG "unregister_netdevice: "
4575 			       "waiting for %s to become free. Usage "
4576 			       "count = %d\n",
4577 			       dev->name, atomic_read(&dev->refcnt));
4578 			warning_time = jiffies;
4579 		}
4580 	}
4581 }
4582 
4583 /* The sequence is:
4584  *
4585  *	rtnl_lock();
4586  *	...
4587  *	register_netdevice(x1);
4588  *	register_netdevice(x2);
4589  *	...
4590  *	unregister_netdevice(y1);
4591  *	unregister_netdevice(y2);
4592  *      ...
4593  *	rtnl_unlock();
4594  *	free_netdev(y1);
4595  *	free_netdev(y2);
4596  *
4597  * We are invoked by rtnl_unlock().
4598  * This allows us to deal with problems:
4599  * 1) We can delete sysfs objects which invoke hotplug
4600  *    without deadlocking with linkwatch via keventd.
4601  * 2) Since we run with the RTNL semaphore not held, we can sleep
4602  *    safely in order to wait for the netdev refcnt to drop to zero.
4603  *
4604  * We must not return until all unregister events added during
4605  * the interval the lock was held have been completed.
4606  */
netdev_run_todo(void)4607 void netdev_run_todo(void)
4608 {
4609 	struct list_head list;
4610 
4611 	/* Snapshot list, allow later requests */
4612 	list_replace_init(&net_todo_list, &list);
4613 
4614 	__rtnl_unlock();
4615 
4616 	while (!list_empty(&list)) {
4617 		struct net_device *dev
4618 			= list_entry(list.next, struct net_device, todo_list);
4619 		list_del(&dev->todo_list);
4620 
4621 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4622 			printk(KERN_ERR "network todo '%s' but state %d\n",
4623 			       dev->name, dev->reg_state);
4624 			dump_stack();
4625 			continue;
4626 		}
4627 
4628 		dev->reg_state = NETREG_UNREGISTERED;
4629 
4630 		on_each_cpu(flush_backlog, dev, 1);
4631 
4632 		netdev_wait_allrefs(dev);
4633 
4634 		/* paranoia */
4635 		BUG_ON(atomic_read(&dev->refcnt));
4636 		WARN_ON(dev->ip_ptr);
4637 		WARN_ON(dev->ip6_ptr);
4638 		WARN_ON(dev->dn_ptr);
4639 
4640 		if (dev->destructor)
4641 			dev->destructor(dev);
4642 
4643 		/* Free network device */
4644 		kobject_put(&dev->dev.kobj);
4645 	}
4646 }
4647 
4648 /**
4649  *	dev_get_stats	- get network device statistics
4650  *	@dev: device to get statistics from
4651  *
4652  *	Get network statistics from device. The device driver may provide
4653  *	its own method by setting dev->netdev_ops->get_stats; otherwise
4654  *	the internal statistics structure is used.
4655  */
dev_get_stats(struct net_device * dev)4656 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4657  {
4658 	const struct net_device_ops *ops = dev->netdev_ops;
4659 
4660 	if (ops->ndo_get_stats)
4661 		return ops->ndo_get_stats(dev);
4662 	else
4663 		return &dev->stats;
4664 }
4665 EXPORT_SYMBOL(dev_get_stats);
4666 
netdev_init_one_queue(struct net_device * dev,struct netdev_queue * queue,void * _unused)4667 static void netdev_init_one_queue(struct net_device *dev,
4668 				  struct netdev_queue *queue,
4669 				  void *_unused)
4670 {
4671 	queue->dev = dev;
4672 }
4673 
netdev_init_queues(struct net_device * dev)4674 static void netdev_init_queues(struct net_device *dev)
4675 {
4676 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4677 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4678 	spin_lock_init(&dev->tx_global_lock);
4679 }
4680 
4681 /**
4682  *	alloc_netdev_mq - allocate network device
4683  *	@sizeof_priv:	size of private data to allocate space for
4684  *	@name:		device name format string
4685  *	@setup:		callback to initialize device
4686  *	@queue_count:	the number of subqueues to allocate
4687  *
4688  *	Allocates a struct net_device with private data area for driver use
4689  *	and performs basic initialization.  Also allocates subquue structs
4690  *	for each queue on the device at the end of the netdevice.
4691  */
alloc_netdev_mq(int sizeof_priv,const char * name,void (* setup)(struct net_device *),unsigned int queue_count)4692 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4693 		void (*setup)(struct net_device *), unsigned int queue_count)
4694 {
4695 	struct netdev_queue *tx;
4696 	struct net_device *dev;
4697 	size_t alloc_size;
4698 	void *p;
4699 
4700 	BUG_ON(strlen(name) >= sizeof(dev->name));
4701 
4702 	alloc_size = sizeof(struct net_device);
4703 	if (sizeof_priv) {
4704 		/* ensure 32-byte alignment of private area */
4705 		alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4706 		alloc_size += sizeof_priv;
4707 	}
4708 	/* ensure 32-byte alignment of whole construct */
4709 	alloc_size += NETDEV_ALIGN_CONST;
4710 
4711 	p = kzalloc(alloc_size, GFP_KERNEL);
4712 	if (!p) {
4713 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4714 		return NULL;
4715 	}
4716 
4717 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4718 	if (!tx) {
4719 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
4720 		       "tx qdiscs.\n");
4721 		kfree(p);
4722 		return NULL;
4723 	}
4724 
4725 	dev = (struct net_device *)
4726 		(((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4727 	dev->padded = (char *)dev - (char *)p;
4728 	dev_net_set(dev, &init_net);
4729 
4730 	dev->_tx = tx;
4731 	dev->num_tx_queues = queue_count;
4732 	dev->real_num_tx_queues = queue_count;
4733 
4734 	dev->gso_max_size = GSO_MAX_SIZE;
4735 
4736 	netdev_init_queues(dev);
4737 
4738 	INIT_LIST_HEAD(&dev->napi_list);
4739 	setup(dev);
4740 	strcpy(dev->name, name);
4741 	return dev;
4742 }
4743 EXPORT_SYMBOL(alloc_netdev_mq);
4744 
4745 /**
4746  *	free_netdev - free network device
4747  *	@dev: device
4748  *
4749  *	This function does the last stage of destroying an allocated device
4750  * 	interface. The reference to the device object is released.
4751  *	If this is the last reference then it will be freed.
4752  */
free_netdev(struct net_device * dev)4753 void free_netdev(struct net_device *dev)
4754 {
4755 	struct napi_struct *p, *n;
4756 
4757 	release_net(dev_net(dev));
4758 
4759 	kfree(dev->_tx);
4760 
4761 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4762 		netif_napi_del(p);
4763 
4764 	/*  Compatibility with error handling in drivers */
4765 	if (dev->reg_state == NETREG_UNINITIALIZED) {
4766 		kfree((char *)dev - dev->padded);
4767 		return;
4768 	}
4769 
4770 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4771 	dev->reg_state = NETREG_RELEASED;
4772 
4773 	/* will free via device release */
4774 	put_device(&dev->dev);
4775 }
4776 
4777 /**
4778  *	synchronize_net -  Synchronize with packet receive processing
4779  *
4780  *	Wait for packets currently being received to be done.
4781  *	Does not block later packets from starting.
4782  */
synchronize_net(void)4783 void synchronize_net(void)
4784 {
4785 	might_sleep();
4786 	synchronize_rcu();
4787 }
4788 
4789 /**
4790  *	unregister_netdevice - remove device from the kernel
4791  *	@dev: device
4792  *
4793  *	This function shuts down a device interface and removes it
4794  *	from the kernel tables.
4795  *
4796  *	Callers must hold the rtnl semaphore.  You may want
4797  *	unregister_netdev() instead of this.
4798  */
4799 
unregister_netdevice(struct net_device * dev)4800 void unregister_netdevice(struct net_device *dev)
4801 {
4802 	ASSERT_RTNL();
4803 
4804 	rollback_registered(dev);
4805 	/* Finish processing unregister after unlock */
4806 	net_set_todo(dev);
4807 }
4808 
4809 /**
4810  *	unregister_netdev - remove device from the kernel
4811  *	@dev: device
4812  *
4813  *	This function shuts down a device interface and removes it
4814  *	from the kernel tables.
4815  *
4816  *	This is just a wrapper for unregister_netdevice that takes
4817  *	the rtnl semaphore.  In general you want to use this and not
4818  *	unregister_netdevice.
4819  */
unregister_netdev(struct net_device * dev)4820 void unregister_netdev(struct net_device *dev)
4821 {
4822 	rtnl_lock();
4823 	unregister_netdevice(dev);
4824 	rtnl_unlock();
4825 }
4826 
4827 EXPORT_SYMBOL(unregister_netdev);
4828 
4829 /**
4830  *	dev_change_net_namespace - move device to different nethost namespace
4831  *	@dev: device
4832  *	@net: network namespace
4833  *	@pat: If not NULL name pattern to try if the current device name
4834  *	      is already taken in the destination network namespace.
4835  *
4836  *	This function shuts down a device interface and moves it
4837  *	to a new network namespace. On success 0 is returned, on
4838  *	a failure a netagive errno code is returned.
4839  *
4840  *	Callers must hold the rtnl semaphore.
4841  */
4842 
dev_change_net_namespace(struct net_device * dev,struct net * net,const char * pat)4843 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4844 {
4845 	char buf[IFNAMSIZ];
4846 	const char *destname;
4847 	int err;
4848 
4849 	ASSERT_RTNL();
4850 
4851 	/* Don't allow namespace local devices to be moved. */
4852 	err = -EINVAL;
4853 	if (dev->features & NETIF_F_NETNS_LOCAL)
4854 		goto out;
4855 
4856 #ifdef CONFIG_SYSFS
4857 	/* Don't allow real devices to be moved when sysfs
4858 	 * is enabled.
4859 	 */
4860 	err = -EINVAL;
4861 	if (dev->dev.parent)
4862 		goto out;
4863 #endif
4864 
4865 	/* Ensure the device has been registrered */
4866 	err = -EINVAL;
4867 	if (dev->reg_state != NETREG_REGISTERED)
4868 		goto out;
4869 
4870 	/* Get out if there is nothing todo */
4871 	err = 0;
4872 	if (net_eq(dev_net(dev), net))
4873 		goto out;
4874 
4875 	/* Pick the destination device name, and ensure
4876 	 * we can use it in the destination network namespace.
4877 	 */
4878 	err = -EEXIST;
4879 	destname = dev->name;
4880 	if (__dev_get_by_name(net, destname)) {
4881 		/* We get here if we can't use the current device name */
4882 		if (!pat)
4883 			goto out;
4884 		if (!dev_valid_name(pat))
4885 			goto out;
4886 		if (strchr(pat, '%')) {
4887 			if (__dev_alloc_name(net, pat, buf) < 0)
4888 				goto out;
4889 			destname = buf;
4890 		} else
4891 			destname = pat;
4892 		if (__dev_get_by_name(net, destname))
4893 			goto out;
4894 	}
4895 
4896 	/*
4897 	 * And now a mini version of register_netdevice unregister_netdevice.
4898 	 */
4899 
4900 	/* If device is running close it first. */
4901 	dev_close(dev);
4902 
4903 	/* And unlink it from device chain */
4904 	err = -ENODEV;
4905 	unlist_netdevice(dev);
4906 
4907 	synchronize_net();
4908 
4909 	/* Shutdown queueing discipline. */
4910 	dev_shutdown(dev);
4911 
4912 	/* Notify protocols, that we are about to destroy
4913 	   this device. They should clean all the things.
4914 	*/
4915 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4916 
4917 	/*
4918 	 *	Flush the unicast and multicast chains
4919 	 */
4920 	dev_addr_discard(dev);
4921 
4922 	netdev_unregister_kobject(dev);
4923 
4924 	/* Actually switch the network namespace */
4925 	dev_net_set(dev, net);
4926 
4927 	/* Assign the new device name */
4928 	if (destname != dev->name)
4929 		strcpy(dev->name, destname);
4930 
4931 	/* If there is an ifindex conflict assign a new one */
4932 	if (__dev_get_by_index(net, dev->ifindex)) {
4933 		int iflink = (dev->iflink == dev->ifindex);
4934 		dev->ifindex = dev_new_index(net);
4935 		if (iflink)
4936 			dev->iflink = dev->ifindex;
4937 	}
4938 
4939 	/* Fixup kobjects */
4940 	err = netdev_register_kobject(dev);
4941 	WARN_ON(err);
4942 
4943 	/* Add the device back in the hashes */
4944 	list_netdevice(dev);
4945 
4946 	/* Notify protocols, that a new device appeared. */
4947 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
4948 
4949 	synchronize_net();
4950 	err = 0;
4951 out:
4952 	return err;
4953 }
4954 
dev_cpu_callback(struct notifier_block * nfb,unsigned long action,void * ocpu)4955 static int dev_cpu_callback(struct notifier_block *nfb,
4956 			    unsigned long action,
4957 			    void *ocpu)
4958 {
4959 	struct sk_buff **list_skb;
4960 	struct Qdisc **list_net;
4961 	struct sk_buff *skb;
4962 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
4963 	struct softnet_data *sd, *oldsd;
4964 
4965 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
4966 		return NOTIFY_OK;
4967 
4968 	local_irq_disable();
4969 	cpu = smp_processor_id();
4970 	sd = &per_cpu(softnet_data, cpu);
4971 	oldsd = &per_cpu(softnet_data, oldcpu);
4972 
4973 	/* Find end of our completion_queue. */
4974 	list_skb = &sd->completion_queue;
4975 	while (*list_skb)
4976 		list_skb = &(*list_skb)->next;
4977 	/* Append completion queue from offline CPU. */
4978 	*list_skb = oldsd->completion_queue;
4979 	oldsd->completion_queue = NULL;
4980 
4981 	/* Find end of our output_queue. */
4982 	list_net = &sd->output_queue;
4983 	while (*list_net)
4984 		list_net = &(*list_net)->next_sched;
4985 	/* Append output queue from offline CPU. */
4986 	*list_net = oldsd->output_queue;
4987 	oldsd->output_queue = NULL;
4988 
4989 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
4990 	local_irq_enable();
4991 
4992 	/* Process offline CPU's input_pkt_queue */
4993 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
4994 		netif_rx(skb);
4995 
4996 	return NOTIFY_OK;
4997 }
4998 
4999 
5000 /**
5001  *	netdev_increment_features - increment feature set by one
5002  *	@all: current feature set
5003  *	@one: new feature set
5004  *	@mask: mask feature set
5005  *
5006  *	Computes a new feature set after adding a device with feature set
5007  *	@one to the master device with current feature set @all.  Will not
5008  *	enable anything that is off in @mask. Returns the new feature set.
5009  */
netdev_increment_features(unsigned long all,unsigned long one,unsigned long mask)5010 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5011 					unsigned long mask)
5012 {
5013 	/* If device needs checksumming, downgrade to it. */
5014         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5015 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5016 	else if (mask & NETIF_F_ALL_CSUM) {
5017 		/* If one device supports v4/v6 checksumming, set for all. */
5018 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5019 		    !(all & NETIF_F_GEN_CSUM)) {
5020 			all &= ~NETIF_F_ALL_CSUM;
5021 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5022 		}
5023 
5024 		/* If one device supports hw checksumming, set for all. */
5025 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5026 			all &= ~NETIF_F_ALL_CSUM;
5027 			all |= NETIF_F_HW_CSUM;
5028 		}
5029 	}
5030 
5031 	one |= NETIF_F_ALL_CSUM;
5032 
5033 	one |= all & NETIF_F_ONE_FOR_ALL;
5034 	all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5035 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5036 
5037 	return all;
5038 }
5039 EXPORT_SYMBOL(netdev_increment_features);
5040 
netdev_create_hash(void)5041 static struct hlist_head *netdev_create_hash(void)
5042 {
5043 	int i;
5044 	struct hlist_head *hash;
5045 
5046 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5047 	if (hash != NULL)
5048 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5049 			INIT_HLIST_HEAD(&hash[i]);
5050 
5051 	return hash;
5052 }
5053 
5054 /* Initialize per network namespace state */
netdev_init(struct net * net)5055 static int __net_init netdev_init(struct net *net)
5056 {
5057 	INIT_LIST_HEAD(&net->dev_base_head);
5058 
5059 	net->dev_name_head = netdev_create_hash();
5060 	if (net->dev_name_head == NULL)
5061 		goto err_name;
5062 
5063 	net->dev_index_head = netdev_create_hash();
5064 	if (net->dev_index_head == NULL)
5065 		goto err_idx;
5066 
5067 	return 0;
5068 
5069 err_idx:
5070 	kfree(net->dev_name_head);
5071 err_name:
5072 	return -ENOMEM;
5073 }
5074 
5075 /**
5076  *	netdev_drivername - network driver for the device
5077  *	@dev: network device
5078  *	@buffer: buffer for resulting name
5079  *	@len: size of buffer
5080  *
5081  *	Determine network driver for device.
5082  */
netdev_drivername(const struct net_device * dev,char * buffer,int len)5083 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5084 {
5085 	const struct device_driver *driver;
5086 	const struct device *parent;
5087 
5088 	if (len <= 0 || !buffer)
5089 		return buffer;
5090 	buffer[0] = 0;
5091 
5092 	parent = dev->dev.parent;
5093 
5094 	if (!parent)
5095 		return buffer;
5096 
5097 	driver = parent->driver;
5098 	if (driver && driver->name)
5099 		strlcpy(buffer, driver->name, len);
5100 	return buffer;
5101 }
5102 
netdev_exit(struct net * net)5103 static void __net_exit netdev_exit(struct net *net)
5104 {
5105 	kfree(net->dev_name_head);
5106 	kfree(net->dev_index_head);
5107 }
5108 
5109 static struct pernet_operations __net_initdata netdev_net_ops = {
5110 	.init = netdev_init,
5111 	.exit = netdev_exit,
5112 };
5113 
default_device_exit(struct net * net)5114 static void __net_exit default_device_exit(struct net *net)
5115 {
5116 	struct net_device *dev;
5117 	/*
5118 	 * Push all migratable of the network devices back to the
5119 	 * initial network namespace
5120 	 */
5121 	rtnl_lock();
5122 restart:
5123 	for_each_netdev(net, dev) {
5124 		int err;
5125 		char fb_name[IFNAMSIZ];
5126 
5127 		/* Ignore unmoveable devices (i.e. loopback) */
5128 		if (dev->features & NETIF_F_NETNS_LOCAL)
5129 			continue;
5130 
5131 		/* Delete virtual devices */
5132 		if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5133 			dev->rtnl_link_ops->dellink(dev);
5134 			goto restart;
5135 		}
5136 
5137 		/* Push remaing network devices to init_net */
5138 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5139 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5140 		if (err) {
5141 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5142 				__func__, dev->name, err);
5143 			BUG();
5144 		}
5145 		goto restart;
5146 	}
5147 	rtnl_unlock();
5148 }
5149 
5150 static struct pernet_operations __net_initdata default_device_ops = {
5151 	.exit = default_device_exit,
5152 };
5153 
5154 /*
5155  *	Initialize the DEV module. At boot time this walks the device list and
5156  *	unhooks any devices that fail to initialise (normally hardware not
5157  *	present) and leaves us with a valid list of present and active devices.
5158  *
5159  */
5160 
5161 /*
5162  *       This is called single threaded during boot, so no need
5163  *       to take the rtnl semaphore.
5164  */
net_dev_init(void)5165 static int __init net_dev_init(void)
5166 {
5167 	int i, rc = -ENOMEM;
5168 
5169 	BUG_ON(!dev_boot_phase);
5170 
5171 	if (dev_proc_init())
5172 		goto out;
5173 
5174 	if (netdev_kobject_init())
5175 		goto out;
5176 
5177 	INIT_LIST_HEAD(&ptype_all);
5178 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5179 		INIT_LIST_HEAD(&ptype_base[i]);
5180 
5181 	if (register_pernet_subsys(&netdev_net_ops))
5182 		goto out;
5183 
5184 	/*
5185 	 *	Initialise the packet receive queues.
5186 	 */
5187 
5188 	for_each_possible_cpu(i) {
5189 		struct softnet_data *queue;
5190 
5191 		queue = &per_cpu(softnet_data, i);
5192 		skb_queue_head_init(&queue->input_pkt_queue);
5193 		queue->completion_queue = NULL;
5194 		INIT_LIST_HEAD(&queue->poll_list);
5195 
5196 		queue->backlog.poll = process_backlog;
5197 		queue->backlog.weight = weight_p;
5198 		queue->backlog.gro_list = NULL;
5199 	}
5200 
5201 	dev_boot_phase = 0;
5202 
5203 	/* The loopback device is special if any other network devices
5204 	 * is present in a network namespace the loopback device must
5205 	 * be present. Since we now dynamically allocate and free the
5206 	 * loopback device ensure this invariant is maintained by
5207 	 * keeping the loopback device as the first device on the
5208 	 * list of network devices.  Ensuring the loopback devices
5209 	 * is the first device that appears and the last network device
5210 	 * that disappears.
5211 	 */
5212 	if (register_pernet_device(&loopback_net_ops))
5213 		goto out;
5214 
5215 	if (register_pernet_device(&default_device_ops))
5216 		goto out;
5217 
5218 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5219 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5220 
5221 	hotcpu_notifier(dev_cpu_callback, 0);
5222 	dst_init();
5223 	dev_mcast_init();
5224 	rc = 0;
5225 out:
5226 	return rc;
5227 }
5228 
5229 subsys_initcall(net_dev_init);
5230 
5231 EXPORT_SYMBOL(__dev_get_by_index);
5232 EXPORT_SYMBOL(__dev_get_by_name);
5233 EXPORT_SYMBOL(__dev_remove_pack);
5234 EXPORT_SYMBOL(dev_valid_name);
5235 EXPORT_SYMBOL(dev_add_pack);
5236 EXPORT_SYMBOL(dev_alloc_name);
5237 EXPORT_SYMBOL(dev_close);
5238 EXPORT_SYMBOL(dev_get_by_flags);
5239 EXPORT_SYMBOL(dev_get_by_index);
5240 EXPORT_SYMBOL(dev_get_by_name);
5241 EXPORT_SYMBOL(dev_open);
5242 EXPORT_SYMBOL(dev_queue_xmit);
5243 EXPORT_SYMBOL(dev_remove_pack);
5244 EXPORT_SYMBOL(dev_set_allmulti);
5245 EXPORT_SYMBOL(dev_set_promiscuity);
5246 EXPORT_SYMBOL(dev_change_flags);
5247 EXPORT_SYMBOL(dev_set_mtu);
5248 EXPORT_SYMBOL(dev_set_mac_address);
5249 EXPORT_SYMBOL(free_netdev);
5250 EXPORT_SYMBOL(netdev_boot_setup_check);
5251 EXPORT_SYMBOL(netdev_set_master);
5252 EXPORT_SYMBOL(netdev_state_change);
5253 EXPORT_SYMBOL(netif_receive_skb);
5254 EXPORT_SYMBOL(netif_rx);
5255 EXPORT_SYMBOL(register_gifconf);
5256 EXPORT_SYMBOL(register_netdevice);
5257 EXPORT_SYMBOL(register_netdevice_notifier);
5258 EXPORT_SYMBOL(skb_checksum_help);
5259 EXPORT_SYMBOL(synchronize_net);
5260 EXPORT_SYMBOL(unregister_netdevice);
5261 EXPORT_SYMBOL(unregister_netdevice_notifier);
5262 EXPORT_SYMBOL(net_enable_timestamp);
5263 EXPORT_SYMBOL(net_disable_timestamp);
5264 EXPORT_SYMBOL(dev_get_flags);
5265 
5266 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5267 EXPORT_SYMBOL(br_handle_frame_hook);
5268 EXPORT_SYMBOL(br_fdb_get_hook);
5269 EXPORT_SYMBOL(br_fdb_put_hook);
5270 #endif
5271 
5272 EXPORT_SYMBOL(dev_load);
5273 
5274 EXPORT_PER_CPU_SYMBOL(softnet_data);
5275