• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *	Paul `Rusty' Russell		properly handle non-linear skbs
23  *	Harald Welte			don't use nfcache
24  *
25  */
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/ip.h>
30 #include <linux/tcp.h>
31 #include <linux/icmp.h>
32 
33 #include <net/ip.h>
34 #include <net/tcp.h>
35 #include <net/udp.h>
36 #include <net/icmp.h>                   /* for icmp_send */
37 #include <net/route.h>
38 
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv4.h>
41 
42 #ifdef CONFIG_IP_VS_IPV6
43 #include <net/ipv6.h>
44 #include <linux/netfilter_ipv6.h>
45 #endif
46 
47 #include <net/ip_vs.h>
48 
49 
50 EXPORT_SYMBOL(register_ip_vs_scheduler);
51 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
52 EXPORT_SYMBOL(ip_vs_skb_replace);
53 EXPORT_SYMBOL(ip_vs_proto_name);
54 EXPORT_SYMBOL(ip_vs_conn_new);
55 EXPORT_SYMBOL(ip_vs_conn_in_get);
56 EXPORT_SYMBOL(ip_vs_conn_out_get);
57 #ifdef CONFIG_IP_VS_PROTO_TCP
58 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
59 #endif
60 EXPORT_SYMBOL(ip_vs_conn_put);
61 #ifdef CONFIG_IP_VS_DEBUG
62 EXPORT_SYMBOL(ip_vs_get_debug_level);
63 #endif
64 
65 
66 /* ID used in ICMP lookups */
67 #define icmp_id(icmph)          (((icmph)->un).echo.id)
68 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
69 
ip_vs_proto_name(unsigned proto)70 const char *ip_vs_proto_name(unsigned proto)
71 {
72 	static char buf[20];
73 
74 	switch (proto) {
75 	case IPPROTO_IP:
76 		return "IP";
77 	case IPPROTO_UDP:
78 		return "UDP";
79 	case IPPROTO_TCP:
80 		return "TCP";
81 	case IPPROTO_ICMP:
82 		return "ICMP";
83 #ifdef CONFIG_IP_VS_IPV6
84 	case IPPROTO_ICMPV6:
85 		return "ICMPv6";
86 #endif
87 	default:
88 		sprintf(buf, "IP_%d", proto);
89 		return buf;
90 	}
91 }
92 
ip_vs_init_hash_table(struct list_head * table,int rows)93 void ip_vs_init_hash_table(struct list_head *table, int rows)
94 {
95 	while (--rows >= 0)
96 		INIT_LIST_HEAD(&table[rows]);
97 }
98 
99 static inline void
ip_vs_in_stats(struct ip_vs_conn * cp,struct sk_buff * skb)100 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
101 {
102 	struct ip_vs_dest *dest = cp->dest;
103 	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
104 		spin_lock(&dest->stats.lock);
105 		dest->stats.ustats.inpkts++;
106 		dest->stats.ustats.inbytes += skb->len;
107 		spin_unlock(&dest->stats.lock);
108 
109 		spin_lock(&dest->svc->stats.lock);
110 		dest->svc->stats.ustats.inpkts++;
111 		dest->svc->stats.ustats.inbytes += skb->len;
112 		spin_unlock(&dest->svc->stats.lock);
113 
114 		spin_lock(&ip_vs_stats.lock);
115 		ip_vs_stats.ustats.inpkts++;
116 		ip_vs_stats.ustats.inbytes += skb->len;
117 		spin_unlock(&ip_vs_stats.lock);
118 	}
119 }
120 
121 
122 static inline void
ip_vs_out_stats(struct ip_vs_conn * cp,struct sk_buff * skb)123 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
124 {
125 	struct ip_vs_dest *dest = cp->dest;
126 	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
127 		spin_lock(&dest->stats.lock);
128 		dest->stats.ustats.outpkts++;
129 		dest->stats.ustats.outbytes += skb->len;
130 		spin_unlock(&dest->stats.lock);
131 
132 		spin_lock(&dest->svc->stats.lock);
133 		dest->svc->stats.ustats.outpkts++;
134 		dest->svc->stats.ustats.outbytes += skb->len;
135 		spin_unlock(&dest->svc->stats.lock);
136 
137 		spin_lock(&ip_vs_stats.lock);
138 		ip_vs_stats.ustats.outpkts++;
139 		ip_vs_stats.ustats.outbytes += skb->len;
140 		spin_unlock(&ip_vs_stats.lock);
141 	}
142 }
143 
144 
145 static inline void
ip_vs_conn_stats(struct ip_vs_conn * cp,struct ip_vs_service * svc)146 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
147 {
148 	spin_lock(&cp->dest->stats.lock);
149 	cp->dest->stats.ustats.conns++;
150 	spin_unlock(&cp->dest->stats.lock);
151 
152 	spin_lock(&svc->stats.lock);
153 	svc->stats.ustats.conns++;
154 	spin_unlock(&svc->stats.lock);
155 
156 	spin_lock(&ip_vs_stats.lock);
157 	ip_vs_stats.ustats.conns++;
158 	spin_unlock(&ip_vs_stats.lock);
159 }
160 
161 
162 static inline int
ip_vs_set_state(struct ip_vs_conn * cp,int direction,const struct sk_buff * skb,struct ip_vs_protocol * pp)163 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
164 		const struct sk_buff *skb,
165 		struct ip_vs_protocol *pp)
166 {
167 	if (unlikely(!pp->state_transition))
168 		return 0;
169 	return pp->state_transition(cp, direction, skb, pp);
170 }
171 
172 
173 /*
174  *  IPVS persistent scheduling function
175  *  It creates a connection entry according to its template if exists,
176  *  or selects a server and creates a connection entry plus a template.
177  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
178  *  Protocols supported: TCP, UDP
179  */
180 static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service * svc,const struct sk_buff * skb,__be16 ports[2])181 ip_vs_sched_persist(struct ip_vs_service *svc,
182 		    const struct sk_buff *skb,
183 		    __be16 ports[2])
184 {
185 	struct ip_vs_conn *cp = NULL;
186 	struct ip_vs_iphdr iph;
187 	struct ip_vs_dest *dest;
188 	struct ip_vs_conn *ct;
189 	__be16  dport;			/* destination port to forward */
190 	union nf_inet_addr snet;	/* source network of the client,
191 					   after masking */
192 
193 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
194 
195 	/* Mask saddr with the netmask to adjust template granularity */
196 #ifdef CONFIG_IP_VS_IPV6
197 	if (svc->af == AF_INET6)
198 		ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
199 	else
200 #endif
201 		snet.ip = iph.saddr.ip & svc->netmask;
202 
203 	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
204 		      "mnet %s\n",
205 		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
206 		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
207 		      IP_VS_DBG_ADDR(svc->af, &snet));
208 
209 	/*
210 	 * As far as we know, FTP is a very complicated network protocol, and
211 	 * it uses control connection and data connections. For active FTP,
212 	 * FTP server initialize data connection to the client, its source port
213 	 * is often 20. For passive FTP, FTP server tells the clients the port
214 	 * that it passively listens to,  and the client issues the data
215 	 * connection. In the tunneling or direct routing mode, the load
216 	 * balancer is on the client-to-server half of connection, the port
217 	 * number is unknown to the load balancer. So, a conn template like
218 	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
219 	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
220 	 * is created for other persistent services.
221 	 */
222 	if (ports[1] == svc->port) {
223 		/* Check if a template already exists */
224 		if (svc->port != FTPPORT)
225 			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
226 					     &iph.daddr, ports[1]);
227 		else
228 			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
229 					     &iph.daddr, 0);
230 
231 		if (!ct || !ip_vs_check_template(ct)) {
232 			/*
233 			 * No template found or the dest of the connection
234 			 * template is not available.
235 			 */
236 			dest = svc->scheduler->schedule(svc, skb);
237 			if (dest == NULL) {
238 				IP_VS_DBG(1, "p-schedule: no dest found.\n");
239 				return NULL;
240 			}
241 
242 			/*
243 			 * Create a template like <protocol,caddr,0,
244 			 * vaddr,vport,daddr,dport> for non-ftp service,
245 			 * and <protocol,caddr,0,vaddr,0,daddr,0>
246 			 * for ftp service.
247 			 */
248 			if (svc->port != FTPPORT)
249 				ct = ip_vs_conn_new(svc->af, iph.protocol,
250 						    &snet, 0,
251 						    &iph.daddr,
252 						    ports[1],
253 						    &dest->addr, dest->port,
254 						    IP_VS_CONN_F_TEMPLATE,
255 						    dest);
256 			else
257 				ct = ip_vs_conn_new(svc->af, iph.protocol,
258 						    &snet, 0,
259 						    &iph.daddr, 0,
260 						    &dest->addr, 0,
261 						    IP_VS_CONN_F_TEMPLATE,
262 						    dest);
263 			if (ct == NULL)
264 				return NULL;
265 
266 			ct->timeout = svc->timeout;
267 		} else {
268 			/* set destination with the found template */
269 			dest = ct->dest;
270 		}
271 		dport = dest->port;
272 	} else {
273 		/*
274 		 * Note: persistent fwmark-based services and persistent
275 		 * port zero service are handled here.
276 		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
277 		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
278 		 */
279 		if (svc->fwmark) {
280 			union nf_inet_addr fwmark = {
281 				.all = { 0, 0, 0, htonl(svc->fwmark) }
282 			};
283 
284 			ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
285 					     &fwmark, 0);
286 		} else
287 			ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
288 					     &iph.daddr, 0);
289 
290 		if (!ct || !ip_vs_check_template(ct)) {
291 			/*
292 			 * If it is not persistent port zero, return NULL,
293 			 * otherwise create a connection template.
294 			 */
295 			if (svc->port)
296 				return NULL;
297 
298 			dest = svc->scheduler->schedule(svc, skb);
299 			if (dest == NULL) {
300 				IP_VS_DBG(1, "p-schedule: no dest found.\n");
301 				return NULL;
302 			}
303 
304 			/*
305 			 * Create a template according to the service
306 			 */
307 			if (svc->fwmark) {
308 				union nf_inet_addr fwmark = {
309 					.all = { 0, 0, 0, htonl(svc->fwmark) }
310 				};
311 
312 				ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
313 						    &snet, 0,
314 						    &fwmark, 0,
315 						    &dest->addr, 0,
316 						    IP_VS_CONN_F_TEMPLATE,
317 						    dest);
318 			} else
319 				ct = ip_vs_conn_new(svc->af, iph.protocol,
320 						    &snet, 0,
321 						    &iph.daddr, 0,
322 						    &dest->addr, 0,
323 						    IP_VS_CONN_F_TEMPLATE,
324 						    dest);
325 			if (ct == NULL)
326 				return NULL;
327 
328 			ct->timeout = svc->timeout;
329 		} else {
330 			/* set destination with the found template */
331 			dest = ct->dest;
332 		}
333 		dport = ports[1];
334 	}
335 
336 	/*
337 	 *    Create a new connection according to the template
338 	 */
339 	cp = ip_vs_conn_new(svc->af, iph.protocol,
340 			    &iph.saddr, ports[0],
341 			    &iph.daddr, ports[1],
342 			    &dest->addr, dport,
343 			    0,
344 			    dest);
345 	if (cp == NULL) {
346 		ip_vs_conn_put(ct);
347 		return NULL;
348 	}
349 
350 	/*
351 	 *    Add its control
352 	 */
353 	ip_vs_control_add(cp, ct);
354 	ip_vs_conn_put(ct);
355 
356 	ip_vs_conn_stats(cp, svc);
357 	return cp;
358 }
359 
360 
361 /*
362  *  IPVS main scheduling function
363  *  It selects a server according to the virtual service, and
364  *  creates a connection entry.
365  *  Protocols supported: TCP, UDP
366  */
367 struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service * svc,const struct sk_buff * skb)368 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
369 {
370 	struct ip_vs_conn *cp = NULL;
371 	struct ip_vs_iphdr iph;
372 	struct ip_vs_dest *dest;
373 	__be16 _ports[2], *pptr;
374 
375 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
376 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
377 	if (pptr == NULL)
378 		return NULL;
379 
380 	/*
381 	 *    Persistent service
382 	 */
383 	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
384 		return ip_vs_sched_persist(svc, skb, pptr);
385 
386 	/*
387 	 *    Non-persistent service
388 	 */
389 	if (!svc->fwmark && pptr[1] != svc->port) {
390 		if (!svc->port)
391 			IP_VS_ERR("Schedule: port zero only supported "
392 				  "in persistent services, "
393 				  "check your ipvs configuration\n");
394 		return NULL;
395 	}
396 
397 	dest = svc->scheduler->schedule(svc, skb);
398 	if (dest == NULL) {
399 		IP_VS_DBG(1, "Schedule: no dest found.\n");
400 		return NULL;
401 	}
402 
403 	/*
404 	 *    Create a connection entry.
405 	 */
406 	cp = ip_vs_conn_new(svc->af, iph.protocol,
407 			    &iph.saddr, pptr[0],
408 			    &iph.daddr, pptr[1],
409 			    &dest->addr, dest->port ? dest->port : pptr[1],
410 			    0,
411 			    dest);
412 	if (cp == NULL)
413 		return NULL;
414 
415 	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
416 		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
417 		      ip_vs_fwd_tag(cp),
418 		      IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
419 		      IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
420 		      IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
421 		      cp->flags, atomic_read(&cp->refcnt));
422 
423 	ip_vs_conn_stats(cp, svc);
424 	return cp;
425 }
426 
427 
428 /*
429  *  Pass or drop the packet.
430  *  Called by ip_vs_in, when the virtual service is available but
431  *  no destination is available for a new connection.
432  */
ip_vs_leave(struct ip_vs_service * svc,struct sk_buff * skb,struct ip_vs_protocol * pp)433 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
434 		struct ip_vs_protocol *pp)
435 {
436 	__be16 _ports[2], *pptr;
437 	struct ip_vs_iphdr iph;
438 	int unicast;
439 	ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
440 
441 	pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
442 	if (pptr == NULL) {
443 		ip_vs_service_put(svc);
444 		return NF_DROP;
445 	}
446 
447 #ifdef CONFIG_IP_VS_IPV6
448 	if (svc->af == AF_INET6)
449 		unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
450 	else
451 #endif
452 		unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
453 
454 	/* if it is fwmark-based service, the cache_bypass sysctl is up
455 	   and the destination is a non-local unicast, then create
456 	   a cache_bypass connection entry */
457 	if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
458 		int ret, cs;
459 		struct ip_vs_conn *cp;
460 		union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
461 
462 		ip_vs_service_put(svc);
463 
464 		/* create a new connection entry */
465 		IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
466 		cp = ip_vs_conn_new(svc->af, iph.protocol,
467 				    &iph.saddr, pptr[0],
468 				    &iph.daddr, pptr[1],
469 				    &daddr, 0,
470 				    IP_VS_CONN_F_BYPASS,
471 				    NULL);
472 		if (cp == NULL)
473 			return NF_DROP;
474 
475 		/* statistics */
476 		ip_vs_in_stats(cp, skb);
477 
478 		/* set state */
479 		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
480 
481 		/* transmit the first SYN packet */
482 		ret = cp->packet_xmit(skb, cp, pp);
483 		/* do not touch skb anymore */
484 
485 		atomic_inc(&cp->in_pkts);
486 		ip_vs_conn_put(cp);
487 		return ret;
488 	}
489 
490 	/*
491 	 * When the virtual ftp service is presented, packets destined
492 	 * for other services on the VIP may get here (except services
493 	 * listed in the ipvs table), pass the packets, because it is
494 	 * not ipvs job to decide to drop the packets.
495 	 */
496 	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
497 		ip_vs_service_put(svc);
498 		return NF_ACCEPT;
499 	}
500 
501 	ip_vs_service_put(svc);
502 
503 	/*
504 	 * Notify the client that the destination is unreachable, and
505 	 * release the socket buffer.
506 	 * Since it is in IP layer, the TCP socket is not actually
507 	 * created, the TCP RST packet cannot be sent, instead that
508 	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
509 	 */
510 #ifdef CONFIG_IP_VS_IPV6
511 	if (svc->af == AF_INET6)
512 		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
513 			    skb->dev);
514 	else
515 #endif
516 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
517 
518 	return NF_DROP;
519 }
520 
521 
522 /*
523  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
524  *      chain, and is used for VS/NAT.
525  *      It detects packets for VS/NAT connections and sends the packets
526  *      immediately. This can avoid that iptable_nat mangles the packets
527  *      for VS/NAT.
528  */
ip_vs_post_routing(unsigned int hooknum,struct sk_buff * skb,const struct net_device * in,const struct net_device * out,int (* okfn)(struct sk_buff *))529 static unsigned int ip_vs_post_routing(unsigned int hooknum,
530 				       struct sk_buff *skb,
531 				       const struct net_device *in,
532 				       const struct net_device *out,
533 				       int (*okfn)(struct sk_buff *))
534 {
535 	if (!skb->ipvs_property)
536 		return NF_ACCEPT;
537 	/* The packet was sent from IPVS, exit this chain */
538 	return NF_STOP;
539 }
540 
ip_vs_checksum_complete(struct sk_buff * skb,int offset)541 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
542 {
543 	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
544 }
545 
ip_vs_gather_frags(struct sk_buff * skb,u_int32_t user)546 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
547 {
548 	int err = ip_defrag(skb, user);
549 
550 	if (!err)
551 		ip_send_check(ip_hdr(skb));
552 
553 	return err;
554 }
555 
556 #ifdef CONFIG_IP_VS_IPV6
ip_vs_gather_frags_v6(struct sk_buff * skb,u_int32_t user)557 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
558 {
559 	/* TODO IPv6: Find out what to do here for IPv6 */
560 	return 0;
561 }
562 #endif
563 
564 /*
565  * Packet has been made sufficiently writable in caller
566  * - inout: 1=in->out, 0=out->in
567  */
ip_vs_nat_icmp(struct sk_buff * skb,struct ip_vs_protocol * pp,struct ip_vs_conn * cp,int inout)568 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
569 		    struct ip_vs_conn *cp, int inout)
570 {
571 	struct iphdr *iph	 = ip_hdr(skb);
572 	unsigned int icmp_offset = iph->ihl*4;
573 	struct icmphdr *icmph	 = (struct icmphdr *)(skb_network_header(skb) +
574 						      icmp_offset);
575 	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
576 
577 	if (inout) {
578 		iph->saddr = cp->vaddr.ip;
579 		ip_send_check(iph);
580 		ciph->daddr = cp->vaddr.ip;
581 		ip_send_check(ciph);
582 	} else {
583 		iph->daddr = cp->daddr.ip;
584 		ip_send_check(iph);
585 		ciph->saddr = cp->daddr.ip;
586 		ip_send_check(ciph);
587 	}
588 
589 	/* the TCP/UDP port */
590 	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
591 		__be16 *ports = (void *)ciph + ciph->ihl*4;
592 
593 		if (inout)
594 			ports[1] = cp->vport;
595 		else
596 			ports[0] = cp->dport;
597 	}
598 
599 	/* And finally the ICMP checksum */
600 	icmph->checksum = 0;
601 	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
602 	skb->ip_summed = CHECKSUM_UNNECESSARY;
603 
604 	if (inout)
605 		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
606 			"Forwarding altered outgoing ICMP");
607 	else
608 		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
609 			"Forwarding altered incoming ICMP");
610 }
611 
612 #ifdef CONFIG_IP_VS_IPV6
ip_vs_nat_icmp_v6(struct sk_buff * skb,struct ip_vs_protocol * pp,struct ip_vs_conn * cp,int inout)613 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
614 		    struct ip_vs_conn *cp, int inout)
615 {
616 	struct ipv6hdr *iph	 = ipv6_hdr(skb);
617 	unsigned int icmp_offset = sizeof(struct ipv6hdr);
618 	struct icmp6hdr *icmph	 = (struct icmp6hdr *)(skb_network_header(skb) +
619 						      icmp_offset);
620 	struct ipv6hdr *ciph	 = (struct ipv6hdr *)(icmph + 1);
621 
622 	if (inout) {
623 		iph->saddr = cp->vaddr.in6;
624 		ciph->daddr = cp->vaddr.in6;
625 	} else {
626 		iph->daddr = cp->daddr.in6;
627 		ciph->saddr = cp->daddr.in6;
628 	}
629 
630 	/* the TCP/UDP port */
631 	if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
632 		__be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
633 
634 		if (inout)
635 			ports[1] = cp->vport;
636 		else
637 			ports[0] = cp->dport;
638 	}
639 
640 	/* And finally the ICMP checksum */
641 	icmph->icmp6_cksum = 0;
642 	/* TODO IPv6: is this correct for ICMPv6? */
643 	ip_vs_checksum_complete(skb, icmp_offset);
644 	skb->ip_summed = CHECKSUM_UNNECESSARY;
645 
646 	if (inout)
647 		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
648 			"Forwarding altered outgoing ICMPv6");
649 	else
650 		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
651 			"Forwarding altered incoming ICMPv6");
652 }
653 #endif
654 
655 /* Handle relevant response ICMP messages - forward to the right
656  * destination host. Used for NAT and local client.
657  */
handle_response_icmp(int af,struct sk_buff * skb,union nf_inet_addr * snet,__u8 protocol,struct ip_vs_conn * cp,struct ip_vs_protocol * pp,unsigned int offset,unsigned int ihl)658 static int handle_response_icmp(int af, struct sk_buff *skb,
659 				union nf_inet_addr *snet,
660 				__u8 protocol, struct ip_vs_conn *cp,
661 				struct ip_vs_protocol *pp,
662 				unsigned int offset, unsigned int ihl)
663 {
664 	unsigned int verdict = NF_DROP;
665 
666 	if (IP_VS_FWD_METHOD(cp) != 0) {
667 		IP_VS_ERR("shouldn't reach here, because the box is on the "
668 			  "half connection in the tun/dr module.\n");
669 	}
670 
671 	/* Ensure the checksum is correct */
672 	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
673 		/* Failed checksum! */
674 		IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
675 			      IP_VS_DBG_ADDR(af, snet));
676 		goto out;
677 	}
678 
679 	if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
680 		offset += 2 * sizeof(__u16);
681 	if (!skb_make_writable(skb, offset))
682 		goto out;
683 
684 #ifdef CONFIG_IP_VS_IPV6
685 	if (af == AF_INET6)
686 		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
687 	else
688 #endif
689 		ip_vs_nat_icmp(skb, pp, cp, 1);
690 
691 	/* do the statistics and put it back */
692 	ip_vs_out_stats(cp, skb);
693 
694 	skb->ipvs_property = 1;
695 	verdict = NF_ACCEPT;
696 
697 out:
698 	__ip_vs_conn_put(cp);
699 
700 	return verdict;
701 }
702 
703 /*
704  *	Handle ICMP messages in the inside-to-outside direction (outgoing).
705  *	Find any that might be relevant, check against existing connections.
706  *	Currently handles error types - unreachable, quench, ttl exceeded.
707  */
ip_vs_out_icmp(struct sk_buff * skb,int * related)708 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
709 {
710 	struct iphdr *iph;
711 	struct icmphdr	_icmph, *ic;
712 	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
713 	struct ip_vs_iphdr ciph;
714 	struct ip_vs_conn *cp;
715 	struct ip_vs_protocol *pp;
716 	unsigned int offset, ihl;
717 	union nf_inet_addr snet;
718 
719 	*related = 1;
720 
721 	/* reassemble IP fragments */
722 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
723 		if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
724 			return NF_STOLEN;
725 	}
726 
727 	iph = ip_hdr(skb);
728 	offset = ihl = iph->ihl * 4;
729 	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
730 	if (ic == NULL)
731 		return NF_DROP;
732 
733 	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
734 		  ic->type, ntohs(icmp_id(ic)),
735 		  &iph->saddr, &iph->daddr);
736 
737 	/*
738 	 * Work through seeing if this is for us.
739 	 * These checks are supposed to be in an order that means easy
740 	 * things are checked first to speed up processing.... however
741 	 * this means that some packets will manage to get a long way
742 	 * down this stack and then be rejected, but that's life.
743 	 */
744 	if ((ic->type != ICMP_DEST_UNREACH) &&
745 	    (ic->type != ICMP_SOURCE_QUENCH) &&
746 	    (ic->type != ICMP_TIME_EXCEEDED)) {
747 		*related = 0;
748 		return NF_ACCEPT;
749 	}
750 
751 	/* Now find the contained IP header */
752 	offset += sizeof(_icmph);
753 	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
754 	if (cih == NULL)
755 		return NF_ACCEPT; /* The packet looks wrong, ignore */
756 
757 	pp = ip_vs_proto_get(cih->protocol);
758 	if (!pp)
759 		return NF_ACCEPT;
760 
761 	/* Is the embedded protocol header present? */
762 	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
763 		     pp->dont_defrag))
764 		return NF_ACCEPT;
765 
766 	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
767 
768 	offset += cih->ihl * 4;
769 
770 	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
771 	/* The embedded headers contain source and dest in reverse order */
772 	cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
773 	if (!cp)
774 		return NF_ACCEPT;
775 
776 	snet.ip = iph->saddr;
777 	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
778 				    pp, offset, ihl);
779 }
780 
781 #ifdef CONFIG_IP_VS_IPV6
ip_vs_out_icmp_v6(struct sk_buff * skb,int * related)782 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
783 {
784 	struct ipv6hdr *iph;
785 	struct icmp6hdr	_icmph, *ic;
786 	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
787 					   within the ICMP */
788 	struct ip_vs_iphdr ciph;
789 	struct ip_vs_conn *cp;
790 	struct ip_vs_protocol *pp;
791 	unsigned int offset;
792 	union nf_inet_addr snet;
793 
794 	*related = 1;
795 
796 	/* reassemble IP fragments */
797 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798 		if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
799 			return NF_STOLEN;
800 	}
801 
802 	iph = ipv6_hdr(skb);
803 	offset = sizeof(struct ipv6hdr);
804 	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
805 	if (ic == NULL)
806 		return NF_DROP;
807 
808 	IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
809 		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
810 		  &iph->saddr, &iph->daddr);
811 
812 	/*
813 	 * Work through seeing if this is for us.
814 	 * These checks are supposed to be in an order that means easy
815 	 * things are checked first to speed up processing.... however
816 	 * this means that some packets will manage to get a long way
817 	 * down this stack and then be rejected, but that's life.
818 	 */
819 	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
820 	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
821 	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
822 		*related = 0;
823 		return NF_ACCEPT;
824 	}
825 
826 	/* Now find the contained IP header */
827 	offset += sizeof(_icmph);
828 	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
829 	if (cih == NULL)
830 		return NF_ACCEPT; /* The packet looks wrong, ignore */
831 
832 	pp = ip_vs_proto_get(cih->nexthdr);
833 	if (!pp)
834 		return NF_ACCEPT;
835 
836 	/* Is the embedded protocol header present? */
837 	/* TODO: we don't support fragmentation at the moment anyways */
838 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839 		return NF_ACCEPT;
840 
841 	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
842 
843 	offset += sizeof(struct ipv6hdr);
844 
845 	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
846 	/* The embedded headers contain source and dest in reverse order */
847 	cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
848 	if (!cp)
849 		return NF_ACCEPT;
850 
851 	ipv6_addr_copy(&snet.in6, &iph->saddr);
852 	return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
853 				    pp, offset, sizeof(struct ipv6hdr));
854 }
855 #endif
856 
is_tcp_reset(const struct sk_buff * skb,int nh_len)857 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
858 {
859 	struct tcphdr _tcph, *th;
860 
861 	th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
862 	if (th == NULL)
863 		return 0;
864 	return th->rst;
865 }
866 
867 /* Handle response packets: rewrite addresses and send away...
868  * Used for NAT and local client.
869  */
870 static unsigned int
handle_response(int af,struct sk_buff * skb,struct ip_vs_protocol * pp,struct ip_vs_conn * cp,int ihl)871 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
872 		struct ip_vs_conn *cp, int ihl)
873 {
874 	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
875 
876 	if (!skb_make_writable(skb, ihl))
877 		goto drop;
878 
879 	/* mangle the packet */
880 	if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
881 		goto drop;
882 
883 #ifdef CONFIG_IP_VS_IPV6
884 	if (af == AF_INET6)
885 		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
886 	else
887 #endif
888 	{
889 		ip_hdr(skb)->saddr = cp->vaddr.ip;
890 		ip_send_check(ip_hdr(skb));
891 	}
892 
893 	/* For policy routing, packets originating from this
894 	 * machine itself may be routed differently to packets
895 	 * passing through.  We want this packet to be routed as
896 	 * if it came from this machine itself.  So re-compute
897 	 * the routing information.
898 	 */
899 #ifdef CONFIG_IP_VS_IPV6
900 	if (af == AF_INET6) {
901 		if (ip6_route_me_harder(skb) != 0)
902 			goto drop;
903 	} else
904 #endif
905 		if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
906 			goto drop;
907 
908 	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
909 
910 	ip_vs_out_stats(cp, skb);
911 	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
912 	ip_vs_conn_put(cp);
913 
914 	skb->ipvs_property = 1;
915 
916 	LeaveFunction(11);
917 	return NF_ACCEPT;
918 
919 drop:
920 	ip_vs_conn_put(cp);
921 	kfree_skb(skb);
922 	return NF_STOLEN;
923 }
924 
925 /*
926  *	It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
927  *	Check if outgoing packet belongs to the established ip_vs_conn.
928  */
929 static unsigned int
ip_vs_out(unsigned int hooknum,struct sk_buff * skb,const struct net_device * in,const struct net_device * out,int (* okfn)(struct sk_buff *))930 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
931 	  const struct net_device *in, const struct net_device *out,
932 	  int (*okfn)(struct sk_buff *))
933 {
934 	struct ip_vs_iphdr iph;
935 	struct ip_vs_protocol *pp;
936 	struct ip_vs_conn *cp;
937 	int af;
938 
939 	EnterFunction(11);
940 
941 	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
942 
943 	if (skb->ipvs_property)
944 		return NF_ACCEPT;
945 
946 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
947 #ifdef CONFIG_IP_VS_IPV6
948 	if (af == AF_INET6) {
949 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
950 			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
951 
952 			if (related)
953 				return verdict;
954 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
955 		}
956 	} else
957 #endif
958 		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
959 			int related, verdict = ip_vs_out_icmp(skb, &related);
960 
961 			if (related)
962 				return verdict;
963 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
964 		}
965 
966 	pp = ip_vs_proto_get(iph.protocol);
967 	if (unlikely(!pp))
968 		return NF_ACCEPT;
969 
970 	/* reassemble IP fragments */
971 #ifdef CONFIG_IP_VS_IPV6
972 	if (af == AF_INET6) {
973 		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
974 			int related, verdict = ip_vs_out_icmp_v6(skb, &related);
975 
976 			if (related)
977 				return verdict;
978 
979 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
980 		}
981 	} else
982 #endif
983 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
984 			     !pp->dont_defrag)) {
985 			if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
986 				return NF_STOLEN;
987 
988 			ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
989 		}
990 
991 	/*
992 	 * Check if the packet belongs to an existing entry
993 	 */
994 	cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
995 
996 	if (unlikely(!cp)) {
997 		if (sysctl_ip_vs_nat_icmp_send &&
998 		    (pp->protocol == IPPROTO_TCP ||
999 		     pp->protocol == IPPROTO_UDP)) {
1000 			__be16 _ports[2], *pptr;
1001 
1002 			pptr = skb_header_pointer(skb, iph.len,
1003 						  sizeof(_ports), _ports);
1004 			if (pptr == NULL)
1005 				return NF_ACCEPT;	/* Not for me */
1006 			if (ip_vs_lookup_real_service(af, iph.protocol,
1007 						      &iph.saddr,
1008 						      pptr[0])) {
1009 				/*
1010 				 * Notify the real server: there is no
1011 				 * existing entry if it is not RST
1012 				 * packet or not TCP packet.
1013 				 */
1014 				if (iph.protocol != IPPROTO_TCP
1015 				    || !is_tcp_reset(skb, iph.len)) {
1016 #ifdef CONFIG_IP_VS_IPV6
1017 					if (af == AF_INET6)
1018 						icmpv6_send(skb,
1019 							    ICMPV6_DEST_UNREACH,
1020 							    ICMPV6_PORT_UNREACH,
1021 							    0, skb->dev);
1022 					else
1023 #endif
1024 						icmp_send(skb,
1025 							  ICMP_DEST_UNREACH,
1026 							  ICMP_PORT_UNREACH, 0);
1027 					return NF_DROP;
1028 				}
1029 			}
1030 		}
1031 		IP_VS_DBG_PKT(12, pp, skb, 0,
1032 			      "packet continues traversal as normal");
1033 		return NF_ACCEPT;
1034 	}
1035 
1036 	return handle_response(af, skb, pp, cp, iph.len);
1037 }
1038 
1039 
1040 /*
1041  *	Handle ICMP messages in the outside-to-inside direction (incoming).
1042  *	Find any that might be relevant, check against existing connections,
1043  *	forward to the right destination host if relevant.
1044  *	Currently handles error types - unreachable, quench, ttl exceeded.
1045  */
1046 static int
ip_vs_in_icmp(struct sk_buff * skb,int * related,unsigned int hooknum)1047 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1048 {
1049 	struct iphdr *iph;
1050 	struct icmphdr	_icmph, *ic;
1051 	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
1052 	struct ip_vs_iphdr ciph;
1053 	struct ip_vs_conn *cp;
1054 	struct ip_vs_protocol *pp;
1055 	unsigned int offset, ihl, verdict;
1056 	union nf_inet_addr snet;
1057 
1058 	*related = 1;
1059 
1060 	/* reassemble IP fragments */
1061 	if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1062 		if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1063 					    IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1064 			return NF_STOLEN;
1065 	}
1066 
1067 	iph = ip_hdr(skb);
1068 	offset = ihl = iph->ihl * 4;
1069 	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1070 	if (ic == NULL)
1071 		return NF_DROP;
1072 
1073 	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1074 		  ic->type, ntohs(icmp_id(ic)),
1075 		  &iph->saddr, &iph->daddr);
1076 
1077 	/*
1078 	 * Work through seeing if this is for us.
1079 	 * These checks are supposed to be in an order that means easy
1080 	 * things are checked first to speed up processing.... however
1081 	 * this means that some packets will manage to get a long way
1082 	 * down this stack and then be rejected, but that's life.
1083 	 */
1084 	if ((ic->type != ICMP_DEST_UNREACH) &&
1085 	    (ic->type != ICMP_SOURCE_QUENCH) &&
1086 	    (ic->type != ICMP_TIME_EXCEEDED)) {
1087 		*related = 0;
1088 		return NF_ACCEPT;
1089 	}
1090 
1091 	/* Now find the contained IP header */
1092 	offset += sizeof(_icmph);
1093 	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1094 	if (cih == NULL)
1095 		return NF_ACCEPT; /* The packet looks wrong, ignore */
1096 
1097 	pp = ip_vs_proto_get(cih->protocol);
1098 	if (!pp)
1099 		return NF_ACCEPT;
1100 
1101 	/* Is the embedded protocol header present? */
1102 	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1103 		     pp->dont_defrag))
1104 		return NF_ACCEPT;
1105 
1106 	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1107 
1108 	offset += cih->ihl * 4;
1109 
1110 	ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1111 	/* The embedded headers contain source and dest in reverse order */
1112 	cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1113 	if (!cp) {
1114 		/* The packet could also belong to a local client */
1115 		cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1116 		if (cp) {
1117 			snet.ip = iph->saddr;
1118 			return handle_response_icmp(AF_INET, skb, &snet,
1119 						    cih->protocol, cp, pp,
1120 						    offset, ihl);
1121 		}
1122 		return NF_ACCEPT;
1123 	}
1124 
1125 	verdict = NF_DROP;
1126 
1127 	/* Ensure the checksum is correct */
1128 	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1129 		/* Failed checksum! */
1130 		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1131 			  &iph->saddr);
1132 		goto out;
1133 	}
1134 
1135 	/* do the statistics and put it back */
1136 	ip_vs_in_stats(cp, skb);
1137 	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1138 		offset += 2 * sizeof(__u16);
1139 	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1140 	/* do not touch skb anymore */
1141 
1142   out:
1143 	__ip_vs_conn_put(cp);
1144 
1145 	return verdict;
1146 }
1147 
1148 #ifdef CONFIG_IP_VS_IPV6
1149 static int
ip_vs_in_icmp_v6(struct sk_buff * skb,int * related,unsigned int hooknum)1150 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1151 {
1152 	struct ipv6hdr *iph;
1153 	struct icmp6hdr	_icmph, *ic;
1154 	struct ipv6hdr	_ciph, *cih;	/* The ip header contained
1155 					   within the ICMP */
1156 	struct ip_vs_iphdr ciph;
1157 	struct ip_vs_conn *cp;
1158 	struct ip_vs_protocol *pp;
1159 	unsigned int offset, verdict;
1160 	union nf_inet_addr snet;
1161 
1162 	*related = 1;
1163 
1164 	/* reassemble IP fragments */
1165 	if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1166 		if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1167 					       IP_DEFRAG_VS_IN :
1168 					       IP_DEFRAG_VS_FWD))
1169 			return NF_STOLEN;
1170 	}
1171 
1172 	iph = ipv6_hdr(skb);
1173 	offset = sizeof(struct ipv6hdr);
1174 	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1175 	if (ic == NULL)
1176 		return NF_DROP;
1177 
1178 	IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1179 		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1180 		  &iph->saddr, &iph->daddr);
1181 
1182 	/*
1183 	 * Work through seeing if this is for us.
1184 	 * These checks are supposed to be in an order that means easy
1185 	 * things are checked first to speed up processing.... however
1186 	 * this means that some packets will manage to get a long way
1187 	 * down this stack and then be rejected, but that's life.
1188 	 */
1189 	if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1190 	    (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1191 	    (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1192 		*related = 0;
1193 		return NF_ACCEPT;
1194 	}
1195 
1196 	/* Now find the contained IP header */
1197 	offset += sizeof(_icmph);
1198 	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1199 	if (cih == NULL)
1200 		return NF_ACCEPT; /* The packet looks wrong, ignore */
1201 
1202 	pp = ip_vs_proto_get(cih->nexthdr);
1203 	if (!pp)
1204 		return NF_ACCEPT;
1205 
1206 	/* Is the embedded protocol header present? */
1207 	/* TODO: we don't support fragmentation at the moment anyways */
1208 	if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1209 		return NF_ACCEPT;
1210 
1211 	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1212 
1213 	offset += sizeof(struct ipv6hdr);
1214 
1215 	ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1216 	/* The embedded headers contain source and dest in reverse order */
1217 	cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1218 	if (!cp) {
1219 		/* The packet could also belong to a local client */
1220 		cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1221 		if (cp) {
1222 			ipv6_addr_copy(&snet.in6, &iph->saddr);
1223 			return handle_response_icmp(AF_INET6, skb, &snet,
1224 						    cih->nexthdr,
1225 						    cp, pp, offset,
1226 						    sizeof(struct ipv6hdr));
1227 		}
1228 		return NF_ACCEPT;
1229 	}
1230 
1231 	verdict = NF_DROP;
1232 
1233 	/* do the statistics and put it back */
1234 	ip_vs_in_stats(cp, skb);
1235 	if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1236 		offset += 2 * sizeof(__u16);
1237 	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1238 	/* do not touch skb anymore */
1239 
1240 	__ip_vs_conn_put(cp);
1241 
1242 	return verdict;
1243 }
1244 #endif
1245 
1246 
1247 /*
1248  *	Check if it's for virtual services, look it up,
1249  *	and send it on its way...
1250  */
1251 static unsigned int
ip_vs_in(unsigned int hooknum,struct sk_buff * skb,const struct net_device * in,const struct net_device * out,int (* okfn)(struct sk_buff *))1252 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1253 	 const struct net_device *in, const struct net_device *out,
1254 	 int (*okfn)(struct sk_buff *))
1255 {
1256 	struct ip_vs_iphdr iph;
1257 	struct ip_vs_protocol *pp;
1258 	struct ip_vs_conn *cp;
1259 	int ret, restart, af;
1260 
1261 	af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1262 
1263 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1264 
1265 	/*
1266 	 *	Big tappo: only PACKET_HOST, including loopback for local client
1267 	 *	Don't handle local packets on IPv6 for now
1268 	 */
1269 	if (unlikely(skb->pkt_type != PACKET_HOST)) {
1270 		IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1271 			      skb->pkt_type,
1272 			      iph.protocol,
1273 			      IP_VS_DBG_ADDR(af, &iph.daddr));
1274 		return NF_ACCEPT;
1275 	}
1276 
1277 	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1278 		int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1279 
1280 		if (related)
1281 			return verdict;
1282 		ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1283 	}
1284 
1285 	/* Protocol supported? */
1286 	pp = ip_vs_proto_get(iph.protocol);
1287 	if (unlikely(!pp))
1288 		return NF_ACCEPT;
1289 
1290 	/*
1291 	 * Check if the packet belongs to an existing connection entry
1292 	 */
1293 	cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1294 
1295 	if (unlikely(!cp)) {
1296 		int v;
1297 
1298 		/* For local client packets, it could be a response */
1299 		cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1300 		if (cp)
1301 			return handle_response(af, skb, pp, cp, iph.len);
1302 
1303 		if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1304 			return v;
1305 	}
1306 
1307 	if (unlikely(!cp)) {
1308 		/* sorry, all this trouble for a no-hit :) */
1309 		IP_VS_DBG_PKT(12, pp, skb, 0,
1310 			      "packet continues traversal as normal");
1311 		return NF_ACCEPT;
1312 	}
1313 
1314 	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1315 
1316 	/* Check the server status */
1317 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1318 		/* the destination server is not available */
1319 
1320 		if (sysctl_ip_vs_expire_nodest_conn) {
1321 			/* try to expire the connection immediately */
1322 			ip_vs_conn_expire_now(cp);
1323 		}
1324 		/* don't restart its timer, and silently
1325 		   drop the packet. */
1326 		__ip_vs_conn_put(cp);
1327 		return NF_DROP;
1328 	}
1329 
1330 	ip_vs_in_stats(cp, skb);
1331 	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1332 	if (cp->packet_xmit)
1333 		ret = cp->packet_xmit(skb, cp, pp);
1334 		/* do not touch skb anymore */
1335 	else {
1336 		IP_VS_DBG_RL("warning: packet_xmit is null");
1337 		ret = NF_ACCEPT;
1338 	}
1339 
1340 	/* Increase its packet counter and check if it is needed
1341 	 * to be synchronized
1342 	 *
1343 	 * Sync connection if it is about to close to
1344 	 * encorage the standby servers to update the connections timeout
1345 	 */
1346 	atomic_inc(&cp->in_pkts);
1347 	if (af == AF_INET &&
1348 	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1349 	    (((cp->protocol != IPPROTO_TCP ||
1350 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1351 	      (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1352 	       == sysctl_ip_vs_sync_threshold[0])) ||
1353 	     ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1354 	      ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1355 	       (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1356 	       (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1357 		ip_vs_sync_conn(cp);
1358 	cp->old_state = cp->state;
1359 
1360 	ip_vs_conn_put(cp);
1361 	return ret;
1362 }
1363 
1364 
1365 /*
1366  *	It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1367  *      related packets destined for 0.0.0.0/0.
1368  *      When fwmark-based virtual service is used, such as transparent
1369  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1370  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1371  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1372  *      and send them to ip_vs_in_icmp.
1373  */
1374 static unsigned int
ip_vs_forward_icmp(unsigned int hooknum,struct sk_buff * skb,const struct net_device * in,const struct net_device * out,int (* okfn)(struct sk_buff *))1375 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1376 		   const struct net_device *in, const struct net_device *out,
1377 		   int (*okfn)(struct sk_buff *))
1378 {
1379 	int r;
1380 
1381 	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1382 		return NF_ACCEPT;
1383 
1384 	return ip_vs_in_icmp(skb, &r, hooknum);
1385 }
1386 
1387 #ifdef CONFIG_IP_VS_IPV6
1388 static unsigned int
ip_vs_forward_icmp_v6(unsigned int hooknum,struct sk_buff * skb,const struct net_device * in,const struct net_device * out,int (* okfn)(struct sk_buff *))1389 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1390 		      const struct net_device *in, const struct net_device *out,
1391 		      int (*okfn)(struct sk_buff *))
1392 {
1393 	int r;
1394 
1395 	if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1396 		return NF_ACCEPT;
1397 
1398 	return ip_vs_in_icmp_v6(skb, &r, hooknum);
1399 }
1400 #endif
1401 
1402 
1403 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1404 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
1405 	 * or VS/NAT(change destination), so that filtering rules can be
1406 	 * applied to IPVS. */
1407 	{
1408 		.hook		= ip_vs_in,
1409 		.owner		= THIS_MODULE,
1410 		.pf		= PF_INET,
1411 		.hooknum        = NF_INET_LOCAL_IN,
1412 		.priority       = 100,
1413 	},
1414 	/* After packet filtering, change source only for VS/NAT */
1415 	{
1416 		.hook		= ip_vs_out,
1417 		.owner		= THIS_MODULE,
1418 		.pf		= PF_INET,
1419 		.hooknum        = NF_INET_FORWARD,
1420 		.priority       = 100,
1421 	},
1422 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1423 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1424 	{
1425 		.hook		= ip_vs_forward_icmp,
1426 		.owner		= THIS_MODULE,
1427 		.pf		= PF_INET,
1428 		.hooknum        = NF_INET_FORWARD,
1429 		.priority       = 99,
1430 	},
1431 	/* Before the netfilter connection tracking, exit from POST_ROUTING */
1432 	{
1433 		.hook		= ip_vs_post_routing,
1434 		.owner		= THIS_MODULE,
1435 		.pf		= PF_INET,
1436 		.hooknum        = NF_INET_POST_ROUTING,
1437 		.priority       = NF_IP_PRI_NAT_SRC-1,
1438 	},
1439 #ifdef CONFIG_IP_VS_IPV6
1440 	/* After packet filtering, forward packet through VS/DR, VS/TUN,
1441 	 * or VS/NAT(change destination), so that filtering rules can be
1442 	 * applied to IPVS. */
1443 	{
1444 		.hook		= ip_vs_in,
1445 		.owner		= THIS_MODULE,
1446 		.pf		= PF_INET6,
1447 		.hooknum        = NF_INET_LOCAL_IN,
1448 		.priority       = 100,
1449 	},
1450 	/* After packet filtering, change source only for VS/NAT */
1451 	{
1452 		.hook		= ip_vs_out,
1453 		.owner		= THIS_MODULE,
1454 		.pf		= PF_INET6,
1455 		.hooknum        = NF_INET_FORWARD,
1456 		.priority       = 100,
1457 	},
1458 	/* After packet filtering (but before ip_vs_out_icmp), catch icmp
1459 	 * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1460 	{
1461 		.hook		= ip_vs_forward_icmp_v6,
1462 		.owner		= THIS_MODULE,
1463 		.pf		= PF_INET6,
1464 		.hooknum        = NF_INET_FORWARD,
1465 		.priority       = 99,
1466 	},
1467 	/* Before the netfilter connection tracking, exit from POST_ROUTING */
1468 	{
1469 		.hook		= ip_vs_post_routing,
1470 		.owner		= THIS_MODULE,
1471 		.pf		= PF_INET6,
1472 		.hooknum        = NF_INET_POST_ROUTING,
1473 		.priority       = NF_IP6_PRI_NAT_SRC-1,
1474 	},
1475 #endif
1476 };
1477 
1478 
1479 /*
1480  *	Initialize IP Virtual Server
1481  */
ip_vs_init(void)1482 static int __init ip_vs_init(void)
1483 {
1484 	int ret;
1485 
1486 	ip_vs_estimator_init();
1487 
1488 	ret = ip_vs_control_init();
1489 	if (ret < 0) {
1490 		IP_VS_ERR("can't setup control.\n");
1491 		goto cleanup_estimator;
1492 	}
1493 
1494 	ip_vs_protocol_init();
1495 
1496 	ret = ip_vs_app_init();
1497 	if (ret < 0) {
1498 		IP_VS_ERR("can't setup application helper.\n");
1499 		goto cleanup_protocol;
1500 	}
1501 
1502 	ret = ip_vs_conn_init();
1503 	if (ret < 0) {
1504 		IP_VS_ERR("can't setup connection table.\n");
1505 		goto cleanup_app;
1506 	}
1507 
1508 	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1509 	if (ret < 0) {
1510 		IP_VS_ERR("can't register hooks.\n");
1511 		goto cleanup_conn;
1512 	}
1513 
1514 	IP_VS_INFO("ipvs loaded.\n");
1515 	return ret;
1516 
1517   cleanup_conn:
1518 	ip_vs_conn_cleanup();
1519   cleanup_app:
1520 	ip_vs_app_cleanup();
1521   cleanup_protocol:
1522 	ip_vs_protocol_cleanup();
1523 	ip_vs_control_cleanup();
1524   cleanup_estimator:
1525 	ip_vs_estimator_cleanup();
1526 	return ret;
1527 }
1528 
ip_vs_cleanup(void)1529 static void __exit ip_vs_cleanup(void)
1530 {
1531 	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1532 	ip_vs_conn_cleanup();
1533 	ip_vs_app_cleanup();
1534 	ip_vs_protocol_cleanup();
1535 	ip_vs_control_cleanup();
1536 	ip_vs_estimator_cleanup();
1537 	IP_VS_INFO("ipvs unloaded.\n");
1538 }
1539 
1540 module_init(ip_vs_init);
1541 module_exit(ip_vs_cleanup);
1542 MODULE_LICENSE("GPL");
1543