1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Version: $Id: ip_output.c,v 1.87 2000/10/25 20:07:22 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Donald Becker, <becker@super.org>
13 * Alan Cox, <Alan.Cox@linux.org>
14 * Richard Underwood
15 * Stefan Becker, <stefanb@yello.ping.de>
16 * Jorge Cwik, <jorge@laser.satlink.net>
17 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18 *
19 * See ip_input.c for original log
20 *
21 * Fixes:
22 * Alan Cox : Missing nonblock feature in ip_build_xmit.
23 * Mike Kilburn : htons() missing in ip_build_xmit.
24 * Bradford Johnson: Fix faulty handling of some frames when
25 * no route is found.
26 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
27 * (in case if packet not accepted by
28 * output firewall rules)
29 * Mike McLagan : Routing by source
30 * Alexey Kuznetsov: use new route cache
31 * Andi Kleen: Fix broken PMTU recovery and remove
32 * some redundant tests.
33 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
34 * Andi Kleen : Replace ip_reply with ip_send_reply.
35 * Andi Kleen : Split fast and slow ip_build_xmit path
36 * for decreased register pressure on x86
37 * and more readibility.
38 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
39 * silently drop skb instead of failing with -EPERM.
40 */
41
42 #include <asm/uaccess.h>
43 #include <asm/system.h>
44 #include <linux/types.h>
45 #include <linux/kernel.h>
46 #include <linux/sched.h>
47 #include <linux/mm.h>
48 #include <linux/string.h>
49 #include <linux/errno.h>
50 #include <linux/config.h>
51
52 #include <linux/socket.h>
53 #include <linux/sockios.h>
54 #include <linux/in.h>
55 #include <linux/inet.h>
56 #include <linux/netdevice.h>
57 #include <linux/etherdevice.h>
58 #include <linux/proc_fs.h>
59 #include <linux/stat.h>
60 #include <linux/init.h>
61
62 #include <net/snmp.h>
63 #include <net/ip.h>
64 #include <net/protocol.h>
65 #include <net/route.h>
66 #include <net/tcp.h>
67 #include <net/udp.h>
68 #include <linux/skbuff.h>
69 #include <net/sock.h>
70 #include <net/arp.h>
71 #include <net/icmp.h>
72 #include <net/raw.h>
73 #include <net/checksum.h>
74 #include <net/inetpeer.h>
75 #include <linux/igmp.h>
76 #include <linux/netfilter_ipv4.h>
77 #include <linux/mroute.h>
78 #include <linux/netlink.h>
79
80 /*
81 * Shall we try to damage output packets if routing dev changes?
82 */
83
84 int sysctl_ip_dynaddr = 0;
85 int sysctl_ip_default_ttl = IPDEFTTL;
86
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93
94 /* dev_loopback_xmit for use with netfilter. */
95 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
96 {
97 newskb->mac.raw = newskb->data;
98 skb_pull(newskb, newskb->nh.raw - newskb->data);
99 newskb->pkt_type = PACKET_LOOPBACK;
100 newskb->ip_summed = CHECKSUM_UNNECESSARY;
101 BUG_TRAP(newskb->dst);
102
103 #ifdef CONFIG_NETFILTER_DEBUG
104 nf_debug_ip_loopback_xmit(newskb);
105 #endif
106 netif_rx(newskb);
107 return 0;
108 }
109
110 /* Don't just hand NF_HOOK skb->dst->output, in case netfilter hook
111 changes route */
112 static inline int
113 output_maybe_reroute(struct sk_buff *skb)
114 {
115 return skb->dst->output(skb);
116 }
117
118 /*
119 * Add an ip header to a skbuff and send it out.
120 */
121 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
122 u32 saddr, u32 daddr, struct ip_options *opt)
123 {
124 struct rtable *rt = (struct rtable *)skb->dst;
125 struct iphdr *iph;
126
127 /* Build the IP header. */
128 if (opt)
129 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130 else
131 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133 iph->version = 4;
134 iph->ihl = 5;
135 iph->tos = sk->protinfo.af_inet.tos;
136 iph->frag_off = 0;
137 if (ip_dont_fragment(sk, &rt->u.dst))
138 iph->frag_off |= htons(IP_DF);
139 iph->ttl = sk->protinfo.af_inet.ttl;
140 iph->daddr = rt->rt_dst;
141 iph->saddr = rt->rt_src;
142 iph->protocol = sk->protocol;
143 iph->tot_len = htons(skb->len);
144 ip_select_ident(iph, &rt->u.dst);
145 skb->nh.iph = iph;
146
147 if (opt && opt->optlen) {
148 iph->ihl += opt->optlen>>2;
149 ip_options_build(skb, opt, daddr, rt, 0);
150 }
151 ip_send_check(iph);
152
153 /* Send it out. */
154 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
155 output_maybe_reroute);
156 }
157
158 static inline int ip_finish_output2(struct sk_buff *skb)
159 {
160 struct dst_entry *dst = skb->dst;
161 struct hh_cache *hh = dst->hh;
162
163 #ifdef CONFIG_NETFILTER_DEBUG
164 nf_debug_ip_finish_output2(skb);
165 #endif /*CONFIG_NETFILTER_DEBUG*/
166
167 if (hh) {
168 read_lock_bh(&hh->hh_lock);
169 memcpy(skb->data - 16, hh->hh_data, 16);
170 read_unlock_bh(&hh->hh_lock);
171 skb_push(skb, hh->hh_len);
172 return hh->hh_output(skb);
173 } else if (dst->neighbour)
174 return dst->neighbour->output(skb);
175
176 printk(KERN_DEBUG "khm\n");
177 kfree_skb(skb);
178 return -EINVAL;
179 }
180
181 __inline__ int ip_finish_output(struct sk_buff *skb)
182 {
183 struct net_device *dev = skb->dst->dev;
184
185 skb->dev = dev;
186 skb->protocol = __constant_htons(ETH_P_IP);
187
188 return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
189 ip_finish_output2);
190 }
191
192 int ip_mc_output(struct sk_buff *skb)
193 {
194 struct sock *sk = skb->sk;
195 struct rtable *rt = (struct rtable*)skb->dst;
196 struct net_device *dev = rt->u.dst.dev;
197
198 /*
199 * If the indicated interface is up and running, send the packet.
200 */
201 IP_INC_STATS(IpOutRequests);
202 #ifdef CONFIG_IP_ROUTE_NAT
203 if (rt->rt_flags & RTCF_NAT)
204 ip_do_nat(skb);
205 #endif
206
207 skb->dev = dev;
208 skb->protocol = __constant_htons(ETH_P_IP);
209
210 /*
211 * Multicasts are looped back for other local users
212 */
213
214 if (rt->rt_flags&RTCF_MULTICAST && (!sk || sk->protinfo.af_inet.mc_loop)) {
215 #ifdef CONFIG_IP_MROUTE
216 /* Small optimization: do not loopback not local frames,
217 which returned after forwarding; they will be dropped
218 by ip_mr_input in any case.
219 Note, that local frames are looped back to be delivered
220 to local recipients.
221
222 This check is duplicated in ip_mr_input at the moment.
223 */
224 if ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
225 #endif
226 {
227 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
228 if (newskb)
229 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
230 newskb->dev,
231 ip_dev_loopback_xmit);
232 }
233
234 /* Multicasts with ttl 0 must not go beyond the host */
235
236 if (skb->nh.iph->ttl == 0) {
237 kfree_skb(skb);
238 return 0;
239 }
240 }
241
242 if (rt->rt_flags&RTCF_BROADCAST) {
243 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
244 if (newskb)
245 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
246 newskb->dev, ip_dev_loopback_xmit);
247 }
248
249 return ip_finish_output(skb);
250 }
251
252 int ip_output(struct sk_buff *skb)
253 {
254 #ifdef CONFIG_IP_ROUTE_NAT
255 struct rtable *rt = (struct rtable*)skb->dst;
256 #endif
257
258 IP_INC_STATS(IpOutRequests);
259
260 #ifdef CONFIG_IP_ROUTE_NAT
261 if (rt->rt_flags&RTCF_NAT)
262 ip_do_nat(skb);
263 #endif
264
265 return ip_finish_output(skb);
266 }
267
268 /* Queues a packet to be sent, and starts the transmitter if necessary.
269 * This routine also needs to put in the total length and compute the
270 * checksum. We use to do this in two stages, ip_build_header() then
271 * this, but that scheme created a mess when routes disappeared etc.
272 * So we do it all here, and the TCP send engine has been changed to
273 * match. (No more unroutable FIN disasters, etc. wheee...) This will
274 * most likely make other reliable transport layers above IP easier
275 * to implement under Linux.
276 */
277 static inline int ip_queue_xmit2(struct sk_buff *skb)
278 {
279 struct sock *sk = skb->sk;
280 struct rtable *rt = (struct rtable *)skb->dst;
281 struct net_device *dev;
282 struct iphdr *iph = skb->nh.iph;
283
284 dev = rt->u.dst.dev;
285
286 /* This can happen when the transport layer has segments queued
287 * with a cached route, and by the time we get here things are
288 * re-routed to a device with a different MTU than the original
289 * device. Sick, but we must cover it.
290 */
291 if (skb_headroom(skb) < dev->hard_header_len && dev->hard_header) {
292 struct sk_buff *skb2;
293
294 skb2 = skb_realloc_headroom(skb, (dev->hard_header_len + 15) & ~15);
295 kfree_skb(skb);
296 if (skb2 == NULL)
297 return -ENOMEM;
298 if (sk)
299 skb_set_owner_w(skb2, sk);
300 skb = skb2;
301 iph = skb->nh.iph;
302 }
303
304 if (skb->len > rt->u.dst.pmtu)
305 goto fragment;
306
307 if (ip_dont_fragment(sk, &rt->u.dst))
308 iph->frag_off |= __constant_htons(IP_DF);
309
310 ip_select_ident(iph, &rt->u.dst);
311
312 /* Add an IP checksum. */
313 ip_send_check(iph);
314
315 skb->priority = sk->priority;
316 return skb->dst->output(skb);
317
318 fragment:
319 if (ip_dont_fragment(sk, &rt->u.dst)) {
320 /* Reject packet ONLY if TCP might fragment
321 * it itself, if were careful enough.
322 */
323 iph->frag_off |= __constant_htons(IP_DF);
324 NETDEBUG(printk(KERN_DEBUG "sending pkt_too_big to self\n"));
325
326 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
327 htonl(rt->u.dst.pmtu));
328 kfree_skb(skb);
329 return -EMSGSIZE;
330 }
331 ip_select_ident(iph, &rt->u.dst);
332 return ip_fragment(skb, skb->dst->output);
333 }
334
335 int ip_queue_xmit(struct sk_buff *skb)
336 {
337 struct sock *sk = skb->sk;
338 struct ip_options *opt = sk->protinfo.af_inet.opt;
339 struct rtable *rt;
340 struct iphdr *iph;
341
342 /* Make sure we can route this packet. */
343 rt = (struct rtable *)__sk_dst_check(sk, 0);
344 if (rt == NULL) {
345 u32 daddr;
346
347 /* Use correct destination address if we have options. */
348 daddr = sk->daddr;
349 if(opt && opt->srr)
350 daddr = opt->faddr;
351
352 /* If this fails, retransmit mechanism of transport layer will
353 * keep trying until route appears or the connection times itself
354 * out.
355 */
356 if (ip_route_output(&rt, daddr, sk->saddr,
357 RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
358 sk->bound_dev_if))
359 goto no_route;
360 __sk_dst_set(sk, &rt->u.dst);
361 }
362 skb->dst = dst_clone(&rt->u.dst);
363
364 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
365 goto no_route;
366
367 /* OK, we know where to send it, allocate and build IP header. */
368 iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
369 *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
370 iph->tot_len = htons(skb->len);
371 iph->frag_off = 0;
372 iph->ttl = sk->protinfo.af_inet.ttl;
373 iph->protocol = sk->protocol;
374 iph->saddr = rt->rt_src;
375 iph->daddr = rt->rt_dst;
376 skb->nh.iph = iph;
377 /* Transport layer set skb->h.foo itself. */
378
379 if(opt && opt->optlen) {
380 iph->ihl += opt->optlen >> 2;
381 ip_options_build(skb, opt, sk->daddr, rt, 0);
382 }
383
384 return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
385 ip_queue_xmit2);
386
387 no_route:
388 IP_INC_STATS(IpOutNoRoutes);
389 kfree_skb(skb);
390 return -EHOSTUNREACH;
391 }
392
393 /*
394 * Build and send a packet, with as little as one copy
395 *
396 * Doesn't care much about ip options... option length can be
397 * different for fragment at 0 and other fragments.
398 *
399 * Note that the fragment at the highest offset is sent first,
400 * so the getfrag routine can fill in the TCP/UDP checksum header
401 * field in the last fragment it sends... actually it also helps
402 * the reassemblers, they can put most packets in at the head of
403 * the fragment queue, and they know the total size in advance. This
404 * last feature will measurably improve the Linux fragment handler one
405 * day.
406 *
407 * The callback has five args, an arbitrary pointer (copy of frag),
408 * the source IP address (may depend on the routing table), the
409 * destination address (char *), the offset to copy from, and the
410 * length to be copied.
411 */
412
413 static int ip_build_xmit_slow(struct sock *sk,
414 int getfrag (const void *,
415 char *,
416 unsigned int,
417 unsigned int),
418 const void *frag,
419 unsigned length,
420 struct ipcm_cookie *ipc,
421 struct rtable *rt,
422 int flags)
423 {
424 unsigned int fraglen, maxfraglen, fragheaderlen;
425 int err;
426 int offset, mf;
427 int mtu;
428 u16 id = 0;
429
430 int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
431 int nfrags=0;
432 struct ip_options *opt = ipc->opt;
433 int df = 0;
434
435 mtu = rt->u.dst.pmtu;
436 if (ip_dont_fragment(sk, &rt->u.dst))
437 df = htons(IP_DF);
438
439 length -= sizeof(struct iphdr);
440
441 if (opt) {
442 fragheaderlen = sizeof(struct iphdr) + opt->optlen;
443 maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
444 } else {
445 fragheaderlen = sizeof(struct iphdr);
446
447 /*
448 * Fragheaderlen is the size of 'overhead' on each buffer. Now work
449 * out the size of the frames to send.
450 */
451
452 maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
453 }
454
455 if (length + fragheaderlen > 0xFFFF) {
456 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
457 return -EMSGSIZE;
458 }
459
460 /*
461 * Start at the end of the frame by handling the remainder.
462 */
463
464 offset = length - (length % (maxfraglen - fragheaderlen));
465
466 /*
467 * Amount of memory to allocate for final fragment.
468 */
469
470 fraglen = length - offset + fragheaderlen;
471
472 if (length-offset==0) {
473 fraglen = maxfraglen;
474 offset -= maxfraglen-fragheaderlen;
475 }
476
477 /*
478 * The last fragment will not have MF (more fragments) set.
479 */
480
481 mf = 0;
482
483 /*
484 * Don't fragment packets for path mtu discovery.
485 */
486
487 if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
488 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
489 return -EMSGSIZE;
490 }
491 if (flags&MSG_PROBE)
492 goto out;
493
494 /*
495 * Begin outputting the bytes.
496 */
497
498 do {
499 char *data;
500 struct sk_buff * skb;
501
502 /*
503 * Get the memory we require with some space left for alignment.
504 */
505
506 skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
507 if (skb == NULL)
508 goto error;
509
510 /*
511 * Fill in the control structures
512 */
513
514 skb->priority = sk->priority;
515 skb->dst = dst_clone(&rt->u.dst);
516 skb_reserve(skb, hh_len);
517
518 /*
519 * Find where to start putting bytes.
520 */
521
522 data = skb_put(skb, fraglen);
523 skb->nh.iph = (struct iphdr *)data;
524
525 /*
526 * Only write IP header onto non-raw packets
527 */
528
529 {
530 struct iphdr *iph = (struct iphdr *)data;
531
532 iph->version = 4;
533 iph->ihl = 5;
534 if (opt) {
535 iph->ihl += opt->optlen>>2;
536 ip_options_build(skb, opt,
537 ipc->addr, rt, offset);
538 }
539 iph->tos = sk->protinfo.af_inet.tos;
540 iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
541 iph->frag_off = htons(offset>>3)|mf|df;
542 iph->id = id;
543 if (!mf) {
544 if (offset || !df) {
545 /* Select an unpredictable ident only
546 * for packets without DF or having
547 * been fragmented.
548 */
549 __ip_select_ident(iph, &rt->u.dst);
550 id = iph->id;
551 }
552
553 /*
554 * Any further fragments will have MF set.
555 */
556 mf = htons(IP_MF);
557 }
558 if (rt->rt_type == RTN_MULTICAST)
559 iph->ttl = sk->protinfo.af_inet.mc_ttl;
560 else
561 iph->ttl = sk->protinfo.af_inet.ttl;
562 iph->protocol = sk->protocol;
563 iph->check = 0;
564 iph->saddr = rt->rt_src;
565 iph->daddr = rt->rt_dst;
566 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
567 data += iph->ihl*4;
568 }
569
570 /*
571 * User data callback
572 */
573
574 if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
575 err = -EFAULT;
576 kfree_skb(skb);
577 goto error;
578 }
579
580 offset -= (maxfraglen-fragheaderlen);
581 fraglen = maxfraglen;
582
583 nfrags++;
584
585 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
586 skb->dst->dev, output_maybe_reroute);
587 if (err) {
588 if (err > 0)
589 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
590 if (err)
591 goto error;
592 }
593 } while (offset >= 0);
594
595 if (nfrags>1)
596 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
597 out:
598 return 0;
599
600 error:
601 IP_INC_STATS(IpOutDiscards);
602 if (nfrags>1)
603 ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
604 return err;
605 }
606
607 /*
608 * Fast path for unfragmented packets.
609 */
610 int ip_build_xmit(struct sock *sk,
611 int getfrag (const void *,
612 char *,
613 unsigned int,
614 unsigned int),
615 const void *frag,
616 unsigned length,
617 struct ipcm_cookie *ipc,
618 struct rtable *rt,
619 int flags)
620 {
621 int err;
622 struct sk_buff *skb;
623 int df;
624 struct iphdr *iph;
625
626 /*
627 * Try the simple case first. This leaves fragmented frames, and by
628 * choice RAW frames within 20 bytes of maximum size(rare) to the long path
629 */
630
631 if (!sk->protinfo.af_inet.hdrincl) {
632 length += sizeof(struct iphdr);
633
634 /*
635 * Check for slow path.
636 */
637 if (length > rt->u.dst.pmtu || ipc->opt != NULL)
638 return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
639 } else {
640 if (length > rt->u.dst.dev->mtu) {
641 ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
642 return -EMSGSIZE;
643 }
644 }
645 if (flags&MSG_PROBE)
646 goto out;
647
648 /*
649 * Do path mtu discovery if needed.
650 */
651 df = 0;
652 if (ip_dont_fragment(sk, &rt->u.dst))
653 df = htons(IP_DF);
654
655 /*
656 * Fast path for unfragmented frames without options.
657 */
658 {
659 int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
660
661 skb = sock_alloc_send_skb(sk, length+hh_len+15,
662 0, flags&MSG_DONTWAIT, &err);
663 if(skb==NULL)
664 goto error;
665 skb_reserve(skb, hh_len);
666 }
667
668 skb->priority = sk->priority;
669 skb->dst = dst_clone(&rt->u.dst);
670
671 skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
672
673 if(!sk->protinfo.af_inet.hdrincl) {
674 iph->version=4;
675 iph->ihl=5;
676 iph->tos=sk->protinfo.af_inet.tos;
677 iph->tot_len = htons(length);
678 iph->frag_off = df;
679 iph->ttl=sk->protinfo.af_inet.mc_ttl;
680 ip_select_ident(iph, &rt->u.dst);
681 if (rt->rt_type != RTN_MULTICAST)
682 iph->ttl=sk->protinfo.af_inet.ttl;
683 iph->protocol=sk->protocol;
684 iph->saddr=rt->rt_src;
685 iph->daddr=rt->rt_dst;
686 iph->check=0;
687 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
688 err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
689 }
690 else
691 err = getfrag(frag, (void *)iph, 0, length);
692
693 if (err)
694 goto error_fault;
695
696 err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
697 output_maybe_reroute);
698 if (err > 0)
699 err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
700 if (err)
701 goto error;
702 out:
703 return 0;
704
705 error_fault:
706 err = -EFAULT;
707 kfree_skb(skb);
708 error:
709 IP_INC_STATS(IpOutDiscards);
710 return err;
711 }
712
713 /*
714 * This IP datagram is too large to be sent in one piece. Break it up into
715 * smaller pieces (each of size equal to IP header plus
716 * a block of the data of the original IP data part) that will yet fit in a
717 * single device frame, and queue such a frame for sending.
718 *
719 * Yes this is inefficient, feel free to submit a quicker one.
720 */
721
722 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
723 {
724 struct iphdr *iph;
725 unsigned char *raw;
726 unsigned char *ptr;
727 struct net_device *dev;
728 struct sk_buff *skb2;
729 unsigned int mtu, hlen, left, len;
730 int offset;
731 int not_last_frag;
732 struct rtable *rt = (struct rtable*)skb->dst;
733 int err = 0;
734
735 dev = rt->u.dst.dev;
736
737 /*
738 * Point into the IP datagram header.
739 */
740
741 raw = skb->nh.raw;
742 iph = (struct iphdr*)raw;
743
744 /*
745 * Setup starting values.
746 */
747
748 hlen = iph->ihl * 4;
749 left = ntohs(iph->tot_len) - hlen; /* Space per frame */
750 mtu = rt->u.dst.pmtu - hlen; /* Size of data space */
751 ptr = raw + hlen; /* Where to start from */
752
753 /*
754 * Fragment the datagram.
755 */
756
757 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
758 not_last_frag = iph->frag_off & htons(IP_MF);
759
760 /*
761 * Keep copying data until we run out.
762 */
763
764 while(left > 0) {
765 len = left;
766 /* IF: it doesn't fit, use 'mtu' - the data space left */
767 if (len > mtu)
768 len = mtu;
769 /* IF: we are not sending upto and including the packet end
770 then align the next start on an eight byte boundary */
771 if (len < left) {
772 len &= ~7;
773 }
774 /*
775 * Allocate buffer.
776 */
777
778 if ((skb2 = alloc_skb(len+hlen+dev->hard_header_len+15,GFP_ATOMIC)) == NULL) {
779 NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
780 err = -ENOMEM;
781 goto fail;
782 }
783
784 /*
785 * Set up data on packet
786 */
787
788 skb2->pkt_type = skb->pkt_type;
789 skb2->priority = skb->priority;
790 skb_reserve(skb2, (dev->hard_header_len+15)&~15);
791 skb_put(skb2, len + hlen);
792 skb2->nh.raw = skb2->data;
793 skb2->h.raw = skb2->data + hlen;
794
795 /*
796 * Charge the memory for the fragment to any owner
797 * it might possess
798 */
799
800 if (skb->sk)
801 skb_set_owner_w(skb2, skb->sk);
802 skb2->dst = dst_clone(skb->dst);
803 skb2->dev = skb->dev;
804
805 /*
806 * Copy the packet header into the new buffer.
807 */
808
809 memcpy(skb2->nh.raw, raw, hlen);
810
811 /*
812 * Copy a block of the IP datagram.
813 */
814 memcpy(skb2->h.raw, ptr, len);
815 left -= len;
816
817 /*
818 * Fill in the new header fields.
819 */
820 iph = skb2->nh.iph;
821 iph->frag_off = htons((offset >> 3));
822
823 /* ANK: dirty, but effective trick. Upgrade options only if
824 * the segment to be fragmented was THE FIRST (otherwise,
825 * options are already fixed) and make it ONCE
826 * on the initial skb, so that all the following fragments
827 * will inherit fixed options.
828 */
829 if (offset == 0)
830 ip_options_fragment(skb);
831
832 /*
833 * Added AC : If we are fragmenting a fragment that's not the
834 * last fragment then keep MF on each bit
835 */
836 if (left > 0 || not_last_frag)
837 iph->frag_off |= htons(IP_MF);
838 ptr += len;
839 offset += len;
840
841 #ifdef CONFIG_NETFILTER
842 /* Connection association is same as pre-frag packet */
843 skb2->nfct = skb->nfct;
844 nf_conntrack_get(skb2->nfct);
845 #ifdef CONFIG_NETFILTER_DEBUG
846 skb2->nf_debug = skb->nf_debug;
847 #endif
848 #endif
849
850 /*
851 * Put this fragment into the sending queue.
852 */
853
854 IP_INC_STATS(IpFragCreates);
855
856 iph->tot_len = htons(len + hlen);
857
858 ip_send_check(iph);
859
860 err = output(skb2);
861 if (err)
862 goto fail;
863 }
864 kfree_skb(skb);
865 IP_INC_STATS(IpFragOKs);
866 return err;
867
868 fail:
869 kfree_skb(skb);
870 IP_INC_STATS(IpFragFails);
871 return err;
872 }
873
874 /*
875 * Fetch data from kernel space and fill in checksum if needed.
876 */
877 static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
878 unsigned int fraglen)
879 {
880 struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
881 u16 *pktp = (u16 *)to;
882 struct iovec *iov;
883 int len;
884 int hdrflag = 1;
885
886 iov = &dp->iov[0];
887 if (offset >= iov->iov_len) {
888 offset -= iov->iov_len;
889 iov++;
890 hdrflag = 0;
891 }
892 len = iov->iov_len - offset;
893 if (fraglen > len) { /* overlapping. */
894 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
895 dp->csum);
896 offset = 0;
897 fraglen -= len;
898 to += len;
899 iov++;
900 }
901
902 dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
903 dp->csum);
904
905 if (hdrflag && dp->csumoffset)
906 *(pktp + dp->csumoffset) = csum_fold(dp->csum); /* fill in checksum */
907 return 0;
908 }
909
910 /*
911 * Generic function to send a packet as reply to another packet.
912 * Used to send TCP resets so far. ICMP should use this function too.
913 *
914 * Should run single threaded per socket because it uses the sock
915 * structure to pass arguments.
916 */
917 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
918 unsigned int len)
919 {
920 struct {
921 struct ip_options opt;
922 char data[40];
923 } replyopts;
924 struct ipcm_cookie ipc;
925 u32 daddr;
926 struct rtable *rt = (struct rtable*)skb->dst;
927
928 if (ip_options_echo(&replyopts.opt, skb))
929 return;
930
931 daddr = ipc.addr = rt->rt_src;
932 ipc.opt = NULL;
933
934 if (replyopts.opt.optlen) {
935 ipc.opt = &replyopts.opt;
936
937 if (ipc.opt->srr)
938 daddr = replyopts.opt.faddr;
939 }
940
941 if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
942 return;
943
944 /* And let IP do all the hard work.
945
946 This chunk is not reenterable, hence spinlock.
947 Note that it uses the fact, that this function is called
948 with locally disabled BH and that sk cannot be already spinlocked.
949 */
950 bh_lock_sock(sk);
951 sk->protinfo.af_inet.tos = skb->nh.iph->tos;
952 sk->priority = skb->priority;
953 sk->protocol = skb->nh.iph->protocol;
954 ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
955 bh_unlock_sock(sk);
956
957 ip_rt_put(rt);
958 }
959
960 /*
961 * IP protocol layer initialiser
962 */
963
964 static struct packet_type ip_packet_type =
965 {
966 __constant_htons(ETH_P_IP),
967 NULL, /* All devices */
968 ip_rcv,
969 (void*)1,
970 NULL,
971 };
972
973 /*
974 * IP registers the packet type and then calls the subprotocol initialisers
975 */
976
977 void __init ip_init(void)
978 {
979 dev_add_pack(&ip_packet_type);
980
981 ip_rt_init();
982 inet_initpeers();
983
984 #ifdef CONFIG_IP_MULTICAST
985 proc_net_create("igmp", 0, ip_mc_procinfo);
986 #endif
987 }
988
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.