1 /*
2 * net/sched/sch_csz.c Clark-Shenker-Zhang scheduler.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
38
39
40 /* Clark-Shenker-Zhang algorithm.
41 =======================================
42
43 SOURCE.
44
45 David D. Clark, Scott Shenker and Lixia Zhang
46 "Supporting Real-Time Applications in an Integrated Services Packet
47 Network: Architecture and Mechanism".
48
49 CBQ presents a flexible universal algorithm for packet scheduling,
50 but it has pretty poor delay characteristics.
51 Round-robin scheduling and link-sharing goals
52 apparently contradict minimization of network delay and jitter.
53 Moreover, correct handling of predictive flows seems to be
54 impossible in CBQ.
55
56 CSZ presents a more precise but less flexible and less efficient
57 approach. As I understand it, the main idea is to create
58 WFQ flows for each guaranteed service and to allocate
59 the rest of bandwith to dummy flow-0. Flow-0 comprises
60 the predictive services and the best effort traffic;
61 it is handled by a priority scheduler with the highest
62 priority band allocated for predictive services, and the rest ---
63 to the best effort packets.
64
65 Note that in CSZ flows are NOT limited to their bandwidth. It
66 is supposed that the flow passed admission control at the edge
67 of the QoS network and it doesn't need further shaping. Any
68 attempt to improve the flow or to shape it to a token bucket
69 at intermediate hops will introduce undesired delays and raise
70 jitter.
71
72 At the moment CSZ is the only scheduler that provides
73 true guaranteed service. Another schemes (including CBQ)
74 do not provide guaranteed delay and randomize jitter.
75 There is a proof (Sally Floyd), that delay
76 can be estimated by a IntServ compliant formula.
77 This result is true formally, but it is wrong in principle.
78 It takes into account only round-robin delays,
79 ignoring delays introduced by link sharing i.e. overlimiting.
80 Note that temporary overlimits are inevitable because
81 real links are not ideal, and the real algorithm must take this
82 into account.
83
84 ALGORITHM.
85
86 --- Notations.
87
88 $B$ is link bandwidth (bits/sec).
89
90 $I$ is set of all flows, including flow $0$.
91 Every flow $a \in I$ has associated bandwidth slice $r_a < 1$ and
92 $\sum_{a \in I} r_a = 1$.
93
94 --- Flow model.
95
96 Let $m_a$ is the number of backlogged bits in flow $a$.
97 The flow is {\em active}, if $m_a > 0$.
98 This number is a discontinuous function of time;
99 when a packet $i$ arrives:
100 \[
101 m_a(t_i+0) - m_a(t_i-0) = L^i,
102 \]
103 where $L^i$ is the length of the arrived packet.
104 The flow queue is drained continuously until $m_a == 0$:
105 \[
106 {d m_a \over dt} = - { B r_a \over \sum_{b \in A} r_b}.
107 \]
108 I.e. flow rates are their allocated rates proportionally
109 scaled to take all available link bandwidth. Apparently,
110 it is not the only possible policy. F.e. CBQ classes
111 without borrowing would be modelled by:
112 \[
113 {d m_a \over dt} = - B r_a .
114 \]
115 More complicated hierarchical bandwidth allocation
116 policies are possible, but unfortunately, the basic
117 flow equations have a simple solution only for proportional
118 scaling.
119
120 --- Departure times.
121
122 We calculate the time until the last bit of packet is sent:
123 \[
124 E_a^i(t) = { m_a(t_i) - \delta_a(t) \over r_a },
125 \]
126 where $\delta_a(t)$ is number of bits drained since $t_i$.
127 We have to evaluate $E_a^i$ for all queued packets,
128 then find the packet with minimal $E_a^i$ and send it.
129
130 This sounds good, but direct implementation of the algorithm
131 is absolutely infeasible. Luckily, if flow rates
132 are scaled proportionally, the equations have a simple solution.
133
134 The differential equation for $E_a^i$ is
135 \[
136 {d E_a^i (t) \over dt } = - { d \delta_a(t) \over dt} { 1 \over r_a} =
137 { B \over \sum_{b \in A} r_b}
138 \]
139 with initial condition
140 \[
141 E_a^i (t_i) = { m_a(t_i) \over r_a } .
142 \]
143
144 Let's introduce an auxiliary function $R(t)$:
145
146 --- Round number.
147
148 Consider the following model: we rotate over active flows,
149 sending $r_a B$ bits from every flow, so that we send
150 $B \sum_{a \in A} r_a$ bits per round, that takes
151 $\sum_{a \in A} r_a$ seconds.
152
153 Hence, $R(t)$ (round number) is a monotonically increasing
154 linear function of time when $A$ is not changed
155 \[
156 { d R(t) \over dt } = { 1 \over \sum_{a \in A} r_a }
157 \]
158 and it is continuous when $A$ changes.
159
160 The central observation is that the quantity
161 $F_a^i = R(t) + E_a^i(t)/B$ does not depend on time at all!
162 $R(t)$ does not depend on flow, so that $F_a^i$ can be
163 calculated only once on packet arrival, and we need not
164 recalculate $E$ numbers and resorting queues.
165 The number $F_a^i$ is called finish number of the packet.
166 It is just the value of $R(t)$ when the last bit of packet
167 is sent out.
168
169 Maximal finish number on flow is called finish number of flow
170 and minimal one is "start number of flow".
171 Apparently, flow is active if and only if $F_a \leq R$.
172
173 When a packet of length $L_i$ bit arrives to flow $a$ at time $t_i$,
174 we calculate $F_a^i$ as:
175
176 If flow was inactive ($F_a < R$):
177 $F_a^i = R(t) + {L_i \over B r_a}$
178 otherwise
179 $F_a^i = F_a + {L_i \over B r_a}$
180
181 These equations complete the algorithm specification.
182
183 It looks pretty hairy, but there is a simple
184 procedure for solving these equations.
185 See procedure csz_update(), that is a generalization of
186 the algorithm from S. Keshav's thesis Chapter 3
187 "Efficient Implementation of Fair Queeing".
188
189 NOTES.
190
191 * We implement only the simplest variant of CSZ,
192 when flow-0 is a explicit 4band priority fifo.
193 This is bad, but we need a "peek" operation in addition
194 to "dequeue" to implement complete CSZ.
195 I do not want to do that, unless it is absolutely
196 necessary.
197
198 * A primitive support for token bucket filtering
199 presents itself too. It directly contradicts CSZ, but
200 even though the Internet is on the globe ... :-)
201 "the edges of the network" really exist.
202
203 BUGS.
204
205 * Fixed point arithmetic is overcomplicated, suboptimal and even
206 wrong. Check it later. */
207
208
209 /* This number is arbitrary */
210
211 #define CSZ_GUARANTEED 16
212 #define CSZ_FLOWS (CSZ_GUARANTEED+4)
213
214 struct csz_head
215 {
216 struct csz_head *snext;
217 struct csz_head *sprev;
218 struct csz_head *fnext;
219 struct csz_head *fprev;
220 };
221
222 struct csz_flow
223 {
224 struct csz_head *snext;
225 struct csz_head *sprev;
226 struct csz_head *fnext;
227 struct csz_head *fprev;
228
229 /* Parameters */
230 struct tc_ratespec rate;
231 struct tc_ratespec slice;
232 u32 *L_tab; /* Lookup table for L/(B*r_a) values */
233 unsigned long limit; /* Maximal length of queue */
234 #ifdef CSZ_PLUS_TBF
235 struct tc_ratespec peakrate;
236 __u32 buffer; /* Depth of token bucket, normalized
237 as L/(B*r_a) */
238 __u32 mtu;
239 #endif
240
241 /* Variables */
242 #ifdef CSZ_PLUS_TBF
243 unsigned long tokens; /* Tokens number: usecs */
244 psched_time_t t_tbf;
245 unsigned long R_tbf;
246 int throttled;
247 #endif
248 unsigned peeked;
249 unsigned long start; /* Finish number of the first skb */
250 unsigned long finish; /* Finish number of the flow */
251
252 struct sk_buff_head q; /* FIFO queue */
253 };
254
255 #define L2R(f,L) ((f)->L_tab[(L)>>(f)->slice.cell_log])
256
257 struct csz_sched_data
258 {
259 /* Parameters */
260 unsigned char rate_log; /* fixed point position for rate;
261 * really we need not it */
262 unsigned char R_log; /* fixed point position for round number */
263 unsigned char delta_log; /* 1<<delta_log is maximal timeout in usecs;
264 * 21 <-> 2.1sec is MAXIMAL value */
265
266 /* Variables */
267 struct tcf_proto *filter_list;
268 u8 prio2band[TC_PRIO_MAX+1];
269 #ifdef CSZ_PLUS_TBF
270 struct timer_list wd_timer;
271 long wd_expires;
272 #endif
273 psched_time_t t_c; /* Time check-point */
274 unsigned long R_c; /* R-number check-point */
275 unsigned long rate; /* Current sum of rates of active flows */
276 struct csz_head s; /* Flows sorted by "start" */
277 struct csz_head f; /* Flows sorted by "finish" */
278
279 struct sk_buff_head other[4];/* Predicted (0) and the best efforts
280 classes (1,2,3) */
281 struct csz_flow flow[CSZ_GUARANTEED]; /* Array of flows */
282 };
283
284 /* These routines (csz_insert_finish and csz_insert_start) are
285 the most time consuming part of all the algorithm.
286
287 We insert to sorted list, so that time
288 is linear with respect to number of active flows in the worst case.
289 Note that we have not very large number of guaranteed flows,
290 so that logarithmic algorithms (heap etc.) are useless,
291 they are slower than linear one when length of list <= 32.
292
293 Heap would take sence if we used WFQ for best efforts
294 flows, but SFQ is better choice in this case.
295 */
296
297
298 /* Insert flow "this" to the list "b" before
299 flow with greater finish number.
300 */
301
302 #if 0
303 /* Scan forward */
304 extern __inline__ void csz_insert_finish(struct csz_head *b,
305 struct csz_flow *this)
306 {
307 struct csz_head *f = b->fnext;
308 unsigned long finish = this->finish;
309
310 while (f != b) {
311 if (((struct csz_flow*)f)->finish - finish > 0)
312 break;
313 f = f->fnext;
314 }
315 this->fnext = f;
316 this->fprev = f->fprev;
317 this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
318 }
319 #else
320 /* Scan backward */
321 extern __inline__ void csz_insert_finish(struct csz_head *b,
322 struct csz_flow *this)
323 {
324 struct csz_head *f = b->fprev;
325 unsigned long finish = this->finish;
326
327 while (f != b) {
328 if (((struct csz_flow*)f)->finish - finish <= 0)
329 break;
330 f = f->fprev;
331 }
332 this->fnext = f->fnext;
333 this->fprev = f;
334 this->fnext->fprev = this->fprev->fnext = (struct csz_head*)this;
335 }
336 #endif
337
338 /* Insert flow "this" to the list "b" before
339 flow with greater start number.
340 */
341
342 extern __inline__ void csz_insert_start(struct csz_head *b,
343 struct csz_flow *this)
344 {
345 struct csz_head *f = b->snext;
346 unsigned long start = this->start;
347
348 while (f != b) {
349 if (((struct csz_flow*)f)->start - start > 0)
350 break;
351 f = f->snext;
352 }
353 this->snext = f;
354 this->sprev = f->sprev;
355 this->snext->sprev = this->sprev->snext = (struct csz_head*)this;
356 }
357
358
359 /* Calculate and return current round number.
360 It is another time consuming part, but
361 it is impossible to avoid it.
362
363 It costs O(N) that make all the algorithm useful only
364 to play with closest to ideal fluid model.
365
366 There exist less academic, but more practical modifications,
367 which might have even better characteristics (WF2Q+, HPFQ, HFSC)
368 */
369
370 static unsigned long csz_update(struct Qdisc *sch)
371 {
372 struct csz_sched_data *q = (struct csz_sched_data*)sch->data;
373 struct csz_flow *a;
374 unsigned long F;
375 unsigned long tmp;
376 psched_time_t now;
377 unsigned long delay;
378 unsigned long R_c;
379
380 PSCHED_GET_TIME(now);
381 delay = PSCHED_TDIFF_SAFE(now, q->t_c, 0, goto do_reset);
382
383 if (delay>>q->delta_log) {
384 do_reset:
385 /* Delta is too large.
386 It is possible if MTU/BW > 1<<q->delta_log
387 (i.e. configuration error) or because of hardware
388 fault. We have no choice...
389 */
390 qdisc_reset(sch);
391 return 0;
392 }
393
394 q->t_c = now;
395
396 for (;;) {
397 a = (struct csz_flow*)q->f.fnext;
398
399 /* No more active flows. Reset R and exit. */
400 if (a == (struct csz_flow*)&q->f) {
401 #ifdef CSZ_DEBUG
402 if (q->rate) {
403 printk("csz_update: rate!=0 on inactive csz\n");
404 q->rate = 0;
405 }
406 #endif
407 q->R_c = 0;
408 return 0;
409 }
410
411 F = a->finish;
412
413 #ifdef CSZ_DEBUG
414 if (q->rate == 0) {
415 printk("csz_update: rate=0 on active csz\n");
416 goto do_reset;
417 }
418 #endif
419
420 /*
421 * tmp = (t - q->t_c)/q->rate;
422 */
423
424 tmp = ((delay<<(31-q->delta_log))/q->rate)>>(31-q->delta_log+q->R_log);
425
426 tmp += q->R_c;
427
428 /* OK, this flow (and all flows with greater
429 finish numbers) is still active */
430 if (F - tmp > 0)
431 break;
432
433 /* It is more not active */
434
435 a->fprev->fnext = a->fnext;
436 a->fnext->fprev = a->fprev;
437
438 /*
439 * q->t_c += (F - q->R_c)*q->rate
440 */
441
442 tmp = ((F-q->R_c)*q->rate)<<q->R_log;
443 R_c = F;
444 q->rate -= a->slice.rate;
445
446 if ((long)(delay - tmp) >= 0) {
447 delay -= tmp;
448 continue;
449 }
450 delay = 0;
451 }
452
453 q->R_c = tmp;
454 return tmp;
455 }
456
457 unsigned csz_classify(struct sk_buff *skb, struct csz_sched_data *q)
458 {
459 return CSZ_GUARANTEED;
460 }
461
462 static int
463 csz_enqueue(struct sk_buff *skb, struct Qdisc* sch)
464 {
465 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
466 unsigned flow_id = csz_classify(skb, q);
467 unsigned long R;
468 int prio = 0;
469 struct csz_flow *this;
470
471 if (flow_id >= CSZ_GUARANTEED) {
472 prio = flow_id - CSZ_GUARANTEED;
473 flow_id = 0;
474 }
475
476 this = &q->flow[flow_id];
477 if (this->q.qlen >= this->limit || this->L_tab == NULL) {
478 sch->stats.drops++;
479 kfree_skb(skb);
480 return NET_XMIT_DROP;
481 }
482
483 R = csz_update(sch);
484
485 if ((long)(this->finish - R) >= 0) {
486 /* It was active */
487 this->finish += L2R(this,skb->len);
488 } else {
489 /* It is inactive; activate it */
490 this->finish = R + L2R(this,skb->len);
491 q->rate += this->slice.rate;
492 csz_insert_finish(&q->f, this);
493 }
494
495 /* If this flow was empty, remember start number
496 and insert it into start queue */
497 if (this->q.qlen == 0) {
498 this->start = this->finish;
499 csz_insert_start(&q->s, this);
500 }
501 if (flow_id)
502 skb_queue_tail(&this->q, skb);
503 else
504 skb_queue_tail(&q->other[prio], skb);
505 sch->q.qlen++;
506 sch->stats.bytes += skb->len;
507 sch->stats.packets++;
508 return 0;
509 }
510
511 static __inline__ struct sk_buff *
512 skb_dequeue_best(struct csz_sched_data * q)
513 {
514 int i;
515 struct sk_buff *skb;
516
517 for (i=0; i<4; i++) {
518 skb = skb_dequeue(&q->other[i]);
519 if (skb) {
520 q->flow[0].q.qlen--;
521 return skb;
522 }
523 }
524 return NULL;
525 }
526
527 static __inline__ struct sk_buff *
528 skb_peek_best(struct csz_sched_data * q)
529 {
530 int i;
531 struct sk_buff *skb;
532
533 for (i=0; i<4; i++) {
534 skb = skb_peek(&q->other[i]);
535 if (skb)
536 return skb;
537 }
538 return NULL;
539 }
540
541 #ifdef CSZ_PLUS_TBF
542
543 static void csz_watchdog(unsigned long arg)
544 {
545 struct Qdisc *sch = (struct Qdisc*)arg;
546
547 qdisc_wakeup(sch->dev);
548 }
549
550 static __inline__ void
551 csz_move_queue(struct csz_flow *this, long delta)
552 {
553 this->fprev->fnext = this->fnext;
554 this->fnext->fprev = this->fprev;
555
556 this->start += delta;
557 this->finish += delta;
558
559 csz_insert_finish(this);
560 }
561
562 static __inline__ int csz_enough_tokens(struct csz_sched_data *q,
563 struct csz_flow *this,
564 struct sk_buff *skb)
565 {
566 long toks;
567 long shift;
568 psched_time_t now;
569
570 PSCHED_GET_TIME(now);
571
572 toks = PSCHED_TDIFF(now, t_tbf) + this->tokens - L2R(q,this,skb->len);
573
574 shift = 0;
575 if (this->throttled) {
576 /* Remember aposteriory delay */
577
578 unsigned long R = csz_update(q);
579 shift = R - this->R_tbf;
580 this->R_tbf = R;
581 }
582
583 if (toks >= 0) {
584 /* Now we have enough tokens to proceed */
585
586 this->tokens = toks <= this->depth ? toks : this->depth;
587 this->t_tbf = now;
588
589 if (!this->throttled)
590 return 1;
591
592 /* Flow was throttled. Update its start&finish numbers
593 with delay calculated aposteriori.
594 */
595
596 this->throttled = 0;
597 if (shift > 0)
598 csz_move_queue(this, shift);
599 return 1;
600 }
601
602 if (!this->throttled) {
603 /* Flow has just been throttled; remember
604 current round number to calculate aposteriori delay
605 */
606 this->throttled = 1;
607 this->R_tbf = csz_update(q);
608 }
609
610 /* Move all the queue to the time when it will be allowed to send.
611 We should translate time to round number, but it is impossible,
612 so that we made the most conservative estimate i.e. we suppose
613 that only this flow is active and, hence, R = t.
614 Really toks <= R <= toks/r_a.
615
616 This apriory shift in R will be adjusted later to reflect
617 real delay. We cannot avoid it because of:
618 - throttled flow continues to be active from the viewpoint
619 of CSZ, so that it would acquire the highest priority,
620 if you not adjusted start numbers.
621 - Eventually, finish number would become less than round
622 number and flow were declared inactive.
623 */
624
625 toks = -toks;
626
627 /* Remeber, that we should start watchdog */
628 if (toks < q->wd_expires)
629 q->wd_expires = toks;
630
631 toks >>= q->R_log;
632 shift += toks;
633 if (shift > 0) {
634 this->R_tbf += toks;
635 csz_move_queue(this, shift);
636 }
637 csz_insert_start(this);
638 return 0;
639 }
640 #endif
641
642
643 static struct sk_buff *
644 csz_dequeue(struct Qdisc* sch)
645 {
646 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
647 struct sk_buff *skb;
648 struct csz_flow *this;
649
650 #ifdef CSZ_PLUS_TBF
651 q->wd_expires = 0;
652 #endif
653 this = (struct csz_flow*)q->s.snext;
654
655 while (this != (struct csz_flow*)&q->s) {
656
657 /* First of all: unlink from start list */
658 this->sprev->snext = this->snext;
659 this->snext->sprev = this->sprev;
660
661 if (this != &q->flow[0]) { /* Guaranteed flow */
662 skb = __skb_dequeue(&this->q);
663 if (skb) {
664 #ifdef CSZ_PLUS_TBF
665 if (this->depth) {
666 if (!csz_enough_tokens(q, this, skb))
667 continue;
668 }
669 #endif
670 if (this->q.qlen) {
671 struct sk_buff *nskb = skb_peek(&this->q);
672 this->start += L2R(this,nskb->len);
673 csz_insert_start(&q->s, this);
674 }
675 sch->q.qlen--;
676 return skb;
677 }
678 } else { /* Predicted or best effort flow */
679 skb = skb_dequeue_best(q);
680 if (skb) {
681 unsigned peeked = this->peeked;
682 this->peeked = 0;
683
684 if (--this->q.qlen) {
685 struct sk_buff *nskb;
686 unsigned dequeued = L2R(this,skb->len);
687
688 /* We got not the same thing that
689 peeked earlier; adjust start number
690 */
691 if (peeked != dequeued && peeked)
692 this->start += dequeued - peeked;
693
694 nskb = skb_peek_best(q);
695 peeked = L2R(this,nskb->len);
696 this->start += peeked;
697 this->peeked = peeked;
698 csz_insert_start(&q->s, this);
699 }
700 sch->q.qlen--;
701 return skb;
702 }
703 }
704 }
705 #ifdef CSZ_PLUS_TBF
706 /* We are about to return no skb.
707 Schedule watchdog timer, if it occurred because of shaping.
708 */
709 if (q->wd_expires) {
710 unsigned long delay = PSCHED_US2JIFFIE(q->wd_expires);
711 del_timer(&q->wd_timer);
712 if (delay == 0)
713 delay = 1;
714 q->wd_timer.expires = jiffies + delay;
715 add_timer(&q->wd_timer);
716 sch->stats.overlimits++;
717 }
718 #endif
719 return NULL;
720 }
721
722 static void
723 csz_reset(struct Qdisc* sch)
724 {
725 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
726 int i;
727
728 for (i=0; i<4; i++)
729 skb_queue_purge(&q->other[i]);
730
731 for (i=0; i<CSZ_GUARANTEED; i++) {
732 struct csz_flow *this = q->flow + i;
733 skb_queue_purge(&this->q);
734 this->snext = this->sprev =
735 this->fnext = this->fprev = (struct csz_head*)this;
736 this->start = this->finish = 0;
737 }
738 q->s.snext = q->s.sprev = &q->s;
739 q->f.fnext = q->f.fprev = &q->f;
740 q->R_c = 0;
741 #ifdef CSZ_PLUS_TBF
742 PSCHED_GET_TIME(&q->t_tbf);
743 q->tokens = q->depth;
744 del_timer(&q->wd_timer);
745 #endif
746 sch->q.qlen = 0;
747 }
748
749 static void
750 csz_destroy(struct Qdisc* sch)
751 {
752 MOD_DEC_USE_COUNT;
753 }
754
755 static int csz_init(struct Qdisc *sch, struct rtattr *opt)
756 {
757 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
758 struct rtattr *tb[TCA_CSZ_PTAB];
759 struct tc_csz_qopt *qopt;
760 int i;
761
762 rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
763 if (tb[TCA_CSZ_PARMS-1] == NULL ||
764 RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*qopt))
765 return -EINVAL;
766 qopt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
767
768 q->R_log = qopt->R_log;
769 q->delta_log = qopt->delta_log;
770 for (i=0; i<=TC_PRIO_MAX; i++) {
771 if (qopt->priomap[i] >= CSZ_FLOWS)
772 return -EINVAL;
773 q->prio2band[i] = qopt->priomap[i];
774 }
775
776 for (i=0; i<4; i++)
777 skb_queue_head_init(&q->other[i]);
778
779 for (i=0; i<CSZ_GUARANTEED; i++) {
780 struct csz_flow *this = q->flow + i;
781 skb_queue_head_init(&this->q);
782 this->snext = this->sprev =
783 this->fnext = this->fprev = (struct csz_head*)this;
784 this->start = this->finish = 0;
785 }
786 q->s.snext = q->s.sprev = &q->s;
787 q->f.fnext = q->f.fprev = &q->f;
788 q->R_c = 0;
789 #ifdef CSZ_PLUS_TBF
790 init_timer(&q->wd_timer);
791 q->wd_timer.data = (unsigned long)sch;
792 q->wd_timer.function = csz_watchdog;
793 #endif
794 MOD_INC_USE_COUNT;
795 return 0;
796 }
797
798 #ifdef CONFIG_RTNETLINK
799 static int csz_dump(struct Qdisc *sch, struct sk_buff *skb)
800 {
801 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
802 unsigned char *b = skb->tail;
803 struct rtattr *rta;
804 struct tc_csz_qopt opt;
805
806 rta = (struct rtattr*)b;
807 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
808
809 opt.flows = CSZ_FLOWS;
810 memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1);
811 RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
812 rta->rta_len = skb->tail - b;
813
814 return skb->len;
815
816 rtattr_failure:
817 skb_trim(skb, b - skb->data);
818 return -1;
819 }
820 #endif
821
822
823 static int csz_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
824 struct Qdisc **old)
825 {
826 return -EINVAL;
827 }
828
829 static struct Qdisc * csz_leaf(struct Qdisc *sch, unsigned long cl)
830 {
831 return NULL;
832 }
833
834
835 static unsigned long csz_get(struct Qdisc *sch, u32 classid)
836 {
837 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
838 unsigned long band = TC_H_MIN(classid) - 1;
839
840 if (band >= CSZ_FLOWS)
841 return 0;
842
843 if (band < CSZ_GUARANTEED && q->flow[band].L_tab == NULL)
844 return 0;
845
846 return band+1;
847 }
848
849 static unsigned long csz_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
850 {
851 return csz_get(sch, classid);
852 }
853
854
855 static void csz_put(struct Qdisc *sch, unsigned long cl)
856 {
857 return;
858 }
859
860 static int csz_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg)
861 {
862 unsigned long cl = *arg;
863 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
864 struct rtattr *opt = tca[TCA_OPTIONS-1];
865 struct rtattr *tb[TCA_CSZ_PTAB];
866 struct tc_csz_copt *copt;
867
868 rtattr_parse(tb, TCA_CSZ_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt));
869 if (tb[TCA_CSZ_PARMS-1] == NULL ||
870 RTA_PAYLOAD(tb[TCA_CSZ_PARMS-1]) < sizeof(*copt))
871 return -EINVAL;
872 copt = RTA_DATA(tb[TCA_CSZ_PARMS-1]);
873
874 if (tb[TCA_CSZ_RTAB-1] &&
875 RTA_PAYLOAD(tb[TCA_CSZ_RTAB-1]) < 1024)
876 return -EINVAL;
877
878 if (cl) {
879 struct csz_flow *a;
880 cl--;
881 if (cl >= CSZ_FLOWS)
882 return -ENOENT;
883 if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
884 return -EINVAL;
885
886 a = &q->flow[cl];
887
888 spin_lock_bh(&sch->dev->queue_lock);
889 #if 0
890 a->rate_log = copt->rate_log;
891 #endif
892 #ifdef CSZ_PLUS_TBF
893 a->limit = copt->limit;
894 a->rate = copt->rate;
895 a->buffer = copt->buffer;
896 a->mtu = copt->mtu;
897 #endif
898
899 if (tb[TCA_CSZ_RTAB-1])
900 memcpy(a->L_tab, RTA_DATA(tb[TCA_CSZ_RTAB-1]), 1024);
901
902 spin_unlock_bh(&sch->dev->queue_lock);
903 return 0;
904 }
905 /* NI */
906 return 0;
907 }
908
909 static int csz_delete(struct Qdisc *sch, unsigned long cl)
910 {
911 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
912 struct csz_flow *a;
913
914 cl--;
915
916 if (cl >= CSZ_FLOWS)
917 return -ENOENT;
918 if (cl >= CSZ_GUARANTEED || q->flow[cl].L_tab == NULL)
919 return -EINVAL;
920
921 a = &q->flow[cl];
922
923 spin_lock_bh(&sch->dev->queue_lock);
924 a->fprev->fnext = a->fnext;
925 a->fnext->fprev = a->fprev;
926 a->sprev->snext = a->snext;
927 a->snext->sprev = a->sprev;
928 a->start = a->finish = 0;
929 kfree(xchg(&q->flow[cl].L_tab, NULL));
930 spin_unlock_bh(&sch->dev->queue_lock);
931
932 return 0;
933 }
934
935 #ifdef CONFIG_RTNETLINK
936 static int csz_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm)
937 {
938 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
939 unsigned char *b = skb->tail;
940 struct rtattr *rta;
941 struct tc_csz_copt opt;
942
943 tcm->tcm_handle = sch->handle|cl;
944
945 cl--;
946
947 if (cl > CSZ_FLOWS)
948 goto rtattr_failure;
949
950 if (cl < CSZ_GUARANTEED) {
951 struct csz_flow *f = &q->flow[cl];
952
953 if (f->L_tab == NULL)
954 goto rtattr_failure;
955
956 rta = (struct rtattr*)b;
957 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
958
959 opt.limit = f->limit;
960 opt.rate = f->rate;
961 opt.slice = f->slice;
962 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
963 #ifdef CSZ_PLUS_TBF
964 opt.buffer = f->buffer;
965 opt.mtu = f->mtu;
966 #else
967 opt.buffer = 0;
968 opt.mtu = 0;
969 #endif
970
971 RTA_PUT(skb, TCA_CSZ_PARMS, sizeof(opt), &opt);
972 rta->rta_len = skb->tail - b;
973 }
974
975 return skb->len;
976
977 rtattr_failure:
978 skb_trim(skb, b - skb->data);
979 return -1;
980 }
981 #endif
982
983 static void csz_walk(struct Qdisc *sch, struct qdisc_walker *arg)
984 {
985 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
986 int prio = 0;
987
988 if (arg->stop)
989 return;
990
991 for (prio = 0; prio < CSZ_FLOWS; prio++) {
992 if (arg->count < arg->skip) {
993 arg->count++;
994 continue;
995 }
996 if (prio < CSZ_GUARANTEED && q->flow[prio].L_tab == NULL) {
997 arg->count++;
998 continue;
999 }
1000 if (arg->fn(sch, prio+1, arg) < 0) {
1001 arg->stop = 1;
1002 break;
1003 }
1004 arg->count++;
1005 }
1006 }
1007
1008 static struct tcf_proto ** csz_find_tcf(struct Qdisc *sch, unsigned long cl)
1009 {
1010 struct csz_sched_data *q = (struct csz_sched_data *)sch->data;
1011
1012 if (cl)
1013 return NULL;
1014
1015 return &q->filter_list;
1016 }
1017
1018 struct Qdisc_class_ops csz_class_ops =
1019 {
1020 csz_graft,
1021 csz_leaf,
1022
1023 csz_get,
1024 csz_put,
1025 csz_change,
1026 csz_delete,
1027 csz_walk,
1028
1029 csz_find_tcf,
1030 csz_bind,
1031 csz_put,
1032
1033 #ifdef CONFIG_RTNETLINK
1034 csz_dump_class,
1035 #endif
1036 };
1037
1038 struct Qdisc_ops csz_qdisc_ops =
1039 {
1040 NULL,
1041 &csz_class_ops,
1042 "csz",
1043 sizeof(struct csz_sched_data),
1044
1045 csz_enqueue,
1046 csz_dequeue,
1047 NULL,
1048 NULL,
1049
1050 csz_init,
1051 csz_reset,
1052 csz_destroy,
1053 NULL /* csz_change */,
1054
1055 #ifdef CONFIG_RTNETLINK
1056 csz_dump,
1057 #endif
1058 };
1059
1060
1061 #ifdef MODULE
1062 int init_module(void)
1063 {
1064 return register_qdisc(&csz_qdisc_ops);
1065 }
1066
1067 void cleanup_module(void)
1068 {
1069 unregister_qdisc(&csz_qdisc_ops);
1070 }
1071 #endif
1072
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.