1 /*
2 * net/sched/sch_tbf.c Token Bucket Filter queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13 #include <linux/config.h>
14 #include <linux/module.h>
15 #include <asm/uaccess.h>
16 #include <asm/system.h>
17 #include <asm/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/sched.h>
21 #include <linux/string.h>
22 #include <linux/mm.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/in.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/if_ether.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/etherdevice.h>
32 #include <linux/notifier.h>
33 #include <net/ip.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
38
39
40 /* Simple Token Bucket Filter.
41 =======================================
42
43 SOURCE.
44 -------
45
46 None.
47
48 Description.
49 ------------
50
51 A data flow obeys TBF with rate R and depth B, if for any
52 time interval t_i...t_f the number of transmitted bits
53 does not exceed B + R*(t_f-t_i).
54
55 Packetized version of this definition:
56 The sequence of packets of sizes s_i served at moments t_i
57 obeys TBF, if for any i<=k:
58
59 s_i+....+s_k <= B + R*(t_k - t_i)
60
61 Algorithm.
62 ----------
63
64 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
65
66 N(t+delta) = min{B/R, N(t) + delta}
67
68 If the first packet in queue has length S, it may be
69 transmited only at the time t_* when S/R <= N(t_*),
70 and in this case N(t) jumps:
71
72 N(t_* + 0) = N(t_* - 0) - S/R.
73
74
75
76 Actually, QoS requires two TBF to be applied to a data stream.
77 One of them controls steady state burst size, another
78 one with rate P (peak rate) and depth M (equal to link MTU)
79 limits bursts at a smaller time scale.
80
81 It is easy to see that P>R, and B>M. If P is infinity, this double
82 TBF is equivalent to a single one.
83
84 When TBF works in reshaping mode, latency is estimated as:
85
86 lat = max ((L-B)/R, (L-M)/P)
87
88
89 NOTES.
90 ------
91
92 If TBF throttles, it starts a watchdog timer, which will wake it up
93 when it is ready to transmit.
94 Note that the minimal timer resolution is 1/HZ.
95 If no new packets arrive during this period,
96 or if the device is not awaken by EOI for some previous packet,
97 TBF can stop its activity for 1/HZ.
98
99
100 This means, that with depth B, the maximal rate is
101
102 R_crit = B*HZ
103
104 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
105
106 Note that the peak rate TBF is much more tough: with MTU 1500
107 P_crit = 150Kbytes/sec. So, if you need greater peak
108 rates, use alpha with HZ=1000 :-)
109 */
110
111 struct tbf_sched_data
112 {
113 /* Parameters */
114 u32 limit; /* Maximal length of backlog: bytes */
115 u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
116 u32 mtu;
117 u32 max_size;
118 struct qdisc_rate_table *R_tab;
119 struct qdisc_rate_table *P_tab;
120
121 /* Variables */
122 long tokens; /* Current number of B tokens */
123 long ptokens; /* Current number of P tokens */
124 psched_time_t t_c; /* Time check-point */
125 struct timer_list wd_timer; /* Watchdog timer */
126 };
127
128 #define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
129 #define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
130
131 static int
132 tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
133 {
134 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
135
136 if (skb->len > q->max_size)
137 goto drop;
138 __skb_queue_tail(&sch->q, skb);
139 if ((sch->stats.backlog += skb->len) <= q->limit) {
140 sch->stats.bytes += skb->len;
141 sch->stats.packets++;
142 return 0;
143 }
144
145 /* Drop action: undo the things that we just did,
146 * i.e. make tail drop
147 */
148
149 __skb_unlink(skb, &sch->q);
150 sch->stats.backlog -= skb->len;
151
152 drop:
153 sch->stats.drops++;
154 #ifdef CONFIG_NET_CLS_POLICE
155 if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch))
156 #endif
157 kfree_skb(skb);
158 return NET_XMIT_DROP;
159 }
160
161 static int
162 tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
163 {
164 __skb_queue_head(&sch->q, skb);
165 sch->stats.backlog += skb->len;
166 return 0;
167 }
168
169 static int
170 tbf_drop(struct Qdisc* sch)
171 {
172 struct sk_buff *skb;
173
174 skb = __skb_dequeue_tail(&sch->q);
175 if (skb) {
176 sch->stats.backlog -= skb->len;
177 sch->stats.drops++;
178 kfree_skb(skb);
179 return 1;
180 }
181 return 0;
182 }
183
184 static void tbf_watchdog(unsigned long arg)
185 {
186 struct Qdisc *sch = (struct Qdisc*)arg;
187
188 sch->flags &= ~TCQ_F_THROTTLED;
189 netif_schedule(sch->dev);
190 }
191
192 static struct sk_buff *
193 tbf_dequeue(struct Qdisc* sch)
194 {
195 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
196 struct sk_buff *skb;
197
198 skb = __skb_dequeue(&sch->q);
199
200 if (skb) {
201 psched_time_t now;
202 long toks;
203 long ptoks = 0;
204
205 PSCHED_GET_TIME(now);
206
207 toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer, 0);
208
209 if (q->P_tab) {
210 ptoks = toks + q->ptokens;
211 if (ptoks > (long)q->mtu)
212 ptoks = q->mtu;
213 ptoks -= L2T_P(q, skb->len);
214 }
215 toks += q->tokens;
216 if (toks > (long)q->buffer)
217 toks = q->buffer;
218 toks -= L2T(q, skb->len);
219
220 if ((toks|ptoks) >= 0) {
221 q->t_c = now;
222 q->tokens = toks;
223 q->ptokens = ptoks;
224 sch->stats.backlog -= skb->len;
225 sch->flags &= ~TCQ_F_THROTTLED;
226 return skb;
227 }
228
229 if (!netif_queue_stopped(sch->dev)) {
230 long delay = PSCHED_US2JIFFIE(max(-toks, -ptoks));
231
232 if (delay == 0)
233 delay = 1;
234
235 mod_timer(&q->wd_timer, jiffies+delay);
236 }
237
238 /* Maybe we have a shorter packet in the queue,
239 which can be sent now. It sounds cool,
240 but, however, this is wrong in principle.
241 We MUST NOT reorder packets under these circumstances.
242
243 Really, if we split the flow into independent
244 subflows, it would be a very good solution.
245 This is the main idea of all FQ algorithms
246 (cf. CSZ, HPFQ, HFSC)
247 */
248 __skb_queue_head(&sch->q, skb);
249
250 sch->flags |= TCQ_F_THROTTLED;
251 sch->stats.overlimits++;
252 }
253 return NULL;
254 }
255
256
257 static void
258 tbf_reset(struct Qdisc* sch)
259 {
260 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
261
262 skb_queue_purge(&sch->q);
263 sch->stats.backlog = 0;
264 PSCHED_GET_TIME(q->t_c);
265 q->tokens = q->buffer;
266 q->ptokens = q->mtu;
267 sch->flags &= ~TCQ_F_THROTTLED;
268 del_timer(&q->wd_timer);
269 }
270
271 static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
272 {
273 int err = -EINVAL;
274 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
275 struct rtattr *tb[TCA_TBF_PTAB];
276 struct tc_tbf_qopt *qopt;
277 struct qdisc_rate_table *rtab = NULL;
278 struct qdisc_rate_table *ptab = NULL;
279 int max_size;
280
281 if (rtattr_parse(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)) ||
282 tb[TCA_TBF_PARMS-1] == NULL ||
283 RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
284 goto done;
285
286 qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
287 rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
288 if (rtab == NULL)
289 goto done;
290
291 if (qopt->peakrate.rate) {
292 if (qopt->peakrate.rate > qopt->rate.rate)
293 ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
294 if (ptab == NULL)
295 goto done;
296 }
297
298 max_size = psched_mtu(sch->dev);
299 if (ptab) {
300 int n = max_size>>qopt->peakrate.cell_log;
301 while (n>0 && ptab->data[n-1] > qopt->mtu) {
302 max_size -= (1<<qopt->peakrate.cell_log);
303 n--;
304 }
305 }
306 if (rtab->data[max_size>>qopt->rate.cell_log] > qopt->buffer)
307 goto done;
308
309 sch_tree_lock(sch);
310 q->limit = qopt->limit;
311 q->mtu = qopt->mtu;
312 q->max_size = max_size;
313 q->buffer = qopt->buffer;
314 q->tokens = q->buffer;
315 q->ptokens = q->mtu;
316 rtab = xchg(&q->R_tab, rtab);
317 ptab = xchg(&q->P_tab, ptab);
318 sch_tree_unlock(sch);
319 err = 0;
320 done:
321 if (rtab)
322 qdisc_put_rtab(rtab);
323 if (ptab)
324 qdisc_put_rtab(ptab);
325 return err;
326 }
327
328 static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
329 {
330 int err;
331 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
332
333 if (opt == NULL)
334 return -EINVAL;
335
336 MOD_INC_USE_COUNT;
337
338 PSCHED_GET_TIME(q->t_c);
339 init_timer(&q->wd_timer);
340 q->wd_timer.function = tbf_watchdog;
341 q->wd_timer.data = (unsigned long)sch;
342
343 if ((err = tbf_change(sch, opt)) != 0) {
344 MOD_DEC_USE_COUNT;
345 }
346 return err;
347 }
348
349 static void tbf_destroy(struct Qdisc *sch)
350 {
351 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
352
353 del_timer(&q->wd_timer);
354
355 if (q->P_tab)
356 qdisc_put_rtab(q->P_tab);
357 if (q->R_tab)
358 qdisc_put_rtab(q->R_tab);
359
360 MOD_DEC_USE_COUNT;
361 }
362
363 #ifdef CONFIG_RTNETLINK
364 static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
365 {
366 struct tbf_sched_data *q = (struct tbf_sched_data *)sch->data;
367 unsigned char *b = skb->tail;
368 struct rtattr *rta;
369 struct tc_tbf_qopt opt;
370
371 rta = (struct rtattr*)b;
372 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
373
374 opt.limit = q->limit;
375 opt.rate = q->R_tab->rate;
376 if (q->P_tab)
377 opt.peakrate = q->P_tab->rate;
378 else
379 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
380 opt.mtu = q->mtu;
381 opt.buffer = q->buffer;
382 RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
383 rta->rta_len = skb->tail - b;
384
385 return skb->len;
386
387 rtattr_failure:
388 skb_trim(skb, b - skb->data);
389 return -1;
390 }
391 #endif
392
393 struct Qdisc_ops tbf_qdisc_ops =
394 {
395 NULL,
396 NULL,
397 "tbf",
398 sizeof(struct tbf_sched_data),
399
400 tbf_enqueue,
401 tbf_dequeue,
402 tbf_requeue,
403 tbf_drop,
404
405 tbf_init,
406 tbf_reset,
407 tbf_destroy,
408 tbf_change,
409
410 #ifdef CONFIG_RTNETLINK
411 tbf_dump,
412 #endif
413 };
414
415
416 #ifdef MODULE
417 int init_module(void)
418 {
419 return register_qdisc(&tbf_qdisc_ops);
420 }
421
422 void cleanup_module(void)
423 {
424 unregister_qdisc(&tbf_qdisc_ops);
425 }
426 #endif
427
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.