1 /*
2 * include/asm-i386/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
15
16 /*
17 * High-speed RAID5 checksumming functions utilizing MMX instructions.
18 * Copyright (C) 1998 Ingo Molnar.
19 */
20
21 #define FPU_SAVE \
22 do { \
23 if (!(current->flags & PF_USEDFPU)) \
24 __asm__ __volatile__ (" clts;\n"); \
25 __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \
26 } while (0)
27
28 #define FPU_RESTORE \
29 do { \
30 __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \
31 if (!(current->flags & PF_USEDFPU)) \
32 stts(); \
33 } while (0)
34
35 #define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
36 #define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
37 #define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
38 #define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
39 #define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
40 #define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
41
42
43 static void
44 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
45 {
46 unsigned long lines = bytes >> 7;
47 char fpu_save[108];
48
49 FPU_SAVE;
50
51 __asm__ __volatile__ (
52 #undef BLOCK
53 #define BLOCK(i) \
54 LD(i,0) \
55 LD(i+1,1) \
56 LD(i+2,2) \
57 LD(i+3,3) \
58 XO1(i,0) \
59 ST(i,0) \
60 XO1(i+1,1) \
61 ST(i+1,1) \
62 XO1(i+2,2) \
63 ST(i+2,2) \
64 XO1(i+3,3) \
65 ST(i+3,3)
66
67 " .align 32 ;\n"
68 " 1: ;\n"
69
70 BLOCK(0)
71 BLOCK(4)
72 BLOCK(8)
73 BLOCK(12)
74
75 " addl $128, %1 ;\n"
76 " addl $128, %2 ;\n"
77 " decl %0 ;\n"
78 " jnz 1b ;\n"
79 :
80 : "r" (lines),
81 "r" (p1), "r" (p2)
82 : "memory");
83
84 FPU_RESTORE;
85 }
86
87 static void
88 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
89 unsigned long *p3)
90 {
91 unsigned long lines = bytes >> 7;
92 char fpu_save[108];
93
94 FPU_SAVE;
95
96 __asm__ __volatile__ (
97 #undef BLOCK
98 #define BLOCK(i) \
99 LD(i,0) \
100 LD(i+1,1) \
101 LD(i+2,2) \
102 LD(i+3,3) \
103 XO1(i,0) \
104 XO1(i+1,1) \
105 XO1(i+2,2) \
106 XO1(i+3,3) \
107 XO2(i,0) \
108 ST(i,0) \
109 XO2(i+1,1) \
110 ST(i+1,1) \
111 XO2(i+2,2) \
112 ST(i+2,2) \
113 XO2(i+3,3) \
114 ST(i+3,3)
115
116 " .align 32 ;\n"
117 " 1: ;\n"
118
119 BLOCK(0)
120 BLOCK(4)
121 BLOCK(8)
122 BLOCK(12)
123
124 " addl $128, %1 ;\n"
125 " addl $128, %2 ;\n"
126 " addl $128, %3 ;\n"
127 " decl %0 ;\n"
128 " jnz 1b ;\n"
129 :
130 : "r" (lines),
131 "r" (p1), "r" (p2), "r" (p3)
132 : "memory");
133
134 FPU_RESTORE;
135 }
136
137 static void
138 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
139 unsigned long *p3, unsigned long *p4)
140 {
141 unsigned long lines = bytes >> 7;
142 char fpu_save[108];
143
144 FPU_SAVE;
145
146 __asm__ __volatile__ (
147 #undef BLOCK
148 #define BLOCK(i) \
149 LD(i,0) \
150 LD(i+1,1) \
151 LD(i+2,2) \
152 LD(i+3,3) \
153 XO1(i,0) \
154 XO1(i+1,1) \
155 XO1(i+2,2) \
156 XO1(i+3,3) \
157 XO2(i,0) \
158 XO2(i+1,1) \
159 XO2(i+2,2) \
160 XO2(i+3,3) \
161 XO3(i,0) \
162 ST(i,0) \
163 XO3(i+1,1) \
164 ST(i+1,1) \
165 XO3(i+2,2) \
166 ST(i+2,2) \
167 XO3(i+3,3) \
168 ST(i+3,3)
169
170 " .align 32 ;\n"
171 " 1: ;\n"
172
173 BLOCK(0)
174 BLOCK(4)
175 BLOCK(8)
176 BLOCK(12)
177
178 " addl $128, %1 ;\n"
179 " addl $128, %2 ;\n"
180 " addl $128, %3 ;\n"
181 " addl $128, %4 ;\n"
182 " decl %0 ;\n"
183 " jnz 1b ;\n"
184 :
185 : "r" (lines),
186 "r" (p1), "r" (p2), "r" (p3), "r" (p4)
187 : "memory");
188
189 FPU_RESTORE;
190 }
191
192 static void
193 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
194 unsigned long *p3, unsigned long *p4, unsigned long *p5)
195 {
196 unsigned long lines = bytes >> 7;
197 char fpu_save[108];
198
199 FPU_SAVE;
200
201 __asm__ __volatile__ (
202 #undef BLOCK
203 #define BLOCK(i) \
204 LD(i,0) \
205 LD(i+1,1) \
206 LD(i+2,2) \
207 LD(i+3,3) \
208 XO1(i,0) \
209 XO1(i+1,1) \
210 XO1(i+2,2) \
211 XO1(i+3,3) \
212 XO2(i,0) \
213 XO2(i+1,1) \
214 XO2(i+2,2) \
215 XO2(i+3,3) \
216 XO3(i,0) \
217 XO3(i+1,1) \
218 XO3(i+2,2) \
219 XO3(i+3,3) \
220 XO4(i,0) \
221 ST(i,0) \
222 XO4(i+1,1) \
223 ST(i+1,1) \
224 XO4(i+2,2) \
225 ST(i+2,2) \
226 XO4(i+3,3) \
227 ST(i+3,3)
228
229 " .align 32 ;\n"
230 " 1: ;\n"
231
232 BLOCK(0)
233 BLOCK(4)
234 BLOCK(8)
235 BLOCK(12)
236
237 " addl $128, %1 ;\n"
238 " addl $128, %2 ;\n"
239 " addl $128, %3 ;\n"
240 " addl $128, %4 ;\n"
241 " addl $128, %5 ;\n"
242 " decl %0 ;\n"
243 " jnz 1b ;\n"
244 :
245 : "g" (lines),
246 "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
247 : "memory");
248
249 FPU_RESTORE;
250 }
251
252 #undef LD
253 #undef XO1
254 #undef XO2
255 #undef XO3
256 #undef XO4
257 #undef ST
258 #undef BLOCK
259
260 static void
261 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
262 {
263 unsigned long lines = bytes >> 6;
264 char fpu_save[108];
265
266 FPU_SAVE;
267
268 __asm__ __volatile__ (
269 " .align 32 ;\n"
270 " 1: ;\n"
271 " movq (%1), %%mm0 ;\n"
272 " movq 8(%1), %%mm1 ;\n"
273 " pxor (%2), %%mm0 ;\n"
274 " movq 16(%1), %%mm2 ;\n"
275 " movq %%mm0, (%1) ;\n"
276 " pxor 8(%2), %%mm1 ;\n"
277 " movq 24(%1), %%mm3 ;\n"
278 " movq %%mm1, 8(%1) ;\n"
279 " pxor 16(%2), %%mm2 ;\n"
280 " movq 32(%1), %%mm4 ;\n"
281 " movq %%mm2, 16(%1) ;\n"
282 " pxor 24(%2), %%mm3 ;\n"
283 " movq 40(%1), %%mm5 ;\n"
284 " movq %%mm3, 24(%1) ;\n"
285 " pxor 32(%2), %%mm4 ;\n"
286 " movq 48(%1), %%mm6 ;\n"
287 " movq %%mm4, 32(%1) ;\n"
288 " pxor 40(%2), %%mm5 ;\n"
289 " movq 56(%1), %%mm7 ;\n"
290 " movq %%mm5, 40(%1) ;\n"
291 " pxor 48(%2), %%mm6 ;\n"
292 " pxor 56(%2), %%mm7 ;\n"
293 " movq %%mm6, 48(%1) ;\n"
294 " movq %%mm7, 56(%1) ;\n"
295
296 " addl $64, %1 ;\n"
297 " addl $64, %2 ;\n"
298 " decl %0 ;\n"
299 " jnz 1b ;\n"
300 :
301 : "r" (lines),
302 "r" (p1), "r" (p2)
303 : "memory");
304
305 FPU_RESTORE;
306 }
307
308 static void
309 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
310 unsigned long *p3)
311 {
312 unsigned long lines = bytes >> 6;
313 char fpu_save[108];
314
315 FPU_SAVE;
316
317 __asm__ __volatile__ (
318 " .align 32,0x90 ;\n"
319 " 1: ;\n"
320 " movq (%1), %%mm0 ;\n"
321 " movq 8(%1), %%mm1 ;\n"
322 " pxor (%2), %%mm0 ;\n"
323 " movq 16(%1), %%mm2 ;\n"
324 " pxor 8(%2), %%mm1 ;\n"
325 " pxor (%3), %%mm0 ;\n"
326 " pxor 16(%2), %%mm2 ;\n"
327 " movq %%mm0, (%1) ;\n"
328 " pxor 8(%3), %%mm1 ;\n"
329 " pxor 16(%3), %%mm2 ;\n"
330 " movq 24(%1), %%mm3 ;\n"
331 " movq %%mm1, 8(%1) ;\n"
332 " movq 32(%1), %%mm4 ;\n"
333 " movq 40(%1), %%mm5 ;\n"
334 " pxor 24(%2), %%mm3 ;\n"
335 " movq %%mm2, 16(%1) ;\n"
336 " pxor 32(%2), %%mm4 ;\n"
337 " pxor 24(%3), %%mm3 ;\n"
338 " pxor 40(%2), %%mm5 ;\n"
339 " movq %%mm3, 24(%1) ;\n"
340 " pxor 32(%3), %%mm4 ;\n"
341 " pxor 40(%3), %%mm5 ;\n"
342 " movq 48(%1), %%mm6 ;\n"
343 " movq %%mm4, 32(%1) ;\n"
344 " movq 56(%1), %%mm7 ;\n"
345 " pxor 48(%2), %%mm6 ;\n"
346 " movq %%mm5, 40(%1) ;\n"
347 " pxor 56(%2), %%mm7 ;\n"
348 " pxor 48(%3), %%mm6 ;\n"
349 " pxor 56(%3), %%mm7 ;\n"
350 " movq %%mm6, 48(%1) ;\n"
351 " movq %%mm7, 56(%1) ;\n"
352
353 " addl $64, %1 ;\n"
354 " addl $64, %2 ;\n"
355 " addl $64, %3 ;\n"
356 " decl %0 ;\n"
357 " jnz 1b ;\n"
358 :
359 : "r" (lines),
360 "r" (p1), "r" (p2), "r" (p3)
361 : "memory" );
362
363 FPU_RESTORE;
364 }
365
366 static void
367 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
368 unsigned long *p3, unsigned long *p4)
369 {
370 unsigned long lines = bytes >> 6;
371 char fpu_save[108];
372
373 FPU_SAVE;
374
375 __asm__ __volatile__ (
376 " .align 32,0x90 ;\n"
377 " 1: ;\n"
378 " movq (%1), %%mm0 ;\n"
379 " movq 8(%1), %%mm1 ;\n"
380 " pxor (%2), %%mm0 ;\n"
381 " movq 16(%1), %%mm2 ;\n"
382 " pxor 8(%2), %%mm1 ;\n"
383 " pxor (%3), %%mm0 ;\n"
384 " pxor 16(%2), %%mm2 ;\n"
385 " pxor 8(%3), %%mm1 ;\n"
386 " pxor (%4), %%mm0 ;\n"
387 " movq 24(%1), %%mm3 ;\n"
388 " pxor 16(%3), %%mm2 ;\n"
389 " pxor 8(%4), %%mm1 ;\n"
390 " movq %%mm0, (%1) ;\n"
391 " movq 32(%1), %%mm4 ;\n"
392 " pxor 24(%2), %%mm3 ;\n"
393 " pxor 16(%4), %%mm2 ;\n"
394 " movq %%mm1, 8(%1) ;\n"
395 " movq 40(%1), %%mm5 ;\n"
396 " pxor 32(%2), %%mm4 ;\n"
397 " pxor 24(%3), %%mm3 ;\n"
398 " movq %%mm2, 16(%1) ;\n"
399 " pxor 40(%2), %%mm5 ;\n"
400 " pxor 32(%3), %%mm4 ;\n"
401 " pxor 24(%4), %%mm3 ;\n"
402 " movq %%mm3, 24(%1) ;\n"
403 " movq 56(%1), %%mm7 ;\n"
404 " movq 48(%1), %%mm6 ;\n"
405 " pxor 40(%3), %%mm5 ;\n"
406 " pxor 32(%4), %%mm4 ;\n"
407 " pxor 48(%2), %%mm6 ;\n"
408 " movq %%mm4, 32(%1) ;\n"
409 " pxor 56(%2), %%mm7 ;\n"
410 " pxor 40(%4), %%mm5 ;\n"
411 " pxor 48(%3), %%mm6 ;\n"
412 " pxor 56(%3), %%mm7 ;\n"
413 " movq %%mm5, 40(%1) ;\n"
414 " pxor 48(%4), %%mm6 ;\n"
415 " pxor 56(%4), %%mm7 ;\n"
416 " movq %%mm6, 48(%1) ;\n"
417 " movq %%mm7, 56(%1) ;\n"
418
419 " addl $64, %1 ;\n"
420 " addl $64, %2 ;\n"
421 " addl $64, %3 ;\n"
422 " addl $64, %4 ;\n"
423 " decl %0 ;\n"
424 " jnz 1b ;\n"
425 :
426 : "r" (lines),
427 "r" (p1), "r" (p2), "r" (p3), "r" (p4)
428 : "memory");
429
430 FPU_RESTORE;
431 }
432
433 static void
434 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
435 unsigned long *p3, unsigned long *p4, unsigned long *p5)
436 {
437 unsigned long lines = bytes >> 6;
438 char fpu_save[108];
439
440 FPU_SAVE;
441
442 __asm__ __volatile__ (
443 " .align 32,0x90 ;\n"
444 " 1: ;\n"
445 " movq (%1), %%mm0 ;\n"
446 " movq 8(%1), %%mm1 ;\n"
447 " pxor (%2), %%mm0 ;\n"
448 " pxor 8(%2), %%mm1 ;\n"
449 " movq 16(%1), %%mm2 ;\n"
450 " pxor (%3), %%mm0 ;\n"
451 " pxor 8(%3), %%mm1 ;\n"
452 " pxor 16(%2), %%mm2 ;\n"
453 " pxor (%4), %%mm0 ;\n"
454 " pxor 8(%4), %%mm1 ;\n"
455 " pxor 16(%3), %%mm2 ;\n"
456 " movq 24(%1), %%mm3 ;\n"
457 " pxor (%5), %%mm0 ;\n"
458 " pxor 8(%5), %%mm1 ;\n"
459 " movq %%mm0, (%1) ;\n"
460 " pxor 16(%4), %%mm2 ;\n"
461 " pxor 24(%2), %%mm3 ;\n"
462 " movq %%mm1, 8(%1) ;\n"
463 " pxor 16(%5), %%mm2 ;\n"
464 " pxor 24(%3), %%mm3 ;\n"
465 " movq 32(%1), %%mm4 ;\n"
466 " movq %%mm2, 16(%1) ;\n"
467 " pxor 24(%4), %%mm3 ;\n"
468 " pxor 32(%2), %%mm4 ;\n"
469 " movq 40(%1), %%mm5 ;\n"
470 " pxor 24(%5), %%mm3 ;\n"
471 " pxor 32(%3), %%mm4 ;\n"
472 " pxor 40(%2), %%mm5 ;\n"
473 " movq %%mm3, 24(%1) ;\n"
474 " pxor 32(%4), %%mm4 ;\n"
475 " pxor 40(%3), %%mm5 ;\n"
476 " movq 48(%1), %%mm6 ;\n"
477 " movq 56(%1), %%mm7 ;\n"
478 " pxor 32(%5), %%mm4 ;\n"
479 " pxor 40(%4), %%mm5 ;\n"
480 " pxor 48(%2), %%mm6 ;\n"
481 " pxor 56(%2), %%mm7 ;\n"
482 " movq %%mm4, 32(%1) ;\n"
483 " pxor 48(%3), %%mm6 ;\n"
484 " pxor 56(%3), %%mm7 ;\n"
485 " pxor 40(%5), %%mm5 ;\n"
486 " pxor 48(%4), %%mm6 ;\n"
487 " pxor 56(%4), %%mm7 ;\n"
488 " movq %%mm5, 40(%1) ;\n"
489 " pxor 48(%5), %%mm6 ;\n"
490 " pxor 56(%5), %%mm7 ;\n"
491 " movq %%mm6, 48(%1) ;\n"
492 " movq %%mm7, 56(%1) ;\n"
493
494 " addl $64, %1 ;\n"
495 " addl $64, %2 ;\n"
496 " addl $64, %3 ;\n"
497 " addl $64, %4 ;\n"
498 " addl $64, %5 ;\n"
499 " decl %0 ;\n"
500 " jnz 1b ;\n"
501 :
502 : "g" (lines),
503 "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
504 : "memory");
505
506 FPU_RESTORE;
507 }
508
509 static struct xor_block_template xor_block_pII_mmx = {
510 name: "pII_mmx",
511 do_2: xor_pII_mmx_2,
512 do_3: xor_pII_mmx_3,
513 do_4: xor_pII_mmx_4,
514 do_5: xor_pII_mmx_5,
515 };
516
517 static struct xor_block_template xor_block_p5_mmx = {
518 name: "p5_mmx",
519 do_2: xor_p5_mmx_2,
520 do_3: xor_p5_mmx_3,
521 do_4: xor_p5_mmx_4,
522 do_5: xor_p5_mmx_5,
523 };
524
525 #undef FPU_SAVE
526 #undef FPU_RESTORE
527
528 /*
529 * Cache avoiding checksumming functions utilizing KNI instructions
530 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
531 */
532
533 #define XMMS_SAVE \
534 __asm__ __volatile__ ( \
535 "movl %%cr0,%0 ;\n\t" \
536 "clts ;\n\t" \
537 "movups %%xmm0,(%1) ;\n\t" \
538 "movups %%xmm1,0x10(%1) ;\n\t" \
539 "movups %%xmm2,0x20(%1) ;\n\t" \
540 "movups %%xmm3,0x30(%1) ;\n\t" \
541 : "=r" (cr0) \
542 : "r" (xmm_save) \
543 : "memory")
544
545 #define XMMS_RESTORE \
546 __asm__ __volatile__ ( \
547 "sfence ;\n\t" \
548 "movups (%1),%%xmm0 ;\n\t" \
549 "movups 0x10(%1),%%xmm1 ;\n\t" \
550 "movups 0x20(%1),%%xmm2 ;\n\t" \
551 "movups 0x30(%1),%%xmm3 ;\n\t" \
552 "movl %0,%%cr0 ;\n\t" \
553 : \
554 : "r" (cr0), "r" (xmm_save) \
555 : "memory")
556
557 #define OFFS(x) "16*("#x")"
558 #define PF0(x) " prefetcht0 "OFFS(x)"(%1) ;\n"
559 #define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
560 #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
561 #define PF1(x) " prefetchnta "OFFS(x)"(%2) ;\n"
562 #define PF2(x) " prefetchnta "OFFS(x)"(%3) ;\n"
563 #define PF3(x) " prefetchnta "OFFS(x)"(%4) ;\n"
564 #define PF4(x) " prefetchnta "OFFS(x)"(%5) ;\n"
565 #define PF5(x) " prefetchnta "OFFS(x)"(%6) ;\n"
566 #define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
567 #define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
568 #define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
569 #define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
570 #define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
571
572
573 static void
574 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
575 {
576 unsigned long lines = bytes >> 8;
577 char xmm_save[16*4];
578 int cr0;
579
580 XMMS_SAVE;
581
582 __asm__ __volatile__ (
583 #undef BLOCK
584 #define BLOCK(i) \
585 LD(i,0) \
586 LD(i+1,1) \
587 PF1(i) \
588 PF1(i+2) \
589 LD(i+2,2) \
590 LD(i+3,3) \
591 PF0(i+4) \
592 PF0(i+6) \
593 XO1(i,0) \
594 XO1(i+1,1) \
595 XO1(i+2,2) \
596 XO1(i+3,3) \
597 ST(i,0) \
598 ST(i+1,1) \
599 ST(i+2,2) \
600 ST(i+3,3) \
601
602
603 PF0(0)
604 PF0(2)
605
606 " .align 32 ;\n"
607 " 1: ;\n"
608
609 BLOCK(0)
610 BLOCK(4)
611 BLOCK(8)
612 BLOCK(12)
613
614 " addl $256, %1 ;\n"
615 " addl $256, %2 ;\n"
616 " decl %0 ;\n"
617 " jnz 1b ;\n"
618 :
619 : "r" (lines),
620 "r" (p1), "r" (p2)
621 : "memory");
622
623 XMMS_RESTORE;
624 }
625
626 static void
627 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
628 unsigned long *p3)
629 {
630 unsigned long lines = bytes >> 8;
631 char xmm_save[16*4];
632 int cr0;
633
634 XMMS_SAVE;
635
636 __asm__ __volatile__ (
637 #undef BLOCK
638 #define BLOCK(i) \
639 PF1(i) \
640 PF1(i+2) \
641 LD(i,0) \
642 LD(i+1,1) \
643 LD(i+2,2) \
644 LD(i+3,3) \
645 PF2(i) \
646 PF2(i+2) \
647 PF0(i+4) \
648 PF0(i+6) \
649 XO1(i,0) \
650 XO1(i+1,1) \
651 XO1(i+2,2) \
652 XO1(i+3,3) \
653 XO2(i,0) \
654 XO2(i+1,1) \
655 XO2(i+2,2) \
656 XO2(i+3,3) \
657 ST(i,0) \
658 ST(i+1,1) \
659 ST(i+2,2) \
660 ST(i+3,3) \
661
662
663 PF0(0)
664 PF0(2)
665
666 " .align 32 ;\n"
667 " 1: ;\n"
668
669 BLOCK(0)
670 BLOCK(4)
671 BLOCK(8)
672 BLOCK(12)
673
674 " addl $256, %1 ;\n"
675 " addl $256, %2 ;\n"
676 " addl $256, %3 ;\n"
677 " decl %0 ;\n"
678 " jnz 1b ;\n"
679 :
680 : "r" (lines),
681 "r" (p1), "r"(p2), "r"(p3)
682 : "memory" );
683
684 XMMS_RESTORE;
685 }
686
687 static void
688 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
689 unsigned long *p3, unsigned long *p4)
690 {
691 unsigned long lines = bytes >> 8;
692 char xmm_save[16*4];
693 int cr0;
694
695 XMMS_SAVE;
696
697 __asm__ __volatile__ (
698 #undef BLOCK
699 #define BLOCK(i) \
700 PF1(i) \
701 PF1(i+2) \
702 LD(i,0) \
703 LD(i+1,1) \
704 LD(i+2,2) \
705 LD(i+3,3) \
706 PF2(i) \
707 PF2(i+2) \
708 XO1(i,0) \
709 XO1(i+1,1) \
710 XO1(i+2,2) \
711 XO1(i+3,3) \
712 PF3(i) \
713 PF3(i+2) \
714 PF0(i+4) \
715 PF0(i+6) \
716 XO2(i,0) \
717 XO2(i+1,1) \
718 XO2(i+2,2) \
719 XO2(i+3,3) \
720 XO3(i,0) \
721 XO3(i+1,1) \
722 XO3(i+2,2) \
723 XO3(i+3,3) \
724 ST(i,0) \
725 ST(i+1,1) \
726 ST(i+2,2) \
727 ST(i+3,3) \
728
729
730 PF0(0)
731 PF0(2)
732
733 " .align 32 ;\n"
734 " 1: ;\n"
735
736 BLOCK(0)
737 BLOCK(4)
738 BLOCK(8)
739 BLOCK(12)
740
741 " addl $256, %1 ;\n"
742 " addl $256, %2 ;\n"
743 " addl $256, %3 ;\n"
744 " addl $256, %4 ;\n"
745 " decl %0 ;\n"
746 " jnz 1b ;\n"
747 :
748 : "r" (lines),
749 "r" (p1), "r" (p2), "r" (p3), "r" (p4)
750 : "memory" );
751
752 XMMS_RESTORE;
753 }
754
755 static void
756 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
757 unsigned long *p3, unsigned long *p4, unsigned long *p5)
758 {
759 unsigned long lines = bytes >> 8;
760 char xmm_save[16*4];
761 int cr0;
762
763 XMMS_SAVE;
764
765 __asm__ __volatile__ (
766 #undef BLOCK
767 #define BLOCK(i) \
768 PF1(i) \
769 PF1(i+2) \
770 LD(i,0) \
771 LD(i+1,1) \
772 LD(i+2,2) \
773 LD(i+3,3) \
774 PF2(i) \
775 PF2(i+2) \
776 XO1(i,0) \
777 XO1(i+1,1) \
778 XO1(i+2,2) \
779 XO1(i+3,3) \
780 PF3(i) \
781 PF3(i+2) \
782 XO2(i,0) \
783 XO2(i+1,1) \
784 XO2(i+2,2) \
785 XO2(i+3,3) \
786 PF4(i) \
787 PF4(i+2) \
788 PF0(i+4) \
789 PF0(i+6) \
790 XO3(i,0) \
791 XO3(i+1,1) \
792 XO3(i+2,2) \
793 XO3(i+3,3) \
794 XO4(i,0) \
795 XO4(i+1,1) \
796 XO4(i+2,2) \
797 XO4(i+3,3) \
798 ST(i,0) \
799 ST(i+1,1) \
800 ST(i+2,2) \
801 ST(i+3,3) \
802
803
804 PF0(0)
805 PF0(2)
806
807 " .align 32 ;\n"
808 " 1: ;\n"
809
810 BLOCK(0)
811 BLOCK(4)
812 BLOCK(8)
813 BLOCK(12)
814
815 " addl $256, %1 ;\n"
816 " addl $256, %2 ;\n"
817 " addl $256, %3 ;\n"
818 " addl $256, %4 ;\n"
819 " addl $256, %5 ;\n"
820 " decl %0 ;\n"
821 " jnz 1b ;\n"
822 :
823 : "r" (lines),
824 "r" (p1), "r" (p2), "r" (p3), "r" (p4), "r" (p5)
825 : "memory");
826
827 XMMS_RESTORE;
828 }
829
830 static struct xor_block_template xor_block_pIII_sse = {
831 name: "pIII_sse",
832 do_2: xor_sse_2,
833 do_3: xor_sse_3,
834 do_4: xor_sse_4,
835 do_5: xor_sse_5,
836 };
837
838 /* Also try the generic routines. */
839 #include <asm-generic/xor.h>
840
841 #undef XOR_TRY_TEMPLATES
842 #define XOR_TRY_TEMPLATES \
843 do { \
844 xor_speed(&xor_block_8regs); \
845 xor_speed(&xor_block_32regs); \
846 if (cpu_has_xmm) \
847 xor_speed(&xor_block_pIII_sse); \
848 if (md_cpu_has_mmx()) { \
849 xor_speed(&xor_block_pII_mmx); \
850 xor_speed(&xor_block_p5_mmx); \
851 } \
852 } while (0)
853
854 /* We force the use of the SSE xor block because it can write around L2.
855 We may also be able to load into the L1 only depending on how the cpu
856 deals with a load to a line that is being prefetched. */
857 #define XOR_SELECT_TEMPLATE(FASTEST) \
858 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
859
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more
information.