rt.compiler.gdc.rt.arrayfloat source code

1 /***************************
2  * D programming language http://www.digitalmars.com/d/
3  * Runtime support for float array operations.
4  * Based on code originally written by Burton Radons.
5  * Placed in public domain.
6  */
7 
8 module rt.compiler.gdc.rt.arrayfloat;
9 
10 import CPUid = rt.compiler.util.cpuid;
11 
12 debug(UnitTest)
13 {
14     private extern(C) int printf(char*,...);
15     /* This is so unit tests will test every CPU variant
16      */
17     int cpuid;
18     const int CPUID_MAX = 5;
19     bool mmx()      { return cpuid == 1 && CPUid.mmx(); }
20     bool sse()      { return cpuid == 2 && CPUid.sse(); }
21     bool sse2()     { return cpuid == 3 && CPUid.sse2(); }
22     bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); }
23 }
24 else
25 {
26     alias CPUid.mmx mmx;
27     alias CPUid.sse sse;
28     alias CPUid.sse2 sse2;
29     alias CPUid.amd3dnow amd3dnow;
30 }
31 
32 //version = log;
33 
34 bool disjoint(T)(T[] a, T[] b)
35 {
36     return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
37 }
38 
39 alias float T;
40 
41 extern (C):
42 
43 /* ======================================================================== */
44 /* ======================================================================== */
45 
46 /* template for the case
47  *   a[] = b[] ? c[]
48  * with some binary operator ?
49  */
50 private template CodeGenSliceSliceOp(char[] opD, char[] opSSE, char[] op3DNow)
51 {
52     const char[] CodeGenSliceSliceOp = `
53     auto aptr = a.ptr;
54     auto aend = aptr + a.length;
55     auto bptr = b.ptr;
56     auto cptr = c.ptr;
57 
58     version (D_InlineAsm_X86)
59     {
60         // SSE version is 834% faster
61         if (sse() && b.length >= 16)
62         {
63             auto n = aptr + (b.length & ~15);
64 
65             // Unaligned case
66             asm
67             {
68                 mov EAX, bptr; // left operand
69                 mov ECX, cptr; // right operand
70                 mov ESI, aptr; // destination operand
71                 mov EDI, n;    // end comparison
72 
73                 align 8;
74             startsseloopb:
75                 movups XMM0, [EAX];
76                 movups XMM1, [EAX+16];
77                 movups XMM2, [EAX+32];
78                 movups XMM3, [EAX+48];
79                 add EAX, 64;
80                 movups XMM4, [ECX];
81                 movups XMM5, [ECX+16];
82                 movups XMM6, [ECX+32];
83                 movups XMM7, [ECX+48];
84                 add ESI, 64;
85                 ` ~ opSSE ~ ` XMM0, XMM4;
86                 ` ~ opSSE ~ ` XMM1, XMM5;
87                 ` ~ opSSE ~ ` XMM2, XMM6;
88                 ` ~ opSSE ~ ` XMM3, XMM7;
89                 add ECX, 64;
90                 movups [ESI+ 0-64], XMM0;
91                 movups [ESI+16-64], XMM1;
92                 movups [ESI+32-64], XMM2;
93                 movups [ESI+48-64], XMM3;
94                 cmp ESI, EDI;
95                 jb startsseloopb;
96 
97                 mov aptr, ESI;
98                 mov bptr, EAX;
99                 mov cptr, ECX;
100             }
101         }
102         else
103         // 3DNow! version is only 13% faster
104         if (amd3dnow() && b.length >= 8)
105         {
106             auto n = aptr + (b.length & ~7);
107 
108             asm
109             {
110                 mov ESI, aptr; // destination operand
111                 mov EDI, n;    // end comparison
112                 mov EAX, bptr; // left operand
113                 mov ECX, cptr; // right operand
114 
115                 align 4;
116             start3dnow:
117                 movq MM0, [EAX];
118                 movq MM1, [EAX+8];
119                 movq MM2, [EAX+16];
120                 movq MM3, [EAX+24];
121                 ` ~ op3DNow ~ ` MM0, [ECX];
122                 ` ~ op3DNow ~ ` MM1, [ECX+8];
123                 ` ~ op3DNow ~ ` MM2, [ECX+16];
124                 ` ~ op3DNow ~ ` MM3, [ECX+24];
125                 movq [ESI], MM0;
126                 movq [ESI+8], MM1;
127                 movq [ESI+16], MM2;
128                 movq [ESI+24], MM3;
129                 add ECX, 32;
130                 add ESI, 32;
131                 add EAX, 32;
132                 cmp ESI, EDI;
133                 jb start3dnow;
134 
135                 emms;
136                 mov aptr, ESI;
137                 mov bptr, EAX;
138                 mov cptr, ECX;
139             }
140         }
141     }
142 
143     // Handle remainder
144     while (aptr < aend)
145         *aptr++ = *bptr++ ` ~ opD ~ ` *cptr++;
146 
147     return a;`;
148 }
149 
150 /* ======================================================================== */
151 
152 /***********************
153  * Computes:
154  *      a[] = b[] + c[]
155  */
156 
157 T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b)
158 in
159 {
160         assert(a.length == b.length && b.length == c.length);
161         assert(disjoint(a, b));
162         assert(disjoint(a, c));
163         assert(disjoint(b, c));
164 }
165 body
166 {
167     mixin(CodeGenSliceSliceOp!("+", "addps", "pfadd"));
168 }
169 
170 
171 unittest
172 {
173     printf("_arraySliceSliceAddSliceAssign_f unittest\n");
174     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
175     {
176         version (log) printf("    cpuid %d\n", cpuid);
177 
178         for (int j = 0; j < 2; j++)
179         {
180             const int dim = 67;
181             T[] a = new T[dim + j];     // aligned on 16 byte boundary
182             a = a[j .. dim + j];        // misalign for second iteration
183             T[] b = new T[dim + j];
184             b = b[j .. dim + j];
185             T[] c = new T[dim + j];
186             c = c[j .. dim + j];
187 
188             for (int i = 0; i < dim; i++)
189             {   a[i] = cast(T)i;
190                 b[i] = cast(T)(i + 7);
191                 c[i] = cast(T)(i * 2);
192             }
193 
194             c[] = a[] + b[];
195 
196             for (int i = 0; i < dim; i++)
197             {
198                 if (c[i] != cast(T)(a[i] + b[i]))
199                 {
200                     printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
201                     assert(0);
202                 }
203             }
204         }
205     }
206 }
207 
208 /* ======================================================================== */
209 
210 /***********************
211  * Computes:
212  *      a[] = b[] - c[]
213  */
214 
215 T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b)
216 in
217 {
218         assert(a.length == b.length && b.length == c.length);
219         assert(disjoint(a, b));
220         assert(disjoint(a, c));
221         assert(disjoint(b, c));
222 }
223 body
224 {
225     mixin(CodeGenSliceSliceOp!("-", "subps", "pfsub"));
226 }
227 
228 
229 unittest
230 {
231     printf("_arraySliceSliceMinSliceAssign_f unittest\n");
232     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
233     {
234         version (log) printf("    cpuid %d\n", cpuid);
235 
236         for (int j = 0; j < 2; j++)
237         {
238             const int dim = 67;
239             T[] a = new T[dim + j];     // aligned on 16 byte boundary
240             a = a[j .. dim + j];        // misalign for second iteration
241             T[] b = new T[dim + j];
242             b = b[j .. dim + j];
243             T[] c = new T[dim + j];
244             c = c[j .. dim + j];
245 
246             for (int i = 0; i < dim; i++)
247             {   a[i] = cast(T)i;
248                 b[i] = cast(T)(i + 7);
249                 c[i] = cast(T)(i * 2);
250             }
251 
252             c[] = a[] - b[];
253 
254             for (int i = 0; i < dim; i++)
255             {
256                 if (c[i] != cast(T)(a[i] - b[i]))
257                 {
258                     printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]);
259                     assert(0);
260                 }
261             }
262         }
263     }
264 }
265 
266 /* ======================================================================== */
267 
268 /***********************
269  * Computes:
270  *      a[] = b[] * c[]
271  */
272 
273 T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b)
274 in
275 {
276         assert(a.length == b.length && b.length == c.length);
277         assert(disjoint(a, b));
278         assert(disjoint(a, c));
279         assert(disjoint(b, c));
280 }
281 body
282 {
283     mixin(CodeGenSliceSliceOp!("*", "mulps", "pfmul"));
284 }
285 
286 unittest
287 {
288     printf("_arraySliceSliceMulSliceAssign_f unittest\n");
289     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
290     {
291         version (log) printf("    cpuid %d\n", cpuid);
292 
293         for (int j = 0; j < 2; j++)
294         {
295             const int dim = 67;
296             T[] a = new T[dim + j];     // aligned on 16 byte boundary
297             a = a[j .. dim + j];        // misalign for second iteration
298             T[] b = new T[dim + j];
299             b = b[j .. dim + j];
300             T[] c = new T[dim + j];
301             c = c[j .. dim + j];
302 
303             for (int i = 0; i < dim; i++)
304             {   a[i] = cast(T)i;
305                 b[i] = cast(T)(i + 7);
306                 c[i] = cast(T)(i * 2);
307             }
308 
309             c[] = a[] * b[];
310 
311             for (int i = 0; i < dim; i++)
312             {
313                 if (c[i] != cast(T)(a[i] * b[i]))
314                 {
315                     printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
316                     assert(0);
317                 }
318             }
319         }
320     }
321 }
322 
323 /* ======================================================================== */
324 /* ======================================================================== */
325 
326 /* template for the case
327  *   a[] ?= value
328  * with some binary operator ?
329  */
330 private template CodeGenExpSliceOpAssign(char[] opD, char[] opSSE, char[] op3DNow)
331 {
332     const char[] CodeGenExpSliceOpAssign = `
333     auto aptr = a.ptr;
334     auto aend = aptr + a.length;
335 
336     version (D_InlineAsm_X86)
337     {
338         if (sse() && a.length >= 16)
339         {
340             auto aabeg = cast(T*)((cast(uint)aptr + 15) & ~15); // beginning of paragraph-aligned slice of a
341             auto aaend = cast(T*)((cast(uint)aend) & ~15);      // end of paragraph-aligned slice of a
342 
343             int numAligned = cast(int)(aaend - aabeg);          // how many floats are in the aligned slice?
344 
345             // are there at least 16 floats in the paragraph-aligned slice?
346             // otherwise we can't do anything with SSE.
347             if (numAligned >= 16)
348             {
349                 aaend = aabeg + (numAligned & ~15);     // make sure the slice is actually a multiple of 16 floats long
350 
351                 // process values up to aligned slice one by one
352                 while (aptr < aabeg)
353                     *aptr++ ` ~ opD ~ ` value;
354 
355                 // process aligned slice with fast SSE operations
356                 asm
357                 {
358                     mov ESI, aabeg;
359                     mov EDI, aaend;
360                     movss XMM4, value;
361                     shufps XMM4, XMM4, 0;
362 
363                     align 8;
364                 startsseloopa:
365                     movaps XMM0, [ESI];
366                     movaps XMM1, [ESI+16];
367                     movaps XMM2, [ESI+32];
368                     movaps XMM3, [ESI+48];
369                     add ESI, 64;
370                     ` ~ opSSE ~ ` XMM0, XMM4;
371                     ` ~ opSSE ~ ` XMM1, XMM4;
372                     ` ~ opSSE ~ ` XMM2, XMM4;
373                     ` ~ opSSE ~ ` XMM3, XMM4;
374                     movaps [ESI+ 0-64], XMM0;
375                     movaps [ESI+16-64], XMM1;
376                     movaps [ESI+32-64], XMM2;
377                     movaps [ESI+48-64], XMM3;
378                     cmp ESI, EDI;
379                     jb startsseloopa;
380                 }
381                 aptr = aaend;
382             }
383         }
384         else
385         // 3DNow! version is 63% faster
386         if (amd3dnow() && a.length >= 8)
387         {
388             auto n = aptr + (a.length & ~7);
389 
390             ulong w = *cast(uint *) &value;
391             ulong v = w | (w << 32L);
392 
393             asm
394             {
395                 mov ESI, dword ptr [aptr];
396                 mov EDI, dword ptr [n];
397                 movq MM4, qword ptr [v];
398 
399                 align 8;
400             start:
401                 movq MM0, [ESI];
402                 movq MM1, [ESI+8];
403                 movq MM2, [ESI+16];
404                 movq MM3, [ESI+24];
405                 ` ~ op3DNow ~ ` MM0, MM4;
406                 ` ~ op3DNow ~ ` MM1, MM4;
407                 ` ~ op3DNow ~ ` MM2, MM4;
408                 ` ~ op3DNow ~ ` MM3, MM4;
409                 movq [ESI], MM0;
410                 movq [ESI+8], MM1;
411                 movq [ESI+16], MM2;
412                 movq [ESI+24], MM3;
413                 add ESI, 32;
414                 cmp ESI, EDI;
415                 jb start;
416 
417                 emms;
418                 mov dword ptr [aptr], ESI;
419             }
420         }
421     }
422 
423     while (aptr < aend)
424         *aptr++ ` ~ opD ~ ` value;
425 
426     return a;`;
427 }
428 
429 /* ======================================================================== */
430 
431 /***********************
432  * Computes:
433  *      a[] += value
434  */
435 
436 T[] _arrayExpSliceAddass_f(T[] a, T value)
437 {
438     mixin(CodeGenExpSliceOpAssign!("+=", "addps", "pfadd"));
439 }
440 
441 unittest
442 {
443     printf("_arrayExpSliceAddass_f unittest\n");
444     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
445     {
446         version (log) printf("    cpuid %d\n", cpuid);
447 
448         for (int j = 0; j < 2; j++)
449         {
450             const int dim = 67;
451             T[] a = new T[dim + j];     // aligned on 16 byte boundary
452             a = a[j .. dim + j];        // misalign for second iteration
453             T[] b = new T[dim + j];
454             b = b[j .. dim + j];
455             T[] c = new T[dim + j];
456             c = c[j .. dim + j];
457 
458             for (int i = 0; i < dim; i++)
459             {   a[i] = cast(T)i;
460                 b[i] = cast(T)(i + 7);
461                 c[i] = cast(T)(i * 2);
462             }
463 
464             a[] = c[];
465             c[] += 6;
466 
467             for (int i = 0; i < dim; i++)
468             {
469                 if (c[i] != cast(T)(a[i] + 6))
470                 {
471                     printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
472                     assert(0);
473                 }
474             }
475         }
476     }
477 }
478 
479 /* ======================================================================== */
480 
481 /***********************
482  * Computes:
483  *      a[] -= value
484  */
485 
486 T[] _arrayExpSliceMinass_f(T[] a, T value)
487 {
488     mixin(CodeGenExpSliceOpAssign!("-=", "subps", "pfsub"));
489 }
490 
491 unittest
492 {
493     printf("_arrayExpSliceminass_f unittest\n");
494     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
495     {
496         version (log) printf("    cpuid %d\n", cpuid);
497 
498         for (int j = 0; j < 2; j++)
499         {
500             const int dim = 67;
501             T[] a = new T[dim + j];     // aligned on 16 byte boundary
502             a = a[j .. dim + j];        // misalign for second iteration
503             T[] b = new T[dim + j];
504             b = b[j .. dim + j];
505             T[] c = new T[dim + j];
506             c = c[j .. dim + j];
507 
508             for (int i = 0; i < dim; i++)
509             {   a[i] = cast(T)i;
510                 b[i] = cast(T)(i + 7);
511                 c[i] = cast(T)(i * 2);
512             }
513 
514             a[] = c[];
515             c[] -= 6;
516 
517             for (int i = 0; i < dim; i++)
518             {
519                 if (c[i] != cast(T)(a[i] - 6))
520                 {
521                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
522                     assert(0);
523                 }
524             }
525         }
526     }
527 }
528 
529 /* ======================================================================== */
530 
531 /***********************
532  * Computes:
533  *      a[] *= value
534  */
535 
536 T[] _arrayExpSliceMulass_f(T[] a, T value)
537 {
538     mixin(CodeGenExpSliceOpAssign!("*=", "mulps", "pfmul"));
539 }
540 
541 unittest
542 {
543     printf("_arrayExpSliceMulass_f unittest\n");
544     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
545     {
546         version (log) printf("    cpuid %d\n", cpuid);
547 
548         for (int j = 0; j < 2; j++)
549         {
550             const int dim = 67;
551             T[] a = new T[dim + j];     // aligned on 16 byte boundary
552             a = a[j .. dim + j];        // misalign for second iteration
553             T[] b = new T[dim + j];
554             b = b[j .. dim + j];
555             T[] c = new T[dim + j];
556             c = c[j .. dim + j];
557 
558             for (int i = 0; i < dim; i++)
559             {   a[i] = cast(T)i;
560                 b[i] = cast(T)(i + 7);
561                 c[i] = cast(T)(i * 2);
562             }
563 
564             a[] = c[];
565             c[] *= 6;
566 
567             for (int i = 0; i < dim; i++)
568             {
569                 if (c[i] != cast(T)(a[i] * 6))
570                 {
571                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
572                     assert(0);
573                 }
574             }
575         }
576     }
577 }
578 
579 /* ======================================================================== */
580 
581 /***********************
582  * Computes:
583  *      a[] /= value
584  */
585 
586 T[] _arrayExpSliceDivass_f(T[] a, T value)
587 {
588     return _arrayExpSliceMulass_f(a, 1f / value);
589 }
590 
591 unittest
592 {
593     printf("_arrayExpSliceDivass_f unittest\n");
594     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
595     {
596         version (log) printf("    cpuid %d\n", cpuid);
597 
598         for (int j = 0; j < 2; j++)
599         {
600             const int dim = 67;
601             T[] a = new T[dim + j];     // aligned on 16 byte boundary
602             a = a[j .. dim + j];        // misalign for second iteration
603             T[] b = new T[dim + j];
604             b = b[j .. dim + j];
605             T[] c = new T[dim + j];
606             c = c[j .. dim + j];
607 
608             for (int i = 0; i < dim; i++)
609             {   a[i] = cast(T)i;
610                 b[i] = cast(T)(i + 7);
611                 c[i] = cast(T)(i * 2);
612             }
613 
614             a[] = c[];
615             c[] /= 8;
616 
617             for (int i = 0; i < dim; i++)
618             {
619                 if (c[i] != cast(T)(a[i] / 8))
620                 {
621                     printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
622                     assert(0);
623                 }
624             }
625         }
626     }
627 }
628 
629 
630 /* ======================================================================== */
631 /* ======================================================================== */
632 
633 /* template for the case
634  *   a[] = b[] ? value
635  * with some binary operator ?
636  */
637 private template CodeGenSliceExpOp(char[] opD, char[] opSSE, char[] op3DNow)
638 {
639     const char[] CodeGenSliceExpOp = `
640     auto aptr = a.ptr;
641     auto aend = aptr + a.length;
642     auto bptr = b.ptr;
643 
644     version (D_InlineAsm_X86)
645     {
646         // SSE version is 665% faster
647         if (sse() && a.length >= 16)
648         {
649             auto n = aptr + (a.length & ~15);
650 
651             // Unaligned case
652             asm
653             {
654                 mov EAX, bptr;
655                 mov ESI, aptr;
656                 mov EDI, n;
657                 movss XMM4, value;
658                 shufps XMM4, XMM4, 0;
659 
660                 align 8;
661             startsseloop:
662                 add ESI, 64;
663                 movups XMM0, [EAX];
664                 movups XMM1, [EAX+16];
665                 movups XMM2, [EAX+32];
666                 movups XMM3, [EAX+48];
667                 add EAX, 64;
668                 ` ~ opSSE ~ ` XMM0, XMM4;
669                 ` ~ opSSE ~ ` XMM1, XMM4;
670                 ` ~ opSSE ~ ` XMM2, XMM4;
671                 ` ~ opSSE ~ ` XMM3, XMM4;
672                 movups [ESI+ 0-64], XMM0;
673                 movups [ESI+16-64], XMM1;
674                 movups [ESI+32-64], XMM2;
675                 movups [ESI+48-64], XMM3;
676                 cmp ESI, EDI;
677                 jb startsseloop;
678 
679                 mov aptr, ESI;
680                 mov bptr, EAX;
681             }
682         }
683         else
684         // 3DNow! version is 69% faster
685         if (amd3dnow() && a.length >= 8)
686         {
687             auto n = aptr + (a.length & ~7);
688 
689             ulong w = *cast(uint *) &value;
690             ulong v = w | (w << 32L);
691 
692             asm
693             {
694                 mov ESI, aptr;
695                 mov EDI, n;
696                 mov EAX, bptr;
697                 movq MM4, qword ptr [v];
698 
699                 align 8;
700             start3dnow:
701                 movq MM0, [EAX];
702                 movq MM1, [EAX+8];
703                 movq MM2, [EAX+16];
704                 movq MM3, [EAX+24];
705                 ` ~ op3DNow ~ ` MM0, MM4;
706                 ` ~ op3DNow ~ ` MM1, MM4;
707                 ` ~ op3DNow ~ ` MM2, MM4;
708                 ` ~ op3DNow ~ ` MM3, MM4;
709                 movq [ESI],    MM0;
710                 movq [ESI+8],  MM1;
711                 movq [ESI+16], MM2;
712                 movq [ESI+24], MM3;
713                 add ESI, 32;
714                 add EAX, 32;
715                 cmp ESI, EDI;
716                 jb start3dnow;
717 
718                 emms;
719                 mov aptr, ESI;
720                 mov bptr, EAX;
721             }
722         }
723     }
724 
725     while (aptr < aend)
726         *aptr++ = *bptr++ ` ~ opD ~ ` value;
727 
728     return a;`;
729 }
730 
731 /* ======================================================================== */
732 
733 /***********************
734  * Computes:
735  *      a[] = b[] + value
736  */
737 
738 T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b)
739 in
740 {
741     assert(a.length == b.length);
742     assert(disjoint(a, b));
743 }
744 body
745 {
746     mixin(CodeGenSliceExpOp!("+", "addps", "pfadd"));
747 }
748 
749 unittest
750 {
751     printf("_arraySliceExpAddSliceAssign_f unittest\n");
752     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
753     {
754         version (log) printf("    cpuid %d\n", cpuid);
755 
756         for (int j = 0; j < 2; j++)
757         {
758             const int dim = 67;
759             T[] a = new T[dim + j];     // aligned on 16 byte boundary
760             a = a[j .. dim + j];        // misalign for second iteration
761             T[] b = new T[dim + j];
762             b = b[j .. dim + j];
763             T[] c = new T[dim + j];
764             c = c[j .. dim + j];
765 
766             for (int i = 0; i < dim; i++)
767             {   a[i] = cast(T)i;
768                 b[i] = cast(T)(i + 7);
769                 c[i] = cast(T)(i * 2);
770             }
771 
772             c[] = a[] + 6;
773 
774             for (int i = 0; i < dim; i++)
775             {
776                 if (c[i] != cast(T)(a[i] + 6))
777                 {
778                     printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
779                     assert(0);
780                 }
781             }
782         }
783     }
784 }
785 
786 /* ======================================================================== */
787 
788 /***********************
789  * Computes:
790  *      a[] = b[] - value
791  */
792 
793 T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b)
794 in
795 {
796     assert (a.length == b.length);
797     assert (disjoint(a, b));
798 }
799 body
800 {
801     mixin(CodeGenSliceExpOp!("-", "subps", "pfsub"));
802 }
803 
804 unittest
805 {
806     printf("_arraySliceExpMinSliceAssign_f unittest\n");
807     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
808     {
809         version (log) printf("    cpuid %d\n", cpuid);
810 
811         for (int j = 0; j < 2; j++)
812         {
813             const int dim = 67;
814             T[] a = new T[dim + j];     // aligned on 16 byte boundary
815             a = a[j .. dim + j];        // misalign for second iteration
816             T[] b = new T[dim + j];
817             b = b[j .. dim + j];
818             T[] c = new T[dim + j];
819             c = c[j .. dim + j];
820 
821             for (int i = 0; i < dim; i++)
822             {   a[i] = cast(T)i;
823                 b[i] = cast(T)(i + 7);
824                 c[i] = cast(T)(i * 2);
825             }
826 
827             c[] = a[] - 6;
828 
829             for (int i = 0; i < dim; i++)
830             {
831                 if (c[i] != cast(T)(a[i] - 6))
832                 {
833                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
834                     assert(0);
835                 }
836             }
837         }
838     }
839 }
840 
841 /* ======================================================================== */
842 
843 /***********************
844  * Computes:
845  *      a[] = b[] * value
846  */
847 
848 T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b)
849 in
850 {
851     assert(a.length == b.length);
852     assert(disjoint(a, b));
853 }
854 body
855 {
856     mixin(CodeGenSliceExpOp!("*", "mulps", "pfmul"));
857 }
858 
859 unittest
860 {
861     printf("_arraySliceExpMulSliceAssign_f unittest\n");
862     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
863     {
864         version (log) printf("    cpuid %d\n", cpuid);
865 
866         for (int j = 0; j < 2; j++)
867         {
868             const int dim = 67;
869             T[] a = new T[dim + j];     // aligned on 16 byte boundary
870             a = a[j .. dim + j];        // misalign for second iteration
871             T[] b = new T[dim + j];
872             b = b[j .. dim + j];
873             T[] c = new T[dim + j];
874             c = c[j .. dim + j];
875 
876             for (int i = 0; i < dim; i++)
877             {   a[i] = cast(T)i;
878                 b[i] = cast(T)(i + 7);
879                 c[i] = cast(T)(i * 2);
880             }
881 
882             c[] = a[] * 6;
883 
884             for (int i = 0; i < dim; i++)
885             {
886                 if (c[i] != cast(T)(a[i] * 6))
887                 {
888                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
889                     assert(0);
890                 }
891             }
892         }
893     }
894 }
895 
896 /* ======================================================================== */
897 
898 /***********************
899  * Computes:
900  *      a[] = b[] / value
901  */
902 
903 T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b)
904 {
905     return _arraySliceExpMulSliceAssign_f(a, 1f/value, b);
906 }
907 
908 unittest
909 {
910     printf("_arraySliceExpDivSliceAssign_f unittest\n");
911     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
912     {
913         version (log) printf("    cpuid %d\n", cpuid);
914 
915         for (int j = 0; j < 2; j++)
916         {
917             const int dim = 67;
918             T[] a = new T[dim + j];     // aligned on 16 byte boundary
919             a = a[j .. dim + j];        // misalign for second iteration
920             T[] b = new T[dim + j];
921             b = b[j .. dim + j];
922             T[] c = new T[dim + j];
923             c = c[j .. dim + j];
924 
925             for (int i = 0; i < dim; i++)
926             {   a[i] = cast(T)i;
927                 b[i] = cast(T)(i + 7);
928                 c[i] = cast(T)(i * 2);
929             }
930 
931             c[] = a[] / 8;
932 
933             for (int i = 0; i < dim; i++)
934             {
935                 if (c[i] != cast(T)(a[i] / 8))
936                 {
937                     printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
938                     assert(0);
939                 }
940             }
941         }
942     }
943 }
944 
945 /* ======================================================================== */
946 /* ======================================================================== */
947 
948 private template CodeGenSliceOpAssign(char[] opD, char[] opSSE, char[] op3DNow)
949 {
950     const char[] CodeGenSliceOpAssign = `
951     auto aptr = a.ptr;
952     auto aend = aptr + a.length;
953     auto bptr = b.ptr;
954 
955     version (D_InlineAsm_X86)
956     {
957         // SSE version is 468% faster
958         if (sse() && a.length >= 16)
959         {
960             auto n = aptr + (a.length & ~15);
961 
962             // Unaligned case
963             asm
964             {
965                 mov ECX, bptr; // right operand
966                 mov ESI, aptr; // destination operand
967                 mov EDI, n; // end comparison
968 
969                 align 8;
970             startsseloopb:
971                 movups XMM0, [ESI];
972                 movups XMM1, [ESI+16];
973                 movups XMM2, [ESI+32];
974                 movups XMM3, [ESI+48];
975                 add ESI, 64;
976                 movups XMM4, [ECX];
977                 movups XMM5, [ECX+16];
978                 movups XMM6, [ECX+32];
979                 movups XMM7, [ECX+48];
980                 add ECX, 64;
981                 ` ~ opSSE ~ ` XMM0, XMM4;
982                 ` ~ opSSE ~ ` XMM1, XMM5;
983                 ` ~ opSSE ~ ` XMM2, XMM6;
984                 ` ~ opSSE ~ ` XMM3, XMM7;
985                 movups [ESI+ 0-64], XMM0;
986                 movups [ESI+16-64], XMM1;
987                 movups [ESI+32-64], XMM2;
988                 movups [ESI+48-64], XMM3;
989                 cmp ESI, EDI;
990                 jb startsseloopb;
991 
992                 mov aptr, ESI;
993                 mov bptr, ECX;
994             }
995         }
996         else
997         // 3DNow! version is 57% faster
998         if (amd3dnow() && a.length >= 8)
999         {
1000             auto n = aptr + (a.length & ~7);
1001 
1002             asm
1003             {
1004                 mov ESI, dword ptr [aptr]; // destination operand
1005                 mov EDI, dword ptr [n];    // end comparison
1006                 mov ECX, dword ptr [bptr]; // right operand
1007 
1008                 align 4;
1009             start3dnow:
1010                 movq MM0, [ESI];
1011                 movq MM1, [ESI+8];
1012                 movq MM2, [ESI+16];
1013                 movq MM3, [ESI+24];
1014                 ` ~ op3DNow ~ ` MM0, [ECX];
1015                 ` ~ op3DNow ~ ` MM1, [ECX+8];
1016                 ` ~ op3DNow ~ ` MM2, [ECX+16];
1017                 ` ~ op3DNow ~ ` MM3, [ECX+24];
1018                 movq [ESI], MM0;
1019                 movq [ESI+8], MM1;
1020                 movq [ESI+16], MM2;
1021                 movq [ESI+24], MM3;
1022                 add ESI, 32;
1023                 add ECX, 32;
1024                 cmp ESI, EDI;
1025                 jb start3dnow;
1026 
1027                 emms;
1028                 mov dword ptr [aptr], ESI;
1029                 mov dword ptr [bptr], ECX;
1030             }
1031         }
1032     }
1033 
1034     while (aptr < aend)
1035         *aptr++ ` ~ opD ~ ` *bptr++;
1036 
1037     return a;`;
1038 }
1039 
1040 /* ======================================================================== */
1041 
1042 /***********************
1043  * Computes:
1044  *      a[] += b[]
1045  */
1046 
1047 T[] _arraySliceSliceAddass_f(T[] a, T[] b)
1048 in
1049 {
1050     assert (a.length == b.length);
1051     assert (disjoint(a, b));
1052 }
1053 body
1054 {
1055     mixin(CodeGenSliceOpAssign!("+=", "addps", "pfadd"));
1056 }
1057 
1058 unittest
1059 {
1060     printf("_arraySliceSliceAddass_f unittest\n");
1061     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1062     {
1063         version (log) printf("    cpuid %d\n", cpuid);
1064 
1065         for (int j = 0; j < 2; j++)
1066         {
1067             const int dim = 67;
1068             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1069             a = a[j .. dim + j];        // misalign for second iteration
1070             T[] b = new T[dim + j];
1071             b = b[j .. dim + j];
1072             T[] c = new T[dim + j];
1073             c = c[j .. dim + j];
1074 
1075             for (int i = 0; i < dim; i++)
1076             {   a[i] = cast(T)i;
1077                 b[i] = cast(T)(i + 7);
1078                 c[i] = cast(T)(i * 2);
1079             }
1080 
1081             a[] = c[];
1082             c[] += b[];
1083 
1084             for (int i = 0; i < dim; i++)
1085             {
1086                 if (c[i] != cast(T)(a[i] + b[i]))
1087                 {
1088                     printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
1089                     assert(0);
1090                 }
1091             }
1092         }
1093     }
1094 }
1095 
1096 /* ======================================================================== */
1097 
1098 /***********************
1099  * Computes:
1100  *      a[] -= b[]
1101  */
1102 
1103 T[] _arraySliceSliceMinass_f(T[] a, T[] b)
1104 in
1105 {
1106     assert (a.length == b.length);
1107     assert (disjoint(a, b));
1108 }
1109 body
1110 {
1111     mixin(CodeGenSliceOpAssign!("-=", "subps", "pfsub"));
1112 }
1113 
1114 unittest
1115 {
1116     printf("_arrayExpSliceMinass_f unittest\n");
1117     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1118     {
1119         version (log) printf("    cpuid %d\n", cpuid);
1120 
1121         for (int j = 0; j < 2; j++)
1122         {
1123             const int dim = 67;
1124             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1125             a = a[j .. dim + j];        // misalign for second iteration
1126             T[] b = new T[dim + j];
1127             b = b[j .. dim + j];
1128             T[] c = new T[dim + j];
1129             c = c[j .. dim + j];
1130 
1131             for (int i = 0; i < dim; i++)
1132             {   a[i] = cast(T)i;
1133                 b[i] = cast(T)(i + 7);
1134                 c[i] = cast(T)(i * 2);
1135             }
1136 
1137             a[] = c[];
1138             c[] -= 6;
1139 
1140             for (int i = 0; i < dim; i++)
1141             {
1142                 if (c[i] != cast(T)(a[i] - 6))
1143                 {
1144                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
1145                     assert(0);
1146                 }
1147             }
1148         }
1149     }
1150 }
1151 
1152 /* ======================================================================== */
1153 
1154 /***********************
1155  * Computes:
1156  *      a[] *= b[]
1157  */
1158 
1159 T[] _arraySliceSliceMulass_f(T[] a, T[] b)
1160 in
1161 {
1162     assert (a.length == b.length);
1163     assert (disjoint(a, b));
1164 }
1165 body
1166 {
1167     mixin(CodeGenSliceOpAssign!("*=", "mulps", "pfmul"));
1168 }
1169 
1170 unittest
1171 {
1172     printf("_arrayExpSliceMulass_f unittest\n");
1173     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1174     {
1175         version (log) printf("    cpuid %d\n", cpuid);
1176 
1177         for (int j = 0; j < 2; j++)
1178         {
1179             const int dim = 67;
1180             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1181             a = a[j .. dim + j];        // misalign for second iteration
1182             T[] b = new T[dim + j];
1183             b = b[j .. dim + j];
1184             T[] c = new T[dim + j];
1185             c = c[j .. dim + j];
1186 
1187             for (int i = 0; i < dim; i++)
1188             {   a[i] = cast(T)i;
1189                 b[i] = cast(T)(i + 7);
1190                 c[i] = cast(T)(i * 2);
1191             }
1192 
1193             a[] = c[];
1194             c[] *= 6;
1195 
1196             for (int i = 0; i < dim; i++)
1197             {
1198                 if (c[i] != cast(T)(a[i] * 6))
1199                 {
1200                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1201                     assert(0);
1202                 }
1203             }
1204         }
1205     }
1206 }
1207 
1208 /* ======================================================================== */
1209 /* ======================================================================== */
1210 
1211 /***********************
1212  * Computes:
1213  *      a[] = value - b[]
1214  */
1215 
1216 T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value)
1217 in
1218 {
1219     assert (a.length == b.length);
1220     assert (disjoint(a, b));
1221 }
1222 body
1223 {
1224     //printf("_arrayExpSliceMinSliceAssign_f()\n");
1225     auto aptr = a.ptr;
1226     auto aend = aptr + a.length;
1227     auto bptr = b.ptr;
1228 
1229     version (D_InlineAsm_X86)
1230     {
1231         // SSE version is 690% faster
1232         if (sse() && a.length >= 16)
1233         {
1234             auto n = aptr + (a.length & ~15);
1235 
1236             // Unaligned case
1237             asm
1238             {
1239                 mov EAX, bptr;
1240                 mov ESI, aptr;
1241                 mov EDI, n;
1242                 movss XMM4, value;
1243                 shufps XMM4, XMM4, 0;
1244 
1245                 align 8;
1246             startsseloop:
1247                 add ESI, 64;
1248                 movaps XMM5, XMM4;
1249                 movaps XMM6, XMM4;
1250                 movups XMM0, [EAX];
1251                 movups XMM1, [EAX+16];
1252                 movups XMM2, [EAX+32];
1253                 movups XMM3, [EAX+48];
1254                 add EAX, 64;
1255                 subps XMM5, XMM0;
1256                 subps XMM6, XMM1;
1257                 movups [ESI+ 0-64], XMM5;
1258                 movups [ESI+16-64], XMM6;
1259                 movaps XMM5, XMM4;
1260                 movaps XMM6, XMM4;
1261                 subps XMM5, XMM2;
1262                 subps XMM6, XMM3;
1263                 movups [ESI+32-64], XMM5;
1264                 movups [ESI+48-64], XMM6;
1265                 cmp ESI, EDI;
1266                 jb startsseloop;
1267 
1268                 mov aptr, ESI;
1269                 mov bptr, EAX;
1270             }
1271         }
1272         else
1273         // 3DNow! version is 67% faster
1274         if (amd3dnow() && a.length >= 8)
1275         {
1276             auto n = aptr + (a.length & ~7);
1277 
1278             ulong w = *cast(uint *) &value;
1279             ulong v = w | (w << 32L);
1280 
1281             asm
1282             {
1283                 mov ESI, aptr;
1284                 mov EDI, n;
1285                 mov EAX, bptr;
1286                 movq MM4, qword ptr [v];
1287 
1288                 align 8;
1289             start3dnow:
1290                 movq MM0, [EAX];
1291                 movq MM1, [EAX+8];
1292                 movq MM2, [EAX+16];
1293                 movq MM3, [EAX+24];
1294                 pfsubr MM0, MM4;
1295                 pfsubr MM1, MM4;
1296                 pfsubr MM2, MM4;
1297                 pfsubr MM3, MM4;
1298                 movq [ESI], MM0;
1299                 movq [ESI+8], MM1;
1300                 movq [ESI+16], MM2;
1301                 movq [ESI+24], MM3;
1302                 add ESI, 32;
1303                 add EAX, 32;
1304                 cmp ESI, EDI;
1305                 jb start3dnow;
1306 
1307                 emms;
1308                 mov aptr, ESI;
1309                 mov bptr, EAX;
1310             }
1311         }
1312     }
1313 
1314     while (aptr < aend)
1315         *aptr++ = value - *bptr++;
1316 
1317     return a;
1318 }
1319 
1320 unittest
1321 {
1322     printf("_arrayExpSliceMinSliceAssign_f unittest\n");
1323     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1324     {
1325         version (log) printf("    cpuid %d\n", cpuid);
1326 
1327         for (int j = 0; j < 2; j++)
1328         {
1329             const int dim = 67;
1330             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1331             a = a[j .. dim + j];        // misalign for second iteration
1332             T[] b = new T[dim + j];
1333             b = b[j .. dim + j];
1334             T[] c = new T[dim + j];
1335             c = c[j .. dim + j];
1336 
1337             for (int i = 0; i < dim; i++)
1338             {   a[i] = cast(T)i;
1339                 b[i] = cast(T)(i + 7);
1340                 c[i] = cast(T)(i * 2);
1341             }
1342 
1343             c[] = 6 - a[];
1344 
1345             for (int i = 0; i < dim; i++)
1346             {
1347                 if (c[i] != cast(T)(6 - a[i]))
1348                 {
1349                     printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
1350                     assert(0);
1351                 }
1352             }
1353         }
1354     }
1355 }
1356 
1357 /* ======================================================================== */
1358 
1359 /***********************
1360  * Computes:
1361  *      a[] -= b[] * value
1362  */
1363 
1364 T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b)
1365 {
1366     return _arraySliceExpMulSliceAddass_f(a, -value, b);
1367 }
1368 
1369 /***********************
1370  * Computes:
1371  *      a[] += b[] * value
1372  */
1373 
1374 T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b)
1375 in
1376 {
1377         assert(a.length == b.length);
1378         assert(disjoint(a, b));
1379 }
1380 body
1381 {
1382     auto aptr = a.ptr;
1383     auto aend = aptr + a.length;
1384     auto bptr = b.ptr;
1385 
1386     // Handle remainder
1387     while (aptr < aend)
1388         *aptr++ += *bptr++ * value;
1389 
1390     return a;
1391 }
1392 
1393 unittest
1394 {
1395     printf("_arraySliceExpMulSliceAddass_f unittest\n");
1396 
1397     cpuid = 1;
1398     {
1399         version (log) printf("    cpuid %d\n", cpuid);
1400 
1401         for (int j = 0; j < 1; j++)
1402         {
1403             const int dim = 67;
1404             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1405             a = a[j .. dim + j];        // misalign for second iteration
1406             T[] b = new T[dim + j];
1407             b = b[j .. dim + j];
1408             T[] c = new T[dim + j];
1409             c = c[j .. dim + j];
1410 
1411             for (int i = 0; i < dim; i++)
1412             {   a[i] = cast(T)i;
1413                 b[i] = cast(T)(i + 7);
1414                 c[i] = cast(T)(i * 2);
1415             }
1416 
1417             b[] = c[];
1418             c[] += a[] * 6;
1419 
1420             for (int i = 0; i < dim; i++)
1421             {
1422                 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1423                 if (c[i] != cast(T)(b[i] + a[i] * 6))
1424                 {
1425                     printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1426                     assert(0);
1427                 }
1428             }
1429         }
1430     }
1431 }