1 /***************************
2  * D programming language http://www.digitalmars.com/d/
3  * Runtime support for double array operations.
4  * Based on code originally written by Burton Radons.
5  * Placed in public domain.
6  */
7 
8 module rt.compiler.dmd.rt.arraydouble;
9 
10 import CPUid = rt.compiler.util.cpuid;
11 
12 debug(UnitTest)
13 {
14     private extern(C) int printf(char*,...);
15     /* This is so unit tests will test every CPU variant
16      */
17     int cpuid;
18     const int CPUID_MAX = 5;
19     bool mmx()      { return cpuid == 1 && CPUid.mmx(); }
20     bool sse()      { return cpuid == 2 && CPUid.sse(); }
21     bool sse2()     { return cpuid == 3 && CPUid.sse2(); }
22     bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); }
23 }
24 else
25 {
26     alias CPUid.mmx mmx;
27     alias CPUid.sse sse;
28     alias CPUid.sse2 sse2;
29     alias CPUid.amd3dnow amd3dnow;
30 }
31 
32 //version = log;
33 
34 bool disjoint(T)(T[] a, T[] b)
35 {
36     return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
37 }
38 
39 /* Performance figures measured by Burton Radons
40  */
41 
42 alias double T;
43 
44 extern (C):
45 
46 /* ======================================================================== */
47 
48 /***********************
49  * Computes:
50  *      a[] = b[] + c[]
51  */
52 
53 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b)
54 in
55 {
56         assert(a.length == b.length && b.length == c.length);
57         assert(disjoint(a, b));
58         assert(disjoint(a, c));
59         assert(disjoint(b, c));
60 }
61 body
62 {
63     auto aptr = a.ptr;
64     auto aend = aptr + a.length;
65     auto bptr = b.ptr;
66     auto cptr = c.ptr;
67 
68     version (D_InlineAsm_X86)
69     {
70         // SSE2 version is 333% faster
71         if (sse2() && b.length >= 16)
72         {
73             auto n = aptr + (b.length & ~15);
74 
75             // Unaligned case
76             asm
77             {
78                 mov EAX, bptr; // left operand
79                 mov ECX, cptr; // right operand
80                 mov ESI, aptr; // destination operand
81                 mov EDI, n;    // end comparison
82 
83                 align 8;
84             startsseloopb:
85                 movupd XMM0, [EAX];
86                 movupd XMM1, [EAX+16];
87                 movupd XMM2, [EAX+32];
88                 movupd XMM3, [EAX+48];
89                 add EAX, 64;
90                 movupd XMM4, [ECX];
91                 movupd XMM5, [ECX+16];
92                 movupd XMM6, [ECX+32];
93                 movupd XMM7, [ECX+48];
94                 add ESI, 64;
95                 addpd XMM0, XMM4;
96                 addpd XMM1, XMM5;
97                 addpd XMM2, XMM6;
98                 addpd XMM3, XMM7;
99                 add ECX, 64;
100                 movupd [ESI+ 0-64], XMM0;
101                 movupd [ESI+16-64], XMM1;
102                 movupd [ESI+32-64], XMM2;
103                 movupd [ESI+48-64], XMM3;
104                 cmp ESI, EDI;
105                 jb startsseloopb;
106 
107                 mov aptr, ESI;
108                 mov bptr, EAX;
109                 mov cptr, ECX;
110             }
111         }
112     }
113 
114     // Handle remainder
115     while (aptr < aend)
116         *aptr++ = *bptr++ + *cptr++;
117 
118     return a;
119 }
120 
121 
122 unittest
123 {
124     printf("_arraySliceSliceAddSliceAssign_d unittest\n");
125     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
126     {
127         version (log) printf("    cpuid %d\n", cpuid);
128 
129         for (int j = 0; j < 2; j++)
130         {
131             const int dim = 67;
132             T[] a = new T[dim + j];     // aligned on 16 byte boundary
133             a = a[j .. dim + j];        // misalign for second iteration
134             T[] b = new T[dim + j];
135             b = b[j .. dim + j];
136             T[] c = new T[dim + j];
137             c = c[j .. dim + j];
138 
139             for (int i = 0; i < dim; i++)
140             {   a[i] = cast(T)i;
141                 b[i] = cast(T)(i + 7);
142                 c[i] = cast(T)(i * 2);
143             }
144 
145             c[] = a[] + b[];
146 
147             for (int i = 0; i < dim; i++)
148             {
149                 if (c[i] != cast(T)(a[i] + b[i]))
150                 {
151                     printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
152                     assert(0);
153                 }
154             }
155         }
156     }
157 }
158 
159 /* ======================================================================== */
160 
161 /***********************
162  * Computes:
163  *      a[] = b[] - c[]
164  */
165 
166 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b)
167 in
168 {
169         assert(a.length == b.length && b.length == c.length);
170         assert(disjoint(a, b));
171         assert(disjoint(a, c));
172         assert(disjoint(b, c));
173 }
174 body
175 {
176     auto aptr = a.ptr;
177     auto aend = aptr + a.length;
178     auto bptr = b.ptr;
179     auto cptr = c.ptr;
180 
181     version (D_InlineAsm_X86)
182     {
183         // SSE2 version is 324% faster
184         if (sse2() && b.length >= 8)
185         {
186             auto n = aptr + (b.length & ~7);
187 
188             // Unaligned case
189             asm
190             {
191                 mov EAX, bptr; // left operand
192                 mov ECX, cptr; // right operand
193                 mov ESI, aptr; // destination operand
194                 mov EDI, n;    // end comparison
195 
196                 align 8;
197             startsseloopb:
198                 movupd XMM0, [EAX];
199                 movupd XMM1, [EAX+16];
200                 movupd XMM2, [EAX+32];
201                 movupd XMM3, [EAX+48];
202                 add EAX, 64;
203                 movupd XMM4, [ECX];
204                 movupd XMM5, [ECX+16];
205                 movupd XMM6, [ECX+32];
206                 movupd XMM7, [ECX+48];
207                 add ESI, 64;
208                 subpd XMM0, XMM4;
209                 subpd XMM1, XMM5;
210                 subpd XMM2, XMM6;
211                 subpd XMM3, XMM7;
212                 add ECX, 64;
213                 movupd [ESI+ 0-64], XMM0;
214                 movupd [ESI+16-64], XMM1;
215                 movupd [ESI+32-64], XMM2;
216                 movupd [ESI+48-64], XMM3;
217                 cmp ESI, EDI;
218                 jb startsseloopb;
219 
220                 mov aptr, ESI;
221                 mov bptr, EAX;
222                 mov cptr, ECX;
223             }
224         }
225     }
226 
227     // Handle remainder
228     while (aptr < aend)
229         *aptr++ = *bptr++ - *cptr++;
230 
231     return a;
232 }
233 
234 
235 unittest
236 {
237     printf("_arraySliceSliceMinSliceAssign_d unittest\n");
238     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
239     {
240         version (log) printf("    cpuid %d\n", cpuid);
241 
242         for (int j = 0; j < 2; j++)
243         {
244             const int dim = 67;
245             T[] a = new T[dim + j];     // aligned on 16 byte boundary
246             a = a[j .. dim + j];        // misalign for second iteration
247             T[] b = new T[dim + j];
248             b = b[j .. dim + j];
249             T[] c = new T[dim + j];
250             c = c[j .. dim + j];
251 
252             for (int i = 0; i < dim; i++)
253             {   a[i] = cast(T)i;
254                 b[i] = cast(T)(i + 7);
255                 c[i] = cast(T)(i * 2);
256             }
257 
258             c[] = a[] - b[];
259 
260             for (int i = 0; i < dim; i++)
261             {
262                 if (c[i] != cast(T)(a[i] - b[i]))
263                 {
264                     printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]);
265                     assert(0);
266                 }
267             }
268         }
269     }
270 }
271 
272 
273 /* ======================================================================== */
274 
275 /***********************
276  * Computes:
277  *      a[] = b[] + value
278  */
279 
280 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b)
281 in
282 {
283     assert(a.length == b.length);
284     assert(disjoint(a, b));
285 }
286 body
287 {
288     //printf("_arraySliceExpAddSliceAssign_d()\n");
289     auto aptr = a.ptr;
290     auto aend = aptr + a.length;
291     auto bptr = b.ptr;
292 
293     version (D_InlineAsm_X86)
294     {
295         // SSE2 version is 305% faster
296         if (sse2() && a.length >= 8)
297         {
298             auto n = aptr + (a.length & ~7);
299 
300             // Unaligned case
301             asm
302             {
303                 mov EAX, bptr;
304                 mov ESI, aptr;
305                 mov EDI, n;
306                 movsd XMM4, value;
307                 shufpd XMM4, XMM4, 0;
308 
309                 align 8;
310             startsseloop:
311                 add ESI, 64;
312                 movupd XMM0, [EAX];
313                 movupd XMM1, [EAX+16];
314                 movupd XMM2, [EAX+32];
315                 movupd XMM3, [EAX+48];
316                 add EAX, 64;
317                 addpd XMM0, XMM4;
318                 addpd XMM1, XMM4;
319                 addpd XMM2, XMM4;
320                 addpd XMM3, XMM4;
321                 movupd [ESI+ 0-64], XMM0;
322                 movupd [ESI+16-64], XMM1;
323                 movupd [ESI+32-64], XMM2;
324                 movupd [ESI+48-64], XMM3;
325                 cmp ESI, EDI;
326                 jb startsseloop;
327 
328                 mov aptr, ESI;
329                 mov bptr, EAX;
330             }
331         }
332     }
333 
334     while (aptr < aend)
335         *aptr++ = *bptr++ + value;
336 
337     return a;
338 }
339 
340 unittest
341 {
342     printf("_arraySliceExpAddSliceAssign_d unittest\n");
343     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
344     {
345         version (log) printf("    cpuid %d\n", cpuid);
346 
347         for (int j = 0; j < 2; j++)
348         {
349             const int dim = 67;
350             T[] a = new T[dim + j];     // aligned on 16 byte boundary
351             a = a[j .. dim + j];        // misalign for second iteration
352             T[] b = new T[dim + j];
353             b = b[j .. dim + j];
354             T[] c = new T[dim + j];
355             c = c[j .. dim + j];
356 
357             for (int i = 0; i < dim; i++)
358             {   a[i] = cast(T)i;
359                 b[i] = cast(T)(i + 7);
360                 c[i] = cast(T)(i * 2);
361             }
362 
363             c[] = a[] + 6;
364 
365             for (int i = 0; i < dim; i++)
366             {
367                 if (c[i] != cast(T)(a[i] + 6))
368                 {
369                     printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
370                     assert(0);
371                 }
372             }
373         }
374     }
375 }
376 
377 /* ======================================================================== */
378 
379 /***********************
380  * Computes:
381  *      a[] += value
382  */
383 
384 T[] _arrayExpSliceAddass_d(T[] a, T value)
385 {
386     //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
387     auto aptr = a.ptr;
388     auto aend = aptr + a.length;
389 
390     version (D_InlineAsm_X86)
391     {
392         // SSE2 version is 114% faster
393         if (sse2() && a.length >= 8)
394         {
395             auto n = cast(T*)((cast(uint)aend) & ~7);
396             if (aptr < n)
397 
398             // Unaligned case
399             asm
400             {
401                 mov ESI, aptr;
402                 mov EDI, n;
403                 movsd XMM4, value;
404                 shufpd XMM4, XMM4, 0;
405 
406                 align 8;
407             startsseloopa:
408                 movupd XMM0, [ESI];
409                 movupd XMM1, [ESI+16];
410                 movupd XMM2, [ESI+32];
411                 movupd XMM3, [ESI+48];
412                 add ESI, 64;
413                 addpd XMM0, XMM4;
414                 addpd XMM1, XMM4;
415                 addpd XMM2, XMM4;
416                 addpd XMM3, XMM4;
417                 movupd [ESI+ 0-64], XMM0;
418                 movupd [ESI+16-64], XMM1;
419                 movupd [ESI+32-64], XMM2;
420                 movupd [ESI+48-64], XMM3;
421                 cmp ESI, EDI;
422                 jb startsseloopa;
423 
424                 mov aptr, ESI;
425             }
426         }
427     }
428 
429     while (aptr < aend)
430         *aptr++ += value;
431 
432     return a;
433 }
434 
435 unittest
436 {
437     printf("_arrayExpSliceAddass_d unittest\n");
438     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
439     {
440         version (log) printf("    cpuid %d\n", cpuid);
441 
442         for (int j = 0; j < 2; j++)
443         {
444             const int dim = 67;
445             T[] a = new T[dim + j];     // aligned on 16 byte boundary
446             a = a[j .. dim + j];        // misalign for second iteration
447             T[] b = new T[dim + j];
448             b = b[j .. dim + j];
449             T[] c = new T[dim + j];
450             c = c[j .. dim + j];
451 
452             for (int i = 0; i < dim; i++)
453             {   a[i] = cast(T)i;
454                 b[i] = cast(T)(i + 7);
455                 c[i] = cast(T)(i * 2);
456             }
457 
458             a[] = c[];
459             c[] += 6;
460 
461             for (int i = 0; i < dim; i++)
462             {
463                 if (c[i] != cast(T)(a[i] + 6))
464                 {
465                     printf("[%d]: %g != %g + 6\n", i, c[i], a[i]);
466                     assert(0);
467                 }
468             }
469         }
470     }
471 }
472 
473 /* ======================================================================== */
474 
475 /***********************
476  * Computes:
477  *      a[] += b[]
478  */
479 
480 T[] _arraySliceSliceAddass_d(T[] a, T[] b)
481 in
482 {
483     assert (a.length == b.length);
484     assert (disjoint(a, b));
485 }
486 body
487 {
488     //printf("_arraySliceSliceAddass_d()\n");
489     auto aptr = a.ptr;
490     auto aend = aptr + a.length;
491     auto bptr = b.ptr;
492 
493     version (D_InlineAsm_X86)
494     {
495         // SSE2 version is 183% faster
496         if (sse2() && a.length >= 8)
497         {
498             auto n = aptr + (a.length & ~7);
499 
500             // Unaligned case
501             asm
502             {
503                 mov ECX, bptr; // right operand
504                 mov ESI, aptr; // destination operand
505                 mov EDI, n; // end comparison
506 
507                 align 8;
508             startsseloopb:
509                 movupd XMM0, [ESI];
510                 movupd XMM1, [ESI+16];
511                 movupd XMM2, [ESI+32];
512                 movupd XMM3, [ESI+48];
513                 add ESI, 64;
514                 movupd XMM4, [ECX];
515                 movupd XMM5, [ECX+16];
516                 movupd XMM6, [ECX+32];
517                 movupd XMM7, [ECX+48];
518                 add ECX, 64;
519                 addpd XMM0, XMM4;
520                 addpd XMM1, XMM5;
521                 addpd XMM2, XMM6;
522                 addpd XMM3, XMM7;
523                 movupd [ESI+ 0-64], XMM0;
524                 movupd [ESI+16-64], XMM1;
525                 movupd [ESI+32-64], XMM2;
526                 movupd [ESI+48-64], XMM3;
527                 cmp ESI, EDI;
528                 jb startsseloopb;
529 
530                 mov aptr, ESI;
531                 mov bptr, ECX;
532             }
533         }
534     }
535 
536     while (aptr < aend)
537         *aptr++ += *bptr++;
538 
539     return a;
540 }
541 
542 unittest
543 {
544     printf("_arraySliceSliceAddass_d unittest\n");
545     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
546     {
547         version (log) printf("    cpuid %d\n", cpuid);
548 
549         for (int j = 0; j < 2; j++)
550         {
551             const int dim = 67;
552             T[] a = new T[dim + j];     // aligned on 16 byte boundary
553             a = a[j .. dim + j];        // misalign for second iteration
554             T[] b = new T[dim + j];
555             b = b[j .. dim + j];
556             T[] c = new T[dim + j];
557             c = c[j .. dim + j];
558 
559             for (int i = 0; i < dim; i++)
560             {   a[i] = cast(T)i;
561                 b[i] = cast(T)(i + 7);
562                 c[i] = cast(T)(i * 2);
563             }
564 
565             a[] = c[];
566             c[] += b[];
567 
568             for (int i = 0; i < dim; i++)
569             {
570                 if (c[i] != cast(T)(a[i] + b[i]))
571                 {
572                     printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]);
573                     assert(0);
574                 }
575             }
576         }
577     }
578 }
579 
580 /* ======================================================================== */
581 
582 /***********************
583  * Computes:
584  *      a[] = b[] - value
585  */
586 
587 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b)
588 in
589 {
590     assert (a.length == b.length);
591     assert (disjoint(a, b));
592 }
593 body
594 {
595     //printf("_arraySliceExpMinSliceAssign_d()\n");
596     auto aptr = a.ptr;
597     auto aend = aptr + a.length;
598     auto bptr = b.ptr;
599 
600     version (D_InlineAsm_X86)
601     {
602         // SSE2 version is 305% faster
603         if (sse2() && a.length >= 8)
604         {
605             auto n = aptr + (a.length & ~7);
606 
607             // Unaligned case
608             asm
609             {
610                 mov EAX, bptr;
611                 mov ESI, aptr;
612                 mov EDI, n;
613                 movsd XMM4, value;
614                 shufpd XMM4, XMM4, 0;
615 
616                 align 8;
617             startsseloop:
618                 add ESI, 64;
619                 movupd XMM0, [EAX];
620                 movupd XMM1, [EAX+16];
621                 movupd XMM2, [EAX+32];
622                 movupd XMM3, [EAX+48];
623                 add EAX, 64;
624                 subpd XMM0, XMM4;
625                 subpd XMM1, XMM4;
626                 subpd XMM2, XMM4;
627                 subpd XMM3, XMM4;
628                 movupd [ESI+ 0-64], XMM0;
629                 movupd [ESI+16-64], XMM1;
630                 movupd [ESI+32-64], XMM2;
631                 movupd [ESI+48-64], XMM3;
632                 cmp ESI, EDI;
633                 jb startsseloop;
634 
635                 mov aptr, ESI;
636                 mov bptr, EAX;
637             }
638         }
639     }
640 
641     while (aptr < aend)
642         *aptr++ = *bptr++ - value;
643 
644     return a;
645 }
646 
647 unittest
648 {
649     printf("_arraySliceExpMinSliceAssign_d unittest\n");
650     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
651     {
652         version (log) printf("    cpuid %d\n", cpuid);
653 
654         for (int j = 0; j < 2; j++)
655         {
656             const int dim = 67;
657             T[] a = new T[dim + j];     // aligned on 16 byte boundary
658             a = a[j .. dim + j];        // misalign for second iteration
659             T[] b = new T[dim + j];
660             b = b[j .. dim + j];
661             T[] c = new T[dim + j];
662             c = c[j .. dim + j];
663 
664             for (int i = 0; i < dim; i++)
665             {   a[i] = cast(T)i;
666                 b[i] = cast(T)(i + 7);
667                 c[i] = cast(T)(i * 2);
668             }
669 
670             c[] = a[] - 6;
671 
672             for (int i = 0; i < dim; i++)
673             {
674                 if (c[i] != cast(T)(a[i] - 6))
675                 {
676                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
677                     assert(0);
678                 }
679             }
680         }
681     }
682 }
683 
684 /* ======================================================================== */
685 
686 /***********************
687  * Computes:
688  *      a[] = value - b[]
689  */
690 
691 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value)
692 in
693 {
694     assert (a.length == b.length);
695     assert (disjoint(a, b));
696 }
697 body
698 {
699     //printf("_arrayExpSliceMinSliceAssign_d()\n");
700     auto aptr = a.ptr;
701     auto aend = aptr + a.length;
702     auto bptr = b.ptr;
703 
704     version (D_InlineAsm_X86)
705     {
706         // SSE2 version is 66% faster
707         if (sse2() && a.length >= 8)
708         {
709             auto n = aptr + (a.length & ~7);
710 
711             // Unaligned case
712             asm
713             {
714                 mov EAX, bptr;
715                 mov ESI, aptr;
716                 mov EDI, n;
717                 movsd XMM4, value;
718                 shufpd XMM4, XMM4, 0;
719 
720                 align 8;
721             startsseloop:
722                 add ESI, 64;
723                 movapd XMM5, XMM4;
724                 movapd XMM6, XMM4;
725                 movupd XMM0, [EAX];
726                 movupd XMM1, [EAX+16];
727                 movupd XMM2, [EAX+32];
728                 movupd XMM3, [EAX+48];
729                 add EAX, 64;
730                 subpd XMM5, XMM0;
731                 subpd XMM6, XMM1;
732                 movupd [ESI+ 0-64], XMM5;
733                 movupd [ESI+16-64], XMM6;
734                 movapd XMM5, XMM4;
735                 movapd XMM6, XMM4;
736                 subpd XMM5, XMM2;
737                 subpd XMM6, XMM3;
738                 movupd [ESI+32-64], XMM5;
739                 movupd [ESI+48-64], XMM6;
740                 cmp ESI, EDI;
741                 jb startsseloop;
742 
743                 mov aptr, ESI;
744                 mov bptr, EAX;
745             }
746         }
747     }
748 
749     while (aptr < aend)
750         *aptr++ = value - *bptr++;
751 
752     return a;
753 }
754 
755 unittest
756 {
757     printf("_arrayExpSliceMinSliceAssign_d unittest\n");
758     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
759     {
760         version (log) printf("    cpuid %d\n", cpuid);
761 
762         for (int j = 0; j < 2; j++)
763         {
764             const int dim = 67;
765             T[] a = new T[dim + j];     // aligned on 16 byte boundary
766             a = a[j .. dim + j];        // misalign for second iteration
767             T[] b = new T[dim + j];
768             b = b[j .. dim + j];
769             T[] c = new T[dim + j];
770             c = c[j .. dim + j];
771 
772             for (int i = 0; i < dim; i++)
773             {   a[i] = cast(T)i;
774                 b[i] = cast(T)(i + 7);
775                 c[i] = cast(T)(i * 2);
776             }
777 
778             c[] = 6 - a[];
779 
780             for (int i = 0; i < dim; i++)
781             {
782                 if (c[i] != cast(T)(6 - a[i]))
783                 {
784                     printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]);
785                     assert(0);
786                 }
787             }
788         }
789     }
790 }
791 
792 /* ======================================================================== */
793 
794 /***********************
795  * Computes:
796  *      a[] -= value
797  */
798 
799 T[] _arrayExpSliceMinass_d(T[] a, T value)
800 {
801     //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
802     auto aptr = a.ptr;
803     auto aend = aptr + a.length;
804 
805     version (D_InlineAsm_X86)
806     {
807         // SSE2 version is 115% faster
808         if (sse2() && a.length >= 8)
809         {
810             auto n = cast(T*)((cast(uint)aend) & ~7);
811             if (aptr < n)
812 
813             // Unaligned case
814             asm
815             {
816                 mov ESI, aptr;
817                 mov EDI, n;
818                 movsd XMM4, value;
819                 shufpd XMM4, XMM4, 0;
820 
821                 align 8;
822             startsseloopa:
823                 movupd XMM0, [ESI];
824                 movupd XMM1, [ESI+16];
825                 movupd XMM2, [ESI+32];
826                 movupd XMM3, [ESI+48];
827                 add ESI, 64;
828                 subpd XMM0, XMM4;
829                 subpd XMM1, XMM4;
830                 subpd XMM2, XMM4;
831                 subpd XMM3, XMM4;
832                 movupd [ESI+ 0-64], XMM0;
833                 movupd [ESI+16-64], XMM1;
834                 movupd [ESI+32-64], XMM2;
835                 movupd [ESI+48-64], XMM3;
836                 cmp ESI, EDI;
837                 jb startsseloopa;
838 
839                 mov aptr, ESI;
840             }
841         }
842     }
843 
844     while (aptr < aend)
845         *aptr++ -= value;
846 
847     return a;
848 }
849 
850 unittest
851 {
852     printf("_arrayExpSliceMinass_d unittest\n");
853     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
854     {
855         version (log) printf("    cpuid %d\n", cpuid);
856 
857         for (int j = 0; j < 2; j++)
858         {
859             const int dim = 67;
860             T[] a = new T[dim + j];     // aligned on 16 byte boundary
861             a = a[j .. dim + j];        // misalign for second iteration
862             T[] b = new T[dim + j];
863             b = b[j .. dim + j];
864             T[] c = new T[dim + j];
865             c = c[j .. dim + j];
866 
867             for (int i = 0; i < dim; i++)
868             {   a[i] = cast(T)i;
869                 b[i] = cast(T)(i + 7);
870                 c[i] = cast(T)(i * 2);
871             }
872 
873             a[] = c[];
874             c[] -= 6;
875 
876             for (int i = 0; i < dim; i++)
877             {
878                 if (c[i] != cast(T)(a[i] - 6))
879                 {
880                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
881                     assert(0);
882                 }
883             }
884         }
885     }
886 }
887 
888 /* ======================================================================== */
889 
890 /***********************
891  * Computes:
892  *      a[] -= b[]
893  */
894 
895 T[] _arraySliceSliceMinass_d(T[] a, T[] b)
896 in
897 {
898     assert (a.length == b.length);
899     assert (disjoint(a, b));
900 }
901 body
902 {
903     //printf("_arraySliceSliceMinass_d()\n");
904     auto aptr = a.ptr;
905     auto aend = aptr + a.length;
906     auto bptr = b.ptr;
907 
908     version (D_InlineAsm_X86)
909     {
910         // SSE2 version is 183% faster
911         if (sse2() && a.length >= 8)
912         {
913             auto n = aptr + (a.length & ~7);
914 
915             // Unaligned case
916             asm
917             {
918                 mov ECX, bptr; // right operand
919                 mov ESI, aptr; // destination operand
920                 mov EDI, n; // end comparison
921 
922                 align 8;
923             startsseloopb:
924                 movupd XMM0, [ESI];
925                 movupd XMM1, [ESI+16];
926                 movupd XMM2, [ESI+32];
927                 movupd XMM3, [ESI+48];
928                 add ESI, 64;
929                 movupd XMM4, [ECX];
930                 movupd XMM5, [ECX+16];
931                 movupd XMM6, [ECX+32];
932                 movupd XMM7, [ECX+48];
933                 add ECX, 64;
934                 subpd XMM0, XMM4;
935                 subpd XMM1, XMM5;
936                 subpd XMM2, XMM6;
937                 subpd XMM3, XMM7;
938                 movupd [ESI+ 0-64], XMM0;
939                 movupd [ESI+16-64], XMM1;
940                 movupd [ESI+32-64], XMM2;
941                 movupd [ESI+48-64], XMM3;
942                 cmp ESI, EDI;
943                 jb startsseloopb;
944 
945                 mov aptr, ESI;
946                 mov bptr, ECX;
947             }
948         }
949     }
950 
951     while (aptr < aend)
952         *aptr++ -= *bptr++;
953 
954     return a;
955 }
956 
957 unittest
958 {
959     printf("_arrayExpSliceMinass_d unittest\n");
960     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
961     {
962         version (log) printf("    cpuid %d\n", cpuid);
963 
964         for (int j = 0; j < 2; j++)
965         {
966             const int dim = 67;
967             T[] a = new T[dim + j];     // aligned on 16 byte boundary
968             a = a[j .. dim + j];        // misalign for second iteration
969             T[] b = new T[dim + j];
970             b = b[j .. dim + j];
971             T[] c = new T[dim + j];
972             c = c[j .. dim + j];
973 
974             for (int i = 0; i < dim; i++)
975             {   a[i] = cast(T)i;
976                 b[i] = cast(T)(i + 7);
977                 c[i] = cast(T)(i * 2);
978             }
979 
980             a[] = c[];
981             c[] -= 6;
982 
983             for (int i = 0; i < dim; i++)
984             {
985                 if (c[i] != cast(T)(a[i] - 6))
986                 {
987                     printf("[%d]: %g != %g - 6\n", i, c[i], a[i]);
988                     assert(0);
989                 }
990             }
991         }
992     }
993 }
994 
995 /* ======================================================================== */
996 
997 /***********************
998  * Computes:
999  *      a[] = b[] * value
1000  */
1001 
1002 T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b)
1003 in
1004 {
1005     assert(a.length == b.length);
1006     assert(disjoint(a, b));
1007 }
1008 body
1009 {
1010     //printf("_arraySliceExpMulSliceAssign_d()\n");
1011     auto aptr = a.ptr;
1012     auto aend = aptr + a.length;
1013     auto bptr = b.ptr;
1014 
1015     version (D_InlineAsm_X86)
1016     {
1017         // SSE2 version is 304% faster
1018         if (sse2() && a.length >= 8)
1019         {
1020             auto n = aptr + (a.length & ~7);
1021 
1022             // Unaligned case
1023             asm
1024             {
1025                 mov EAX, bptr;
1026                 mov ESI, aptr;
1027                 mov EDI, n;
1028                 movsd XMM4, value;
1029                 shufpd XMM4, XMM4, 0;
1030 
1031                 align 8;
1032             startsseloop:
1033                 add ESI, 64;
1034                 movupd XMM0, [EAX];
1035                 movupd XMM1, [EAX+16];
1036                 movupd XMM2, [EAX+32];
1037                 movupd XMM3, [EAX+48];
1038                 add EAX, 64;
1039                 mulpd XMM0, XMM4;
1040                 mulpd XMM1, XMM4;
1041                 mulpd XMM2, XMM4;
1042                 mulpd XMM3, XMM4;
1043                 movupd [ESI+ 0-64], XMM0;
1044                 movupd [ESI+16-64], XMM1;
1045                 movupd [ESI+32-64], XMM2;
1046                 movupd [ESI+48-64], XMM3;
1047                 cmp ESI, EDI;
1048                 jb startsseloop;
1049 
1050                 mov aptr, ESI;
1051                 mov bptr, EAX;
1052             }
1053         }
1054     }
1055 
1056     while (aptr < aend)
1057         *aptr++ = *bptr++ * value;
1058 
1059     return a;
1060 }
1061 
1062 unittest
1063 {
1064     printf("_arraySliceExpMulSliceAssign_d unittest\n");
1065     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1066     {
1067         version (log) printf("    cpuid %d\n", cpuid);
1068 
1069         for (int j = 0; j < 2; j++)
1070         {
1071             const int dim = 67;
1072             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1073             a = a[j .. dim + j];        // misalign for second iteration
1074             T[] b = new T[dim + j];
1075             b = b[j .. dim + j];
1076             T[] c = new T[dim + j];
1077             c = c[j .. dim + j];
1078 
1079             for (int i = 0; i < dim; i++)
1080             {   a[i] = cast(T)i;
1081                 b[i] = cast(T)(i + 7);
1082                 c[i] = cast(T)(i * 2);
1083             }
1084 
1085             c[] = a[] * 6;
1086 
1087             for (int i = 0; i < dim; i++)
1088             {
1089                 if (c[i] != cast(T)(a[i] * 6))
1090                 {
1091                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1092                     assert(0);
1093                 }
1094             }
1095         }
1096     }
1097 }
1098 
1099 /* ======================================================================== */
1100 
1101 /***********************
1102  * Computes:
1103  *      a[] = b[] * c[]
1104  */
1105 
1106 T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b)
1107 in
1108 {
1109         assert(a.length == b.length && b.length == c.length);
1110         assert(disjoint(a, b));
1111         assert(disjoint(a, c));
1112         assert(disjoint(b, c));
1113 }
1114 body
1115 {
1116     //printf("_arraySliceSliceMulSliceAssign_d()\n");
1117     auto aptr = a.ptr;
1118     auto aend = aptr + a.length;
1119     auto bptr = b.ptr;
1120     auto cptr = c.ptr;
1121 
1122     version (D_InlineAsm_X86)
1123     {
1124         // SSE2 version is 329% faster
1125         if (sse2() && a.length >= 8)
1126         {
1127             auto n = aptr + (a.length & ~7);
1128 
1129             // Unaligned case
1130             asm
1131             {
1132                 mov EAX, bptr; // left operand
1133                 mov ECX, cptr; // right operand
1134                 mov ESI, aptr; // destination operand
1135                 mov EDI, n; // end comparison
1136 
1137                 align 8;
1138             startsseloopb:
1139                 movupd XMM0, [EAX];
1140                 movupd XMM1, [EAX+16];
1141                 movupd XMM2, [EAX+32];
1142                 movupd XMM3, [EAX+48];
1143                 add ESI, 64;
1144                 movupd XMM4, [ECX];
1145                 movupd XMM5, [ECX+16];
1146                 movupd XMM6, [ECX+32];
1147                 movupd XMM7, [ECX+48];
1148                 add EAX, 64;
1149                 mulpd XMM0, XMM4;
1150                 mulpd XMM1, XMM5;
1151                 mulpd XMM2, XMM6;
1152                 mulpd XMM3, XMM7;
1153                 add ECX, 64;
1154                 movupd [ESI+ 0-64], XMM0;
1155                 movupd [ESI+16-64], XMM1;
1156                 movupd [ESI+32-64], XMM2;
1157                 movupd [ESI+48-64], XMM3;
1158                 cmp ESI, EDI;
1159                 jb startsseloopb;
1160 
1161                 mov aptr, ESI;
1162                 mov bptr, EAX;
1163                 mov cptr, ECX;
1164             }
1165         }
1166     }
1167 
1168     while (aptr < aend)
1169         *aptr++ = *bptr++ * *cptr++;
1170 
1171     return a;
1172 }
1173 
1174 unittest
1175 {
1176     printf("_arraySliceSliceMulSliceAssign_d unittest\n");
1177     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1178     {
1179         version (log) printf("    cpuid %d\n", cpuid);
1180 
1181         for (int j = 0; j < 2; j++)
1182         {
1183             const int dim = 67;
1184             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1185             a = a[j .. dim + j];        // misalign for second iteration
1186             T[] b = new T[dim + j];
1187             b = b[j .. dim + j];
1188             T[] c = new T[dim + j];
1189             c = c[j .. dim + j];
1190 
1191             for (int i = 0; i < dim; i++)
1192             {   a[i] = cast(T)i;
1193                 b[i] = cast(T)(i + 7);
1194                 c[i] = cast(T)(i * 2);
1195             }
1196 
1197             c[] = a[] * b[];
1198 
1199             for (int i = 0; i < dim; i++)
1200             {
1201                 if (c[i] != cast(T)(a[i] * b[i]))
1202                 {
1203                     printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]);
1204                     assert(0);
1205                 }
1206             }
1207         }
1208     }
1209 }
1210 
1211 /* ======================================================================== */
1212 
1213 /***********************
1214  * Computes:
1215  *      a[] *= value
1216  */
1217 
1218 T[] _arrayExpSliceMulass_d(T[] a, T value)
1219 {
1220     //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1221     auto aptr = a.ptr;
1222     auto aend = aptr + a.length;
1223 
1224     version (D_InlineAsm_X86)
1225     {
1226         // SSE2 version is 109% faster
1227         if (sse2() && a.length >= 8)
1228         {
1229             auto n = cast(T*)((cast(uint)aend) & ~7);
1230             if (aptr < n)
1231 
1232             // Unaligned case
1233             asm
1234             {
1235                 mov ESI, aptr;
1236                 mov EDI, n;
1237                 movsd XMM4, value;
1238                 shufpd XMM4, XMM4, 0;
1239 
1240                 align 8;
1241             startsseloopa:
1242                 movupd XMM0, [ESI];
1243                 movupd XMM1, [ESI+16];
1244                 movupd XMM2, [ESI+32];
1245                 movupd XMM3, [ESI+48];
1246                 add ESI, 64;
1247                 mulpd XMM0, XMM4;
1248                 mulpd XMM1, XMM4;
1249                 mulpd XMM2, XMM4;
1250                 mulpd XMM3, XMM4;
1251                 movupd [ESI+ 0-64], XMM0;
1252                 movupd [ESI+16-64], XMM1;
1253                 movupd [ESI+32-64], XMM2;
1254                 movupd [ESI+48-64], XMM3;
1255                 cmp ESI, EDI;
1256                 jb startsseloopa;
1257 
1258                 mov aptr, ESI;
1259             }
1260         }
1261     }
1262 
1263     while (aptr < aend)
1264         *aptr++ *= value;
1265 
1266     return a;
1267 }
1268 
1269 unittest
1270 {
1271     printf("_arrayExpSliceMulass_d unittest\n");
1272     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1273     {
1274         version (log) printf("    cpuid %d\n", cpuid);
1275 
1276         for (int j = 0; j < 2; j++)
1277         {
1278             const int dim = 67;
1279             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1280             a = a[j .. dim + j];        // misalign for second iteration
1281             T[] b = new T[dim + j];
1282             b = b[j .. dim + j];
1283             T[] c = new T[dim + j];
1284             c = c[j .. dim + j];
1285 
1286             for (int i = 0; i < dim; i++)
1287             {   a[i] = cast(T)i;
1288                 b[i] = cast(T)(i + 7);
1289                 c[i] = cast(T)(i * 2);
1290             }
1291 
1292             a[] = c[];
1293             c[] *= 6;
1294 
1295             for (int i = 0; i < dim; i++)
1296             {
1297                 if (c[i] != cast(T)(a[i] * 6))
1298                 {
1299                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1300                     assert(0);
1301                 }
1302             }
1303         }
1304     }
1305 }
1306 
1307 /* ======================================================================== */
1308 
1309 /***********************
1310  * Computes:
1311  *      a[] *= b[]
1312  */
1313 
1314 T[] _arraySliceSliceMulass_d(T[] a, T[] b)
1315 in
1316 {
1317     assert (a.length == b.length);
1318     assert (disjoint(a, b));
1319 }
1320 body
1321 {
1322     //printf("_arraySliceSliceMulass_d()\n");
1323     auto aptr = a.ptr;
1324     auto aend = aptr + a.length;
1325     auto bptr = b.ptr;
1326 
1327     version (D_InlineAsm_X86)
1328     {
1329         // SSE2 version is 205% faster
1330         if (sse2() && a.length >= 8)
1331         {
1332             auto n = aptr + (a.length & ~7);
1333 
1334             // Unaligned case
1335             asm
1336             {
1337                 mov ECX, bptr; // right operand
1338                 mov ESI, aptr; // destination operand
1339                 mov EDI, n; // end comparison
1340 
1341                 align 8;
1342             startsseloopb:
1343                 movupd XMM0, [ESI];
1344                 movupd XMM1, [ESI+16];
1345                 movupd XMM2, [ESI+32];
1346                 movupd XMM3, [ESI+48];
1347                 add ESI, 64;
1348                 movupd XMM4, [ECX];
1349                 movupd XMM5, [ECX+16];
1350                 movupd XMM6, [ECX+32];
1351                 movupd XMM7, [ECX+48];
1352                 add ECX, 64;
1353                 mulpd XMM0, XMM4;
1354                 mulpd XMM1, XMM5;
1355                 mulpd XMM2, XMM6;
1356                 mulpd XMM3, XMM7;
1357                 movupd [ESI+ 0-64], XMM0;
1358                 movupd [ESI+16-64], XMM1;
1359                 movupd [ESI+32-64], XMM2;
1360                 movupd [ESI+48-64], XMM3;
1361                 cmp ESI, EDI;
1362                 jb startsseloopb;
1363 
1364                 mov aptr, ESI;
1365                 mov bptr, ECX;
1366             }
1367         }
1368     }
1369 
1370     while (aptr < aend)
1371         *aptr++ *= *bptr++;
1372 
1373     return a;
1374 }
1375 
1376 unittest
1377 {
1378     printf("_arrayExpSliceMulass_d unittest\n");
1379     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1380     {
1381         version (log) printf("    cpuid %d\n", cpuid);
1382 
1383         for (int j = 0; j < 2; j++)
1384         {
1385             const int dim = 67;
1386             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1387             a = a[j .. dim + j];        // misalign for second iteration
1388             T[] b = new T[dim + j];
1389             b = b[j .. dim + j];
1390             T[] c = new T[dim + j];
1391             c = c[j .. dim + j];
1392 
1393             for (int i = 0; i < dim; i++)
1394             {   a[i] = cast(T)i;
1395                 b[i] = cast(T)(i + 7);
1396                 c[i] = cast(T)(i * 2);
1397             }
1398 
1399             a[] = c[];
1400             c[] *= 6;
1401 
1402             for (int i = 0; i < dim; i++)
1403             {
1404                 if (c[i] != cast(T)(a[i] * 6))
1405                 {
1406                     printf("[%d]: %g != %g * 6\n", i, c[i], a[i]);
1407                     assert(0);
1408                 }
1409             }
1410         }
1411     }
1412 }
1413 
1414 /* ======================================================================== */
1415 
1416 /***********************
1417  * Computes:
1418  *      a[] = b[] / value
1419  */
1420 
1421 T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b)
1422 in
1423 {
1424     assert(a.length == b.length);
1425     assert(disjoint(a, b));
1426 }
1427 body
1428 {
1429     //printf("_arraySliceExpDivSliceAssign_d()\n");
1430     auto aptr = a.ptr;
1431     auto aend = aptr + a.length;
1432     auto bptr = b.ptr;
1433 
1434     /* Multiplying by the reciprocal is faster, but does
1435      * not produce as accurate an answer.
1436      */
1437     T recip = cast(T)1 / value;
1438 
1439     version (D_InlineAsm_X86)
1440     {
1441         // SSE2 version is 299% faster
1442         if (sse2() && a.length >= 8)
1443         {
1444             auto n = aptr + (a.length & ~7);
1445 
1446             // Unaligned case
1447             asm
1448             {
1449                 mov EAX, bptr;
1450                 mov ESI, aptr;
1451                 mov EDI, n;
1452                 movsd XMM4, recip;
1453                 //movsd XMM4, value
1454                 //rcpsd XMM4, XMM4
1455                 shufpd XMM4, XMM4, 0;
1456 
1457                 align 8;
1458             startsseloop:
1459                 add ESI, 64;
1460                 movupd XMM0, [EAX];
1461                 movupd XMM1, [EAX+16];
1462                 movupd XMM2, [EAX+32];
1463                 movupd XMM3, [EAX+48];
1464                 add EAX, 64;
1465                 mulpd XMM0, XMM4;
1466                 mulpd XMM1, XMM4;
1467                 mulpd XMM2, XMM4;
1468                 mulpd XMM3, XMM4;
1469                 //divpd XMM0, XMM4;
1470                 //divpd XMM1, XMM4;
1471                 //divpd XMM2, XMM4;
1472                 //divpd XMM3, XMM4;
1473                 movupd [ESI+ 0-64], XMM0;
1474                 movupd [ESI+16-64], XMM1;
1475                 movupd [ESI+32-64], XMM2;
1476                 movupd [ESI+48-64], XMM3;
1477                 cmp ESI, EDI;
1478                 jb startsseloop;
1479 
1480                 mov aptr, ESI;
1481                 mov bptr, EAX;
1482             }
1483         }
1484     }
1485 
1486     while (aptr < aend)
1487     {
1488         *aptr++ = *bptr++ / value;
1489         //*aptr++ = *bptr++ * recip;
1490     }
1491 
1492     return a;
1493 }
1494 
1495 unittest
1496 {
1497     printf("_arraySliceExpDivSliceAssign_d unittest\n");
1498     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1499     {
1500         version (log) printf("    cpuid %d\n", cpuid);
1501 
1502         for (int j = 0; j < 2; j++)
1503         {
1504             const int dim = 67;
1505             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1506             a = a[j .. dim + j];        // misalign for second iteration
1507             T[] b = new T[dim + j];
1508             b = b[j .. dim + j];
1509             T[] c = new T[dim + j];
1510             c = c[j .. dim + j];
1511 
1512             for (int i = 0; i < dim; i++)
1513             {   a[i] = cast(T)i;
1514                 b[i] = cast(T)(i + 7);
1515                 c[i] = cast(T)(i * 2);
1516             }
1517 
1518             c[] = a[] / 8;
1519 
1520             for (int i = 0; i < dim; i++)
1521             {
1522                 //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]);
1523                 if (c[i] != cast(T)(a[i] / 8))
1524                 {
1525                     printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1526                     assert(0);
1527                 }
1528             }
1529         }
1530     }
1531 }
1532 
1533 /* ======================================================================== */
1534 
1535 /***********************
1536  * Computes:
1537  *      a[] /= value
1538  */
1539 
1540 T[] _arrayExpSliceDivass_d(T[] a, T value)
1541 {
1542     //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1543     auto aptr = a.ptr;
1544     auto aend = aptr + a.length;
1545 
1546     /* Multiplying by the reciprocal is faster, but does
1547      * not produce as accurate an answer.
1548      */
1549     T recip = cast(T)1 / value;
1550 
1551     version (D_InlineAsm_X86)
1552     {
1553         // SSE2 version is 65% faster
1554         if (sse2() && a.length >= 8)
1555         {
1556             auto n = aptr + (a.length & ~7);
1557 
1558             // Unaligned case
1559             asm
1560             {
1561                 mov ESI, aptr;
1562                 mov EDI, n;
1563                 movsd XMM4, recip;
1564                 //movsd XMM4, value
1565                 //rcpsd XMM4, XMM4
1566                 shufpd XMM4, XMM4, 0;
1567 
1568                 align 8;
1569             startsseloopa:
1570                 movupd XMM0, [ESI];
1571                 movupd XMM1, [ESI+16];
1572                 movupd XMM2, [ESI+32];
1573                 movupd XMM3, [ESI+48];
1574                 add ESI, 64;
1575                 mulpd XMM0, XMM4;
1576                 mulpd XMM1, XMM4;
1577                 mulpd XMM2, XMM4;
1578                 mulpd XMM3, XMM4;
1579                 //divpd XMM0, XMM4;
1580                 //divpd XMM1, XMM4;
1581                 //divpd XMM2, XMM4;
1582                 //divpd XMM3, XMM4;
1583                 movupd [ESI+ 0-64], XMM0;
1584                 movupd [ESI+16-64], XMM1;
1585                 movupd [ESI+32-64], XMM2;
1586                 movupd [ESI+48-64], XMM3;
1587                 cmp ESI, EDI;
1588                 jb startsseloopa;
1589 
1590                 mov aptr, ESI;
1591             }
1592         }
1593     }
1594 
1595     while (aptr < aend)
1596         *aptr++ *= recip;
1597 
1598     return a;
1599 }
1600 
1601 
1602 unittest
1603 {
1604     printf("_arrayExpSliceDivass_d unittest\n");
1605     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1606     {
1607         version (log) printf("    cpuid %d\n", cpuid);
1608 
1609         for (int j = 0; j < 2; j++)
1610         {
1611             const int dim = 67;
1612             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1613             a = a[j .. dim + j];        // misalign for second iteration
1614             T[] b = new T[dim + j];
1615             b = b[j .. dim + j];
1616             T[] c = new T[dim + j];
1617             c = c[j .. dim + j];
1618 
1619             for (int i = 0; i < dim; i++)
1620             {   a[i] = cast(T)i;
1621                 b[i] = cast(T)(i + 7);
1622                 c[i] = cast(T)(i * 2);
1623             }
1624 
1625             a[] = c[];
1626             c[] /= 8;
1627 
1628             for (int i = 0; i < dim; i++)
1629             {
1630                 if (c[i] != cast(T)(a[i] / 8))
1631                 {
1632                     printf("[%d]: %g != %g / 8\n", i, c[i], a[i]);
1633                     assert(0);
1634                 }
1635             }
1636         }
1637     }
1638 }
1639 
1640 
1641 /* ======================================================================== */
1642 
1643 /***********************
1644  * Computes:
1645  *      a[] -= b[] * value
1646  */
1647 
1648 T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b)
1649 {
1650     return _arraySliceExpMulSliceAddass_d(a, -value, b);
1651 }
1652 
1653 /***********************
1654  * Computes:
1655  *      a[] += b[] * value
1656  */
1657 
1658 T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b)
1659 in
1660 {
1661         assert(a.length == b.length);
1662         assert(disjoint(a, b));
1663 }
1664 body
1665 {
1666     auto aptr = a.ptr;
1667     auto aend = aptr + a.length;
1668     auto bptr = b.ptr;
1669 
1670     // Handle remainder
1671     while (aptr < aend)
1672         *aptr++ += *bptr++ * value;
1673 
1674     return a;
1675 }
1676 
1677 unittest
1678 {
1679     printf("_arraySliceExpMulSliceAddass_d unittest\n");
1680 
1681     cpuid = 1;
1682     {
1683         version (log) printf("    cpuid %d\n", cpuid);
1684 
1685         for (int j = 0; j < 1; j++)
1686         {
1687             const int dim = 67;
1688             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1689             a = a[j .. dim + j];        // misalign for second iteration
1690             T[] b = new T[dim + j];
1691             b = b[j .. dim + j];
1692             T[] c = new T[dim + j];
1693             c = c[j .. dim + j];
1694 
1695             for (int i = 0; i < dim; i++)
1696             {   a[i] = cast(T)i;
1697                 b[i] = cast(T)(i + 7);
1698                 c[i] = cast(T)(i * 2);
1699             }
1700 
1701             b[] = c[];
1702             c[] += a[] * 6;
1703 
1704             for (int i = 0; i < dim; i++)
1705             {
1706                 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1707                 if (c[i] != cast(T)(b[i] + a[i] * 6))
1708                 {
1709                     printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]);
1710                     assert(0);
1711                 }
1712             }
1713         }
1714     }
1715 }