1 /***************************
2  * D programming language http://www.digitalmars.com/d/
3  * Runtime support for byte array operations.
4  * Based on code originally written by Burton Radons.
5  * Placed in public domain.
6  */
7 
8 /* Contains MMX versions of certain operations for dchar, int,
9  * and uint ('w', 'i' and 'k' suffixes).
10  */
11 
12 module rt.compiler.gdc.rt.arrayint;
13 
14 private import CPUid = rt.compiler.util.cpuid;
15 
16 debug(UnitTest)
17 {
18     private extern(C) int printf(char*,...);
19     /* This is so unit tests will test every CPU variant
20      */
21     int cpuid;
22     const int CPUID_MAX = 4;
23     bool mmx()      { return cpuid == 1 && CPUid.mmx(); }
24     bool sse()      { return cpuid == 2 && CPUid.sse(); }
25     bool sse2()     { return cpuid == 3 && CPUid.sse2(); }
26     bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); }
27 }
28 else
29 {
30     alias CPUid.mmx mmx;
31     alias CPUid.sse sse;
32     alias CPUid.sse2 sse2;
33     alias CPUid.amd3dnow amd3dnow;
34 }
35 
36 //version = log;
37 
38 bool disjoint(T)(T[] a, T[] b)
39 {
40     return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
41 }
42 
43 alias int T;
44 
45 extern (C):
46 
47 /* ======================================================================== */
48 
49 /***********************
50  * Computes:
51  *      a[] = b[] + value
52  */
53 
54 T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b)
55 {
56     return _arraySliceExpAddSliceAssign_i(a, value, b);
57 }
58 
59 T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b)
60 {
61     return _arraySliceExpAddSliceAssign_i(a, value, b);
62 }
63 
64 T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b)
65 in
66 {
67     assert(a.length == b.length);
68     assert(disjoint(a, b));
69 }
70 body
71 {
72     //printf("_arraySliceExpAddSliceAssign_i()\n");
73     auto aptr = a.ptr;
74     auto aend = aptr + a.length;
75     auto bptr = b.ptr;
76 
77     version (D_InlineAsm_X86)
78     {
79         // SSE2 aligned version is 380% faster
80         if (sse2() && a.length >= 8)
81         {
82             auto n = aptr + (a.length & ~7);
83 
84             uint l = value;
85 
86             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
87             {
88                 asm // unaligned case
89                 {
90                     mov ESI, aptr;
91                     mov EDI, n;
92                     mov EAX, bptr;
93                     movd XMM2, l;
94                     pshufd XMM2, XMM2, 0;
95 
96                     align 4;
97                 startaddsse2u:
98                     add ESI, 32;
99                     movdqu XMM0, [EAX];
100                     movdqu XMM1, [EAX+16];
101                     add EAX, 32;
102                     paddd XMM0, XMM2;
103                     paddd XMM1, XMM2;
104                     movdqu [ESI   -32], XMM0;
105                     movdqu [ESI+16-32], XMM1;
106                     cmp ESI, EDI;
107                     jb startaddsse2u;
108 
109                     mov aptr, ESI;
110                     mov bptr, EAX;
111                 }
112             }
113             else
114             {
115                 asm // aligned case
116                 {
117                     mov ESI, aptr;
118                     mov EDI, n;
119                     mov EAX, bptr;
120                     movd XMM2, l;
121                     pshufd XMM2, XMM2, 0;
122 
123                     align 4;
124                 startaddsse2a:
125                     add ESI, 32;
126                     movdqa XMM0, [EAX];
127                     movdqa XMM1, [EAX+16];
128                     add EAX, 32;
129                     paddd XMM0, XMM2;
130                     paddd XMM1, XMM2;
131                     movdqa [ESI   -32], XMM0;
132                     movdqa [ESI+16-32], XMM1;
133                     cmp ESI, EDI;
134                     jb startaddsse2a;
135 
136                     mov aptr, ESI;
137                     mov bptr, EAX;
138                 }
139             }
140         }
141         else
142         // MMX version is 298% faster
143         if (mmx() && a.length >= 4)
144         {
145             auto n = aptr + (a.length & ~3);
146 
147             ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
148 
149             asm
150             {
151                 mov ESI, aptr;
152                 mov EDI, n;
153                 mov EAX, bptr;
154                 movq MM2, l;
155 
156                 align 4;
157             startmmx:
158                 add ESI, 16;
159                 movq MM0, [EAX];
160                 movq MM1, [EAX+8];
161                 add EAX, 16;
162                 paddd MM0, MM2;
163                 paddd MM1, MM2;
164                 movq [ESI  -16], MM0;
165                 movq [ESI+8-16], MM1;
166                 cmp ESI, EDI;
167                 jb startmmx;
168 
169                 emms;
170                 mov aptr, ESI;
171                 mov bptr, EAX;
172             }
173         }
174         else
175         if (a.length >= 2)
176         {
177             auto n = aptr + (a.length & ~1);
178 
179             asm
180             {
181                 mov ESI, aptr;
182                 mov EDI, n;
183                 mov EAX, bptr;
184                 mov EDX, value;
185 
186                 align 4;
187             start386:
188                 add ESI, 8;
189                 mov EBX, [EAX];
190                 mov ECX, [EAX+4];
191                 add EAX, 8;
192                 add EBX, EDX;
193                 add ECX, EDX;
194                 mov [ESI  -8], EBX;
195                 mov [ESI+4-8], ECX;
196                 cmp ESI, EDI;
197                 jb start386;
198 
199                 mov aptr, ESI;
200                 mov bptr, EAX;
201             }
202         }
203     }
204 
205     while (aptr < aend)
206         *aptr++ = *bptr++ + value;
207 
208     return a;
209 }
210 
211 unittest
212 {
213     printf("_arraySliceExpAddSliceAssign_i unittest\n");
214 
215     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
216     {
217         version (log) printf("    cpuid %d\n", cpuid);
218 
219         for (int j = 0; j < 2; j++)
220         {
221             const int dim = 67;
222             T[] a = new T[dim + j];     // aligned on 16 byte boundary
223             a = a[j .. dim + j];        // misalign for second iteration
224             T[] b = new T[dim + j];
225             b = b[j .. dim + j];
226             T[] c = new T[dim + j];
227             c = c[j .. dim + j];
228 
229             for (int i = 0; i < dim; i++)
230             {   a[i] = cast(T)i;
231                 b[i] = cast(T)(i + 7);
232                 c[i] = cast(T)(i * 2);
233             }
234 
235             c[] = a[] + 6;
236 
237             for (int i = 0; i < dim; i++)
238             {
239                 if (c[i] != cast(T)(a[i] + 6))
240                 {
241                     printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
242                     assert(0);
243                 }
244             }
245         }
246     }
247 }
248 
249 
250 /* ======================================================================== */
251 
252 /***********************
253  * Computes:
254  *      a[] = b[] + c[]
255  */
256 
257 T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b)
258 {
259     return _arraySliceSliceAddSliceAssign_i(a, c, b);
260 }
261 
262 T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b)
263 {
264     return _arraySliceSliceAddSliceAssign_i(a, c, b);
265 }
266 
267 T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b)
268 in
269 {
270         assert(a.length == b.length && b.length == c.length);
271         assert(disjoint(a, b));
272         assert(disjoint(a, c));
273         assert(disjoint(b, c));
274 }
275 body
276 {
277     //printf("_arraySliceSliceAddSliceAssign_i()\n");
278     auto aptr = a.ptr;
279     auto aend = aptr + a.length;
280     auto bptr = b.ptr;
281     auto cptr = c.ptr;
282 
283     version (D_InlineAsm_X86)
284     {
285         // SSE2 aligned version is 1710% faster
286         if (sse2() && a.length >= 8)
287         {
288             auto n = aptr + (a.length & ~7);
289 
290             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
291             {
292                 asm // unaligned case
293                 {
294                     mov ESI, aptr;
295                     mov EDI, n;
296                     mov EAX, bptr;
297                     mov ECX, cptr;
298 
299                     align 4;
300                 startsse2u:
301                     add ESI, 32;
302                     movdqu XMM0, [EAX];
303                     movdqu XMM2, [ECX];
304                     movdqu XMM1, [EAX+16];
305                     movdqu XMM3, [ECX+16];
306                     add EAX, 32;
307                     add ECX, 32;
308                     paddd XMM0, XMM2;
309                     paddd XMM1, XMM3;
310                     movdqu [ESI   -32], XMM0;
311                     movdqu [ESI+16-32], XMM1;
312                     cmp ESI, EDI;
313                     jb startsse2u;
314 
315                     mov aptr, ESI;
316                     mov bptr, EAX;
317                     mov cptr, ECX;
318                 }
319             }
320             else
321             {
322                 asm // aligned case
323                 {
324                     mov ESI, aptr;
325                     mov EDI, n;
326                     mov EAX, bptr;
327                     mov ECX, cptr;
328 
329                     align 4;
330                 startsse2a:
331                     add ESI, 32;
332                     movdqa XMM0, [EAX];
333                     movdqa XMM2, [ECX];
334                     movdqa XMM1, [EAX+16];
335                     movdqa XMM3, [ECX+16];
336                     add EAX, 32;
337                     add ECX, 32;
338                     paddd XMM0, XMM2;
339                     paddd XMM1, XMM3;
340                     movdqa [ESI   -32], XMM0;
341                     movdqa [ESI+16-32], XMM1;
342                     cmp ESI, EDI;
343                     jb startsse2a;
344 
345                     mov aptr, ESI;
346                     mov bptr, EAX;
347                     mov cptr, ECX;
348                 }
349             }
350         }
351         else
352         // MMX version is 995% faster
353         if (mmx() && a.length >= 4)
354         {
355             auto n = aptr + (a.length & ~3);
356 
357             asm
358             {
359                 mov ESI, aptr;
360                 mov EDI, n;
361                 mov EAX, bptr;
362                 mov ECX, cptr;
363 
364                 align 4;
365             startmmx:
366                 add ESI, 16;
367                 movq MM0, [EAX];
368                 movq MM2, [ECX];
369                 movq MM1, [EAX+8];
370                 movq MM3, [ECX+8];
371                 add EAX, 16;
372                 add ECX, 16;
373                 paddd MM0, MM2;
374                 paddd MM1, MM3;
375                 movq [ESI  -16], MM0;
376                 movq [ESI+8-16], MM1;
377                 cmp ESI, EDI;
378                 jb startmmx;
379 
380                 emms;
381                 mov aptr, ESI;
382                 mov bptr, EAX;
383                 mov cptr, ECX;
384             }
385         }
386     }
387 
388 normal:
389     while (aptr < aend)
390         *aptr++ = *bptr++ + *cptr++;
391 
392     return a;
393 }
394 
395 unittest
396 {
397     printf("_arraySliceSliceAddSliceAssign_i unittest\n");
398 
399     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
400     {
401         version (log) printf("    cpuid %d\n", cpuid);
402 
403         for (int j = 0; j < 2; j++)
404         {
405             const int dim = 67;
406             T[] a = new T[dim + j];     // aligned on 16 byte boundary
407             a = a[j .. dim + j];        // misalign for second iteration
408             T[] b = new T[dim + j];
409             b = b[j .. dim + j];
410             T[] c = new T[dim + j];
411             c = c[j .. dim + j];
412 
413             for (int i = 0; i < dim; i++)
414             {   a[i] = cast(T)i;
415                 b[i] = cast(T)(i + 7);
416                 c[i] = cast(T)(i * 2);
417             }
418 
419             c[] = a[] + b[];
420 
421             for (int i = 0; i < dim; i++)
422             {
423                 if (c[i] != cast(T)(a[i] + b[i]))
424                 {
425                     printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
426                     assert(0);
427                 }
428             }
429         }
430     }
431 }
432 
433 
434 /* ======================================================================== */
435 
436 /***********************
437  * Computes:
438  *      a[] += value
439  */
440 
441 T[] _arrayExpSliceAddass_w(T[] a, T value)
442 {
443     return _arrayExpSliceAddass_i(a, value);
444 }
445 
446 T[] _arrayExpSliceAddass_k(T[] a, T value)
447 {
448     return _arrayExpSliceAddass_i(a, value);
449 }
450 
451 T[] _arrayExpSliceAddass_i(T[] a, T value)
452 {
453     //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
454     auto aptr = a.ptr;
455     auto aend = aptr + a.length;
456 
457     version (D_InlineAsm_X86)
458     {
459         // SSE2 aligned version is 83% faster
460         if (sse2() && a.length >= 8)
461         {
462             auto n = aptr + (a.length & ~7);
463 
464             uint l = value;
465 
466             if (((cast(uint) aptr) & 15) != 0)
467             {
468                 asm // unaligned case
469                 {
470                     mov ESI, aptr;
471                     mov EDI, n;
472                     movd XMM2, l;
473                     pshufd XMM2, XMM2, 0;
474 
475                     align 4;
476                 startaddsse2u:
477                     movdqu XMM0, [ESI];
478                     movdqu XMM1, [ESI+16];
479                     add ESI, 32;
480                     paddd XMM0, XMM2;
481                     paddd XMM1, XMM2;
482                     movdqu [ESI   -32], XMM0;
483                     movdqu [ESI+16-32], XMM1;
484                     cmp ESI, EDI;
485                     jb startaddsse2u;
486 
487                     mov aptr, ESI;
488                 }
489             }
490             else
491             {
492                 asm // aligned case
493                 {
494                     mov ESI, aptr;
495                     mov EDI, n;
496                     movd XMM2, l;
497                     pshufd XMM2, XMM2, 0;
498 
499                     align 4;
500                 startaddsse2a:
501                     movdqa XMM0, [ESI];
502                     movdqa XMM1, [ESI+16];
503                     add ESI, 32;
504                     paddd XMM0, XMM2;
505                     paddd XMM1, XMM2;
506                     movdqa [ESI   -32], XMM0;
507                     movdqa [ESI+16-32], XMM1;
508                     cmp ESI, EDI;
509                     jb startaddsse2a;
510 
511                     mov aptr, ESI;
512                 }
513             }
514         }
515         else
516         // MMX version is 81% faster
517         if (mmx() && a.length >= 4)
518         {
519             auto n = aptr + (a.length & ~3);
520 
521             ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
522 
523             asm
524             {
525                 mov ESI, aptr;
526                 mov EDI, n;
527                 movq MM2, l;
528 
529                 align 4;
530             startmmx:
531                 movq MM0, [ESI];
532                 movq MM1, [ESI+8];
533                 add ESI, 16;
534                 paddd MM0, MM2;
535                 paddd MM1, MM2;
536                 movq [ESI  -16], MM0;
537                 movq [ESI+8-16], MM1;
538                 cmp ESI, EDI;
539                 jb startmmx;
540 
541                 emms;
542                 mov aptr, ESI;
543             }
544         }
545         else
546         if (a.length >= 2)
547         {
548             auto n = aptr + (a.length & ~1);
549 
550             asm
551             {
552                 mov ESI, aptr;
553                 mov EDI, n;
554                 mov EDX, value;
555 
556                 align 4;
557             start386:
558                 mov EBX, [ESI];
559                 mov ECX, [ESI+4];
560                 add ESI, 8;
561                 add EBX, EDX;
562                 add ECX, EDX;
563                 mov [ESI  -8], EBX;
564                 mov [ESI+4-8], ECX;
565                 cmp ESI, EDI;
566                 jb start386;
567 
568                 mov aptr, ESI;
569             }
570         }
571     }
572 
573     while (aptr < aend)
574         *aptr++ += value;
575 
576     return a;
577 }
578 
579 unittest
580 {
581     printf("_arrayExpSliceAddass_i unittest\n");
582 
583     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
584     {
585         version (log) printf("    cpuid %d\n", cpuid);
586 
587         for (int j = 0; j < 2; j++)
588         {
589             const int dim = 67;
590             T[] a = new T[dim + j];     // aligned on 16 byte boundary
591             a = a[j .. dim + j];        // misalign for second iteration
592             T[] b = new T[dim + j];
593             b = b[j .. dim + j];
594             T[] c = new T[dim + j];
595             c = c[j .. dim + j];
596 
597             for (int i = 0; i < dim; i++)
598             {   a[i] = cast(T)i;
599                 b[i] = cast(T)(i + 7);
600                 c[i] = cast(T)(i * 2);
601             }
602 
603             a[] = c[];
604             a[] += 6;
605 
606             for (int i = 0; i < dim; i++)
607             {
608                 if (a[i] != cast(T)(c[i] + 6))
609                 {
610                     printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
611                     assert(0);
612                 }
613             }
614         }
615     }
616 }
617 
618 
619 /* ======================================================================== */
620 
621 /***********************
622  * Computes:
623  *      a[] += b[]
624  */
625 
626 T[] _arraySliceSliceAddass_w(T[] a, T[] b)
627 {
628     return _arraySliceSliceAddass_i(a, b);
629 }
630 
631 T[] _arraySliceSliceAddass_k(T[] a, T[] b)
632 {
633     return _arraySliceSliceAddass_i(a, b);
634 }
635 
636 T[] _arraySliceSliceAddass_i(T[] a, T[] b)
637 in
638 {
639     assert (a.length == b.length);
640     assert (disjoint(a, b));
641 }
642 body
643 {
644     //printf("_arraySliceSliceAddass_i()\n");
645     auto aptr = a.ptr;
646     auto aend = aptr + a.length;
647     auto bptr = b.ptr;
648 
649     version (D_InlineAsm_X86)
650     {
651         // SSE2 aligned version is 695% faster
652         if (sse2() && a.length >= 8)
653         {
654             auto n = aptr + (a.length & ~7);
655 
656             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
657             {
658                 asm // unaligned case
659                 {
660                     mov ESI, aptr;
661                     mov EDI, n;
662                     mov ECX, bptr;
663 
664                     align 4;
665                 startsse2u:
666                     movdqu XMM0, [ESI];
667                     movdqu XMM2, [ECX];
668                     movdqu XMM1, [ESI+16];
669                     movdqu XMM3, [ECX+16];
670                     add ESI, 32;
671                     add ECX, 32;
672                     paddd XMM0, XMM2;
673                     paddd XMM1, XMM3;
674                     movdqu [ESI   -32], XMM0;
675                     movdqu [ESI+16-32], XMM1;
676                     cmp ESI, EDI;
677                     jb startsse2u;
678 
679                     mov aptr, ESI;
680                     mov bptr, ECX;
681                 }
682             }
683             else
684             {
685                 asm // aligned case
686                 {
687                     mov ESI, aptr;
688                     mov EDI, n;
689                     mov ECX, bptr;
690 
691                     align 4;
692                 startsse2a:
693                     movdqa XMM0, [ESI];
694                     movdqa XMM2, [ECX];
695                     movdqa XMM1, [ESI+16];
696                     movdqa XMM3, [ECX+16];
697                     add ESI, 32;
698                     add ECX, 32;
699                     paddd XMM0, XMM2;
700                     paddd XMM1, XMM3;
701                     movdqa [ESI   -32], XMM0;
702                     movdqa [ESI+16-32], XMM1;
703                     cmp ESI, EDI;
704                     jb startsse2a;
705 
706                     mov aptr, ESI;
707                     mov bptr, ECX;
708                 }
709             }
710         }
711         else
712         // MMX version is 471% faster
713         if (mmx() && a.length >= 4)
714         {
715             auto n = aptr + (a.length & ~3);
716 
717             asm
718             {
719                 mov ESI, aptr;
720                 mov EDI, n;
721                 mov ECX, bptr;
722 
723                 align 4;
724             startmmx:
725                 movq MM0, [ESI];
726                 movq MM2, [ECX];
727                 movq MM1, [ESI+8];
728                 movq MM3, [ECX+8];
729                 add ESI, 16;
730                 add ECX, 16;
731                 paddd MM0, MM2;
732                 paddd MM1, MM3;
733                 movq [ESI  -16], MM0;
734                 movq [ESI+8-16], MM1;
735                 cmp ESI, EDI;
736                 jb startmmx;
737 
738                 emms;
739                 mov aptr, ESI;
740                 mov bptr, ECX;
741             }
742         }
743     }
744 
745 normal:
746     while (aptr < aend)
747         *aptr++ += *bptr++;
748 
749     return a;
750 }
751 
752 unittest
753 {
754     printf("_arraySliceSliceAddass_i unittest\n");
755 
756     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
757     {
758         version (log) printf("    cpuid %d\n", cpuid);
759 
760         for (int j = 0; j < 2; j++)
761         {
762             const int dim = 67;
763             T[] a = new T[dim + j];     // aligned on 16 byte boundary
764             a = a[j .. dim + j];        // misalign for second iteration
765             T[] b = new T[dim + j];
766             b = b[j .. dim + j];
767             T[] c = new T[dim + j];
768             c = c[j .. dim + j];
769 
770             for (int i = 0; i < dim; i++)
771             {   a[i] = cast(T)i;
772                 b[i] = cast(T)(i + 7);
773                 c[i] = cast(T)(i * 2);
774             }
775 
776             b[] = c[];
777             c[] += a[];
778 
779             for (int i = 0; i < dim; i++)
780             {
781                 if (c[i] != cast(T)(b[i] + a[i]))
782                 {
783                     printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
784                     assert(0);
785                 }
786             }
787         }
788     }
789 }
790 
791 
792 /* ======================================================================== */
793 
794 /***********************
795  * Computes:
796  *      a[] = b[] - value
797  */
798 
799 T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b)
800 {
801     return _arraySliceExpMinSliceAssign_i(a, value, b);
802 }
803 
804 T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b)
805 {
806     return _arraySliceExpMinSliceAssign_i(a, value, b);
807 }
808 
809 T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b)
810 in
811 {
812     assert(a.length == b.length);
813     assert(disjoint(a, b));
814 }
815 body
816 {
817     //printf("_arraySliceExpMinSliceAssign_i()\n");
818     auto aptr = a.ptr;
819     auto aend = aptr + a.length;
820     auto bptr = b.ptr;
821 
822     version (D_InlineAsm_X86)
823     {
824         // SSE2 aligned version is 400% faster
825         if (sse2() && a.length >= 8)
826         {
827             auto n = aptr + (a.length & ~7);
828 
829             uint l = value;
830 
831             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
832             {
833                 asm // unaligned case
834                 {
835                     mov ESI, aptr;
836                     mov EDI, n;
837                     mov EAX, bptr;
838                     movd XMM2, l;
839                     pshufd XMM2, XMM2, 0;
840 
841                     align 4;
842                 startaddsse2u:
843                     add ESI, 32;
844                     movdqu XMM0, [EAX];
845                     movdqu XMM1, [EAX+16];
846                     add EAX, 32;
847                     psubd XMM0, XMM2;
848                     psubd XMM1, XMM2;
849                     movdqu [ESI   -32], XMM0;
850                     movdqu [ESI+16-32], XMM1;
851                     cmp ESI, EDI;
852                     jb startaddsse2u;
853 
854                     mov aptr, ESI;
855                     mov bptr, EAX;
856                 }
857             }
858             else
859             {
860                 asm // aligned case
861                 {
862                     mov ESI, aptr;
863                     mov EDI, n;
864                     mov EAX, bptr;
865                     movd XMM2, l;
866                     pshufd XMM2, XMM2, 0;
867 
868                     align 4;
869                 startaddsse2a:
870                     add ESI, 32;
871                     movdqa XMM0, [EAX];
872                     movdqa XMM1, [EAX+16];
873                     add EAX, 32;
874                     psubd XMM0, XMM2;
875                     psubd XMM1, XMM2;
876                     movdqa [ESI   -32], XMM0;
877                     movdqa [ESI+16-32], XMM1;
878                     cmp ESI, EDI;
879                     jb startaddsse2a;
880 
881                     mov aptr, ESI;
882                     mov bptr, EAX;
883                 }
884             }
885         }
886         else
887         // MMX version is 315% faster
888         if (mmx() && a.length >= 4)
889         {
890             auto n = aptr + (a.length & ~3);
891 
892             ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
893 
894             asm
895             {
896                 mov ESI, aptr;
897                 mov EDI, n;
898                 mov EAX, bptr;
899                 movq MM2, l;
900 
901                 align 4;
902             startmmx:
903                 add ESI, 16;
904                 movq MM0, [EAX];
905                 movq MM1, [EAX+8];
906                 add EAX, 16;
907                 psubd MM0, MM2;
908                 psubd MM1, MM2;
909                 movq [ESI  -16], MM0;
910                 movq [ESI+8-16], MM1;
911                 cmp ESI, EDI;
912                 jb startmmx;
913 
914                 emms;
915                 mov aptr, ESI;
916                 mov bptr, EAX;
917             }
918         }
919         else
920         if (a.length >= 2)
921         {
922             auto n = aptr + (a.length & ~1);
923 
924             asm
925             {
926                 mov ESI, aptr;
927                 mov EDI, n;
928                 mov EAX, bptr;
929                 mov EDX, value;
930 
931                 align 4;
932             start386:
933                 add ESI, 8;
934                 mov EBX, [EAX];
935                 mov ECX, [EAX+4];
936                 add EAX, 8;
937                 sub EBX, EDX;
938                 sub ECX, EDX;
939                 mov [ESI  -8], EBX;
940                 mov [ESI+4-8], ECX;
941                 cmp ESI, EDI;
942                 jb start386;
943 
944                 mov aptr, ESI;
945                 mov bptr, EAX;
946             }
947         }
948     }
949 
950     while (aptr < aend)
951         *aptr++ = *bptr++ - value;
952 
953     return a;
954 }
955 
956 unittest
957 {
958     printf("_arraySliceExpMinSliceAssign_i unittest\n");
959 
960     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
961     {
962         version (log) printf("    cpuid %d\n", cpuid);
963 
964         for (int j = 0; j < 2; j++)
965         {
966             const int dim = 67;
967             T[] a = new T[dim + j];     // aligned on 16 byte boundary
968             a = a[j .. dim + j];        // misalign for second iteration
969             T[] b = new T[dim + j];
970             b = b[j .. dim + j];
971             T[] c = new T[dim + j];
972             c = c[j .. dim + j];
973 
974             for (int i = 0; i < dim; i++)
975             {   a[i] = cast(T)i;
976                 b[i] = cast(T)(i + 7);
977                 c[i] = cast(T)(i * 2);
978             }
979 
980             c[] = a[] - 6;
981 
982             for (int i = 0; i < dim; i++)
983             {
984                 if (c[i] != cast(T)(a[i] - 6))
985                 {
986                     printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
987                     assert(0);
988                 }
989             }
990         }
991     }
992 }
993 
994 
995 /* ======================================================================== */
996 
997 /***********************
998  * Computes:
999  *      a[] = value - b[]
1000  */
1001 
1002 T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value)
1003 {
1004     return _arrayExpSliceMinSliceAssign_i(a, b, value);
1005 }
1006 
1007 T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value)
1008 {
1009     return _arrayExpSliceMinSliceAssign_i(a, b, value);
1010 }
1011 
1012 T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value)
1013 in
1014 {
1015     assert(a.length == b.length);
1016     assert(disjoint(a, b));
1017 }
1018 body
1019 {
1020     //printf("_arrayExpSliceMinSliceAssign_i()\n");
1021     auto aptr = a.ptr;
1022     auto aend = aptr + a.length;
1023     auto bptr = b.ptr;
1024 
1025     version (D_InlineAsm_X86)
1026     {
1027         // SSE2 aligned version is 1812% faster
1028         if (sse2() && a.length >= 8)
1029         {
1030             auto n = aptr + (a.length & ~7);
1031 
1032             uint l = value;
1033 
1034             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1035             {
1036                 asm // unaligned case
1037                 {
1038                     mov ESI, aptr;
1039                     mov EDI, n;
1040                     mov EAX, bptr;
1041                     movd XMM4, l;
1042                     pshufd XMM4, XMM4, 0;
1043 
1044                     align 4;
1045                 startaddsse2u:
1046                     add ESI, 32;
1047                     movdqu XMM2, [EAX];
1048                     movdqu XMM3, [EAX+16];
1049                     movdqa XMM0, XMM4;
1050                     movdqa XMM1, XMM4;
1051                     add EAX, 32;
1052                     psubd XMM0, XMM2;
1053                     psubd XMM1, XMM3;
1054                     movdqu [ESI   -32], XMM0;
1055                     movdqu [ESI+16-32], XMM1;
1056                     cmp ESI, EDI;
1057                     jb startaddsse2u;
1058 
1059                     mov aptr, ESI;
1060                     mov bptr, EAX;
1061                 }
1062             }
1063             else
1064             {
1065                 asm // aligned case
1066                 {
1067                     mov ESI, aptr;
1068                     mov EDI, n;
1069                     mov EAX, bptr;
1070                     movd XMM4, l;
1071                     pshufd XMM4, XMM4, 0;
1072 
1073                     align 4;
1074                 startaddsse2a:
1075                     add ESI, 32;
1076                     movdqa XMM2, [EAX];
1077                     movdqa XMM3, [EAX+16];
1078                     movdqa XMM0, XMM4;
1079                     movdqa XMM1, XMM4;
1080                     add EAX, 32;
1081                     psubd XMM0, XMM2;
1082                     psubd XMM1, XMM3;
1083                     movdqa [ESI   -32], XMM0;
1084                     movdqa [ESI+16-32], XMM1;
1085                     cmp ESI, EDI;
1086                     jb startaddsse2a;
1087 
1088                     mov aptr, ESI;
1089                     mov bptr, EAX;
1090                 }
1091             }
1092         }
1093         else
1094         // MMX version is 1077% faster
1095         if (mmx() && a.length >= 4)
1096         {
1097             auto n = aptr + (a.length & ~3);
1098 
1099             ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1100 
1101             asm
1102             {
1103                 mov ESI, aptr;
1104                 mov EDI, n;
1105                 mov EAX, bptr;
1106                 movq MM4, l;
1107 
1108                 align 4;
1109             startmmx:
1110                 add ESI, 16;
1111                 movq MM2, [EAX];
1112                 movq MM3, [EAX+8];
1113                 movq MM0, MM4;
1114                 movq MM1, MM4;
1115                 add EAX, 16;
1116                 psubd MM0, MM2;
1117                 psubd MM1, MM3;
1118                 movq [ESI  -16], MM0;
1119                 movq [ESI+8-16], MM1;
1120                 cmp ESI, EDI;
1121                 jb startmmx;
1122 
1123                 emms;
1124                 mov aptr, ESI;
1125                 mov bptr, EAX;
1126             }
1127         }
1128     }
1129 
1130     while (aptr < aend)
1131         *aptr++ = value - *bptr++;
1132 
1133     return a;
1134 }
1135 
1136 unittest
1137 {
1138     printf("_arrayExpSliceMinSliceAssign_i unittest\n");
1139 
1140     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1141     {
1142         version (log) printf("    cpuid %d\n", cpuid);
1143 
1144         for (int j = 0; j < 2; j++)
1145         {
1146             const int dim = 67;
1147             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1148             a = a[j .. dim + j];        // misalign for second iteration
1149             T[] b = new T[dim + j];
1150             b = b[j .. dim + j];
1151             T[] c = new T[dim + j];
1152             c = c[j .. dim + j];
1153 
1154             for (int i = 0; i < dim; i++)
1155             {   a[i] = cast(T)i;
1156                 b[i] = cast(T)(i + 7);
1157                 c[i] = cast(T)(i * 2);
1158             }
1159 
1160             c[] = 6 - a[];
1161 
1162             for (int i = 0; i < dim; i++)
1163             {
1164                 if (c[i] != cast(T)(6 - a[i]))
1165                 {
1166                     printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1167                     assert(0);
1168                 }
1169             }
1170         }
1171     }
1172 }
1173 
1174 
1175 /* ======================================================================== */
1176 
1177 /***********************
1178  * Computes:
1179  *      a[] = b[] - c[]
1180  */
1181 
1182 T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b)
1183 {
1184     return _arraySliceSliceMinSliceAssign_i(a, c, b);
1185 }
1186 
1187 T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b)
1188 {
1189     return _arraySliceSliceMinSliceAssign_i(a, c, b);
1190 }
1191 
1192 T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b)
1193 in
1194 {
1195         assert(a.length == b.length && b.length == c.length);
1196         assert(disjoint(a, b));
1197         assert(disjoint(a, c));
1198         assert(disjoint(b, c));
1199 }
1200 body
1201 {
1202     auto aptr = a.ptr;
1203     auto aend = aptr + a.length;
1204     auto bptr = b.ptr;
1205     auto cptr = c.ptr;
1206 
1207     version (D_InlineAsm_X86)
1208     {
1209         // SSE2 aligned version is 1721% faster
1210         if (sse2() && a.length >= 8)
1211         {
1212             auto n = aptr + (a.length & ~7);
1213 
1214             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1215             {
1216                 asm // unaligned case
1217                 {
1218                     mov ESI, aptr;
1219                     mov EDI, n;
1220                     mov EAX, bptr;
1221                     mov ECX, cptr;
1222 
1223                     align 4;
1224                 startsse2u:
1225                     add ESI, 32;
1226                     movdqu XMM0, [EAX];
1227                     movdqu XMM2, [ECX];
1228                     movdqu XMM1, [EAX+16];
1229                     movdqu XMM3, [ECX+16];
1230                     add EAX, 32;
1231                     add ECX, 32;
1232                     psubd XMM0, XMM2;
1233                     psubd XMM1, XMM3;
1234                     movdqu [ESI   -32], XMM0;
1235                     movdqu [ESI+16-32], XMM1;
1236                     cmp ESI, EDI;
1237                     jb startsse2u;
1238 
1239                     mov aptr, ESI;
1240                     mov bptr, EAX;
1241                     mov cptr, ECX;
1242                 }
1243             }
1244             else
1245             {
1246                 asm // aligned case
1247                 {
1248                     mov ESI, aptr;
1249                     mov EDI, n;
1250                     mov EAX, bptr;
1251                     mov ECX, cptr;
1252 
1253                     align 4;
1254                 startsse2a:
1255                     add ESI, 32;
1256                     movdqa XMM0, [EAX];
1257                     movdqa XMM2, [ECX];
1258                     movdqa XMM1, [EAX+16];
1259                     movdqa XMM3, [ECX+16];
1260                     add EAX, 32;
1261                     add ECX, 32;
1262                     psubd XMM0, XMM2;
1263                     psubd XMM1, XMM3;
1264                     movdqa [ESI   -32], XMM0;
1265                     movdqa [ESI+16-32], XMM1;
1266                     cmp ESI, EDI;
1267                     jb startsse2a;
1268 
1269                     mov aptr, ESI;
1270                     mov bptr, EAX;
1271                     mov cptr, ECX;
1272                 }
1273             }
1274         }
1275         else
1276         // MMX version is 1002% faster
1277         if (mmx() && a.length >= 4)
1278         {
1279             auto n = aptr + (a.length & ~3);
1280 
1281             asm
1282             {
1283                 mov ESI, aptr;
1284                 mov EDI, n;
1285                 mov EAX, bptr;
1286                 mov ECX, cptr;
1287 
1288                 align 4;
1289             startmmx:
1290                 add ESI, 16;
1291                 movq MM0, [EAX];
1292                 movq MM2, [ECX];
1293                 movq MM1, [EAX+8];
1294                 movq MM3, [ECX+8];
1295                 add EAX, 16;
1296                 add ECX, 16;
1297                 psubd MM0, MM2;
1298                 psubd MM1, MM3;
1299                 movq [ESI  -16], MM0;
1300                 movq [ESI+8-16], MM1;
1301                 cmp ESI, EDI;
1302                 jb startmmx;
1303 
1304                 emms;
1305                 mov aptr, ESI;
1306                 mov bptr, EAX;
1307                 mov cptr, ECX;
1308             }
1309         }
1310     }
1311 
1312     while (aptr < aend)
1313         *aptr++ = *bptr++ - *cptr++;
1314 
1315     return a;
1316 }
1317 
1318 unittest
1319 {
1320     printf("_arraySliceSliceMinSliceAssign_i unittest\n");
1321 
1322     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1323     {
1324         version (log) printf("    cpuid %d\n", cpuid);
1325 
1326         for (int j = 0; j < 2; j++)
1327         {
1328             const int dim = 67;
1329             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1330             a = a[j .. dim + j];        // misalign for second iteration
1331             T[] b = new T[dim + j];
1332             b = b[j .. dim + j];
1333             T[] c = new T[dim + j];
1334             c = c[j .. dim + j];
1335 
1336             for (int i = 0; i < dim; i++)
1337             {   a[i] = cast(T)i;
1338                 b[i] = cast(T)(i + 7);
1339                 c[i] = cast(T)(i * 2);
1340             }
1341 
1342             c[] = a[] - b[];
1343 
1344             for (int i = 0; i < dim; i++)
1345             {
1346                 if (c[i] != cast(T)(a[i] - b[i]))
1347                 {
1348                     printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1349                     assert(0);
1350                 }
1351             }
1352         }
1353     }
1354 }
1355 
1356 
1357 /* ======================================================================== */
1358 
1359 /***********************
1360  * Computes:
1361  *      a[] -= value
1362  */
1363 
1364 T[] _arrayExpSliceMinass_w(T[] a, T value)
1365 {
1366     return _arrayExpSliceMinass_i(a, value);
1367 }
1368 
1369 T[] _arrayExpSliceMinass_k(T[] a, T value)
1370 {
1371     return _arrayExpSliceMinass_i(a, value);
1372 }
1373 
1374 T[] _arrayExpSliceMinass_i(T[] a, T value)
1375 {
1376     //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1377     auto aptr = a.ptr;
1378     auto aend = aptr + a.length;
1379 
1380     version (D_InlineAsm_X86)
1381     {
1382         // SSE2 aligned version is 81% faster
1383         if (sse2() && a.length >= 8)
1384         {
1385             auto n = aptr + (a.length & ~7);
1386 
1387             uint l = value;
1388 
1389             if (((cast(uint) aptr) & 15) != 0)
1390             {
1391                 asm // unaligned case
1392                 {
1393                     mov ESI, aptr;
1394                     mov EDI, n;
1395                     movd XMM2, l;
1396                     pshufd XMM2, XMM2, 0;
1397 
1398                     align 4;
1399                 startaddsse2u:
1400                     movdqu XMM0, [ESI];
1401                     movdqu XMM1, [ESI+16];
1402                     add ESI, 32;
1403                     psubd XMM0, XMM2;
1404                     psubd XMM1, XMM2;
1405                     movdqu [ESI   -32], XMM0;
1406                     movdqu [ESI+16-32], XMM1;
1407                     cmp ESI, EDI;
1408                     jb startaddsse2u;
1409 
1410                     mov aptr, ESI;
1411                 }
1412             }
1413             else
1414             {
1415                 asm // aligned case
1416                 {
1417                     mov ESI, aptr;
1418                     mov EDI, n;
1419                     movd XMM2, l;
1420                     pshufd XMM2, XMM2, 0;
1421 
1422                     align 4;
1423                 startaddsse2a:
1424                     movdqa XMM0, [ESI];
1425                     movdqa XMM1, [ESI+16];
1426                     add ESI, 32;
1427                     psubd XMM0, XMM2;
1428                     psubd XMM1, XMM2;
1429                     movdqa [ESI   -32], XMM0;
1430                     movdqa [ESI+16-32], XMM1;
1431                     cmp ESI, EDI;
1432                     jb startaddsse2a;
1433 
1434                     mov aptr, ESI;
1435                 }
1436             }
1437         }
1438         else
1439         // MMX version is 81% faster
1440         if (mmx() && a.length >= 4)
1441         {
1442             auto n = aptr + (a.length & ~3);
1443 
1444             ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1445 
1446             asm
1447             {
1448                 mov ESI, aptr;
1449                 mov EDI, n;
1450                 movq MM2, l;
1451 
1452                 align 4;
1453             startmmx:
1454                 movq MM0, [ESI];
1455                 movq MM1, [ESI+8];
1456                 add ESI, 16;
1457                 psubd MM0, MM2;
1458                 psubd MM1, MM2;
1459                 movq [ESI  -16], MM0;
1460                 movq [ESI+8-16], MM1;
1461                 cmp ESI, EDI;
1462                 jb startmmx;
1463 
1464                 emms;
1465                 mov aptr, ESI;
1466             }
1467         }
1468         else
1469         if (a.length >= 2)
1470         {
1471             auto n = aptr + (a.length & ~1);
1472 
1473             asm
1474             {
1475                 mov ESI, aptr;
1476                 mov EDI, n;
1477                 mov EDX, value;
1478 
1479                 align 4;
1480             start386:
1481                 mov EBX, [ESI];
1482                 mov ECX, [ESI+4];
1483                 add ESI, 8;
1484                 sub EBX, EDX;
1485                 sub ECX, EDX;
1486                 mov [ESI  -8], EBX;
1487                 mov [ESI+4-8], ECX;
1488                 cmp ESI, EDI;
1489                 jb start386;
1490 
1491                 mov aptr, ESI;
1492             }
1493         }
1494     }
1495 
1496     while (aptr < aend)
1497         *aptr++ -= value;
1498 
1499     return a;
1500 }
1501 
1502 unittest
1503 {
1504     printf("_arrayExpSliceMinass_i unittest\n");
1505 
1506     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1507     {
1508         version (log) printf("    cpuid %d\n", cpuid);
1509 
1510         for (int j = 0; j < 2; j++)
1511         {
1512             const int dim = 67;
1513             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1514             a = a[j .. dim + j];        // misalign for second iteration
1515             T[] b = new T[dim + j];
1516             b = b[j .. dim + j];
1517             T[] c = new T[dim + j];
1518             c = c[j .. dim + j];
1519 
1520             for (int i = 0; i < dim; i++)
1521             {   a[i] = cast(T)i;
1522                 b[i] = cast(T)(i + 7);
1523                 c[i] = cast(T)(i * 2);
1524             }
1525 
1526             a[] = c[];
1527             a[] -= 6;
1528 
1529             for (int i = 0; i < dim; i++)
1530             {
1531                 if (a[i] != cast(T)(c[i] - 6))
1532                 {
1533                     printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1534                     assert(0);
1535                 }
1536             }
1537         }
1538     }
1539 }
1540 
1541 
1542 /* ======================================================================== */
1543 
1544 /***********************
1545  * Computes:
1546  *      a[] -= b[]
1547  */
1548 
1549 T[] _arraySliceSliceMinass_w(T[] a, T[] b)
1550 {
1551     return _arraySliceSliceMinass_i(a, b);
1552 }
1553 
1554 T[] _arraySliceSliceMinass_k(T[] a, T[] b)
1555 {
1556     return _arraySliceSliceMinass_i(a, b);
1557 }
1558 
1559 T[] _arraySliceSliceMinass_i(T[] a, T[] b)
1560 in
1561 {
1562     assert (a.length == b.length);
1563     assert (disjoint(a, b));
1564 }
1565 body
1566 {
1567     //printf("_arraySliceSliceMinass_i()\n");
1568     auto aptr = a.ptr;
1569     auto aend = aptr + a.length;
1570     auto bptr = b.ptr;
1571 
1572     version (D_InlineAsm_X86)
1573     {
1574         // SSE2 aligned version is 731% faster
1575         if (sse2() && a.length >= 8)
1576         {
1577             auto n = aptr + (a.length & ~7);
1578 
1579             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1580             {
1581                 asm // unaligned case
1582                 {
1583                     mov ESI, aptr;
1584                     mov EDI, n;
1585                     mov ECX, bptr;
1586 
1587                     align 4;
1588                 startsse2u:
1589                     movdqu XMM0, [ESI];
1590                     movdqu XMM2, [ECX];
1591                     movdqu XMM1, [ESI+16];
1592                     movdqu XMM3, [ECX+16];
1593                     add ESI, 32;
1594                     add ECX, 32;
1595                     psubd XMM0, XMM2;
1596                     psubd XMM1, XMM3;
1597                     movdqu [ESI   -32], XMM0;
1598                     movdqu [ESI+16-32], XMM1;
1599                     cmp ESI, EDI;
1600                     jb startsse2u;
1601 
1602                     mov aptr, ESI;
1603                     mov bptr, ECX;
1604                 }
1605             }
1606             else
1607             {
1608                 asm // aligned case
1609                 {
1610                     mov ESI, aptr;
1611                     mov EDI, n;
1612                     mov ECX, bptr;
1613 
1614                     align 4;
1615                 startsse2a:
1616                     movdqa XMM0, [ESI];
1617                     movdqa XMM2, [ECX];
1618                     movdqa XMM1, [ESI+16];
1619                     movdqa XMM3, [ECX+16];
1620                     add ESI, 32;
1621                     add ECX, 32;
1622                     psubd XMM0, XMM2;
1623                     psubd XMM1, XMM3;
1624                     movdqa [ESI   -32], XMM0;
1625                     movdqa [ESI+16-32], XMM1;
1626                     cmp ESI, EDI;
1627                     jb startsse2a;
1628 
1629                     mov aptr, ESI;
1630                     mov bptr, ECX;
1631                 }
1632             }
1633         }
1634         else
1635         // MMX version is 441% faster
1636         if (mmx() && a.length >= 4)
1637         {
1638             auto n = aptr + (a.length & ~3);
1639 
1640             asm
1641             {
1642                 mov ESI, aptr;
1643                 mov EDI, n;
1644                 mov ECX, bptr;
1645 
1646                 align 4;
1647             startmmx:
1648                 movq MM0, [ESI];
1649                 movq MM2, [ECX];
1650                 movq MM1, [ESI+8];
1651                 movq MM3, [ECX+8];
1652                 add ESI, 16;
1653                 add ECX, 16;
1654                 psubd MM0, MM2;
1655                 psubd MM1, MM3;
1656                 movq [ESI  -16], MM0;
1657                 movq [ESI+8-16], MM1;
1658                 cmp ESI, EDI;
1659                 jb startmmx;
1660 
1661                 emms;
1662                 mov aptr, ESI;
1663                 mov bptr, ECX;
1664             }
1665         }
1666     }
1667 
1668     while (aptr < aend)
1669         *aptr++ -= *bptr++;
1670 
1671     return a;
1672 }
1673 
1674 unittest
1675 {
1676     printf("_arraySliceSliceMinass_i unittest\n");
1677 
1678     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1679     {
1680         version (log) printf("    cpuid %d\n", cpuid);
1681 
1682         for (int j = 0; j < 2; j++)
1683         {
1684             const int dim = 67;
1685             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1686             a = a[j .. dim + j];        // misalign for second iteration
1687             T[] b = new T[dim + j];
1688             b = b[j .. dim + j];
1689             T[] c = new T[dim + j];
1690             c = c[j .. dim + j];
1691 
1692             for (int i = 0; i < dim; i++)
1693             {   a[i] = cast(T)i;
1694                 b[i] = cast(T)(i + 7);
1695                 c[i] = cast(T)(i * 2);
1696             }
1697 
1698             b[] = c[];
1699             c[] -= a[];
1700 
1701             for (int i = 0; i < dim; i++)
1702             {
1703                 if (c[i] != cast(T)(b[i] - a[i]))
1704                 {
1705                     printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1706                     assert(0);
1707                 }
1708             }
1709         }
1710     }
1711 }
1712 
1713 
1714 /* ======================================================================== */
1715 
1716 /***********************
1717  * Computes:
1718  *      a[] = b[] * value
1719  */
1720 
1721 T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b)
1722 {
1723     return _arraySliceExpMulSliceAssign_i(a, value, b);
1724 }
1725 
1726 T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b)
1727 {
1728     return _arraySliceExpMulSliceAssign_i(a, value, b);
1729 }
1730 
1731 T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b)
1732 in
1733 {
1734     assert(a.length == b.length);
1735     assert(disjoint(a, b));
1736 }
1737 body
1738 {
1739     //printf("_arraySliceExpMulSliceAssign_i()\n");
1740     auto aptr = a.ptr;
1741     auto aend = aptr + a.length;
1742     auto bptr = b.ptr;
1743 
1744   version (none)        // multiplying a pair is not supported by MMX
1745   {
1746     version (D_InlineAsm_X86)
1747     {
1748         // SSE2 aligned version is 1380% faster
1749         if (sse2() && a.length >= 8)
1750         {
1751             auto n = aptr + (a.length & ~7);
1752 
1753             uint l = value;
1754 
1755             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1756             {
1757                 asm
1758                 {
1759                     mov ESI, aptr;
1760                     mov EDI, n;
1761                     mov EAX, bptr;
1762                     movd XMM2, l;
1763                     pshufd XMM2, XMM2, 0;
1764 
1765                     align 4;
1766                 startsse2u:
1767                     add ESI, 32;
1768                     movdqu XMM0, [EAX];
1769                     movdqu XMM1, [EAX+16];
1770                     add EAX, 32;
1771                     pmuludq XMM0, XMM2;
1772                     pmuludq XMM1, XMM2;
1773                     movdqu [ESI   -32], XMM0;
1774                     movdqu [ESI+16-32], XMM1;
1775                     cmp ESI, EDI;
1776                     jb startsse2u;
1777 
1778                     mov aptr, ESI;
1779                     mov bptr, EAX;
1780                 }
1781             }
1782             else
1783             {
1784                 asm
1785                 {
1786                     mov ESI, aptr;
1787                     mov EDI, n;
1788                     mov EAX, bptr;
1789                     movd XMM2, l;
1790                     pshufd XMM2, XMM2, 0;
1791 
1792                     align 4;
1793                 startsse2a:
1794                     add ESI, 32;
1795                     movdqa XMM0, [EAX];
1796                     movdqa XMM1, [EAX+16];
1797                     add EAX, 32;
1798                     pmuludq XMM0, XMM2;
1799                     pmuludq XMM1, XMM2;
1800                     movdqa [ESI   -32], XMM0;
1801                     movdqa [ESI+16-32], XMM1;
1802                     cmp ESI, EDI;
1803                     jb startsse2a;
1804 
1805                     mov aptr, ESI;
1806                     mov bptr, EAX;
1807                 }
1808             }
1809         }
1810         else
1811         {
1812         // MMX version is 1380% faster
1813         if (mmx() && a.length >= 4)
1814         {
1815             auto n = aptr + (a.length & ~3);
1816 
1817             ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
1818 
1819             asm
1820             {
1821                 mov ESI, aptr;
1822                 mov EDI, n;
1823                 mov EAX, bptr;
1824                 movq MM2, l;
1825 
1826                 align 4;
1827             startmmx:
1828                 add ESI, 16;
1829                 movq MM0, [EAX];
1830                 movq MM1, [EAX+8];
1831                 add EAX, 16;
1832                 pmuludq MM0, MM2;       // only multiplies low 32 bits
1833                 pmuludq MM1, MM2;
1834                 movq [ESI  -16], MM0;
1835                 movq [ESI+8-16], MM1;
1836                 cmp ESI, EDI;
1837                 jb startmmx;
1838 
1839                 emms;
1840                 mov aptr, ESI;
1841                 mov bptr, EAX;
1842             }
1843         }
1844     }
1845         }
1846   }
1847 
1848     while (aptr < aend)
1849         *aptr++ = *bptr++ * value;
1850 
1851     return a;
1852 }
1853 
1854 unittest
1855 {
1856     printf("_arraySliceExpMulSliceAssign_s unittest\n");
1857 
1858     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1859     {
1860         version (log) printf("    cpuid %d\n", cpuid);
1861 
1862         for (int j = 0; j < 2; j++)
1863         {
1864             const int dim = 67;
1865             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1866             a = a[j .. dim + j];        // misalign for second iteration
1867             T[] b = new T[dim + j];
1868             b = b[j .. dim + j];
1869             T[] c = new T[dim + j];
1870             c = c[j .. dim + j];
1871 
1872             for (int i = 0; i < dim; i++)
1873             {   a[i] = cast(T)i;
1874                 b[i] = cast(T)(i + 7);
1875                 c[i] = cast(T)(i * 2);
1876             }
1877 
1878             c[] = a[] * 6;
1879 
1880             for (int i = 0; i < dim; i++)
1881             {
1882                 //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]);
1883                 if (c[i] != cast(T)(a[i] * 6))
1884                 {
1885                     printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1886                     assert(0);
1887                 }
1888             }
1889         }
1890     }
1891 }
1892 
1893 
1894 /* ======================================================================== */
1895 
1896 /***********************
1897  * Computes:
1898  *      a[] = b[] * c[]
1899  */
1900 
1901 T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b)
1902 {
1903     return _arraySliceSliceMulSliceAssign_i(a, c, b);
1904 }
1905 
1906 T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b)
1907 {
1908     return _arraySliceSliceMulSliceAssign_i(a, c, b);
1909 }
1910 
1911 T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b)
1912 in
1913 {
1914         assert(a.length == b.length && b.length == c.length);
1915         assert(disjoint(a, b));
1916         assert(disjoint(a, c));
1917         assert(disjoint(b, c));
1918 }
1919 body
1920 {
1921     //printf("_arraySliceSliceMulSliceAssign_i()\n");
1922     auto aptr = a.ptr;
1923     auto aend = aptr + a.length;
1924     auto bptr = b.ptr;
1925     auto cptr = c.ptr;
1926 
1927   version (none)
1928   {
1929     version (D_InlineAsm_X86)
1930     {
1931         // SSE2 aligned version is 1407% faster
1932         if (sse2() && a.length >= 8)
1933         {
1934             auto n = aptr + (a.length & ~7);
1935 
1936             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1937             {
1938                 asm
1939                 {
1940                     mov ESI, aptr;
1941                     mov EDI, n;
1942                     mov EAX, bptr;
1943                     mov ECX, cptr;
1944 
1945                     align 4;
1946                 startsse2u:
1947                     add ESI, 32;
1948                     movdqu XMM0, [EAX];
1949                     movdqu XMM2, [ECX];
1950                     movdqu XMM1, [EAX+16];
1951                     movdqu XMM3, [ECX+16];
1952                     add EAX, 32;
1953                     add ECX, 32;
1954                     pmuludq XMM0, XMM2;
1955                     pmuludq XMM1, XMM3;
1956                     movdqu [ESI   -32], XMM0;
1957                     movdqu [ESI+16-32], XMM1;
1958                     cmp ESI, EDI;
1959                     jb startsse2u;
1960 
1961                     mov aptr, ESI;
1962                     mov bptr, EAX;
1963                     mov cptr, ECX;
1964                 }
1965             }
1966             else
1967             {
1968                 asm
1969                 {
1970                     mov ESI, aptr;
1971                     mov EDI, n;
1972                     mov EAX, bptr;
1973                     mov ECX, cptr;
1974 
1975                     align 4;
1976                 startsse2a:
1977                     add ESI, 32;
1978                     movdqa XMM0, [EAX];
1979                     movdqa XMM2, [ECX];
1980                     movdqa XMM1, [EAX+16];
1981                     movdqa XMM3, [ECX+16];
1982                     add EAX, 32;
1983                     add ECX, 32;
1984                     pmuludq XMM0, XMM2;
1985                     pmuludq XMM1, XMM3;
1986                     movdqa [ESI   -32], XMM0;
1987                     movdqa [ESI+16-32], XMM1;
1988                     cmp ESI, EDI;
1989                     jb startsse2a;
1990 
1991                     mov aptr, ESI;
1992                     mov bptr, EAX;
1993                     mov cptr, ECX;
1994                }
1995             }
1996         }
1997         else
1998         // MMX version is 1029% faster
1999         if (mmx() && a.length >= 4)
2000         {
2001             auto n = aptr + (a.length & ~3);
2002 
2003             asm
2004             {
2005                 mov ESI, aptr;
2006                 mov EDI, n;
2007                 mov EAX, bptr;
2008                 mov ECX, cptr;
2009 
2010                 align 4;
2011             startmmx:
2012                 add ESI, 16;
2013                 movq MM0, [EAX];
2014                 movq MM2, [ECX];
2015                 movq MM1, [EAX+8];
2016                 movq MM3, [ECX+8];
2017                 add EAX, 16;
2018                 add ECX, 16;
2019                 pmuludq MM0, MM2;
2020                 pmuludq MM1, MM3;
2021                 movq [ESI  -16], MM0;
2022                 movq [ESI+8-16], MM1;
2023                 cmp ESI, EDI;
2024                 jb startmmx;
2025 
2026                 emms;
2027                 mov aptr, ESI;
2028                 mov bptr, EAX;
2029                 mov cptr, ECX;
2030             }
2031         }
2032     }
2033   }
2034 
2035     while (aptr < aend)
2036         *aptr++ = *bptr++ * *cptr++;
2037 
2038     return a;
2039 }
2040 
2041 unittest
2042 {
2043     printf("_arraySliceSliceMulSliceAssign_i unittest\n");
2044 
2045     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2046     {
2047         version (log) printf("    cpuid %d\n", cpuid);
2048 
2049         for (int j = 0; j < 2; j++)
2050         {
2051             const int dim = 67;
2052             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2053             a = a[j .. dim + j];        // misalign for second iteration
2054             T[] b = new T[dim + j];
2055             b = b[j .. dim + j];
2056             T[] c = new T[dim + j];
2057             c = c[j .. dim + j];
2058 
2059             for (int i = 0; i < dim; i++)
2060             {   a[i] = cast(T)i;
2061                 b[i] = cast(T)(i + 7);
2062                 c[i] = cast(T)(i * 2);
2063             }
2064 
2065             c[] = a[] * b[];
2066 
2067             for (int i = 0; i < dim; i++)
2068             {
2069                 if (c[i] != cast(T)(a[i] * b[i]))
2070                 {
2071                     printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
2072                     assert(0);
2073                 }
2074             }
2075         }
2076     }
2077 }
2078 
2079 
2080 /* ======================================================================== */
2081 
2082 /***********************
2083  * Computes:
2084  *      a[] *= value
2085  */
2086 
2087 T[] _arrayExpSliceMulass_w(T[] a, T value)
2088 {
2089     return _arrayExpSliceMulass_i(a, value);
2090 }
2091 
2092 T[] _arrayExpSliceMulass_k(T[] a, T value)
2093 {
2094     return _arrayExpSliceMulass_i(a, value);
2095 }
2096 
2097 T[] _arrayExpSliceMulass_i(T[] a, T value)
2098 {
2099     //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
2100     auto aptr = a.ptr;
2101     auto aend = aptr + a.length;
2102 
2103   version (none)
2104   {
2105     version (D_InlineAsm_X86)
2106     {
2107         // SSE2 aligned version is 400% faster
2108         if (sse2() && a.length >= 8)
2109         {
2110             auto n = aptr + (a.length & ~7);
2111 
2112             uint l = value;
2113 
2114             if (((cast(uint) aptr) & 15) != 0)
2115             {
2116                 asm
2117                 {
2118                     mov ESI, aptr;
2119                     mov EDI, n;
2120                     movd XMM2, l;
2121                     pshufd XMM2, XMM2, 0;
2122 
2123                     align 4;
2124                 startsse2u:
2125                     movdqu XMM0, [ESI];
2126                     movdqu XMM1, [ESI+16];
2127                     add ESI, 32;
2128                     pmuludq XMM0, XMM2;
2129                     pmuludq XMM1, XMM2;
2130                     movdqu [ESI   -32], XMM0;
2131                     movdqu [ESI+16-32], XMM1;
2132                     cmp ESI, EDI;
2133                     jb startsse2u;
2134 
2135                     mov aptr, ESI;
2136                 }
2137             }
2138             else
2139             {
2140                 asm
2141                 {
2142                     mov ESI, aptr;
2143                     mov EDI, n;
2144                     movd XMM2, l;
2145                     pshufd XMM2, XMM2, 0;
2146 
2147                     align 4;
2148                 startsse2a:
2149                     movdqa XMM0, [ESI];
2150                     movdqa XMM1, [ESI+16];
2151                     add ESI, 32;
2152                     pmuludq XMM0, XMM2;
2153                     pmuludq XMM1, XMM2;
2154                     movdqa [ESI   -32], XMM0;
2155                     movdqa [ESI+16-32], XMM1;
2156                     cmp ESI, EDI;
2157                     jb startsse2a;
2158 
2159                     mov aptr, ESI;
2160                 }
2161             }
2162         }
2163         else
2164         // MMX version is 402% faster
2165         if (mmx() && a.length >= 4)
2166         {
2167             auto n = aptr + (a.length & ~3);
2168 
2169             ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32);
2170 
2171             asm
2172             {
2173                 mov ESI, aptr;
2174                 mov EDI, n;
2175                 movq MM2, l;
2176 
2177                 align 4;
2178             startmmx:
2179                 movq MM0, [ESI];
2180                 movq MM1, [ESI+8];
2181                 add ESI, 16;
2182                 pmuludq MM0, MM2;
2183                 pmuludq MM1, MM2;
2184                 movq [ESI  -16], MM0;
2185                 movq [ESI+8-16], MM1;
2186                 cmp ESI, EDI;
2187                 jb startmmx;
2188 
2189                 emms;
2190                 mov aptr, ESI;
2191             }
2192         }
2193     }
2194   }
2195 
2196     while (aptr < aend)
2197         *aptr++ *= value;
2198 
2199     return a;
2200 }
2201 
2202 unittest
2203 {
2204     printf("_arrayExpSliceMulass_i unittest\n");
2205 
2206     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2207     {
2208         version (log) printf("    cpuid %d\n", cpuid);
2209 
2210         for (int j = 0; j < 2; j++)
2211         {
2212             const int dim = 67;
2213             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2214             a = a[j .. dim + j];        // misalign for second iteration
2215             T[] b = new T[dim + j];
2216             b = b[j .. dim + j];
2217             T[] c = new T[dim + j];
2218             c = c[j .. dim + j];
2219 
2220             for (int i = 0; i < dim; i++)
2221             {   a[i] = cast(T)i;
2222                 b[i] = cast(T)(i + 7);
2223                 c[i] = cast(T)(i * 2);
2224             }
2225 
2226             b[] = a[];
2227             a[] *= 6;
2228 
2229             for (int i = 0; i < dim; i++)
2230             {
2231                 if (a[i] != cast(T)(b[i] * 6))
2232                 {
2233                     printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2234                     assert(0);
2235                 }
2236             }
2237         }
2238     }
2239 }
2240 
2241 
2242 /* ======================================================================== */
2243 
2244 /***********************
2245  * Computes:
2246  *      a[] *= b[]
2247  */
2248 
2249 T[] _arraySliceSliceMulass_w(T[] a, T[] b)
2250 {
2251     return _arraySliceSliceMulass_i(a, b);
2252 }
2253 
2254 T[] _arraySliceSliceMulass_k(T[] a, T[] b)
2255 {
2256     return _arraySliceSliceMulass_i(a, b);
2257 }
2258 
2259 T[] _arraySliceSliceMulass_i(T[] a, T[] b)
2260 in
2261 {
2262     assert (a.length == b.length);
2263     assert (disjoint(a, b));
2264 }
2265 body
2266 {
2267     //printf("_arraySliceSliceMulass_i()\n");
2268     auto aptr = a.ptr;
2269     auto aend = aptr + a.length;
2270     auto bptr = b.ptr;
2271 
2272   version (none)
2273   {
2274     version (D_InlineAsm_X86)
2275     {
2276         // SSE2 aligned version is 873% faster
2277         if (sse2() && a.length >= 8)
2278         {
2279             auto n = aptr + (a.length & ~7);
2280 
2281             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2282             {
2283                 asm
2284                 {
2285                     mov ESI, aptr;
2286                     mov EDI, n;
2287                     mov ECX, bptr;
2288 
2289                     align 4;
2290                 startsse2u:
2291                     movdqu XMM0, [ESI];
2292                     movdqu XMM2, [ECX];
2293                     movdqu XMM1, [ESI+16];
2294                     movdqu XMM3, [ECX+16];
2295                     add ESI, 32;
2296                     add ECX, 32;
2297                     pmuludq XMM0, XMM2;
2298                     pmuludq XMM1, XMM3;
2299                     movdqu [ESI   -32], XMM0;
2300                     movdqu [ESI+16-32], XMM1;
2301                     cmp ESI, EDI;
2302                     jb startsse2u;
2303 
2304                     mov aptr, ESI;
2305                     mov bptr, ECX;
2306                 }
2307             }
2308             else
2309             {
2310                 asm
2311                 {
2312                     mov ESI, aptr;
2313                     mov EDI, n;
2314                     mov ECX, bptr;
2315 
2316                     align 4;
2317                 startsse2a:
2318                     movdqa XMM0, [ESI];
2319                     movdqa XMM2, [ECX];
2320                     movdqa XMM1, [ESI+16];
2321                     movdqa XMM3, [ECX+16];
2322                     add ESI, 32;
2323                     add ECX, 32;
2324                     pmuludq XMM0, XMM2;
2325                     pmuludq XMM1, XMM3;
2326                     movdqa [ESI   -32], XMM0;
2327                     movdqa [ESI+16-32], XMM1;
2328                     cmp ESI, EDI;
2329                     jb startsse2a;
2330 
2331                     mov aptr, ESI;
2332                     mov bptr, ECX;
2333                }
2334             }
2335         }
2336 /+ BUG: comment out this section until we figure out what is going
2337    wrong with the invalid pshufd instructions.
2338 
2339         else
2340         // MMX version is 573% faster
2341         if (mmx() && a.length >= 4)
2342         {
2343             auto n = aptr + (a.length & ~3);
2344 
2345             asm
2346             {
2347                 mov ESI, aptr;
2348                 mov EDI, n;
2349                 mov ECX, bptr;
2350 
2351                 align 4;
2352             startmmx:
2353                 movq MM0, [ESI];
2354                 movq MM2, [ECX];
2355                 movq MM1, [ESI+8];
2356                 movq MM3, [ECX+8];
2357                 pxor MM4, MM4;
2358                 pxor MM5, MM5;
2359                 punpckldq MM4, MM0;
2360                 punpckldq MM5, MM2;
2361                 add ESI, 16;
2362                 add ECX, 16;
2363                 pmuludq MM4, MM5;
2364                 pshufd MM4, MM4, 8;     // ?
2365                 movq [ESI  -16], MM4;
2366                 pxor MM4, MM4;
2367                 pxor MM5, MM5;
2368                 punpckldq MM4, MM1;
2369                 punpckldq MM5, MM3;
2370                 pmuludq MM4, MM5;
2371                 pshufd MM4, MM4, 8;     // ?
2372                 movq [ESI+8-16], MM4;
2373                 cmp ESI, EDI;
2374                 jb startmmx;
2375 
2376                 emms;
2377                 mov aptr, ESI;
2378                 mov bptr, ECX;
2379             }
2380         }
2381 +/
2382     }
2383   }
2384 
2385     while (aptr < aend)
2386         *aptr++ *= *bptr++;
2387 
2388     return a;
2389 }
2390 
2391 unittest
2392 {
2393     printf("_arraySliceSliceMulass_i unittest\n");
2394 
2395     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2396     {
2397         version (log) printf("    cpuid %d\n", cpuid);
2398 
2399         for (int j = 0; j < 2; j++)
2400         {
2401             const int dim = 67;
2402             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2403             a = a[j .. dim + j];        // misalign for second iteration
2404             T[] b = new T[dim + j];
2405             b = b[j .. dim + j];
2406             T[] c = new T[dim + j];
2407             c = c[j .. dim + j];
2408 
2409             for (int i = 0; i < dim; i++)
2410             {   a[i] = cast(T)i;
2411                 b[i] = cast(T)(i + 7);
2412                 c[i] = cast(T)(i * 2);
2413             }
2414 
2415             b[] = a[];
2416             a[] *= c[];
2417 
2418             for (int i = 0; i < dim; i++)
2419             {
2420                 if (a[i] != cast(T)(b[i] * c[i]))
2421                 {
2422                     printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
2423                     assert(0);
2424                 }
2425             }
2426         }
2427     }
2428 }