1 /***************************
2  * D programming language http://www.digitalmars.com/d/
3  * Runtime support for byte array operations.
4  * Based on code originally written by Burton Radons.
5  * Placed in public domain.
6  */
7 
8 /* Contains SSE2 and MMX versions of certain operations for wchar, short,
9  * and ushort ('u', 's' and 't' suffixes).
10  */
11 
12 module rt.compiler.gdc.rt.arrayshort;
13 
14 private import CPUid = rt.compiler.util.cpuid;
15 
16 debug(UnitTest)
17 {
18     private extern(C) int printf(char*,...);
19     /* This is so unit tests will test every CPU variant
20      */
21     int cpuid;
22     const int CPUID_MAX = 4;
23     bool mmx()      { return cpuid == 1 && CPUid.mmx(); }
24     bool sse()      { return cpuid == 2 && CPUid.sse(); }
25     bool sse2()     { return cpuid == 3 && CPUid.sse2(); }
26     bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); }
27 }
28 else
29 {
30     alias CPUid.mmx mmx;
31     alias CPUid.sse sse;
32     alias CPUid.sse2 sse2;
33     alias CPUid.sse2 sse2;
34 }
35 
36 //version = log;
37 
38 bool disjoint(T)(T[] a, T[] b)
39 {
40     return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
41 }
42 
43 alias short T;
44 
45 extern (C):
46 
47 /* ======================================================================== */
48 
49 /***********************
50  * Computes:
51  *      a[] = b[] + value
52  */
53 
54 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b)
55 {
56     return _arraySliceExpAddSliceAssign_s(a, value, b);
57 }
58 
59 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b)
60 {
61     return _arraySliceExpAddSliceAssign_s(a, value, b);
62 }
63 
64 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b)
65 in
66 {
67     assert(a.length == b.length);
68     assert(disjoint(a, b));
69 }
70 body
71 {
72     //printf("_arraySliceExpAddSliceAssign_s()\n");
73     auto aptr = a.ptr;
74     auto aend = aptr + a.length;
75     auto bptr = b.ptr;
76 
77     version (D_InlineAsm_X86)
78     {
79         // SSE2 aligned version is 3343% faster
80         if (sse2() && a.length >= 16)
81         {
82             auto n = aptr + (a.length & ~15);
83 
84             uint l = cast(ushort) value;
85             l |= (l << 16);
86 
87             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
88             {
89                 asm // unaligned case
90                 {
91                     mov ESI, aptr;
92                     mov EDI, n;
93                     mov EAX, bptr;
94                     movd XMM2, l;
95                     pshufd XMM2, XMM2, 0;
96 
97                     align 4;
98                 startaddsse2u:
99                     add ESI, 32;
100                     movdqu XMM0, [EAX];
101                     movdqu XMM1, [EAX+16];
102                     add EAX, 32;
103                     paddw XMM0, XMM2;
104                     paddw XMM1, XMM2;
105                     movdqu [ESI   -32], XMM0;
106                     movdqu [ESI+16-32], XMM1;
107                     cmp ESI, EDI;
108                     jb startaddsse2u;
109 
110                     mov aptr, ESI;
111                     mov bptr, EAX;
112                 }
113             }
114             else
115             {
116                 asm // aligned case
117                 {
118                     mov ESI, aptr;
119                     mov EDI, n;
120                     mov EAX, bptr;
121                     movd XMM2, l;
122                     pshufd XMM2, XMM2, 0;
123 
124                     align 4;
125                 startaddsse2a:
126                     add ESI, 32;
127                     movdqa XMM0, [EAX];
128                     movdqa XMM1, [EAX+16];
129                     add EAX, 32;
130                     paddw XMM0, XMM2;
131                     paddw XMM1, XMM2;
132                     movdqa [ESI   -32], XMM0;
133                     movdqa [ESI+16-32], XMM1;
134                     cmp ESI, EDI;
135                     jb startaddsse2a;
136 
137                     mov aptr, ESI;
138                     mov bptr, EAX;
139                 }
140             }
141         }
142         else
143         // MMX version is 3343% faster
144         if (mmx() && a.length >= 8)
145         {
146             auto n = aptr + (a.length & ~7);
147 
148             uint l = cast(ushort) value;
149 
150             asm
151             {
152                 mov ESI, aptr;
153                 mov EDI, n;
154                 mov EAX, bptr;
155                 movd MM2, l;
156                 pshufw MM2, MM2, 0;
157 
158                 align 4;
159             startmmx:
160                 add ESI, 16;
161                 movq MM0, [EAX];
162                 movq MM1, [EAX+8];
163                 add EAX, 16;
164                 paddw MM0, MM2;
165                 paddw MM1, MM2;
166                 movq [ESI  -16], MM0;
167                 movq [ESI+8-16], MM1;
168                 cmp ESI, EDI;
169                 jb startmmx;
170 
171                 emms;
172                 mov aptr, ESI;
173                 mov bptr, EAX;
174             }
175         }
176     }
177 
178     while (aptr < aend)
179         *aptr++ = cast(T)(*bptr++ + value);
180 
181     return a;
182 }
183 
184 unittest
185 {
186     printf("_arraySliceExpAddSliceAssign_s unittest\n");
187 
188     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
189     {
190         version (log) printf("    cpuid %d\n", cpuid);
191 
192         for (int j = 0; j < 2; j++)
193         {
194             const int dim = 67;
195             T[] a = new T[dim + j];     // aligned on 16 byte boundary
196             a = a[j .. dim + j];        // misalign for second iteration
197             T[] b = new T[dim + j];
198             b = b[j .. dim + j];
199             T[] c = new T[dim + j];
200             c = c[j .. dim + j];
201 
202             for (int i = 0; i < dim; i++)
203             {   a[i] = cast(T)i;
204                 b[i] = cast(T)(i + 7);
205                 c[i] = cast(T)(i * 2);
206             }
207 
208             c[] = a[] + 6;
209 
210             for (int i = 0; i < dim; i++)
211             {
212                 if (c[i] != cast(T)(a[i] + 6))
213                 {
214                     printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
215                     assert(0);
216                 }
217             }
218         }
219     }
220 }
221 
222 
223 /* ======================================================================== */
224 
225 /***********************
226  * Computes:
227  *      a[] = b[] + c[]
228  */
229 
230 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b)
231 {
232     return _arraySliceSliceAddSliceAssign_s(a, c, b);
233 }
234 
235 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b)
236 {
237     return _arraySliceSliceAddSliceAssign_s(a, c, b);
238 }
239 
240 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b)
241 in
242 {
243         assert(a.length == b.length && b.length == c.length);
244         assert(disjoint(a, b));
245         assert(disjoint(a, c));
246         assert(disjoint(b, c));
247 }
248 body
249 {
250     //printf("_arraySliceSliceAddSliceAssign_s()\n");
251     auto aptr = a.ptr;
252     auto aend = aptr + a.length;
253     auto bptr = b.ptr;
254     auto cptr = c.ptr;
255 
256     version (D_InlineAsm_X86)
257     {
258         // SSE2 aligned version is 3777% faster
259         if (sse2() && a.length >= 16)
260         {
261             auto n = aptr + (a.length & ~15);
262 
263             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
264             {
265                 asm // unaligned case
266                 {
267                     mov ESI, aptr;
268                     mov EDI, n;
269                     mov EAX, bptr;
270                     mov ECX, cptr;
271 
272                     align 4;
273                 startsse2u:
274                     add ESI, 32;
275                     movdqu XMM0, [EAX];
276                     movdqu XMM1, [EAX+16];
277                     add EAX, 32;
278                     movdqu XMM2, [ECX];
279                     movdqu XMM3, [ECX+16];
280                     add ECX, 32;
281                     paddw XMM0, XMM2;
282                     paddw XMM1, XMM3;
283                     movdqu [ESI   -32], XMM0;
284                     movdqu [ESI+16-32], XMM1;
285                     cmp ESI, EDI;
286                     jb startsse2u;
287 
288                     mov aptr, ESI;
289                     mov bptr, EAX;
290                     mov cptr, ECX;
291                 }
292             }
293             else
294             {
295                 asm // aligned case
296                 {
297                     mov ESI, aptr;
298                     mov EDI, n;
299                     mov EAX, bptr;
300                     mov ECX, cptr;
301 
302                     align 4;
303                 startsse2a:
304                     add ESI, 32;
305                     movdqa XMM0, [EAX];
306                     movdqa XMM1, [EAX+16];
307                     add EAX, 32;
308                     movdqa XMM2, [ECX];
309                     movdqa XMM3, [ECX+16];
310                     add ECX, 32;
311                     paddw XMM0, XMM2;
312                     paddw XMM1, XMM3;
313                     movdqa [ESI   -32], XMM0;
314                     movdqa [ESI+16-32], XMM1;
315                     cmp ESI, EDI;
316                     jb startsse2a;
317 
318                     mov aptr, ESI;
319                     mov bptr, EAX;
320                     mov cptr, ECX;
321                 }
322             }
323         }
324         else
325         // MMX version is 2068% faster
326         if (mmx() && a.length >= 8)
327         {
328             auto n = aptr + (a.length & ~7);
329 
330             asm
331             {
332                 mov ESI, aptr;
333                 mov EDI, n;
334                 mov EAX, bptr;
335                 mov ECX, cptr;
336 
337                 align 4;
338             startmmx:
339                 add ESI, 16;
340                 movq MM0, [EAX];
341                 movq MM1, [EAX+8];
342                 add EAX, 16;
343                 movq MM2, [ECX];
344                 movq MM3, [ECX+8];
345                 add ECX, 16;
346                 paddw MM0, MM2;
347                 paddw MM1, MM3;
348                 movq [ESI  -16], MM0;
349                 movq [ESI+8-16], MM1;
350                 cmp ESI, EDI;
351                 jb startmmx;
352 
353                 emms;
354                 mov aptr, ESI;
355                 mov bptr, EAX;
356                 mov cptr, ECX;
357             }
358         }
359     }
360 
361     while (aptr < aend)
362         *aptr++ = cast(T)(*bptr++ + *cptr++);
363 
364     return a;
365 }
366 
367 unittest
368 {
369     printf("_arraySliceSliceAddSliceAssign_s unittest\n");
370 
371     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
372     {
373         version (log) printf("    cpuid %d\n", cpuid);
374 
375         for (int j = 0; j < 2; j++)
376         {
377             const int dim = 67;
378             T[] a = new T[dim + j];     // aligned on 16 byte boundary
379             a = a[j .. dim + j];        // misalign for second iteration
380             T[] b = new T[dim + j];
381             b = b[j .. dim + j];
382             T[] c = new T[dim + j];
383             c = c[j .. dim + j];
384 
385             for (int i = 0; i < dim; i++)
386             {   a[i] = cast(T)i;
387                 b[i] = cast(T)(i + 7);
388                 c[i] = cast(T)(i * 2);
389             }
390 
391             c[] = a[] + b[];
392 
393             for (int i = 0; i < dim; i++)
394             {
395                 if (c[i] != cast(T)(a[i] + b[i]))
396                 {
397                     printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
398                     assert(0);
399                 }
400             }
401         }
402     }
403 }
404 
405 
406 /* ======================================================================== */
407 
408 /***********************
409  * Computes:
410  *      a[] += value
411  */
412 
413 T[] _arrayExpSliceAddass_u(T[] a, T value)
414 {
415     return _arrayExpSliceAddass_s(a, value);
416 }
417 
418 T[] _arrayExpSliceAddass_t(T[] a, T value)
419 {
420     return _arrayExpSliceAddass_s(a, value);
421 }
422 
423 T[] _arrayExpSliceAddass_s(T[] a, T value)
424 {
425     //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
426     auto aptr = a.ptr;
427     auto aend = aptr + a.length;
428 
429     version (D_InlineAsm_X86)
430     {
431         // SSE2 aligned version is 832% faster
432         if (sse2() && a.length >= 16)
433         {
434             auto n = aptr + (a.length & ~15);
435 
436             uint l = cast(ushort) value;
437             l |= (l << 16);
438 
439             if (((cast(uint) aptr) & 15) != 0)
440             {
441                 asm // unaligned case
442                 {
443                     mov ESI, aptr;
444                     mov EDI, n;
445                     movd XMM2, l;
446                     pshufd XMM2, XMM2, 0;
447 
448                     align 4;
449                 startaddsse2u:
450                     movdqu XMM0, [ESI];
451                     movdqu XMM1, [ESI+16];
452                     add ESI, 32;
453                     paddw XMM0, XMM2;
454                     paddw XMM1, XMM2;
455                     movdqu [ESI   -32], XMM0;
456                     movdqu [ESI+16-32], XMM1;
457                     cmp ESI, EDI;
458                     jb startaddsse2u;
459 
460                     mov aptr, ESI;
461                 }
462             }
463             else
464             {
465                 asm // aligned case
466                 {
467                     mov ESI, aptr;
468                     mov EDI, n;
469                     movd XMM2, l;
470                     pshufd XMM2, XMM2, 0;
471 
472                     align 4;
473                 startaddsse2a:
474                     movdqa XMM0, [ESI];
475                     movdqa XMM1, [ESI+16];
476                     add ESI, 32;
477                     paddw XMM0, XMM2;
478                     paddw XMM1, XMM2;
479                     movdqa [ESI   -32], XMM0;
480                     movdqa [ESI+16-32], XMM1;
481                     cmp ESI, EDI;
482                     jb startaddsse2a;
483 
484                     mov aptr, ESI;
485                 }
486             }
487         }
488         else
489         // MMX version is 826% faster
490         if (mmx() && a.length >= 8)
491         {
492             auto n = aptr + (a.length & ~7);
493 
494             uint l = cast(ushort) value;
495 
496             asm
497             {
498                 mov ESI, aptr;
499                 mov EDI, n;
500                 movd MM2, l;
501                 pshufw MM2, MM2, 0;
502 
503                 align 4;
504             startmmx:
505                 movq MM0, [ESI];
506                 movq MM1, [ESI+8];
507                 add ESI, 16;
508                 paddw MM0, MM2;
509                 paddw MM1, MM2;
510                 movq [ESI  -16], MM0;
511                 movq [ESI+8-16], MM1;
512                 cmp ESI, EDI;
513                 jb startmmx;
514 
515                 emms;
516                 mov aptr, ESI;
517             }
518         }
519     }
520 
521     while (aptr < aend)
522         *aptr++ += value;
523 
524     return a;
525 }
526 
527 unittest
528 {
529     printf("_arrayExpSliceAddass_s unittest\n");
530 
531     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
532     {
533         version (log) printf("    cpuid %d\n", cpuid);
534 
535         for (int j = 0; j < 2; j++)
536         {
537             const int dim = 67;
538             T[] a = new T[dim + j];     // aligned on 16 byte boundary
539             a = a[j .. dim + j];        // misalign for second iteration
540             T[] b = new T[dim + j];
541             b = b[j .. dim + j];
542             T[] c = new T[dim + j];
543             c = c[j .. dim + j];
544 
545             for (int i = 0; i < dim; i++)
546             {   a[i] = cast(T)i;
547                 b[i] = cast(T)(i + 7);
548                 c[i] = cast(T)(i * 2);
549             }
550 
551             a[] = c[];
552             a[] += 6;
553 
554             for (int i = 0; i < dim; i++)
555             {
556                 if (a[i] != cast(T)(c[i] + 6))
557                 {
558                     printf("[%d]: %d != %d + 6\n", i, a[i], c[i]);
559                     assert(0);
560                 }
561             }
562         }
563     }
564 }
565 
566 
567 /* ======================================================================== */
568 
569 /***********************
570  * Computes:
571  *      a[] += b[]
572  */
573 
574 T[] _arraySliceSliceAddass_u(T[] a, T[] b)
575 {
576     return _arraySliceSliceAddass_s(a, b);
577 }
578 
579 T[] _arraySliceSliceAddass_t(T[] a, T[] b)
580 {
581     return _arraySliceSliceAddass_s(a, b);
582 }
583 
584 T[] _arraySliceSliceAddass_s(T[] a, T[] b)
585 in
586 {
587     assert (a.length == b.length);
588     assert (disjoint(a, b));
589 }
590 body
591 {
592     //printf("_arraySliceSliceAddass_s()\n");
593     auto aptr = a.ptr;
594     auto aend = aptr + a.length;
595     auto bptr = b.ptr;
596 
597     version (D_InlineAsm_X86)
598     {
599         // SSE2 aligned version is 2085% faster
600         if (sse2() && a.length >= 16)
601         {
602             auto n = aptr + (a.length & ~15);
603 
604             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
605             {
606                 asm // unaligned case
607                 {
608                     mov ESI, aptr;
609                     mov EDI, n;
610                     mov ECX, bptr;
611 
612                     align 4;
613                 startsse2u:
614                     movdqu XMM0, [ESI];
615                     movdqu XMM1, [ESI+16];
616                     add ESI, 32;
617                     movdqu XMM2, [ECX];
618                     movdqu XMM3, [ECX+16];
619                     add ECX, 32;
620                     paddw XMM0, XMM2;
621                     paddw XMM1, XMM3;
622                     movdqu [ESI   -32], XMM0;
623                     movdqu [ESI+16-32], XMM1;
624                     cmp ESI, EDI;
625                     jb startsse2u;
626 
627                     mov aptr, ESI;
628                     mov bptr, ECX;
629                 }
630             }
631             else
632             {
633                 asm // aligned case
634                 {
635                     mov ESI, aptr;
636                     mov EDI, n;
637                     mov ECX, bptr;
638 
639                     align 4;
640                 startsse2a:
641                     movdqa XMM0, [ESI];
642                     movdqa XMM1, [ESI+16];
643                     add ESI, 32;
644                     movdqa XMM2, [ECX];
645                     movdqa XMM3, [ECX+16];
646                     add ECX, 32;
647                     paddw XMM0, XMM2;
648                     paddw XMM1, XMM3;
649                     movdqa [ESI   -32], XMM0;
650                     movdqa [ESI+16-32], XMM1;
651                     cmp ESI, EDI;
652                     jb startsse2a;
653 
654                     mov aptr, ESI;
655                     mov bptr, ECX;
656                 }
657             }
658         }
659         else
660         // MMX version is 1022% faster
661         if (mmx() && a.length >= 8)
662         {
663             auto n = aptr + (a.length & ~7);
664 
665             asm
666             {
667                 mov ESI, aptr;
668                 mov EDI, n;
669                 mov ECX, bptr;
670 
671                 align 4;
672             start:
673                 movq MM0, [ESI];
674                 movq MM1, [ESI+8];
675                 add ESI, 16;
676                 movq MM2, [ECX];
677                 movq MM3, [ECX+8];
678                 add ECX, 16;
679                 paddw MM0, MM2;
680                 paddw MM1, MM3;
681                 movq [ESI  -16], MM0;
682                 movq [ESI+8-16], MM1;
683                 cmp ESI, EDI;
684                 jb start;
685 
686                 emms;
687                 mov aptr, ESI;
688                 mov bptr, ECX;
689             }
690         }
691     }
692 
693     while (aptr < aend)
694         *aptr++ += *bptr++;
695 
696     return a;
697 }
698 
699 unittest
700 {
701     printf("_arraySliceSliceAddass_s unittest\n");
702 
703     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
704     {
705         version (log) printf("    cpuid %d\n", cpuid);
706 
707         for (int j = 0; j < 2; j++)
708         {
709             const int dim = 67;
710             T[] a = new T[dim + j];     // aligned on 16 byte boundary
711             a = a[j .. dim + j];        // misalign for second iteration
712             T[] b = new T[dim + j];
713             b = b[j .. dim + j];
714             T[] c = new T[dim + j];
715             c = c[j .. dim + j];
716 
717             for (int i = 0; i < dim; i++)
718             {   a[i] = cast(T)i;
719                 b[i] = cast(T)(i + 7);
720                 c[i] = cast(T)(i * 2);
721             }
722 
723             b[] = c[];
724             c[] += a[];
725 
726             for (int i = 0; i < dim; i++)
727             {
728                 if (c[i] != cast(T)(b[i] + a[i]))
729                 {
730                     printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]);
731                     assert(0);
732                 }
733             }
734         }
735     }
736 }
737 
738 
739 /* ======================================================================== */
740 
741 /***********************
742  * Computes:
743  *      a[] = b[] - value
744  */
745 
746 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b)
747 {
748     return _arraySliceExpMinSliceAssign_s(a, value, b);
749 }
750 
751 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b)
752 {
753     return _arraySliceExpMinSliceAssign_s(a, value, b);
754 }
755 
756 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b)
757 in
758 {
759     assert(a.length == b.length);
760     assert(disjoint(a, b));
761 }
762 body
763 {
764     //printf("_arraySliceExpMinSliceAssign_s()\n");
765     auto aptr = a.ptr;
766     auto aend = aptr + a.length;
767     auto bptr = b.ptr;
768 
769     version (D_InlineAsm_X86)
770     {
771         // SSE2 aligned version is 3695% faster
772         if (sse2() && a.length >= 16)
773         {
774             auto n = aptr + (a.length & ~15);
775 
776             uint l = cast(ushort) value;
777             l |= (l << 16);
778 
779             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
780             {
781                 asm // unaligned case
782                 {
783                     mov ESI, aptr;
784                     mov EDI, n;
785                     mov EAX, bptr;
786                     movd XMM2, l;
787                     pshufd XMM2, XMM2, 0;
788 
789                     align 4;
790                 startaddsse2u:
791                     add ESI, 32;
792                     movdqu XMM0, [EAX];
793                     movdqu XMM1, [EAX+16];
794                     add EAX, 32;
795                     psubw XMM0, XMM2;
796                     psubw XMM1, XMM2;
797                     movdqu [ESI   -32], XMM0;
798                     movdqu [ESI+16-32], XMM1;
799                     cmp ESI, EDI;
800                     jb startaddsse2u;
801 
802                     mov aptr, ESI;
803                     mov bptr, EAX;
804                 }
805             }
806             else
807             {
808                 asm // aligned case
809                 {
810                     mov ESI, aptr;
811                     mov EDI, n;
812                     mov EAX, bptr;
813                     movd XMM2, l;
814                     pshufd XMM2, XMM2, 0;
815 
816                     align 4;
817                 startaddsse2a:
818                     add ESI, 32;
819                     movdqa XMM0, [EAX];
820                     movdqa XMM1, [EAX+16];
821                     add EAX, 32;
822                     psubw XMM0, XMM2;
823                     psubw XMM1, XMM2;
824                     movdqa [ESI   -32], XMM0;
825                     movdqa [ESI+16-32], XMM1;
826                     cmp ESI, EDI;
827                     jb startaddsse2a;
828 
829                     mov aptr, ESI;
830                     mov bptr, EAX;
831                 }
832             }
833         }
834         else
835         // MMX version is 3049% faster
836         if (mmx() && a.length >= 8)
837         {
838             auto n = aptr + (a.length & ~7);
839 
840             uint l = cast(ushort) value;
841 
842             asm
843             {
844                 mov ESI, aptr;
845                 mov EDI, n;
846                 mov EAX, bptr;
847                 movd MM2, l;
848                 pshufw MM2, MM2, 0;
849 
850                 align 4;
851             startmmx:
852                 add ESI, 16;
853                 movq MM0, [EAX];
854                 movq MM1, [EAX+8];
855                 add EAX, 16;
856                 psubw MM0, MM2;
857                 psubw MM1, MM2;
858                 movq [ESI  -16], MM0;
859                 movq [ESI+8-16], MM1;
860                 cmp ESI, EDI;
861                 jb startmmx;
862 
863                 emms;
864                 mov aptr, ESI;
865                 mov bptr, EAX;
866             }
867         }
868     }
869 
870     while (aptr < aend)
871         *aptr++ = cast(T)(*bptr++ - value);
872 
873     return a;
874 }
875 
876 unittest
877 {
878     printf("_arraySliceExpMinSliceAssign_s unittest\n");
879 
880     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
881     {
882         version (log) printf("    cpuid %d\n", cpuid);
883 
884         for (int j = 0; j < 2; j++)
885         {
886             const int dim = 67;
887             T[] a = new T[dim + j];     // aligned on 16 byte boundary
888             a = a[j .. dim + j];        // misalign for second iteration
889             T[] b = new T[dim + j];
890             b = b[j .. dim + j];
891             T[] c = new T[dim + j];
892             c = c[j .. dim + j];
893 
894             for (int i = 0; i < dim; i++)
895             {   a[i] = cast(T)i;
896                 b[i] = cast(T)(i + 7);
897                 c[i] = cast(T)(i * 2);
898             }
899 
900             c[] = a[] - 6;
901 
902             for (int i = 0; i < dim; i++)
903             {
904                 if (c[i] != cast(T)(a[i] - 6))
905                 {
906                     printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
907                     assert(0);
908                 }
909             }
910         }
911     }
912 }
913 
914 
915 /* ======================================================================== */
916 
917 /***********************
918  * Computes:
919  *      a[] = value - b[]
920  */
921 
922 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value)
923 {
924     return _arrayExpSliceMinSliceAssign_s(a, b, value);
925 }
926 
927 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value)
928 {
929     return _arrayExpSliceMinSliceAssign_s(a, b, value);
930 }
931 
932 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value)
933 in
934 {
935     assert(a.length == b.length);
936     assert(disjoint(a, b));
937 }
938 body
939 {
940     //printf("_arrayExpSliceMinSliceAssign_s()\n");
941     auto aptr = a.ptr;
942     auto aend = aptr + a.length;
943     auto bptr = b.ptr;
944 
945     version (D_InlineAsm_X86)
946     {
947         // SSE2 aligned version is 4995% faster
948         if (sse2() && a.length >= 16)
949         {
950             auto n = aptr + (a.length & ~15);
951 
952             uint l = cast(ushort) value;
953             l |= (l << 16);
954 
955             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
956             {
957                 asm // unaligned case
958                 {
959                     mov ESI, aptr;
960                     mov EDI, n;
961                     mov EAX, bptr;
962 
963                     align 4;
964                 startaddsse2u:
965                     movd XMM2, l;
966                     pshufd XMM2, XMM2, 0;
967                     movd XMM3, l;
968                     pshufd XMM3, XMM3, 0;
969                     add ESI, 32;
970                     movdqu XMM0, [EAX];
971                     movdqu XMM1, [EAX+16];
972                     add EAX, 32;
973                     psubw XMM2, XMM0;
974                     psubw XMM3, XMM1;
975                     movdqu [ESI   -32], XMM2;
976                     movdqu [ESI+16-32], XMM3;
977                     cmp ESI, EDI;
978                     jb startaddsse2u;
979 
980                     mov aptr, ESI;
981                     mov bptr, EAX;
982                 }
983             }
984             else
985             {
986                 asm // aligned case
987                 {
988                     mov ESI, aptr;
989                     mov EDI, n;
990                     mov EAX, bptr;
991 
992                     align 4;
993                 startaddsse2a:
994                     movd XMM2, l;
995                     pshufd XMM2, XMM2, 0;
996                     movd XMM3, l;
997                     pshufd XMM3, XMM3, 0;
998                     add ESI, 32;
999                     movdqa XMM0, [EAX];
1000                     movdqa XMM1, [EAX+16];
1001                     add EAX, 32;
1002                     psubw XMM2, XMM0;
1003                     psubw XMM3, XMM1;
1004                     movdqa [ESI   -32], XMM2;
1005                     movdqa [ESI+16-32], XMM3;
1006                     cmp ESI, EDI;
1007                     jb startaddsse2a;
1008 
1009                     mov aptr, ESI;
1010                     mov bptr, EAX;
1011                 }
1012             }
1013         }
1014         else
1015         // MMX version is 4562% faster
1016         if (mmx() && a.length >= 8)
1017         {
1018             auto n = aptr + (a.length & ~7);
1019 
1020             uint l = cast(ushort) value;
1021 
1022             asm
1023             {
1024                 mov ESI, aptr;
1025                 mov EDI, n;
1026                 mov EAX, bptr;
1027                 movd MM4, l;
1028                 pshufw MM4, MM4, 0;
1029 
1030                 align 4;
1031             startmmx:
1032                 add ESI, 16;
1033                 movq MM2, [EAX];
1034                 movq MM3, [EAX+8];
1035                 movq MM0, MM4;
1036                 movq MM1, MM4;
1037                 add EAX, 16;
1038                 psubw MM0, MM2;
1039                 psubw MM1, MM3;
1040                 movq [ESI  -16], MM0;
1041                 movq [ESI+8-16], MM1;
1042                 cmp ESI, EDI;
1043                 jb startmmx;
1044 
1045                 emms;
1046                 mov aptr, ESI;
1047                 mov bptr, EAX;
1048             }
1049         }
1050     }
1051 
1052     while (aptr < aend)
1053         *aptr++ = cast(T)(value - *bptr++);
1054 
1055     return a;
1056 }
1057 
1058 unittest
1059 {
1060     printf("_arrayExpSliceMinSliceAssign_s unittest\n");
1061 
1062     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1063     {
1064         version (log) printf("    cpuid %d\n", cpuid);
1065 
1066         for (int j = 0; j < 2; j++)
1067         {
1068             const int dim = 67;
1069             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1070             a = a[j .. dim + j];        // misalign for second iteration
1071             T[] b = new T[dim + j];
1072             b = b[j .. dim + j];
1073             T[] c = new T[dim + j];
1074             c = c[j .. dim + j];
1075 
1076             for (int i = 0; i < dim; i++)
1077             {   a[i] = cast(T)i;
1078                 b[i] = cast(T)(i + 7);
1079                 c[i] = cast(T)(i * 2);
1080             }
1081 
1082             c[] = 6 - a[];
1083 
1084             for (int i = 0; i < dim; i++)
1085             {
1086                 if (c[i] != cast(T)(6 - a[i]))
1087                 {
1088                     printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]);
1089                     assert(0);
1090                 }
1091             }
1092         }
1093     }
1094 }
1095 
1096 
1097 /* ======================================================================== */
1098 
1099 /***********************
1100  * Computes:
1101  *      a[] = b[] - c[]
1102  */
1103 
1104 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b)
1105 {
1106     return _arraySliceSliceMinSliceAssign_s(a, c, b);
1107 }
1108 
1109 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b)
1110 {
1111     return _arraySliceSliceMinSliceAssign_s(a, c, b);
1112 }
1113 
1114 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b)
1115 in
1116 {
1117         assert(a.length == b.length && b.length == c.length);
1118         assert(disjoint(a, b));
1119         assert(disjoint(a, c));
1120         assert(disjoint(b, c));
1121 }
1122 body
1123 {
1124     auto aptr = a.ptr;
1125     auto aend = aptr + a.length;
1126     auto bptr = b.ptr;
1127     auto cptr = c.ptr;
1128 
1129     version (D_InlineAsm_X86)
1130     {
1131         // SSE2 aligned version is 4129% faster
1132         if (sse2() && a.length >= 16)
1133         {
1134             auto n = aptr + (a.length & ~15);
1135 
1136             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1137             {
1138                 asm // unaligned case
1139                 {
1140                     mov ESI, aptr;
1141                     mov EDI, n;
1142                     mov EAX, bptr;
1143                     mov ECX, cptr;
1144 
1145                     align 4;
1146                 startsse2u:
1147                     add ESI, 32;
1148                     movdqu XMM0, [EAX];
1149                     movdqu XMM1, [EAX+16];
1150                     add EAX, 32;
1151                     movdqu XMM2, [ECX];
1152                     movdqu XMM3, [ECX+16];
1153                     add ECX, 32;
1154                     psubw XMM0, XMM2;
1155                     psubw XMM1, XMM3;
1156                     movdqu [ESI   -32], XMM0;
1157                     movdqu [ESI+16-32], XMM1;
1158                     cmp ESI, EDI;
1159                     jb startsse2u;
1160 
1161                     mov aptr, ESI;
1162                     mov bptr, EAX;
1163                     mov cptr, ECX;
1164                 }
1165             }
1166             else
1167             {
1168                 asm // aligned case
1169                 {
1170                     mov ESI, aptr;
1171                     mov EDI, n;
1172                     mov EAX, bptr;
1173                     mov ECX, cptr;
1174 
1175                     align 4;
1176                 startsse2a:
1177                     add ESI, 32;
1178                     movdqa XMM0, [EAX];
1179                     movdqa XMM1, [EAX+16];
1180                     add EAX, 32;
1181                     movdqa XMM2, [ECX];
1182                     movdqa XMM3, [ECX+16];
1183                     add ECX, 32;
1184                     psubw XMM0, XMM2;
1185                     psubw XMM1, XMM3;
1186                     movdqa [ESI   -32], XMM0;
1187                     movdqa [ESI+16-32], XMM1;
1188                     cmp ESI, EDI;
1189                     jb startsse2a;
1190 
1191                     mov aptr, ESI;
1192                     mov bptr, EAX;
1193                     mov cptr, ECX;
1194                 }
1195             }
1196         }
1197         else
1198         // MMX version is 2018% faster
1199         if (mmx() && a.length >= 8)
1200         {
1201             auto n = aptr + (a.length & ~7);
1202 
1203             asm
1204             {
1205                 mov ESI, aptr;
1206                 mov EDI, n;
1207                 mov EAX, bptr;
1208                 mov ECX, cptr;
1209 
1210                 align 4;
1211             startmmx:
1212                 add ESI, 16;
1213                 movq MM0, [EAX];
1214                 movq MM1, [EAX+8];
1215                 add EAX, 16;
1216                 movq MM2, [ECX];
1217                 movq MM3, [ECX+8];
1218                 add ECX, 16;
1219                 psubw MM0, MM2;
1220                 psubw MM1, MM3;
1221                 movq [ESI  -16], MM0;
1222                 movq [ESI+8-16], MM1;
1223                 cmp ESI, EDI;
1224                 jb startmmx;
1225 
1226                 emms;
1227                 mov aptr, ESI;
1228                 mov bptr, EAX;
1229                 mov cptr, ECX;
1230             }
1231         }
1232     }
1233 
1234     while (aptr < aend)
1235         *aptr++ = cast(T)(*bptr++ - *cptr++);
1236 
1237     return a;
1238 }
1239 
1240 unittest
1241 {
1242     printf("_arraySliceSliceMinSliceAssign_s unittest\n");
1243 
1244     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1245     {
1246         version (log) printf("    cpuid %d\n", cpuid);
1247 
1248         for (int j = 0; j < 2; j++)
1249         {
1250             const int dim = 67;
1251             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1252             a = a[j .. dim + j];        // misalign for second iteration
1253             T[] b = new T[dim + j];
1254             b = b[j .. dim + j];
1255             T[] c = new T[dim + j];
1256             c = c[j .. dim + j];
1257 
1258             for (int i = 0; i < dim; i++)
1259             {   a[i] = cast(T)i;
1260                 b[i] = cast(T)(i + 7);
1261                 c[i] = cast(T)(i * 2);
1262             }
1263 
1264             c[] = a[] - b[];
1265 
1266             for (int i = 0; i < dim; i++)
1267             {
1268                 if (c[i] != cast(T)(a[i] - b[i]))
1269                 {
1270                     printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1271                     assert(0);
1272                 }
1273             }
1274         }
1275     }
1276 }
1277 
1278 
1279 /* ======================================================================== */
1280 
1281 /***********************
1282  * Computes:
1283  *      a[] -= value
1284  */
1285 
1286 T[] _arrayExpSliceMinass_u(T[] a, T value)
1287 {
1288     return _arrayExpSliceMinass_s(a, value);
1289 }
1290 
1291 T[] _arrayExpSliceMinass_t(T[] a, T value)
1292 {
1293     return _arrayExpSliceMinass_s(a, value);
1294 }
1295 
1296 T[] _arrayExpSliceMinass_s(T[] a, T value)
1297 {
1298     //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1299     auto aptr = a.ptr;
1300     auto aend = aptr + a.length;
1301 
1302     version (D_InlineAsm_X86)
1303     {
1304         // SSE2 aligned version is 835% faster
1305         if (sse2() && a.length >= 16)
1306         {
1307             auto n = aptr + (a.length & ~15);
1308 
1309             uint l = cast(ushort) value;
1310             l |= (l << 16);
1311 
1312             if (((cast(uint) aptr) & 15) != 0)
1313             {
1314                 asm // unaligned case
1315                 {
1316                     mov ESI, aptr;
1317                     mov EDI, n;
1318                     movd XMM2, l;
1319                     pshufd XMM2, XMM2, 0;
1320 
1321                     align 4;
1322                 startaddsse2u:
1323                     movdqu XMM0, [ESI];
1324                     movdqu XMM1, [ESI+16];
1325                     add ESI, 32;
1326                     psubw XMM0, XMM2;
1327                     psubw XMM1, XMM2;
1328                     movdqu [ESI   -32], XMM0;
1329                     movdqu [ESI+16-32], XMM1;
1330                     cmp ESI, EDI;
1331                     jb startaddsse2u;
1332 
1333                     mov aptr, ESI;
1334                 }
1335             }
1336             else
1337             {
1338                 asm // aligned case
1339                 {
1340                     mov ESI, aptr;
1341                     mov EDI, n;
1342                     movd XMM2, l;
1343                     pshufd XMM2, XMM2, 0;
1344 
1345                     align 4;
1346                 startaddsse2a:
1347                     movdqa XMM0, [ESI];
1348                     movdqa XMM1, [ESI+16];
1349                     add ESI, 32;
1350                     psubw XMM0, XMM2;
1351                     psubw XMM1, XMM2;
1352                     movdqa [ESI   -32], XMM0;
1353                     movdqa [ESI+16-32], XMM1;
1354                     cmp ESI, EDI;
1355                     jb startaddsse2a;
1356 
1357                     mov aptr, ESI;
1358                 }
1359             }
1360         }
1361         else
1362         // MMX version is 835% faster
1363         if (mmx() && a.length >= 8)
1364         {
1365             auto n = aptr + (a.length & ~7);
1366 
1367             uint l = cast(ushort) value;
1368 
1369             asm
1370             {
1371                 mov ESI, aptr;
1372                 mov EDI, n;
1373                 movd MM2, l;
1374                 pshufw MM2, MM2, 0;
1375 
1376                 align 4;
1377             startmmx:
1378                 movq MM0, [ESI];
1379                 movq MM1, [ESI+8];
1380                 add ESI, 16;
1381                 psubw MM0, MM2;
1382                 psubw MM1, MM2;
1383                 movq [ESI  -16], MM0;
1384                 movq [ESI+8-16], MM1;
1385                 cmp ESI, EDI;
1386                 jb startmmx;
1387 
1388                 emms;
1389                 mov aptr, ESI;
1390             }
1391         }
1392     }
1393 
1394     while (aptr < aend)
1395         *aptr++ -= value;
1396 
1397     return a;
1398 }
1399 
1400 unittest
1401 {
1402     printf("_arrayExpSliceMinass_s unittest\n");
1403 
1404     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1405     {
1406         version (log) printf("    cpuid %d\n", cpuid);
1407 
1408         for (int j = 0; j < 2; j++)
1409         {
1410             const int dim = 67;
1411             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1412             a = a[j .. dim + j];        // misalign for second iteration
1413             T[] b = new T[dim + j];
1414             b = b[j .. dim + j];
1415             T[] c = new T[dim + j];
1416             c = c[j .. dim + j];
1417 
1418             for (int i = 0; i < dim; i++)
1419             {   a[i] = cast(T)i;
1420                 b[i] = cast(T)(i + 7);
1421                 c[i] = cast(T)(i * 2);
1422             }
1423 
1424             a[] = c[];
1425             a[] -= 6;
1426 
1427             for (int i = 0; i < dim; i++)
1428             {
1429                 if (a[i] != cast(T)(c[i] - 6))
1430                 {
1431                     printf("[%d]: %d != %d - 6\n", i, a[i], c[i]);
1432                     assert(0);
1433                 }
1434             }
1435         }
1436     }
1437 }
1438 
1439 
1440 /* ======================================================================== */
1441 
1442 /***********************
1443  * Computes:
1444  *      a[] -= b[]
1445  */
1446 
1447 T[] _arraySliceSliceMinass_u(T[] a, T[] b)
1448 {
1449     return _arraySliceSliceMinass_s(a, b);
1450 }
1451 
1452 T[] _arraySliceSliceMinass_t(T[] a, T[] b)
1453 {
1454     return _arraySliceSliceMinass_s(a, b);
1455 }
1456 
1457 T[] _arraySliceSliceMinass_s(T[] a, T[] b)
1458 in
1459 {
1460     assert (a.length == b.length);
1461     assert (disjoint(a, b));
1462 }
1463 body
1464 {
1465     //printf("_arraySliceSliceMinass_s()\n");
1466     auto aptr = a.ptr;
1467     auto aend = aptr + a.length;
1468     auto bptr = b.ptr;
1469 
1470     version (D_InlineAsm_X86)
1471     {
1472         // SSE2 aligned version is 2121% faster
1473         if (sse2() && a.length >= 16)
1474         {
1475             auto n = aptr + (a.length & ~15);
1476 
1477             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1478             {
1479                 asm // unaligned case
1480                 {
1481                     mov ESI, aptr;
1482                     mov EDI, n;
1483                     mov ECX, bptr;
1484 
1485                     align 4;
1486                 startsse2u:
1487                     movdqu XMM0, [ESI];
1488                     movdqu XMM1, [ESI+16];
1489                     add ESI, 32;
1490                     movdqu XMM2, [ECX];
1491                     movdqu XMM3, [ECX+16];
1492                     add ECX, 32;
1493                     psubw XMM0, XMM2;
1494                     psubw XMM1, XMM3;
1495                     movdqu [ESI   -32], XMM0;
1496                     movdqu [ESI+16-32], XMM1;
1497                     cmp ESI, EDI;
1498                     jb startsse2u;
1499 
1500                     mov aptr, ESI;
1501                     mov bptr, ECX;
1502                 }
1503             }
1504             else
1505             {
1506                 asm // aligned case
1507                 {
1508                     mov ESI, aptr;
1509                     mov EDI, n;
1510                     mov ECX, bptr;
1511 
1512                     align 4;
1513                 startsse2a:
1514                     movdqa XMM0, [ESI];
1515                     movdqa XMM1, [ESI+16];
1516                     add ESI, 32;
1517                     movdqa XMM2, [ECX];
1518                     movdqa XMM3, [ECX+16];
1519                     add ECX, 32;
1520                     psubw XMM0, XMM2;
1521                     psubw XMM1, XMM3;
1522                     movdqa [ESI   -32], XMM0;
1523                     movdqa [ESI+16-32], XMM1;
1524                     cmp ESI, EDI;
1525                     jb startsse2a;
1526 
1527                     mov aptr, ESI;
1528                     mov bptr, ECX;
1529                 }
1530             }
1531         }
1532         else
1533         // MMX version is 1116% faster
1534         if (mmx() && a.length >= 8)
1535         {
1536             auto n = aptr + (a.length & ~7);
1537 
1538             asm
1539             {
1540                 mov ESI, aptr;
1541                 mov EDI, n;
1542                 mov ECX, bptr;
1543 
1544                 align 4;
1545             start:
1546                 movq MM0, [ESI];
1547                 movq MM1, [ESI+8];
1548                 add ESI, 16;
1549                 movq MM2, [ECX];
1550                 movq MM3, [ECX+8];
1551                 add ECX, 16;
1552                 psubw MM0, MM2;
1553                 psubw MM1, MM3;
1554                 movq [ESI  -16], MM0;
1555                 movq [ESI+8-16], MM1;
1556                 cmp ESI, EDI;
1557                 jb start;
1558 
1559                 emms;
1560                 mov aptr, ESI;
1561                 mov bptr, ECX;
1562             }
1563         }
1564     }
1565 
1566     while (aptr < aend)
1567         *aptr++ -= *bptr++;
1568 
1569     return a;
1570 }
1571 
1572 unittest
1573 {
1574     printf("_arraySliceSliceMinass_s unittest\n");
1575 
1576     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1577     {
1578         version (log) printf("    cpuid %d\n", cpuid);
1579 
1580         for (int j = 0; j < 2; j++)
1581         {
1582             const int dim = 67;
1583             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1584             a = a[j .. dim + j];        // misalign for second iteration
1585             T[] b = new T[dim + j];
1586             b = b[j .. dim + j];
1587             T[] c = new T[dim + j];
1588             c = c[j .. dim + j];
1589 
1590             for (int i = 0; i < dim; i++)
1591             {   a[i] = cast(T)i;
1592                 b[i] = cast(T)(i + 7);
1593                 c[i] = cast(T)(i * 2);
1594             }
1595 
1596             b[] = c[];
1597             c[] -= a[];
1598 
1599             for (int i = 0; i < dim; i++)
1600             {
1601                 if (c[i] != cast(T)(b[i] - a[i]))
1602                 {
1603                     printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]);
1604                     assert(0);
1605                 }
1606             }
1607         }
1608     }
1609 }
1610 
1611 
1612 /* ======================================================================== */
1613 
1614 /***********************
1615  * Computes:
1616  *      a[] = b[] * value
1617  */
1618 
1619 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b)
1620 {
1621     return _arraySliceExpMulSliceAssign_s(a, value, b);
1622 }
1623 
1624 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b)
1625 {
1626     return _arraySliceExpMulSliceAssign_s(a, value, b);
1627 }
1628 
1629 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b)
1630 in
1631 {
1632     assert(a.length == b.length);
1633     assert(disjoint(a, b));
1634 }
1635 body
1636 {
1637     //printf("_arraySliceExpMulSliceAssign_s()\n");
1638     auto aptr = a.ptr;
1639     auto aend = aptr + a.length;
1640     auto bptr = b.ptr;
1641 
1642     version (D_InlineAsm_X86)
1643     {
1644         // SSE2 aligned version is 3733% faster
1645         if (sse2() && a.length >= 16)
1646         {
1647             auto n = aptr + (a.length & ~15);
1648 
1649             uint l = cast(ushort) value;
1650             l |= l << 16;
1651 
1652             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1653             {
1654                 asm
1655                 {
1656                     mov ESI, aptr;
1657                     mov EDI, n;
1658                     mov EAX, bptr;
1659                     movd XMM2, l;
1660                     pshufd XMM2, XMM2, 0;
1661 
1662                     align 4;
1663                 startsse2u:
1664                     add ESI, 32;
1665                     movdqu XMM0, [EAX];
1666                     movdqu XMM1, [EAX+16];
1667                     add EAX, 32;
1668                     pmullw XMM0, XMM2;
1669                     pmullw XMM1, XMM2;
1670                     movdqu [ESI   -32], XMM0;
1671                     movdqu [ESI+16-32], XMM1;
1672                     cmp ESI, EDI;
1673                     jb startsse2u;
1674 
1675                     mov aptr, ESI;
1676                     mov bptr, EAX;
1677                 }
1678             }
1679             else
1680             {
1681                 asm
1682                 {
1683                     mov ESI, aptr;
1684                     mov EDI, n;
1685                     mov EAX, bptr;
1686                     movd XMM2, l;
1687                     pshufd XMM2, XMM2, 0;
1688 
1689                     align 4;
1690                 startsse2a:
1691                     add ESI, 32;
1692                     movdqa XMM0, [EAX];
1693                     movdqa XMM1, [EAX+16];
1694                     add EAX, 32;
1695                     pmullw XMM0, XMM2;
1696                     pmullw XMM1, XMM2;
1697                     movdqa [ESI   -32], XMM0;
1698                     movdqa [ESI+16-32], XMM1;
1699                     cmp ESI, EDI;
1700                     jb startsse2a;
1701 
1702                     mov aptr, ESI;
1703                     mov bptr, EAX;
1704                 }
1705             }
1706         }
1707         else
1708         // MMX version is 3733% faster
1709         if (mmx() && a.length >= 8)
1710         {
1711             auto n = aptr + (a.length & ~7);
1712 
1713             uint l = cast(ushort) value;
1714 
1715             asm
1716             {
1717                 mov ESI, aptr;
1718                 mov EDI, n;
1719                 mov EAX, bptr;
1720                 movd MM2, l;
1721                 pshufw MM2, MM2, 0;
1722 
1723                 align 4;
1724             startmmx:
1725                 add ESI, 16;
1726                 movq MM0, [EAX];
1727                 movq MM1, [EAX+8];
1728                 add EAX, 16;
1729                 pmullw MM0, MM2;
1730                 pmullw MM1, MM2;
1731                 movq [ESI  -16], MM0;
1732                 movq [ESI+8-16], MM1;
1733                 cmp ESI, EDI;
1734                 jb startmmx;
1735 
1736                 emms;
1737                 mov aptr, ESI;
1738                 mov bptr, EAX;
1739             }
1740         }
1741     }
1742 
1743     while (aptr < aend)
1744         *aptr++ = cast(T)(*bptr++ * value);
1745 
1746     return a;
1747 }
1748 
1749 unittest
1750 {
1751     printf("_arraySliceExpMulSliceAssign_s unittest\n");
1752 
1753     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1754     {
1755         version (log) printf("    cpuid %d\n", cpuid);
1756 
1757         for (int j = 0; j < 2; j++)
1758         {
1759             const int dim = 67;
1760             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1761             a = a[j .. dim + j];        // misalign for second iteration
1762             T[] b = new T[dim + j];
1763             b = b[j .. dim + j];
1764             T[] c = new T[dim + j];
1765             c = c[j .. dim + j];
1766 
1767             for (int i = 0; i < dim; i++)
1768             {   a[i] = cast(T)i;
1769                 b[i] = cast(T)(i + 7);
1770                 c[i] = cast(T)(i * 2);
1771             }
1772 
1773             c[] = a[] * 6;
1774 
1775             for (int i = 0; i < dim; i++)
1776             {
1777                 if (c[i] != cast(T)(a[i] * 6))
1778                 {
1779                     printf("[%d]: %d != %d * 6\n", i, c[i], a[i]);
1780                     assert(0);
1781                 }
1782             }
1783         }
1784     }
1785 }
1786 
1787 
1788 /* ======================================================================== */
1789 
1790 /***********************
1791  * Computes:
1792  *      a[] = b[] * c[]
1793  */
1794 
1795 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b)
1796 {
1797     return _arraySliceSliceMulSliceAssign_s(a, c, b);
1798 }
1799 
1800 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b)
1801 {
1802     return _arraySliceSliceMulSliceAssign_s(a, c, b);
1803 }
1804 
1805 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b)
1806 in
1807 {
1808         assert(a.length == b.length && b.length == c.length);
1809         assert(disjoint(a, b));
1810         assert(disjoint(a, c));
1811         assert(disjoint(b, c));
1812 }
1813 body
1814 {
1815     //printf("_arraySliceSliceMulSliceAssign_s()\n");
1816     auto aptr = a.ptr;
1817     auto aend = aptr + a.length;
1818     auto bptr = b.ptr;
1819     auto cptr = c.ptr;
1820 
1821     version (D_InlineAsm_X86)
1822     {
1823         // SSE2 aligned version is 2515% faster
1824         if (sse2() && a.length >= 16)
1825         {
1826             auto n = aptr + (a.length & ~15);
1827 
1828             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1829             {
1830                 asm
1831                 {
1832                     mov ESI, aptr;
1833                     mov EDI, n;
1834                     mov EAX, bptr;
1835                     mov ECX, cptr;
1836 
1837                     align 4;
1838                 startsse2u:
1839                     add ESI, 32;
1840                     movdqu XMM0, [EAX];
1841                     movdqu XMM2, [ECX];
1842                     movdqu XMM1, [EAX+16];
1843                     movdqu XMM3, [ECX+16];
1844                     add EAX, 32;
1845                     add ECX, 32;
1846                     pmullw XMM0, XMM2;
1847                     pmullw XMM1, XMM3;
1848                     movdqu [ESI   -32], XMM0;
1849                     movdqu [ESI+16-32], XMM1;
1850                     cmp ESI, EDI;
1851                     jb startsse2u;
1852 
1853                     mov aptr, ESI;
1854                     mov bptr, EAX;
1855                     mov cptr, ECX;
1856                 }
1857             }
1858             else
1859             {
1860                 asm
1861                 {
1862                     mov ESI, aptr;
1863                     mov EDI, n;
1864                     mov EAX, bptr;
1865                     mov ECX, cptr;
1866 
1867                     align 4;
1868                 startsse2a:
1869                     add ESI, 32;
1870                     movdqa XMM0, [EAX];
1871                     movdqa XMM2, [ECX];
1872                     movdqa XMM1, [EAX+16];
1873                     movdqa XMM3, [ECX+16];
1874                     add EAX, 32;
1875                     add ECX, 32;
1876                     pmullw XMM0, XMM2;
1877                     pmullw XMM1, XMM3;
1878                     movdqa [ESI   -32], XMM0;
1879                     movdqa [ESI+16-32], XMM1;
1880                     cmp ESI, EDI;
1881                     jb startsse2a;
1882 
1883                     mov aptr, ESI;
1884                     mov bptr, EAX;
1885                     mov cptr, ECX;
1886                }
1887             }
1888         }
1889         else
1890         // MMX version is 2515% faster
1891         if (mmx() && a.length >= 8)
1892         {
1893             auto n = aptr + (a.length & ~7);
1894 
1895             asm
1896             {
1897                 mov ESI, aptr;
1898                 mov EDI, n;
1899                 mov EAX, bptr;
1900                 mov ECX, cptr;
1901 
1902                 align 4;
1903             startmmx:
1904                 add ESI, 16;
1905                 movq MM0, [EAX];
1906                 movq MM2, [ECX];
1907                 movq MM1, [EAX+8];
1908                 movq MM3, [ECX+8];
1909                 add EAX, 16;
1910                 add ECX, 16;
1911                 pmullw MM0, MM2;
1912                 pmullw MM1, MM3;
1913                 movq [ESI  -16], MM0;
1914                 movq [ESI+8-16], MM1;
1915                 cmp ESI, EDI;
1916                 jb startmmx;
1917 
1918                 emms;
1919                 mov aptr, ESI;
1920                 mov bptr, EAX;
1921                 mov cptr, ECX;
1922             }
1923         }
1924     }
1925 
1926     while (aptr < aend)
1927         *aptr++ = cast(T)(*bptr++ * *cptr++);
1928 
1929     return a;
1930 }
1931 
1932 unittest
1933 {
1934     printf("_arraySliceSliceMulSliceAssign_s unittest\n");
1935 
1936     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1937     {
1938         version (log) printf("    cpuid %d\n", cpuid);
1939 
1940         for (int j = 0; j < 2; j++)
1941         {
1942             const int dim = 67;
1943             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1944             a = a[j .. dim + j];        // misalign for second iteration
1945             T[] b = new T[dim + j];
1946             b = b[j .. dim + j];
1947             T[] c = new T[dim + j];
1948             c = c[j .. dim + j];
1949 
1950             for (int i = 0; i < dim; i++)
1951             {   a[i] = cast(T)i;
1952                 b[i] = cast(T)(i + 7);
1953                 c[i] = cast(T)(i * 2);
1954             }
1955 
1956             c[] = a[] * b[];
1957 
1958             for (int i = 0; i < dim; i++)
1959             {
1960                 if (c[i] != cast(T)(a[i] * b[i]))
1961                 {
1962                     printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]);
1963                     assert(0);
1964                 }
1965             }
1966         }
1967     }
1968 }
1969 
1970 
1971 /* ======================================================================== */
1972 
1973 /***********************
1974  * Computes:
1975  *      a[] *= value
1976  */
1977 
1978 T[] _arrayExpSliceMulass_u(T[] a, T value)
1979 {
1980     return _arrayExpSliceMulass_s(a, value);
1981 }
1982 
1983 T[] _arrayExpSliceMulass_t(T[] a, T value)
1984 {
1985     return _arrayExpSliceMulass_s(a, value);
1986 }
1987 
1988 T[] _arrayExpSliceMulass_s(T[] a, T value)
1989 {
1990     //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1991     auto aptr = a.ptr;
1992     auto aend = aptr + a.length;
1993 
1994     version (D_InlineAsm_X86)
1995     {
1996         // SSE2 aligned version is 2044% faster
1997         if (sse2() && a.length >= 16)
1998         {
1999             auto n = aptr + (a.length & ~15);
2000 
2001             uint l = cast(ushort) value;
2002             l |= l << 16;
2003 
2004             if (((cast(uint) aptr) & 15) != 0)
2005             {
2006                 asm
2007                 {
2008                     mov ESI, aptr;
2009                     mov EDI, n;
2010                     movd XMM2, l;
2011                     pshufd XMM2, XMM2, 0;
2012 
2013                     align 4;
2014                 startsse2u:
2015                     movdqu XMM0, [ESI];
2016                     movdqu XMM1, [ESI+16];
2017                     add ESI, 32;
2018                     pmullw XMM0, XMM2;
2019                     pmullw XMM1, XMM2;
2020                     movdqu [ESI   -32], XMM0;
2021                     movdqu [ESI+16-32], XMM1;
2022                     cmp ESI, EDI;
2023                     jb startsse2u;
2024 
2025                     mov aptr, ESI;
2026                 }
2027             }
2028             else
2029             {
2030                 asm
2031                 {
2032                     mov ESI, aptr;
2033                     mov EDI, n;
2034                     movd XMM2, l;
2035                     pshufd XMM2, XMM2, 0;
2036 
2037                     align 4;
2038                 startsse2a:
2039                     movdqa XMM0, [ESI];
2040                     movdqa XMM1, [ESI+16];
2041                     add ESI, 32;
2042                     pmullw XMM0, XMM2;
2043                     pmullw XMM1, XMM2;
2044                     movdqa [ESI   -32], XMM0;
2045                     movdqa [ESI+16-32], XMM1;
2046                     cmp ESI, EDI;
2047                     jb startsse2a;
2048 
2049                     mov aptr, ESI;
2050                 }
2051             }
2052         }
2053         else
2054         // MMX version is 2056% faster
2055         if (mmx() && a.length >= 8)
2056         {
2057             auto n = aptr + (a.length & ~7);
2058 
2059             uint l = cast(ushort) value;
2060 
2061             asm
2062             {
2063                 mov ESI, aptr;
2064                 mov EDI, n;
2065                 movd MM2, l;
2066                 pshufw MM2, MM2, 0;
2067 
2068                 align 4;
2069             startmmx:
2070                 movq MM0, [ESI];
2071                 movq MM1, [ESI+8];
2072                 add ESI, 16;
2073                 pmullw MM0, MM2;
2074                 pmullw MM1, MM2;
2075                 movq [ESI  -16], MM0;
2076                 movq [ESI+8-16], MM1;
2077                 cmp ESI, EDI;
2078                 jb startmmx;
2079 
2080                 emms;
2081                 mov aptr, ESI;
2082             }
2083         }
2084     }
2085 
2086     while (aptr < aend)
2087         *aptr++ *= value;
2088 
2089     return a;
2090 }
2091 
2092 unittest
2093 {
2094     printf("_arrayExpSliceMulass_s unittest\n");
2095 
2096     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2097     {
2098         version (log) printf("    cpuid %d\n", cpuid);
2099 
2100         for (int j = 0; j < 2; j++)
2101         {
2102             const int dim = 67;
2103             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2104             a = a[j .. dim + j];        // misalign for second iteration
2105             T[] b = new T[dim + j];
2106             b = b[j .. dim + j];
2107             T[] c = new T[dim + j];
2108             c = c[j .. dim + j];
2109 
2110             for (int i = 0; i < dim; i++)
2111             {   a[i] = cast(T)i;
2112                 b[i] = cast(T)(i + 7);
2113                 c[i] = cast(T)(i * 2);
2114             }
2115 
2116             b[] = a[];
2117             a[] *= 6;
2118 
2119             for (int i = 0; i < dim; i++)
2120             {
2121                 if (a[i] != cast(T)(b[i] * 6))
2122                 {
2123                     printf("[%d]: %d != %d * 6\n", i, a[i], b[i]);
2124                     assert(0);
2125                 }
2126             }
2127         }
2128     }
2129 }
2130 
2131 
2132 /* ======================================================================== */
2133 
2134 /***********************
2135  * Computes:
2136  *      a[] *= b[]
2137  */
2138 
2139 T[] _arraySliceSliceMulass_u(T[] a, T[] b)
2140 {
2141     return _arraySliceSliceMulass_s(a, b);
2142 }
2143 
2144 T[] _arraySliceSliceMulass_t(T[] a, T[] b)
2145 {
2146     return _arraySliceSliceMulass_s(a, b);
2147 }
2148 
2149 T[] _arraySliceSliceMulass_s(T[] a, T[] b)
2150 in
2151 {
2152     assert (a.length == b.length);
2153     assert (disjoint(a, b));
2154 }
2155 body
2156 {
2157     //printf("_arraySliceSliceMulass_s()\n");
2158     auto aptr = a.ptr;
2159     auto aend = aptr + a.length;
2160     auto bptr = b.ptr;
2161 
2162     version (D_InlineAsm_X86)
2163     {
2164         // SSE2 aligned version is 2519% faster
2165         if (sse2() && a.length >= 16)
2166         {
2167             auto n = aptr + (a.length & ~15);
2168 
2169             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
2170             {
2171                 asm
2172                 {
2173                     mov ESI, aptr;
2174                     mov EDI, n;
2175                     mov ECX, bptr;
2176 
2177                     align 4;
2178                 startsse2u:
2179                     movdqu XMM0, [ESI];
2180                     movdqu XMM2, [ECX];
2181                     movdqu XMM1, [ESI+16];
2182                     movdqu XMM3, [ECX+16];
2183                     add ESI, 32;
2184                     add ECX, 32;
2185                     pmullw XMM0, XMM2;
2186                     pmullw XMM1, XMM3;
2187                     movdqu [ESI   -32], XMM0;
2188                     movdqu [ESI+16-32], XMM1;
2189                     cmp ESI, EDI;
2190                     jb startsse2u;
2191 
2192                     mov aptr, ESI;
2193                     mov bptr, ECX;
2194                 }
2195             }
2196             else
2197             {
2198                 asm
2199                 {
2200                     mov ESI, aptr;
2201                     mov EDI, n;
2202                     mov ECX, bptr;
2203 
2204                     align 4;
2205                 startsse2a:
2206                     movdqa XMM0, [ESI];
2207                     movdqa XMM2, [ECX];
2208                     movdqa XMM1, [ESI+16];
2209                     movdqa XMM3, [ECX+16];
2210                     add ESI, 32;
2211                     add ECX, 32;
2212                     pmullw XMM0, XMM2;
2213                     pmullw XMM1, XMM3;
2214                     movdqa [ESI   -32], XMM0;
2215                     movdqa [ESI+16-32], XMM1;
2216                     cmp ESI, EDI;
2217                     jb startsse2a;
2218 
2219                     mov aptr, ESI;
2220                     mov bptr, ECX;
2221                }
2222             }
2223         }
2224         else
2225         // MMX version is 1712% faster
2226         if (mmx() && a.length >= 8)
2227         {
2228             auto n = aptr + (a.length & ~7);
2229 
2230             asm
2231             {
2232                 mov ESI, aptr;
2233                 mov EDI, n;
2234                 mov ECX, bptr;
2235 
2236                 align 4;
2237             startmmx:
2238                 movq MM0, [ESI];
2239                 movq MM2, [ECX];
2240                 movq MM1, [ESI+8];
2241                 movq MM3, [ECX+8];
2242                 add ESI, 16;
2243                 add ECX, 16;
2244                 pmullw MM0, MM2;
2245                 pmullw MM1, MM3;
2246                 movq [ESI  -16], MM0;
2247                 movq [ESI+8-16], MM1;
2248                 cmp ESI, EDI;
2249                 jb startmmx;
2250 
2251                 emms;
2252                 mov aptr, ESI;
2253                 mov bptr, ECX;
2254             }
2255         }
2256     }
2257 
2258     while (aptr < aend)
2259         *aptr++ *= *bptr++;
2260 
2261     return a;
2262 }
2263 
2264 unittest
2265 {
2266     printf("_arraySliceSliceMulass_s unittest\n");
2267 
2268     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
2269     {
2270         version (log) printf("    cpuid %d\n", cpuid);
2271 
2272         for (int j = 0; j < 2; j++)
2273         {
2274             const int dim = 67;
2275             T[] a = new T[dim + j];     // aligned on 16 byte boundary
2276             a = a[j .. dim + j];        // misalign for second iteration
2277             T[] b = new T[dim + j];
2278             b = b[j .. dim + j];
2279             T[] c = new T[dim + j];
2280             c = c[j .. dim + j];
2281 
2282             for (int i = 0; i < dim; i++)
2283             {   a[i] = cast(T)i;
2284                 b[i] = cast(T)(i + 7);
2285                 c[i] = cast(T)(i * 2);
2286             }
2287 
2288             b[] = a[];
2289             a[] *= c[];
2290 
2291             for (int i = 0; i < dim; i++)
2292             {
2293                 if (a[i] != cast(T)(b[i] * c[i]))
2294                 {
2295                     printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]);
2296                     assert(0);
2297                 }
2298             }
2299         }
2300     }
2301 }