1 /***************************
2  * D programming language http://www.digitalmars.com/d/
3  * Runtime support for byte array operations.
4  * Based on code originally written by Burton Radons.
5  * Placed in public domain.
6  */
7 
8 /* Contains SSE2 and MMX versions of certain operations for char, byte,
9  * and ubyte ('a', 'g' and 'h' suffixes).
10  */
11 
12 module rt.compiler.gdc.rt.arraybyte;
13 
14 import CPUid = rt.compiler.util.cpuid;
15 
16 debug(UnitTest)
17 {
18     private extern(C) int printf(char*,...);
19     /* This is so unit tests will test every CPU variant
20      */
21     int cpuid;
22     const int CPUID_MAX = 4;
23     bool mmx()      { return cpuid == 1 && CPUid.mmx(); }
24     bool sse()      { return cpuid == 2 && CPUid.sse(); }
25     bool sse2()     { return cpuid == 3 && CPUid.sse2(); }
26     bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); }
27 }
28 else
29 {
30     alias CPUid.mmx mmx;
31     alias CPUid.sse sse;
32     alias CPUid.sse2 sse2;
33     alias CPUid.amd3dnow amd3dnow;
34 }
35 
36 //version = log;
37 
38 bool disjoint(T)(T[] a, T[] b)
39 {
40     return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr);
41 }
42 
43 alias byte T;
44 
45 extern (C):
46 
47 /* ======================================================================== */
48 
49 
50 /***********************
51  * Computes:
52  *      a[] = b[] + value
53  */
54 
55 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b)
56 {
57     return _arraySliceExpAddSliceAssign_g(a, value, b);
58 }
59 
60 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b)
61 {
62     return _arraySliceExpAddSliceAssign_g(a, value, b);
63 }
64 
65 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b)
66 in
67 {
68     assert(a.length == b.length);
69     assert(disjoint(a, b));
70 }
71 body
72 {
73     //printf("_arraySliceExpAddSliceAssign_g()\n");
74     auto aptr = a.ptr;
75     auto aend = aptr + a.length;
76     auto bptr = b.ptr;
77 
78     version (D_InlineAsm_X86)
79     {
80         // SSE2 aligned version is 1088% faster
81         if (sse2() && a.length >= 64)
82         {
83             auto n = aptr + (a.length & ~63);
84 
85             uint l = cast(ubyte) value;
86             l |= (l << 8);
87             l |= (l << 16);
88 
89             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
90             {
91                 asm // unaligned case
92                 {
93                     mov ESI, aptr;
94                     mov EDI, n;
95                     mov EAX, bptr;
96                     movd XMM4, l;
97                     pshufd XMM4, XMM4, 0;
98 
99                     align 8;
100                 startaddsse2u:
101                     add ESI, 64;
102                     movdqu XMM0, [EAX];
103                     movdqu XMM1, [EAX+16];
104                     movdqu XMM2, [EAX+32];
105                     movdqu XMM3, [EAX+48];
106                     add EAX, 64;
107                     paddb XMM0, XMM4;
108                     paddb XMM1, XMM4;
109                     paddb XMM2, XMM4;
110                     paddb XMM3, XMM4;
111                     movdqu [ESI   -64], XMM0;
112                     movdqu [ESI+16-64], XMM1;
113                     movdqu [ESI+32-64], XMM2;
114                     movdqu [ESI+48-64], XMM3;
115                     cmp ESI, EDI;
116                     jb startaddsse2u;
117 
118                     mov aptr, ESI;
119                     mov bptr, EAX;
120                 }
121             }
122             else
123             {
124                 asm // aligned case
125                 {
126                     mov ESI, aptr;
127                     mov EDI, n;
128                     mov EAX, bptr;
129                     movd XMM4, l;
130                     pshufd XMM4, XMM4, 0;
131 
132                     align 8;
133                 startaddsse2a:
134                     add ESI, 64;
135                     movdqa XMM0, [EAX];
136                     movdqa XMM1, [EAX+16];
137                     movdqa XMM2, [EAX+32];
138                     movdqa XMM3, [EAX+48];
139                     add EAX, 64;
140                     paddb XMM0, XMM4;
141                     paddb XMM1, XMM4;
142                     paddb XMM2, XMM4;
143                     paddb XMM3, XMM4;
144                     movdqa [ESI   -64], XMM0;
145                     movdqa [ESI+16-64], XMM1;
146                     movdqa [ESI+32-64], XMM2;
147                     movdqa [ESI+48-64], XMM3;
148                     cmp ESI, EDI;
149                     jb startaddsse2a;
150 
151                     mov aptr, ESI;
152                     mov bptr, EAX;
153                 }
154             }
155         }
156         else
157         // MMX version is 1000% faster
158         if (mmx() && a.length >= 32)
159         {
160             auto n = aptr + (a.length & ~31);
161 
162             uint l = cast(ubyte) value;
163             l |= (l << 8);
164 
165             asm
166             {
167                 mov ESI, aptr;
168                 mov EDI, n;
169                 mov EAX, bptr;
170                 movd MM4, l;
171                 pshufw MM4, MM4, 0;
172 
173                 align 4;
174             startaddmmx:
175                 add ESI, 32;
176                 movq MM0, [EAX];
177                 movq MM1, [EAX+8];
178                 movq MM2, [EAX+16];
179                 movq MM3, [EAX+24];
180                 add EAX, 32;
181                 paddb MM0, MM4;
182                 paddb MM1, MM4;
183                 paddb MM2, MM4;
184                 paddb MM3, MM4;
185                 movq [ESI   -32], MM0;
186                 movq [ESI+8 -32], MM1;
187                 movq [ESI+16-32], MM2;
188                 movq [ESI+24-32], MM3;
189                 cmp ESI, EDI;
190                 jb startaddmmx;
191 
192                 emms;
193                 mov aptr, ESI;
194                 mov bptr, EAX;
195             }
196         }
197         /* trying to be fair and treat normal 32-bit cpu the same way as we do
198          * the SIMD units, with unrolled asm.  There's not enough registers,
199          * really.
200          */
201         else
202         if (a.length >= 4)
203         {
204 
205             auto n = aptr + (a.length & ~3);
206             asm
207             {
208                 mov ESI, aptr;
209                 mov EDI, n;
210                 mov EAX, bptr;
211                 mov CL, value;
212 
213                 align 4;
214             startadd386:
215                 add ESI, 4;
216                 mov DX, [EAX];
217                 mov BX, [EAX+2];
218                 add EAX, 4;
219                 add BL, CL;
220                 add BH, CL;
221                 add DL, CL;
222                 add DH, CL;
223                 mov [ESI   -4], DX;
224                 mov [ESI+2 -4], BX;
225                 cmp ESI, EDI;
226                 jb startadd386;
227 
228                 mov aptr, ESI;
229                 mov bptr, EAX;
230             }
231 
232         }
233     }
234 
235     while (aptr < aend)
236         *aptr++ = cast(T)(*bptr++ + value);
237 
238     return a;
239 }
240 
241 unittest
242 {
243     printf("_arraySliceExpAddSliceAssign_g unittest\n");
244 
245     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
246     {
247         version (log) printf("    cpuid %d\n", cpuid);
248 
249         for (int j = 0; j < 2; j++)
250         {
251             const int dim = 67;
252             T[] a = new T[dim + j];     // aligned on 16 byte boundary
253             a = a[j .. dim + j];        // misalign for second iteration
254             T[] b = new T[dim + j];
255             b = b[j .. dim + j];
256             T[] c = new T[dim + j];
257             c = c[j .. dim + j];
258 
259             for (int i = 0; i < dim; i++)
260             {   a[i] = cast(T)i;
261                 b[i] = cast(T)(i + 7);
262                 c[i] = cast(T)(i * 2);
263             }
264 
265             c[] = a[] + 6;
266 
267             for (int i = 0; i < dim; i++)
268             {
269                 if (c[i] != cast(T)(a[i] + 6))
270                 {
271                     printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
272                     assert(0);
273                 }
274             }
275         }
276     }
277 }
278 
279 
280 /* ======================================================================== */
281 
282 /***********************
283  * Computes:
284  *      a[] = b[] + c[]
285  */
286 
287 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b)
288 {
289     return _arraySliceSliceAddSliceAssign_g(a, c, b);
290 }
291 
292 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b)
293 {
294     return _arraySliceSliceAddSliceAssign_g(a, c, b);
295 }
296 
297 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b)
298 in
299 {
300         assert(a.length == b.length && b.length == c.length);
301         assert(disjoint(a, b));
302         assert(disjoint(a, c));
303         assert(disjoint(b, c));
304 }
305 body
306 {
307     //printf("_arraySliceSliceAddSliceAssign_g()\n");
308     auto aptr = a.ptr;
309     auto aend = aptr + a.length;
310     auto bptr = b.ptr;
311     auto cptr = c.ptr;
312 
313     version (D_InlineAsm_X86)
314     {
315         // SSE2 aligned version is 5739% faster
316         if (sse2() && a.length >= 64)
317         {
318             auto n = aptr + (a.length & ~63);
319 
320             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
321             {
322                 version (log) printf("\tsse2 unaligned\n");
323                 asm // unaligned case
324                 {
325                     mov ESI, aptr;
326                     mov EDI, n;
327                     mov EAX, bptr;
328                     mov ECX, cptr;
329 
330                     align 8;
331                 startaddlsse2u:
332                     add ESI, 64;
333                     movdqu XMM0, [EAX];
334                     movdqu XMM1, [EAX+16];
335                     movdqu XMM2, [EAX+32];
336                     movdqu XMM3, [EAX+48];
337                     add EAX, 64;
338                     movdqu XMM4, [ECX];
339                     movdqu XMM5, [ECX+16];
340                     movdqu XMM6, [ECX+32];
341                     movdqu XMM7, [ECX+48];
342                     add ECX, 64;
343                     paddb XMM0, XMM4;
344                     paddb XMM1, XMM5;
345                     paddb XMM2, XMM6;
346                     paddb XMM3, XMM7;
347                     movdqu [ESI   -64], XMM0;
348                     movdqu [ESI+16-64], XMM1;
349                     movdqu [ESI+32-64], XMM2;
350                     movdqu [ESI+48-64], XMM3;
351                     cmp ESI, EDI;
352                     jb startaddlsse2u;
353 
354                     mov aptr, ESI;
355                     mov bptr, EAX;
356                     mov cptr, ECX;
357                 }
358             }
359             else
360             {
361                 version (log) printf("\tsse2 aligned\n");
362                 asm // aligned case
363                 {
364                     mov ESI, aptr;
365                     mov EDI, n;
366                     mov EAX, bptr;
367                     mov ECX, cptr;
368 
369                     align 8;
370                 startaddlsse2a:
371                     add ESI, 64;
372                     movdqa XMM0, [EAX];
373                     movdqa XMM1, [EAX+16];
374                     movdqa XMM2, [EAX+32];
375                     movdqa XMM3, [EAX+48];
376                     add EAX, 64;
377                     movdqa XMM4, [ECX];
378                     movdqa XMM5, [ECX+16];
379                     movdqa XMM6, [ECX+32];
380                     movdqa XMM7, [ECX+48];
381                     add ECX, 64;
382                     paddb XMM0, XMM4;
383                     paddb XMM1, XMM5;
384                     paddb XMM2, XMM6;
385                     paddb XMM3, XMM7;
386                     movdqa [ESI   -64], XMM0;
387                     movdqa [ESI+16-64], XMM1;
388                     movdqa [ESI+32-64], XMM2;
389                     movdqa [ESI+48-64], XMM3;
390                     cmp ESI, EDI;
391                     jb startaddlsse2a;
392 
393                     mov aptr, ESI;
394                     mov bptr, EAX;
395                     mov cptr, ECX;
396                 }
397             }
398         }
399         else
400         // MMX version is 4428% faster
401         if (mmx() && a.length >= 32)
402         {
403             version (log) printf("\tmmx\n");
404             auto n = aptr + (a.length & ~31);
405 
406             asm
407             {
408                 mov ESI, aptr;
409                 mov EDI, n;
410                 mov EAX, bptr;
411                 mov ECX, cptr;
412 
413                 align 4;
414             startaddlmmx:
415                 add ESI, 32;
416                 movq MM0, [EAX];
417                 movq MM1, [EAX+8];
418                 movq MM2, [EAX+16];
419                 movq MM3, [EAX+24];
420                 add EAX, 32;
421                 movq MM4, [ECX];
422                 movq MM5, [ECX+8];
423                 movq MM6, [ECX+16];
424                 movq MM7, [ECX+24];
425                 add ECX, 32;
426                 paddb MM0, MM4;
427                 paddb MM1, MM5;
428                 paddb MM2, MM6;
429                 paddb MM3, MM7;
430                 movq [ESI   -32], MM0;
431                 movq [ESI+8 -32], MM1;
432                 movq [ESI+16-32], MM2;
433                 movq [ESI+24-32], MM3;
434                 cmp ESI, EDI;
435                 jb startaddlmmx;
436 
437                 emms;
438                 mov aptr, ESI;
439                 mov bptr, EAX;
440                 mov cptr, ECX;
441             }
442         }
443     }
444 
445     version (log) if (aptr < aend) printf("\tbase\n");
446     while (aptr < aend)
447         *aptr++ = cast(T)(*bptr++ + *cptr++);
448 
449     return a;
450 }
451 
452 unittest
453 {
454     printf("_arraySliceSliceAddSliceAssign_g unittest\n");
455 
456     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
457     {
458         version (log) printf("    cpuid %d\n", cpuid);
459 
460         for (int j = 0; j < 2; j++)
461         {
462             const int dim = 67;
463             T[] a = new T[dim + j];     // aligned on 16 byte boundary
464             a = a[j .. dim + j];        // misalign for second iteration
465             T[] b = new T[dim + j];
466             b = b[j .. dim + j];
467             T[] c = new T[dim + j];
468             c = c[j .. dim + j];
469 
470             for (int i = 0; i < dim; i++)
471             {   a[i] = cast(T)i;
472                 b[i] = cast(T)(i + 7);
473                 c[i] = cast(T)(i * 2);
474             }
475 
476             c[] = a[] + b[];
477 
478             for (int i = 0; i < dim; i++)
479             {
480                 if (c[i] != cast(T)(a[i] + b[i]))
481                 {
482                     printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
483                     assert(0);
484                 }
485             }
486         }
487     }
488 }
489 
490 
491 /* ======================================================================== */
492 
493 /***********************
494  * Computes:
495  *      a[] += value
496  */
497 
498 T[] _arrayExpSliceAddass_a(T[] a, T value)
499 {
500     return _arrayExpSliceAddass_g(a, value);
501 }
502 
503 T[] _arrayExpSliceAddass_h(T[] a, T value)
504 {
505     return _arrayExpSliceAddass_g(a, value);
506 }
507 
508 T[] _arrayExpSliceAddass_g(T[] a, T value)
509 {
510     //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
511     auto aptr = a.ptr;
512     auto aend = aptr + a.length;
513 
514     version (D_InlineAsm_X86)
515     {
516         // SSE2 aligned version is 1578% faster
517         if (sse2() && a.length >= 64)
518         {
519             auto n = aptr + (a.length & ~63);
520 
521             uint l = cast(ubyte) value;
522             l |= (l << 8);
523             l |= (l << 16);
524 
525             if (((cast(uint) aptr) & 15) != 0)
526             {
527                 asm // unaligned case
528                 {
529                     mov ESI, aptr;
530                     mov EDI, n;
531                     movd XMM4, l;
532                     pshufd XMM4, XMM4, 0;
533 
534                     align 8;
535                 startaddasssse2u:
536                     movdqu XMM0, [ESI];
537                     movdqu XMM1, [ESI+16];
538                     movdqu XMM2, [ESI+32];
539                     movdqu XMM3, [ESI+48];
540                     add ESI, 64;
541                     paddb XMM0, XMM4;
542                     paddb XMM1, XMM4;
543                     paddb XMM2, XMM4;
544                     paddb XMM3, XMM4;
545                     movdqu [ESI   -64], XMM0;
546                     movdqu [ESI+16-64], XMM1;
547                     movdqu [ESI+32-64], XMM2;
548                     movdqu [ESI+48-64], XMM3;
549                     cmp ESI, EDI;
550                     jb startaddasssse2u;
551 
552                     mov aptr, ESI;
553                 }
554             }
555             else
556             {
557                 asm // aligned case
558                 {
559                     mov ESI, aptr;
560                     mov EDI, n;
561                     movd XMM4, l;
562                     pshufd XMM4, XMM4, 0;
563 
564                     align 8;
565                 startaddasssse2a:
566                     movdqa XMM0, [ESI];
567                     movdqa XMM1, [ESI+16];
568                     movdqa XMM2, [ESI+32];
569                     movdqa XMM3, [ESI+48];
570                     add ESI, 64;
571                     paddb XMM0, XMM4;
572                     paddb XMM1, XMM4;
573                     paddb XMM2, XMM4;
574                     paddb XMM3, XMM4;
575                     movdqa [ESI   -64], XMM0;
576                     movdqa [ESI+16-64], XMM1;
577                     movdqa [ESI+32-64], XMM2;
578                     movdqa [ESI+48-64], XMM3;
579                     cmp ESI, EDI;
580                     jb startaddasssse2a;
581 
582                     mov aptr, ESI;
583                 }
584             }
585         }
586         else
587         // MMX version is 1721% faster
588         if (mmx() && a.length >= 32)
589         {
590 
591             auto n = aptr + (a.length & ~31);
592 
593             uint l = cast(ubyte) value;
594             l |= (l << 8);
595 
596             asm
597             {
598                 mov ESI, aptr;
599                 mov EDI, n;
600                 movd MM4, l;
601                 pshufw MM4, MM4, 0;
602 
603                 align 8;
604             startaddassmmx:
605                 movq MM0, [ESI];
606                 movq MM1, [ESI+8];
607                 movq MM2, [ESI+16];
608                 movq MM3, [ESI+24];
609                 add ESI, 32;
610                 paddb MM0, MM4;
611                 paddb MM1, MM4;
612                 paddb MM2, MM4;
613                 paddb MM3, MM4;
614                 movq [ESI   -32], MM0;
615                 movq [ESI+8 -32], MM1;
616                 movq [ESI+16-32], MM2;
617                 movq [ESI+24-32], MM3;
618                 cmp ESI, EDI;
619                 jb startaddassmmx;
620 
621                 emms;
622                 mov aptr, ESI;
623             }
624         }
625     }
626 
627     while (aptr < aend)
628         *aptr++ += value;
629 
630     return a;
631 }
632 
633 unittest
634 {
635     printf("_arrayExpSliceAddass_g unittest\n");
636 
637     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
638     {
639         version (log) printf("    cpuid %d\n", cpuid);
640 
641         for (int j = 0; j < 2; j++)
642         {
643             const int dim = 67;
644             T[] a = new T[dim + j];     // aligned on 16 byte boundary
645             a = a[j .. dim + j];        // misalign for second iteration
646             T[] b = new T[dim + j];
647             b = b[j .. dim + j];
648             T[] c = new T[dim + j];
649             c = c[j .. dim + j];
650 
651             for (int i = 0; i < dim; i++)
652             {   a[i] = cast(T)i;
653                 b[i] = cast(T)(i + 7);
654                 c[i] = cast(T)(i * 2);
655             }
656 
657             a[] = c[];
658             c[] += 6;
659 
660             for (int i = 0; i < dim; i++)
661             {
662                 if (c[i] != cast(T)(a[i] + 6))
663                 {
664                     printf("[%d]: %d != %d + 6\n", i, c[i], a[i]);
665                     assert(0);
666                 }
667             }
668         }
669     }
670 }
671 
672 
673 /* ======================================================================== */
674 
675 /***********************
676  * Computes:
677  *      a[] += b[]
678  */
679 
680 T[] _arraySliceSliceAddass_a(T[] a, T[] b)
681 {
682     return _arraySliceSliceAddass_g(a, b);
683 }
684 
685 T[] _arraySliceSliceAddass_h(T[] a, T[] b)
686 {
687     return _arraySliceSliceAddass_g(a, b);
688 }
689 
690 T[] _arraySliceSliceAddass_g(T[] a, T[] b)
691 in
692 {
693     assert (a.length == b.length);
694     assert (disjoint(a, b));
695 }
696 body
697 {
698     //printf("_arraySliceSliceAddass_g()\n");
699     auto aptr = a.ptr;
700     auto aend = aptr + a.length;
701     auto bptr = b.ptr;
702 
703     version (D_InlineAsm_X86)
704     {
705         // SSE2 aligned version is 4727% faster
706         if (sse2() && a.length >= 64)
707         {
708             auto n = aptr + (a.length & ~63);
709 
710             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
711             {
712                 asm // unaligned case
713                 {
714                     mov ESI, aptr;
715                     mov EDI, n;
716                     mov ECX, bptr;
717 
718                     align 8;
719                 startaddasslsse2u:
720                     movdqu XMM0, [ESI];
721                     movdqu XMM1, [ESI+16];
722                     movdqu XMM2, [ESI+32];
723                     movdqu XMM3, [ESI+48];
724                     add ESI, 64;
725                     movdqu XMM4, [ECX];
726                     movdqu XMM5, [ECX+16];
727                     movdqu XMM6, [ECX+32];
728                     movdqu XMM7, [ECX+48];
729                     add ECX, 64;
730                     paddb XMM0, XMM4;
731                     paddb XMM1, XMM5;
732                     paddb XMM2, XMM6;
733                     paddb XMM3, XMM7;
734                     movdqu [ESI   -64], XMM0;
735                     movdqu [ESI+16-64], XMM1;
736                     movdqu [ESI+32-64], XMM2;
737                     movdqu [ESI+48-64], XMM3;
738                     cmp ESI, EDI;
739                     jb startaddasslsse2u;
740 
741                     mov aptr, ESI;
742                     mov bptr, ECX;
743                 }
744             }
745             else
746             {
747                 asm // aligned case
748                 {
749                     mov ESI, aptr;
750                     mov EDI, n;
751                     mov ECX, bptr;
752 
753                     align 8;
754                 startaddasslsse2a:
755                     movdqa XMM0, [ESI];
756                     movdqa XMM1, [ESI+16];
757                     movdqa XMM2, [ESI+32];
758                     movdqa XMM3, [ESI+48];
759                     add ESI, 64;
760                     movdqa XMM4, [ECX];
761                     movdqa XMM5, [ECX+16];
762                     movdqa XMM6, [ECX+32];
763                     movdqa XMM7, [ECX+48];
764                     add ECX, 64;
765                     paddb XMM0, XMM4;
766                     paddb XMM1, XMM5;
767                     paddb XMM2, XMM6;
768                     paddb XMM3, XMM7;
769                     movdqa [ESI   -64], XMM0;
770                     movdqa [ESI+16-64], XMM1;
771                     movdqa [ESI+32-64], XMM2;
772                     movdqa [ESI+48-64], XMM3;
773                     cmp ESI, EDI;
774                     jb startaddasslsse2a;
775 
776                     mov aptr, ESI;
777                     mov bptr, ECX;
778                 }
779             }
780         }
781         else
782         // MMX version is 3059% faster
783         if (mmx() && a.length >= 32)
784         {
785 
786             auto n = aptr + (a.length & ~31);
787 
788             asm
789             {
790                 mov ESI, aptr;
791                 mov EDI, n;
792                 mov ECX, bptr;
793 
794                 align 8;
795             startaddasslmmx:
796                 movq MM0, [ESI];
797                 movq MM1, [ESI+8];
798                 movq MM2, [ESI+16];
799                 movq MM3, [ESI+24];
800                 add ESI, 32;
801                 movq MM4, [ECX];
802                 movq MM5, [ECX+8];
803                 movq MM6, [ECX+16];
804                 movq MM7, [ECX+24];
805                 add ECX, 32;
806                 paddb MM0, MM4;
807                 paddb MM1, MM5;
808                 paddb MM2, MM6;
809                 paddb MM3, MM7;
810                 movq [ESI   -32], MM0;
811                 movq [ESI+8 -32], MM1;
812                 movq [ESI+16-32], MM2;
813                 movq [ESI+24-32], MM3;
814                 cmp ESI, EDI;
815                 jb startaddasslmmx;
816 
817                 emms;
818                 mov aptr, ESI;
819                 mov bptr, ECX;
820             }
821         }
822     }
823 
824     while (aptr < aend)
825         *aptr++ += *bptr++;
826 
827     return a;
828 }
829 
830 unittest
831 {
832     printf("_arraySliceSliceAddass_g unittest\n");
833 
834     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
835     {
836         version (log) printf("    cpuid %d\n", cpuid);
837 
838         for (int j = 0; j < 2; j++)
839         {
840             const int dim = 67;
841             T[] a = new T[dim + j];     // aligned on 16 byte boundary
842             a = a[j .. dim + j];        // misalign for second iteration
843             T[] b = new T[dim + j];
844             b = b[j .. dim + j];
845             T[] c = new T[dim + j];
846             c = c[j .. dim + j];
847 
848             for (int i = 0; i < dim; i++)
849             {   a[i] = cast(T)i;
850                 b[i] = cast(T)(i + 7);
851                 c[i] = cast(T)(i * 2);
852             }
853 
854             a[] = c[];
855             c[] += b[];
856 
857             for (int i = 0; i < dim; i++)
858             {
859                 if (c[i] != cast(T)(a[i] + b[i]))
860                 {
861                     printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]);
862                     assert(0);
863                 }
864             }
865         }
866     }
867 }
868 
869 
870 /* ======================================================================== */
871 
872 
873 /***********************
874  * Computes:
875  *      a[] = b[] - value
876  */
877 
878 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b)
879 {
880     return _arraySliceExpMinSliceAssign_g(a, value, b);
881 }
882 
883 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b)
884 {
885     return _arraySliceExpMinSliceAssign_g(a, value, b);
886 }
887 
888 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b)
889 in
890 {
891     assert(a.length == b.length);
892     assert(disjoint(a, b));
893 }
894 body
895 {
896     //printf("_arraySliceExpMinSliceAssign_g()\n");
897     auto aptr = a.ptr;
898     auto aend = aptr + a.length;
899     auto bptr = b.ptr;
900 
901     version (D_InlineAsm_X86)
902     {
903         // SSE2 aligned version is 1189% faster
904         if (sse2() && a.length >= 64)
905         {
906             auto n = aptr + (a.length & ~63);
907 
908             uint l = cast(ubyte) value;
909             l |= (l << 8);
910             l |= (l << 16);
911 
912             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
913             {
914                 asm // unaligned case
915                 {
916                     mov ESI, aptr;
917                     mov EDI, n;
918                     mov EAX, bptr;
919                     movd XMM4, l;
920                     pshufd XMM4, XMM4, 0;
921 
922                     align 8;
923                 startsubsse2u:
924                     add ESI, 64;
925                     movdqu XMM0, [EAX];
926                     movdqu XMM1, [EAX+16];
927                     movdqu XMM2, [EAX+32];
928                     movdqu XMM3, [EAX+48];
929                     add EAX, 64;
930                     psubb XMM0, XMM4;
931                     psubb XMM1, XMM4;
932                     psubb XMM2, XMM4;
933                     psubb XMM3, XMM4;
934                     movdqu [ESI   -64], XMM0;
935                     movdqu [ESI+16-64], XMM1;
936                     movdqu [ESI+32-64], XMM2;
937                     movdqu [ESI+48-64], XMM3;
938                     cmp ESI, EDI;
939                     jb startsubsse2u;
940 
941                     mov aptr, ESI;
942                     mov bptr, EAX;
943                 }
944             }
945             else
946             {
947                 asm // aligned case
948                 {
949                     mov ESI, aptr;
950                     mov EDI, n;
951                     mov EAX, bptr;
952                     movd XMM4, l;
953                     pshufd XMM4, XMM4, 0;
954 
955                     align 8;
956                 startsubsse2a:
957                     add ESI, 64;
958                     movdqa XMM0, [EAX];
959                     movdqa XMM1, [EAX+16];
960                     movdqa XMM2, [EAX+32];
961                     movdqa XMM3, [EAX+48];
962                     add EAX, 64;
963                     psubb XMM0, XMM4;
964                     psubb XMM1, XMM4;
965                     psubb XMM2, XMM4;
966                     psubb XMM3, XMM4;
967                     movdqa [ESI   -64], XMM0;
968                     movdqa [ESI+16-64], XMM1;
969                     movdqa [ESI+32-64], XMM2;
970                     movdqa [ESI+48-64], XMM3;
971                     cmp ESI, EDI;
972                     jb startsubsse2a;
973 
974                     mov aptr, ESI;
975                     mov bptr, EAX;
976                 }
977             }
978         }
979         else
980         // MMX version is 1079% faster
981         if (mmx() && a.length >= 32)
982         {
983             auto n = aptr + (a.length & ~31);
984 
985             uint l = cast(ubyte) value;
986             l |= (l << 8);
987 
988             asm
989             {
990                 mov ESI, aptr;
991                 mov EDI, n;
992                 mov EAX, bptr;
993                 movd MM4, l;
994                 pshufw MM4, MM4, 0;
995 
996                 align 4;
997             startsubmmx:
998                 add ESI, 32;
999                 movq MM0, [EAX];
1000                 movq MM1, [EAX+8];
1001                 movq MM2, [EAX+16];
1002                 movq MM3, [EAX+24];
1003                 add EAX, 32;
1004                 psubb MM0, MM4;
1005                 psubb MM1, MM4;
1006                 psubb MM2, MM4;
1007                 psubb MM3, MM4;
1008                 movq [ESI   -32], MM0;
1009                 movq [ESI+8 -32], MM1;
1010                 movq [ESI+16-32], MM2;
1011                 movq [ESI+24-32], MM3;
1012                 cmp ESI, EDI;
1013                 jb startsubmmx;
1014 
1015                 emms;
1016                 mov aptr, ESI;
1017                 mov bptr, EAX;
1018             }
1019         }
1020         // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm.  There's not enough registers, really.
1021         else
1022         if (a.length >= 4)
1023         {
1024             auto n = aptr + (a.length & ~3);
1025             asm
1026             {
1027                 mov ESI, aptr;
1028                 mov EDI, n;
1029                 mov EAX, bptr;
1030                 mov CL, value;
1031 
1032                 align 4;
1033             startsub386:
1034                 add ESI, 4;
1035                 mov DX, [EAX];
1036                 mov BX, [EAX+2];
1037                 add EAX, 4;
1038                 sub BL, CL;
1039                 sub BH, CL;
1040                 sub DL, CL;
1041                 sub DH, CL;
1042                 mov [ESI   -4], DX;
1043                 mov [ESI+2 -4], BX;
1044                 cmp ESI, EDI;
1045                 jb startsub386;
1046 
1047                 mov aptr, ESI;
1048                 mov bptr, EAX;
1049             }
1050         }
1051     }
1052 
1053     while (aptr < aend)
1054         *aptr++ = cast(T)(*bptr++ - value);
1055 
1056     return a;
1057 }
1058 
1059 unittest
1060 {
1061     printf("_arraySliceExpMinSliceAssign_g unittest\n");
1062 
1063     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1064     {
1065         version (log) printf("    cpuid %d\n", cpuid);
1066 
1067         for (int j = 0; j < 2; j++)
1068         {
1069             const int dim = 67;
1070             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1071             a = a[j .. dim + j];        // misalign for second iteration
1072             T[] b = new T[dim + j];
1073             b = b[j .. dim + j];
1074             T[] c = new T[dim + j];
1075             c = c[j .. dim + j];
1076 
1077             for (int i = 0; i < dim; i++)
1078             {   a[i] = cast(T)i;
1079                 b[i] = cast(T)(i + 7);
1080                 c[i] = cast(T)(i * 2);
1081             }
1082 
1083             a[] = c[];
1084             c[] = b[] - 6;
1085 
1086             for (int i = 0; i < dim; i++)
1087             {
1088                 if (c[i] != cast(T)(b[i] - 6))
1089                 {
1090                     printf("[%d]: %d != %d - 6\n", i, c[i], b[i]);
1091                     assert(0);
1092                 }
1093             }
1094         }
1095     }
1096 }
1097 
1098 
1099 /* ======================================================================== */
1100 
1101 /***********************
1102  * Computes:
1103  *      a[] = value - b[]
1104  */
1105 
1106 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value)
1107 {
1108     return _arrayExpSliceMinSliceAssign_g(a, b, value);
1109 }
1110 
1111 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value)
1112 {
1113     return _arrayExpSliceMinSliceAssign_g(a, b, value);
1114 }
1115 
1116 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value)
1117 in
1118 {
1119     assert(a.length == b.length);
1120     assert(disjoint(a, b));
1121 }
1122 body
1123 {
1124     //printf("_arrayExpSliceMinSliceAssign_g()\n");
1125     auto aptr = a.ptr;
1126     auto aend = aptr + a.length;
1127     auto bptr = b.ptr;
1128 
1129     version (D_InlineAsm_X86)
1130     {
1131         // SSE2 aligned version is 8748% faster
1132         if (sse2() && a.length >= 64)
1133         {
1134             auto n = aptr + (a.length & ~63);
1135 
1136             uint l = cast(ubyte) value;
1137             l |= (l << 8);
1138             l |= (l << 16);
1139 
1140             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1141             {
1142                 asm // unaligned case
1143                 {
1144                     mov ESI, aptr;
1145                     mov EDI, n;
1146                     mov EAX, bptr;
1147                     movd XMM4, l;
1148                     pshufd XMM4, XMM4, 0;
1149 
1150                     align 8;
1151                 startsubrsse2u:
1152                     add ESI, 64;
1153                     movdqa XMM5, XMM4;
1154                     movdqa XMM6, XMM4;
1155                     movdqu XMM0, [EAX];
1156                     movdqu XMM1, [EAX+16];
1157                     psubb XMM5, XMM0;
1158                     psubb XMM6, XMM1;
1159                     movdqu [ESI   -64], XMM5;
1160                     movdqu [ESI+16-64], XMM6;
1161                     movdqa XMM5, XMM4;
1162                     movdqa XMM6, XMM4;
1163                     movdqu XMM2, [EAX+32];
1164                     movdqu XMM3, [EAX+48];
1165                     add EAX, 64;
1166                     psubb XMM5, XMM2;
1167                     psubb XMM6, XMM3;
1168                     movdqu [ESI+32-64], XMM5;
1169                     movdqu [ESI+48-64], XMM6;
1170                     cmp ESI, EDI;
1171                     jb startsubrsse2u;
1172 
1173                     mov aptr, ESI;
1174                     mov bptr, EAX;
1175                 }
1176             }
1177             else
1178             {
1179                 asm // aligned case
1180                 {
1181                     mov ESI, aptr;
1182                     mov EDI, n;
1183                     mov EAX, bptr;
1184                     movd XMM4, l;
1185                     pshufd XMM4, XMM4, 0;
1186 
1187                     align 8;
1188                 startsubrsse2a:
1189                     add ESI, 64;
1190                     movdqa XMM5, XMM4;
1191                     movdqa XMM6, XMM4;
1192                     movdqa XMM0, [EAX];
1193                     movdqa XMM1, [EAX+16];
1194                     psubb XMM5, XMM0;
1195                     psubb XMM6, XMM1;
1196                     movdqa [ESI   -64], XMM5;
1197                     movdqa [ESI+16-64], XMM6;
1198                     movdqa XMM5, XMM4;
1199                     movdqa XMM6, XMM4;
1200                     movdqa XMM2, [EAX+32];
1201                     movdqa XMM3, [EAX+48];
1202                     add EAX, 64;
1203                     psubb XMM5, XMM2;
1204                     psubb XMM6, XMM3;
1205                     movdqa [ESI+32-64], XMM5;
1206                     movdqa [ESI+48-64], XMM6;
1207                     cmp ESI, EDI;
1208                     jb startsubrsse2a;
1209 
1210                     mov aptr, ESI;
1211                     mov bptr, EAX;
1212                 }
1213             }
1214         }
1215         else
1216         // MMX version is 7397% faster
1217         if (mmx() && a.length >= 32)
1218         {
1219             auto n = aptr + (a.length & ~31);
1220 
1221             uint l = cast(ubyte) value;
1222             l |= (l << 8);
1223 
1224             asm
1225             {
1226                 mov ESI, aptr;
1227                 mov EDI, n;
1228                 mov EAX, bptr;
1229                 movd MM4, l;
1230                 pshufw MM4, MM4, 0;
1231 
1232                 align 4;
1233             startsubrmmx:
1234                 add ESI, 32;
1235                 movq MM5, MM4;
1236                 movq MM6, MM4;
1237                 movq MM0, [EAX];
1238                 movq MM1, [EAX+8];
1239                 psubb MM5, MM0;
1240                 psubb MM6, MM1;
1241                 movq [ESI   -32], MM5;
1242                 movq [ESI+8 -32], MM6;
1243                 movq MM5, MM4;
1244                 movq MM6, MM4;
1245                 movq MM2, [EAX+16];
1246                 movq MM3, [EAX+24];
1247                 add EAX, 32;
1248                 psubb MM5, MM2;
1249                 psubb MM6, MM3;
1250                 movq [ESI+16-32], MM5;
1251                 movq [ESI+24-32], MM6;
1252                 cmp ESI, EDI;
1253                 jb startsubrmmx;
1254 
1255                 emms;
1256                 mov aptr, ESI;
1257                 mov bptr, EAX;
1258             }
1259         }
1260 
1261     }
1262 
1263     while (aptr < aend)
1264         *aptr++ = cast(T)(value - *bptr++);
1265 
1266     return a;
1267 }
1268 
1269 unittest
1270 {
1271     printf("_arrayExpSliceMinSliceAssign_g unittest\n");
1272 
1273     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1274     {
1275         version (log) printf("    cpuid %d\n", cpuid);
1276 
1277         for (int j = 0; j < 2; j++)
1278         {
1279             const int dim = 67;
1280             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1281             a = a[j .. dim + j];        // misalign for second iteration
1282             T[] b = new T[dim + j];
1283             b = b[j .. dim + j];
1284             T[] c = new T[dim + j];
1285             c = c[j .. dim + j];
1286 
1287             for (int i = 0; i < dim; i++)
1288             {   a[i] = cast(T)i;
1289                 b[i] = cast(T)(i + 7);
1290                 c[i] = cast(T)(i * 2);
1291             }
1292 
1293             a[] = c[];
1294             c[] = 6 - b[];
1295 
1296             for (int i = 0; i < dim; i++)
1297             {
1298                 if (c[i] != cast(T)(6 - b[i]))
1299                 {
1300                     printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]);
1301                     assert(0);
1302                 }
1303             }
1304         }
1305     }
1306 }
1307 
1308 
1309 /* ======================================================================== */
1310 
1311 /***********************
1312  * Computes:
1313  *      a[] = b[] - c[]
1314  */
1315 
1316 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b)
1317 {
1318     return _arraySliceSliceMinSliceAssign_g(a, c, b);
1319 }
1320 
1321 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b)
1322 {
1323     return _arraySliceSliceMinSliceAssign_g(a, c, b);
1324 }
1325 
1326 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b)
1327 in
1328 {
1329         assert(a.length == b.length && b.length == c.length);
1330         assert(disjoint(a, b));
1331         assert(disjoint(a, c));
1332         assert(disjoint(b, c));
1333 }
1334 body
1335 {
1336     auto aptr = a.ptr;
1337     auto aend = aptr + a.length;
1338     auto bptr = b.ptr;
1339     auto cptr = c.ptr;
1340 
1341     version (D_InlineAsm_X86)
1342     {
1343         // SSE2 aligned version is 5756% faster
1344         if (sse2() && a.length >= 64)
1345         {
1346             auto n = aptr + (a.length & ~63);
1347 
1348             if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0)
1349             {
1350                 asm // unaligned case
1351                 {
1352                     mov ESI, aptr;
1353                     mov EDI, n;
1354                     mov EAX, bptr;
1355                     mov ECX, cptr;
1356 
1357                     align 8;
1358                 startsublsse2u:
1359                     add ESI, 64;
1360                     movdqu XMM0, [EAX];
1361                     movdqu XMM1, [EAX+16];
1362                     movdqu XMM2, [EAX+32];
1363                     movdqu XMM3, [EAX+48];
1364                     add EAX, 64;
1365                     movdqu XMM4, [ECX];
1366                     movdqu XMM5, [ECX+16];
1367                     movdqu XMM6, [ECX+32];
1368                     movdqu XMM7, [ECX+48];
1369                     add ECX, 64;
1370                     psubb XMM0, XMM4;
1371                     psubb XMM1, XMM5;
1372                     psubb XMM2, XMM6;
1373                     psubb XMM3, XMM7;
1374                     movdqu [ESI   -64], XMM0;
1375                     movdqu [ESI+16-64], XMM1;
1376                     movdqu [ESI+32-64], XMM2;
1377                     movdqu [ESI+48-64], XMM3;
1378                     cmp ESI, EDI;
1379                     jb startsublsse2u;
1380 
1381                     mov aptr, ESI;
1382                     mov bptr, EAX;
1383                     mov cptr, ECX;
1384                 }
1385             }
1386             else
1387             {
1388                 asm // aligned case
1389                 {
1390                     mov ESI, aptr;
1391                     mov EDI, n;
1392                     mov EAX, bptr;
1393                     mov ECX, cptr;
1394 
1395                     align 8;
1396                 startsublsse2a:
1397                     add ESI, 64;
1398                     movdqa XMM0, [EAX];
1399                     movdqa XMM1, [EAX+16];
1400                     movdqa XMM2, [EAX+32];
1401                     movdqa XMM3, [EAX+48];
1402                     add EAX, 64;
1403                     movdqa XMM4, [ECX];
1404                     movdqa XMM5, [ECX+16];
1405                     movdqa XMM6, [ECX+32];
1406                     movdqa XMM7, [ECX+48];
1407                     add ECX, 64;
1408                     psubb XMM0, XMM4;
1409                     psubb XMM1, XMM5;
1410                     psubb XMM2, XMM6;
1411                     psubb XMM3, XMM7;
1412                     movdqa [ESI   -64], XMM0;
1413                     movdqa [ESI+16-64], XMM1;
1414                     movdqa [ESI+32-64], XMM2;
1415                     movdqa [ESI+48-64], XMM3;
1416                     cmp ESI, EDI;
1417                     jb startsublsse2a;
1418 
1419                     mov aptr, ESI;
1420                     mov bptr, EAX;
1421                     mov cptr, ECX;
1422                 }
1423             }
1424         }
1425         else
1426         // MMX version is 4428% faster
1427         if (mmx() && a.length >= 32)
1428         {
1429             auto n = aptr + (a.length & ~31);
1430 
1431             asm
1432             {
1433                 mov ESI, aptr;
1434                 mov EDI, n;
1435                 mov EAX, bptr;
1436                 mov ECX, cptr;
1437 
1438                 align 8;
1439             startsublmmx:
1440                 add ESI, 32;
1441                 movq MM0, [EAX];
1442                 movq MM1, [EAX+8];
1443                 movq MM2, [EAX+16];
1444                 movq MM3, [EAX+24];
1445                 add EAX, 32;
1446                 movq MM4, [ECX];
1447                 movq MM5, [ECX+8];
1448                 movq MM6, [ECX+16];
1449                 movq MM7, [ECX+24];
1450                 add ECX, 32;
1451                 psubb MM0, MM4;
1452                 psubb MM1, MM5;
1453                 psubb MM2, MM6;
1454                 psubb MM3, MM7;
1455                 movq [ESI   -32], MM0;
1456                 movq [ESI+8 -32], MM1;
1457                 movq [ESI+16-32], MM2;
1458                 movq [ESI+24-32], MM3;
1459                 cmp ESI, EDI;
1460                 jb startsublmmx;
1461 
1462                 emms;
1463                 mov aptr, ESI;
1464                 mov bptr, EAX;
1465                 mov cptr, ECX;
1466             }
1467         }
1468     }
1469 
1470     while (aptr < aend)
1471         *aptr++ = cast(T)(*bptr++ - *cptr++);
1472 
1473     return a;
1474 }
1475 
1476 unittest
1477 {
1478     printf("_arraySliceSliceMinSliceAssign_g unittest\n");
1479 
1480     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1481     {
1482         version (log) printf("    cpuid %d\n", cpuid);
1483 
1484         for (int j = 0; j < 2; j++)
1485         {
1486             const int dim = 67;
1487             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1488             a = a[j .. dim + j];        // misalign for second iteration
1489             T[] b = new T[dim + j];
1490             b = b[j .. dim + j];
1491             T[] c = new T[dim + j];
1492             c = c[j .. dim + j];
1493 
1494             for (int i = 0; i < dim; i++)
1495             {   a[i] = cast(T)i;
1496                 b[i] = cast(T)(i + 7);
1497                 c[i] = cast(T)(i * 2);
1498             }
1499 
1500             c[] = a[] - b[];
1501 
1502             for (int i = 0; i < dim; i++)
1503             {
1504                 if (c[i] != cast(T)(a[i] - b[i]))
1505                 {
1506                     printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1507                     assert(0);
1508                 }
1509             }
1510         }
1511     }
1512 }
1513 
1514 
1515 /* ======================================================================== */
1516 
1517 /***********************
1518  * Computes:
1519  *      a[] -= value
1520  */
1521 
1522 T[] _arrayExpSliceMinass_a(T[] a, T value)
1523 {
1524     return _arrayExpSliceMinass_g(a, value);
1525 }
1526 
1527 T[] _arrayExpSliceMinass_h(T[] a, T value)
1528 {
1529     return _arrayExpSliceMinass_g(a, value);
1530 }
1531 
1532 T[] _arrayExpSliceMinass_g(T[] a, T value)
1533 {
1534     //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value);
1535     auto aptr = a.ptr;
1536     auto aend = aptr + a.length;
1537 
1538     version (D_InlineAsm_X86)
1539     {
1540         // SSE2 aligned version is 1577% faster
1541         if (sse2() && a.length >= 64)
1542         {
1543             auto n = aptr + (a.length & ~63);
1544 
1545             uint l = cast(ubyte) value;
1546             l |= (l << 8);
1547             l |= (l << 16);
1548 
1549             if (((cast(uint) aptr) & 15) != 0)
1550             {
1551                 asm // unaligned case
1552                 {
1553                     mov ESI, aptr;
1554                     mov EDI, n;
1555                     movd XMM4, l;
1556                     pshufd XMM4, XMM4, 0;
1557 
1558                     align 8;
1559                 startsubasssse2u:
1560                     movdqu XMM0, [ESI];
1561                     movdqu XMM1, [ESI+16];
1562                     movdqu XMM2, [ESI+32];
1563                     movdqu XMM3, [ESI+48];
1564                     add ESI, 64;
1565                     psubb XMM0, XMM4;
1566                     psubb XMM1, XMM4;
1567                     psubb XMM2, XMM4;
1568                     psubb XMM3, XMM4;
1569                     movdqu [ESI   -64], XMM0;
1570                     movdqu [ESI+16-64], XMM1;
1571                     movdqu [ESI+32-64], XMM2;
1572                     movdqu [ESI+48-64], XMM3;
1573                     cmp ESI, EDI;
1574                     jb startsubasssse2u;
1575 
1576                     mov aptr, ESI;
1577                 }
1578             }
1579             else
1580             {
1581                 asm // aligned case
1582                 {
1583                     mov ESI, aptr;
1584                     mov EDI, n;
1585                     movd XMM4, l;
1586                     pshufd XMM4, XMM4, 0;
1587 
1588                     align 8;
1589                 startsubasssse2a:
1590                     movdqa XMM0, [ESI];
1591                     movdqa XMM1, [ESI+16];
1592                     movdqa XMM2, [ESI+32];
1593                     movdqa XMM3, [ESI+48];
1594                     add ESI, 64;
1595                     psubb XMM0, XMM4;
1596                     psubb XMM1, XMM4;
1597                     psubb XMM2, XMM4;
1598                     psubb XMM3, XMM4;
1599                     movdqa [ESI   -64], XMM0;
1600                     movdqa [ESI+16-64], XMM1;
1601                     movdqa [ESI+32-64], XMM2;
1602                     movdqa [ESI+48-64], XMM3;
1603                     cmp ESI, EDI;
1604                     jb startsubasssse2a;
1605 
1606                     mov aptr, ESI;
1607                 }
1608             }
1609         }
1610         else
1611         // MMX version is 1577% faster
1612         if (mmx() && a.length >= 32)
1613         {
1614 
1615             auto n = aptr + (a.length & ~31);
1616 
1617             uint l = cast(ubyte) value;
1618             l |= (l << 8);
1619 
1620             asm
1621             {
1622                 mov ESI, aptr;
1623                 mov EDI, n;
1624                 movd MM4, l;
1625                 pshufw MM4, MM4, 0;
1626 
1627                 align 8;
1628             startsubassmmx:
1629                 movq MM0, [ESI];
1630                 movq MM1, [ESI+8];
1631                 movq MM2, [ESI+16];
1632                 movq MM3, [ESI+24];
1633                 add ESI, 32;
1634                 psubb MM0, MM4;
1635                 psubb MM1, MM4;
1636                 psubb MM2, MM4;
1637                 psubb MM3, MM4;
1638                 movq [ESI   -32], MM0;
1639                 movq [ESI+8 -32], MM1;
1640                 movq [ESI+16-32], MM2;
1641                 movq [ESI+24-32], MM3;
1642                 cmp ESI, EDI;
1643                 jb startsubassmmx;
1644 
1645                 emms;
1646                 mov aptr, ESI;
1647             }
1648         }
1649     }
1650 
1651     while (aptr < aend)
1652         *aptr++ -= value;
1653 
1654     return a;
1655 }
1656 
1657 unittest
1658 {
1659     printf("_arrayExpSliceMinass_g unittest\n");
1660 
1661     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1662     {
1663         version (log) printf("    cpuid %d\n", cpuid);
1664 
1665         for (int j = 0; j < 2; j++)
1666         {
1667             const int dim = 67;
1668             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1669             a = a[j .. dim + j];        // misalign for second iteration
1670             T[] b = new T[dim + j];
1671             b = b[j .. dim + j];
1672             T[] c = new T[dim + j];
1673             c = c[j .. dim + j];
1674 
1675             for (int i = 0; i < dim; i++)
1676             {   a[i] = cast(T)i;
1677                 b[i] = cast(T)(i + 7);
1678                 c[i] = cast(T)(i * 2);
1679             }
1680 
1681             a[] = c[];
1682             c[] -= 6;
1683 
1684             for (int i = 0; i < dim; i++)
1685             {
1686                 if (c[i] != cast(T)(a[i] - 6))
1687                 {
1688                     printf("[%d]: %d != %d - 6\n", i, c[i], a[i]);
1689                     assert(0);
1690                 }
1691             }
1692         }
1693     }
1694 }
1695 
1696 
1697 /* ======================================================================== */
1698 
1699 /***********************
1700  * Computes:
1701  *      a[] -= b[]
1702  */
1703 
1704 T[] _arraySliceSliceMinass_a(T[] a, T[] b)
1705 {
1706     return _arraySliceSliceMinass_g(a, b);
1707 }
1708 
1709 T[] _arraySliceSliceMinass_h(T[] a, T[] b)
1710 {
1711     return _arraySliceSliceMinass_g(a, b);
1712 }
1713 
1714 T[] _arraySliceSliceMinass_g(T[] a, T[] b)
1715 in
1716 {
1717     assert (a.length == b.length);
1718     assert (disjoint(a, b));
1719 }
1720 body
1721 {
1722     //printf("_arraySliceSliceMinass_g()\n");
1723     auto aptr = a.ptr;
1724     auto aend = aptr + a.length;
1725     auto bptr = b.ptr;
1726 
1727     version (D_InlineAsm_X86)
1728     {
1729         // SSE2 aligned version is 4800% faster
1730         if (sse2() && a.length >= 64)
1731         {
1732             auto n = aptr + (a.length & ~63);
1733 
1734             if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0)
1735             {
1736                 asm // unaligned case
1737                 {
1738                     mov ESI, aptr;
1739                     mov EDI, n;
1740                     mov ECX, bptr;
1741 
1742                     align 8;
1743                 startsubasslsse2u:
1744                     movdqu XMM0, [ESI];
1745                     movdqu XMM1, [ESI+16];
1746                     movdqu XMM2, [ESI+32];
1747                     movdqu XMM3, [ESI+48];
1748                     add ESI, 64;
1749                     movdqu XMM4, [ECX];
1750                     movdqu XMM5, [ECX+16];
1751                     movdqu XMM6, [ECX+32];
1752                     movdqu XMM7, [ECX+48];
1753                     add ECX, 64;
1754                     psubb XMM0, XMM4;
1755                     psubb XMM1, XMM5;
1756                     psubb XMM2, XMM6;
1757                     psubb XMM3, XMM7;
1758                     movdqu [ESI   -64], XMM0;
1759                     movdqu [ESI+16-64], XMM1;
1760                     movdqu [ESI+32-64], XMM2;
1761                     movdqu [ESI+48-64], XMM3;
1762                     cmp ESI, EDI;
1763                     jb startsubasslsse2u;
1764 
1765                     mov aptr, ESI;
1766                     mov bptr, ECX;
1767                 }
1768             }
1769             else
1770             {
1771                 asm // aligned case
1772                 {
1773                     mov ESI, aptr;
1774                     mov EDI, n;
1775                     mov ECX, bptr;
1776 
1777                     align 8;
1778                 startsubasslsse2a:
1779                     movdqa XMM0, [ESI];
1780                     movdqa XMM1, [ESI+16];
1781                     movdqa XMM2, [ESI+32];
1782                     movdqa XMM3, [ESI+48];
1783                     add ESI, 64;
1784                     movdqa XMM4, [ECX];
1785                     movdqa XMM5, [ECX+16];
1786                     movdqa XMM6, [ECX+32];
1787                     movdqa XMM7, [ECX+48];
1788                     add ECX, 64;
1789                     psubb XMM0, XMM4;
1790                     psubb XMM1, XMM5;
1791                     psubb XMM2, XMM6;
1792                     psubb XMM3, XMM7;
1793                     movdqa [ESI   -64], XMM0;
1794                     movdqa [ESI+16-64], XMM1;
1795                     movdqa [ESI+32-64], XMM2;
1796                     movdqa [ESI+48-64], XMM3;
1797                     cmp ESI, EDI;
1798                     jb startsubasslsse2a;
1799 
1800                     mov aptr, ESI;
1801                     mov bptr, ECX;
1802                 }
1803             }
1804         }
1805         else
1806         // MMX version is 3107% faster
1807         if (mmx() && a.length >= 32)
1808         {
1809 
1810             auto n = aptr + (a.length & ~31);
1811 
1812             asm
1813             {
1814                 mov ESI, aptr;
1815                 mov EDI, n;
1816                 mov ECX, bptr;
1817 
1818                 align 8;
1819             startsubasslmmx:
1820                 movq MM0, [ESI];
1821                 movq MM1, [ESI+8];
1822                 movq MM2, [ESI+16];
1823                 movq MM3, [ESI+24];
1824                 add ESI, 32;
1825                 movq MM4, [ECX];
1826                 movq MM5, [ECX+8];
1827                 movq MM6, [ECX+16];
1828                 movq MM7, [ECX+24];
1829                 add ECX, 32;
1830                 psubb MM0, MM4;
1831                 psubb MM1, MM5;
1832                 psubb MM2, MM6;
1833                 psubb MM3, MM7;
1834                 movq [ESI   -32], MM0;
1835                 movq [ESI+8 -32], MM1;
1836                 movq [ESI+16-32], MM2;
1837                 movq [ESI+24-32], MM3;
1838                 cmp ESI, EDI;
1839                 jb startsubasslmmx;
1840 
1841                 emms;
1842                 mov aptr, ESI;
1843                 mov bptr, ECX;
1844             }
1845         }
1846     }
1847 
1848     while (aptr < aend)
1849         *aptr++ -= *bptr++;
1850 
1851     return a;
1852 }
1853 
1854 unittest
1855 {
1856     printf("_arraySliceSliceMinass_g unittest\n");
1857 
1858     for (cpuid = 0; cpuid < CPUID_MAX; cpuid++)
1859     {
1860         version (log) printf("    cpuid %d\n", cpuid);
1861 
1862         for (int j = 0; j < 2; j++)
1863         {
1864             const int dim = 67;
1865             T[] a = new T[dim + j];     // aligned on 16 byte boundary
1866             a = a[j .. dim + j];        // misalign for second iteration
1867             T[] b = new T[dim + j];
1868             b = b[j .. dim + j];
1869             T[] c = new T[dim + j];
1870             c = c[j .. dim + j];
1871 
1872             for (int i = 0; i < dim; i++)
1873             {   a[i] = cast(T)i;
1874                 b[i] = cast(T)(i + 7);
1875                 c[i] = cast(T)(i * 2);
1876             }
1877 
1878             a[] = c[];
1879             c[] -= b[];
1880 
1881             for (int i = 0; i < dim; i++)
1882             {
1883                 if (c[i] != cast(T)(a[i] - b[i]))
1884                 {
1885                     printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]);
1886                     assert(0);
1887                 }
1888             }
1889         }
1890     }
1891 }