1 /*************************** 2 * D programming language http://www.digitalmars.com/d/ 3 * Runtime support for byte array operations. 4 * Based on code originally written by Burton Radons. 5 * Placed in public domain. 6 */ 7 8 /* Contains SSE2 and MMX versions of certain operations for wchar, short, 9 * and ushort ('u', 's' and 't' suffixes). 10 */ 11 12 module rt.compiler.gdc.rt.arrayshort; 13 14 private import CPUid = rt.compiler.util.cpuid; 15 16 debug(UnitTest) 17 { 18 private extern(C) int printf(char*,...); 19 /* This is so unit tests will test every CPU variant 20 */ 21 int cpuid; 22 const int CPUID_MAX = 4; 23 bool mmx() { return cpuid == 1 && CPUid.mmx(); } 24 bool sse() { return cpuid == 2 && CPUid.sse(); } 25 bool sse2() { return cpuid == 3 && CPUid.sse2(); } 26 bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); } 27 } 28 else 29 { 30 alias CPUid.mmx mmx; 31 alias CPUid.sse sse; 32 alias CPUid.sse2 sse2; 33 alias CPUid.sse2 sse2; 34 } 35 36 //version = log; 37 38 bool disjoint(T)(T[] a, T[] b) 39 { 40 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); 41 } 42 43 alias short T; 44 45 extern (C): 46 47 /* ======================================================================== */ 48 49 /*********************** 50 * Computes: 51 * a[] = b[] + value 52 */ 53 54 T[] _arraySliceExpAddSliceAssign_u(T[] a, T value, T[] b) 55 { 56 return _arraySliceExpAddSliceAssign_s(a, value, b); 57 } 58 59 T[] _arraySliceExpAddSliceAssign_t(T[] a, T value, T[] b) 60 { 61 return _arraySliceExpAddSliceAssign_s(a, value, b); 62 } 63 64 T[] _arraySliceExpAddSliceAssign_s(T[] a, T value, T[] b) 65 in 66 { 67 assert(a.length == b.length); 68 assert(disjoint(a, b)); 69 } 70 body 71 { 72 //printf("_arraySliceExpAddSliceAssign_s()\n"); 73 auto aptr = a.ptr; 74 auto aend = aptr + a.length; 75 auto bptr = b.ptr; 76 77 version (D_InlineAsm_X86) 78 { 79 // SSE2 aligned version is 3343% faster 80 if (sse2() && a.length >= 16) 81 { 82 auto n = aptr + (a.length & ~15); 83 84 uint l = cast(ushort) value; 85 l |= (l << 16); 86 87 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 88 { 89 asm // unaligned case 90 { 91 mov ESI, aptr; 92 mov EDI, n; 93 mov EAX, bptr; 94 movd XMM2, l; 95 pshufd XMM2, XMM2, 0; 96 97 align 4; 98 startaddsse2u: 99 add ESI, 32; 100 movdqu XMM0, [EAX]; 101 movdqu XMM1, [EAX+16]; 102 add EAX, 32; 103 paddw XMM0, XMM2; 104 paddw XMM1, XMM2; 105 movdqu [ESI -32], XMM0; 106 movdqu [ESI+16-32], XMM1; 107 cmp ESI, EDI; 108 jb startaddsse2u; 109 110 mov aptr, ESI; 111 mov bptr, EAX; 112 } 113 } 114 else 115 { 116 asm // aligned case 117 { 118 mov ESI, aptr; 119 mov EDI, n; 120 mov EAX, bptr; 121 movd XMM2, l; 122 pshufd XMM2, XMM2, 0; 123 124 align 4; 125 startaddsse2a: 126 add ESI, 32; 127 movdqa XMM0, [EAX]; 128 movdqa XMM1, [EAX+16]; 129 add EAX, 32; 130 paddw XMM0, XMM2; 131 paddw XMM1, XMM2; 132 movdqa [ESI -32], XMM0; 133 movdqa [ESI+16-32], XMM1; 134 cmp ESI, EDI; 135 jb startaddsse2a; 136 137 mov aptr, ESI; 138 mov bptr, EAX; 139 } 140 } 141 } 142 else 143 // MMX version is 3343% faster 144 if (mmx() && a.length >= 8) 145 { 146 auto n = aptr + (a.length & ~7); 147 148 uint l = cast(ushort) value; 149 150 asm 151 { 152 mov ESI, aptr; 153 mov EDI, n; 154 mov EAX, bptr; 155 movd MM2, l; 156 pshufw MM2, MM2, 0; 157 158 align 4; 159 startmmx: 160 add ESI, 16; 161 movq MM0, [EAX]; 162 movq MM1, [EAX+8]; 163 add EAX, 16; 164 paddw MM0, MM2; 165 paddw MM1, MM2; 166 movq [ESI -16], MM0; 167 movq [ESI+8-16], MM1; 168 cmp ESI, EDI; 169 jb startmmx; 170 171 emms; 172 mov aptr, ESI; 173 mov bptr, EAX; 174 } 175 } 176 } 177 178 while (aptr < aend) 179 *aptr++ = cast(T)(*bptr++ + value); 180 181 return a; 182 } 183 184 unittest 185 { 186 printf("_arraySliceExpAddSliceAssign_s unittest\n"); 187 188 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 189 { 190 version (log) printf(" cpuid %d\n", cpuid); 191 192 for (int j = 0; j < 2; j++) 193 { 194 const int dim = 67; 195 T[] a = new T[dim + j]; // aligned on 16 byte boundary 196 a = a[j .. dim + j]; // misalign for second iteration 197 T[] b = new T[dim + j]; 198 b = b[j .. dim + j]; 199 T[] c = new T[dim + j]; 200 c = c[j .. dim + j]; 201 202 for (int i = 0; i < dim; i++) 203 { a[i] = cast(T)i; 204 b[i] = cast(T)(i + 7); 205 c[i] = cast(T)(i * 2); 206 } 207 208 c[] = a[] + 6; 209 210 for (int i = 0; i < dim; i++) 211 { 212 if (c[i] != cast(T)(a[i] + 6)) 213 { 214 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); 215 assert(0); 216 } 217 } 218 } 219 } 220 } 221 222 223 /* ======================================================================== */ 224 225 /*********************** 226 * Computes: 227 * a[] = b[] + c[] 228 */ 229 230 T[] _arraySliceSliceAddSliceAssign_u(T[] a, T[] c, T[] b) 231 { 232 return _arraySliceSliceAddSliceAssign_s(a, c, b); 233 } 234 235 T[] _arraySliceSliceAddSliceAssign_t(T[] a, T[] c, T[] b) 236 { 237 return _arraySliceSliceAddSliceAssign_s(a, c, b); 238 } 239 240 T[] _arraySliceSliceAddSliceAssign_s(T[] a, T[] c, T[] b) 241 in 242 { 243 assert(a.length == b.length && b.length == c.length); 244 assert(disjoint(a, b)); 245 assert(disjoint(a, c)); 246 assert(disjoint(b, c)); 247 } 248 body 249 { 250 //printf("_arraySliceSliceAddSliceAssign_s()\n"); 251 auto aptr = a.ptr; 252 auto aend = aptr + a.length; 253 auto bptr = b.ptr; 254 auto cptr = c.ptr; 255 256 version (D_InlineAsm_X86) 257 { 258 // SSE2 aligned version is 3777% faster 259 if (sse2() && a.length >= 16) 260 { 261 auto n = aptr + (a.length & ~15); 262 263 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 264 { 265 asm // unaligned case 266 { 267 mov ESI, aptr; 268 mov EDI, n; 269 mov EAX, bptr; 270 mov ECX, cptr; 271 272 align 4; 273 startsse2u: 274 add ESI, 32; 275 movdqu XMM0, [EAX]; 276 movdqu XMM1, [EAX+16]; 277 add EAX, 32; 278 movdqu XMM2, [ECX]; 279 movdqu XMM3, [ECX+16]; 280 add ECX, 32; 281 paddw XMM0, XMM2; 282 paddw XMM1, XMM3; 283 movdqu [ESI -32], XMM0; 284 movdqu [ESI+16-32], XMM1; 285 cmp ESI, EDI; 286 jb startsse2u; 287 288 mov aptr, ESI; 289 mov bptr, EAX; 290 mov cptr, ECX; 291 } 292 } 293 else 294 { 295 asm // aligned case 296 { 297 mov ESI, aptr; 298 mov EDI, n; 299 mov EAX, bptr; 300 mov ECX, cptr; 301 302 align 4; 303 startsse2a: 304 add ESI, 32; 305 movdqa XMM0, [EAX]; 306 movdqa XMM1, [EAX+16]; 307 add EAX, 32; 308 movdqa XMM2, [ECX]; 309 movdqa XMM3, [ECX+16]; 310 add ECX, 32; 311 paddw XMM0, XMM2; 312 paddw XMM1, XMM3; 313 movdqa [ESI -32], XMM0; 314 movdqa [ESI+16-32], XMM1; 315 cmp ESI, EDI; 316 jb startsse2a; 317 318 mov aptr, ESI; 319 mov bptr, EAX; 320 mov cptr, ECX; 321 } 322 } 323 } 324 else 325 // MMX version is 2068% faster 326 if (mmx() && a.length >= 8) 327 { 328 auto n = aptr + (a.length & ~7); 329 330 asm 331 { 332 mov ESI, aptr; 333 mov EDI, n; 334 mov EAX, bptr; 335 mov ECX, cptr; 336 337 align 4; 338 startmmx: 339 add ESI, 16; 340 movq MM0, [EAX]; 341 movq MM1, [EAX+8]; 342 add EAX, 16; 343 movq MM2, [ECX]; 344 movq MM3, [ECX+8]; 345 add ECX, 16; 346 paddw MM0, MM2; 347 paddw MM1, MM3; 348 movq [ESI -16], MM0; 349 movq [ESI+8-16], MM1; 350 cmp ESI, EDI; 351 jb startmmx; 352 353 emms; 354 mov aptr, ESI; 355 mov bptr, EAX; 356 mov cptr, ECX; 357 } 358 } 359 } 360 361 while (aptr < aend) 362 *aptr++ = cast(T)(*bptr++ + *cptr++); 363 364 return a; 365 } 366 367 unittest 368 { 369 printf("_arraySliceSliceAddSliceAssign_s unittest\n"); 370 371 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 372 { 373 version (log) printf(" cpuid %d\n", cpuid); 374 375 for (int j = 0; j < 2; j++) 376 { 377 const int dim = 67; 378 T[] a = new T[dim + j]; // aligned on 16 byte boundary 379 a = a[j .. dim + j]; // misalign for second iteration 380 T[] b = new T[dim + j]; 381 b = b[j .. dim + j]; 382 T[] c = new T[dim + j]; 383 c = c[j .. dim + j]; 384 385 for (int i = 0; i < dim; i++) 386 { a[i] = cast(T)i; 387 b[i] = cast(T)(i + 7); 388 c[i] = cast(T)(i * 2); 389 } 390 391 c[] = a[] + b[]; 392 393 for (int i = 0; i < dim; i++) 394 { 395 if (c[i] != cast(T)(a[i] + b[i])) 396 { 397 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); 398 assert(0); 399 } 400 } 401 } 402 } 403 } 404 405 406 /* ======================================================================== */ 407 408 /*********************** 409 * Computes: 410 * a[] += value 411 */ 412 413 T[] _arrayExpSliceAddass_u(T[] a, T value) 414 { 415 return _arrayExpSliceAddass_s(a, value); 416 } 417 418 T[] _arrayExpSliceAddass_t(T[] a, T value) 419 { 420 return _arrayExpSliceAddass_s(a, value); 421 } 422 423 T[] _arrayExpSliceAddass_s(T[] a, T value) 424 { 425 //printf("_arrayExpSliceAddass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 426 auto aptr = a.ptr; 427 auto aend = aptr + a.length; 428 429 version (D_InlineAsm_X86) 430 { 431 // SSE2 aligned version is 832% faster 432 if (sse2() && a.length >= 16) 433 { 434 auto n = aptr + (a.length & ~15); 435 436 uint l = cast(ushort) value; 437 l |= (l << 16); 438 439 if (((cast(uint) aptr) & 15) != 0) 440 { 441 asm // unaligned case 442 { 443 mov ESI, aptr; 444 mov EDI, n; 445 movd XMM2, l; 446 pshufd XMM2, XMM2, 0; 447 448 align 4; 449 startaddsse2u: 450 movdqu XMM0, [ESI]; 451 movdqu XMM1, [ESI+16]; 452 add ESI, 32; 453 paddw XMM0, XMM2; 454 paddw XMM1, XMM2; 455 movdqu [ESI -32], XMM0; 456 movdqu [ESI+16-32], XMM1; 457 cmp ESI, EDI; 458 jb startaddsse2u; 459 460 mov aptr, ESI; 461 } 462 } 463 else 464 { 465 asm // aligned case 466 { 467 mov ESI, aptr; 468 mov EDI, n; 469 movd XMM2, l; 470 pshufd XMM2, XMM2, 0; 471 472 align 4; 473 startaddsse2a: 474 movdqa XMM0, [ESI]; 475 movdqa XMM1, [ESI+16]; 476 add ESI, 32; 477 paddw XMM0, XMM2; 478 paddw XMM1, XMM2; 479 movdqa [ESI -32], XMM0; 480 movdqa [ESI+16-32], XMM1; 481 cmp ESI, EDI; 482 jb startaddsse2a; 483 484 mov aptr, ESI; 485 } 486 } 487 } 488 else 489 // MMX version is 826% faster 490 if (mmx() && a.length >= 8) 491 { 492 auto n = aptr + (a.length & ~7); 493 494 uint l = cast(ushort) value; 495 496 asm 497 { 498 mov ESI, aptr; 499 mov EDI, n; 500 movd MM2, l; 501 pshufw MM2, MM2, 0; 502 503 align 4; 504 startmmx: 505 movq MM0, [ESI]; 506 movq MM1, [ESI+8]; 507 add ESI, 16; 508 paddw MM0, MM2; 509 paddw MM1, MM2; 510 movq [ESI -16], MM0; 511 movq [ESI+8-16], MM1; 512 cmp ESI, EDI; 513 jb startmmx; 514 515 emms; 516 mov aptr, ESI; 517 } 518 } 519 } 520 521 while (aptr < aend) 522 *aptr++ += value; 523 524 return a; 525 } 526 527 unittest 528 { 529 printf("_arrayExpSliceAddass_s unittest\n"); 530 531 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 532 { 533 version (log) printf(" cpuid %d\n", cpuid); 534 535 for (int j = 0; j < 2; j++) 536 { 537 const int dim = 67; 538 T[] a = new T[dim + j]; // aligned on 16 byte boundary 539 a = a[j .. dim + j]; // misalign for second iteration 540 T[] b = new T[dim + j]; 541 b = b[j .. dim + j]; 542 T[] c = new T[dim + j]; 543 c = c[j .. dim + j]; 544 545 for (int i = 0; i < dim; i++) 546 { a[i] = cast(T)i; 547 b[i] = cast(T)(i + 7); 548 c[i] = cast(T)(i * 2); 549 } 550 551 a[] = c[]; 552 a[] += 6; 553 554 for (int i = 0; i < dim; i++) 555 { 556 if (a[i] != cast(T)(c[i] + 6)) 557 { 558 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); 559 assert(0); 560 } 561 } 562 } 563 } 564 } 565 566 567 /* ======================================================================== */ 568 569 /*********************** 570 * Computes: 571 * a[] += b[] 572 */ 573 574 T[] _arraySliceSliceAddass_u(T[] a, T[] b) 575 { 576 return _arraySliceSliceAddass_s(a, b); 577 } 578 579 T[] _arraySliceSliceAddass_t(T[] a, T[] b) 580 { 581 return _arraySliceSliceAddass_s(a, b); 582 } 583 584 T[] _arraySliceSliceAddass_s(T[] a, T[] b) 585 in 586 { 587 assert (a.length == b.length); 588 assert (disjoint(a, b)); 589 } 590 body 591 { 592 //printf("_arraySliceSliceAddass_s()\n"); 593 auto aptr = a.ptr; 594 auto aend = aptr + a.length; 595 auto bptr = b.ptr; 596 597 version (D_InlineAsm_X86) 598 { 599 // SSE2 aligned version is 2085% faster 600 if (sse2() && a.length >= 16) 601 { 602 auto n = aptr + (a.length & ~15); 603 604 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 605 { 606 asm // unaligned case 607 { 608 mov ESI, aptr; 609 mov EDI, n; 610 mov ECX, bptr; 611 612 align 4; 613 startsse2u: 614 movdqu XMM0, [ESI]; 615 movdqu XMM1, [ESI+16]; 616 add ESI, 32; 617 movdqu XMM2, [ECX]; 618 movdqu XMM3, [ECX+16]; 619 add ECX, 32; 620 paddw XMM0, XMM2; 621 paddw XMM1, XMM3; 622 movdqu [ESI -32], XMM0; 623 movdqu [ESI+16-32], XMM1; 624 cmp ESI, EDI; 625 jb startsse2u; 626 627 mov aptr, ESI; 628 mov bptr, ECX; 629 } 630 } 631 else 632 { 633 asm // aligned case 634 { 635 mov ESI, aptr; 636 mov EDI, n; 637 mov ECX, bptr; 638 639 align 4; 640 startsse2a: 641 movdqa XMM0, [ESI]; 642 movdqa XMM1, [ESI+16]; 643 add ESI, 32; 644 movdqa XMM2, [ECX]; 645 movdqa XMM3, [ECX+16]; 646 add ECX, 32; 647 paddw XMM0, XMM2; 648 paddw XMM1, XMM3; 649 movdqa [ESI -32], XMM0; 650 movdqa [ESI+16-32], XMM1; 651 cmp ESI, EDI; 652 jb startsse2a; 653 654 mov aptr, ESI; 655 mov bptr, ECX; 656 } 657 } 658 } 659 else 660 // MMX version is 1022% faster 661 if (mmx() && a.length >= 8) 662 { 663 auto n = aptr + (a.length & ~7); 664 665 asm 666 { 667 mov ESI, aptr; 668 mov EDI, n; 669 mov ECX, bptr; 670 671 align 4; 672 start: 673 movq MM0, [ESI]; 674 movq MM1, [ESI+8]; 675 add ESI, 16; 676 movq MM2, [ECX]; 677 movq MM3, [ECX+8]; 678 add ECX, 16; 679 paddw MM0, MM2; 680 paddw MM1, MM3; 681 movq [ESI -16], MM0; 682 movq [ESI+8-16], MM1; 683 cmp ESI, EDI; 684 jb start; 685 686 emms; 687 mov aptr, ESI; 688 mov bptr, ECX; 689 } 690 } 691 } 692 693 while (aptr < aend) 694 *aptr++ += *bptr++; 695 696 return a; 697 } 698 699 unittest 700 { 701 printf("_arraySliceSliceAddass_s unittest\n"); 702 703 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 704 { 705 version (log) printf(" cpuid %d\n", cpuid); 706 707 for (int j = 0; j < 2; j++) 708 { 709 const int dim = 67; 710 T[] a = new T[dim + j]; // aligned on 16 byte boundary 711 a = a[j .. dim + j]; // misalign for second iteration 712 T[] b = new T[dim + j]; 713 b = b[j .. dim + j]; 714 T[] c = new T[dim + j]; 715 c = c[j .. dim + j]; 716 717 for (int i = 0; i < dim; i++) 718 { a[i] = cast(T)i; 719 b[i] = cast(T)(i + 7); 720 c[i] = cast(T)(i * 2); 721 } 722 723 b[] = c[]; 724 c[] += a[]; 725 726 for (int i = 0; i < dim; i++) 727 { 728 if (c[i] != cast(T)(b[i] + a[i])) 729 { 730 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); 731 assert(0); 732 } 733 } 734 } 735 } 736 } 737 738 739 /* ======================================================================== */ 740 741 /*********************** 742 * Computes: 743 * a[] = b[] - value 744 */ 745 746 T[] _arraySliceExpMinSliceAssign_u(T[] a, T value, T[] b) 747 { 748 return _arraySliceExpMinSliceAssign_s(a, value, b); 749 } 750 751 T[] _arraySliceExpMinSliceAssign_t(T[] a, T value, T[] b) 752 { 753 return _arraySliceExpMinSliceAssign_s(a, value, b); 754 } 755 756 T[] _arraySliceExpMinSliceAssign_s(T[] a, T value, T[] b) 757 in 758 { 759 assert(a.length == b.length); 760 assert(disjoint(a, b)); 761 } 762 body 763 { 764 //printf("_arraySliceExpMinSliceAssign_s()\n"); 765 auto aptr = a.ptr; 766 auto aend = aptr + a.length; 767 auto bptr = b.ptr; 768 769 version (D_InlineAsm_X86) 770 { 771 // SSE2 aligned version is 3695% faster 772 if (sse2() && a.length >= 16) 773 { 774 auto n = aptr + (a.length & ~15); 775 776 uint l = cast(ushort) value; 777 l |= (l << 16); 778 779 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 780 { 781 asm // unaligned case 782 { 783 mov ESI, aptr; 784 mov EDI, n; 785 mov EAX, bptr; 786 movd XMM2, l; 787 pshufd XMM2, XMM2, 0; 788 789 align 4; 790 startaddsse2u: 791 add ESI, 32; 792 movdqu XMM0, [EAX]; 793 movdqu XMM1, [EAX+16]; 794 add EAX, 32; 795 psubw XMM0, XMM2; 796 psubw XMM1, XMM2; 797 movdqu [ESI -32], XMM0; 798 movdqu [ESI+16-32], XMM1; 799 cmp ESI, EDI; 800 jb startaddsse2u; 801 802 mov aptr, ESI; 803 mov bptr, EAX; 804 } 805 } 806 else 807 { 808 asm // aligned case 809 { 810 mov ESI, aptr; 811 mov EDI, n; 812 mov EAX, bptr; 813 movd XMM2, l; 814 pshufd XMM2, XMM2, 0; 815 816 align 4; 817 startaddsse2a: 818 add ESI, 32; 819 movdqa XMM0, [EAX]; 820 movdqa XMM1, [EAX+16]; 821 add EAX, 32; 822 psubw XMM0, XMM2; 823 psubw XMM1, XMM2; 824 movdqa [ESI -32], XMM0; 825 movdqa [ESI+16-32], XMM1; 826 cmp ESI, EDI; 827 jb startaddsse2a; 828 829 mov aptr, ESI; 830 mov bptr, EAX; 831 } 832 } 833 } 834 else 835 // MMX version is 3049% faster 836 if (mmx() && a.length >= 8) 837 { 838 auto n = aptr + (a.length & ~7); 839 840 uint l = cast(ushort) value; 841 842 asm 843 { 844 mov ESI, aptr; 845 mov EDI, n; 846 mov EAX, bptr; 847 movd MM2, l; 848 pshufw MM2, MM2, 0; 849 850 align 4; 851 startmmx: 852 add ESI, 16; 853 movq MM0, [EAX]; 854 movq MM1, [EAX+8]; 855 add EAX, 16; 856 psubw MM0, MM2; 857 psubw MM1, MM2; 858 movq [ESI -16], MM0; 859 movq [ESI+8-16], MM1; 860 cmp ESI, EDI; 861 jb startmmx; 862 863 emms; 864 mov aptr, ESI; 865 mov bptr, EAX; 866 } 867 } 868 } 869 870 while (aptr < aend) 871 *aptr++ = cast(T)(*bptr++ - value); 872 873 return a; 874 } 875 876 unittest 877 { 878 printf("_arraySliceExpMinSliceAssign_s unittest\n"); 879 880 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 881 { 882 version (log) printf(" cpuid %d\n", cpuid); 883 884 for (int j = 0; j < 2; j++) 885 { 886 const int dim = 67; 887 T[] a = new T[dim + j]; // aligned on 16 byte boundary 888 a = a[j .. dim + j]; // misalign for second iteration 889 T[] b = new T[dim + j]; 890 b = b[j .. dim + j]; 891 T[] c = new T[dim + j]; 892 c = c[j .. dim + j]; 893 894 for (int i = 0; i < dim; i++) 895 { a[i] = cast(T)i; 896 b[i] = cast(T)(i + 7); 897 c[i] = cast(T)(i * 2); 898 } 899 900 c[] = a[] - 6; 901 902 for (int i = 0; i < dim; i++) 903 { 904 if (c[i] != cast(T)(a[i] - 6)) 905 { 906 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); 907 assert(0); 908 } 909 } 910 } 911 } 912 } 913 914 915 /* ======================================================================== */ 916 917 /*********************** 918 * Computes: 919 * a[] = value - b[] 920 */ 921 922 T[] _arrayExpSliceMinSliceAssign_u(T[] a, T[] b, T value) 923 { 924 return _arrayExpSliceMinSliceAssign_s(a, b, value); 925 } 926 927 T[] _arrayExpSliceMinSliceAssign_t(T[] a, T[] b, T value) 928 { 929 return _arrayExpSliceMinSliceAssign_s(a, b, value); 930 } 931 932 T[] _arrayExpSliceMinSliceAssign_s(T[] a, T[] b, T value) 933 in 934 { 935 assert(a.length == b.length); 936 assert(disjoint(a, b)); 937 } 938 body 939 { 940 //printf("_arrayExpSliceMinSliceAssign_s()\n"); 941 auto aptr = a.ptr; 942 auto aend = aptr + a.length; 943 auto bptr = b.ptr; 944 945 version (D_InlineAsm_X86) 946 { 947 // SSE2 aligned version is 4995% faster 948 if (sse2() && a.length >= 16) 949 { 950 auto n = aptr + (a.length & ~15); 951 952 uint l = cast(ushort) value; 953 l |= (l << 16); 954 955 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 956 { 957 asm // unaligned case 958 { 959 mov ESI, aptr; 960 mov EDI, n; 961 mov EAX, bptr; 962 963 align 4; 964 startaddsse2u: 965 movd XMM2, l; 966 pshufd XMM2, XMM2, 0; 967 movd XMM3, l; 968 pshufd XMM3, XMM3, 0; 969 add ESI, 32; 970 movdqu XMM0, [EAX]; 971 movdqu XMM1, [EAX+16]; 972 add EAX, 32; 973 psubw XMM2, XMM0; 974 psubw XMM3, XMM1; 975 movdqu [ESI -32], XMM2; 976 movdqu [ESI+16-32], XMM3; 977 cmp ESI, EDI; 978 jb startaddsse2u; 979 980 mov aptr, ESI; 981 mov bptr, EAX; 982 } 983 } 984 else 985 { 986 asm // aligned case 987 { 988 mov ESI, aptr; 989 mov EDI, n; 990 mov EAX, bptr; 991 992 align 4; 993 startaddsse2a: 994 movd XMM2, l; 995 pshufd XMM2, XMM2, 0; 996 movd XMM3, l; 997 pshufd XMM3, XMM3, 0; 998 add ESI, 32; 999 movdqa XMM0, [EAX]; 1000 movdqa XMM1, [EAX+16]; 1001 add EAX, 32; 1002 psubw XMM2, XMM0; 1003 psubw XMM3, XMM1; 1004 movdqa [ESI -32], XMM2; 1005 movdqa [ESI+16-32], XMM3; 1006 cmp ESI, EDI; 1007 jb startaddsse2a; 1008 1009 mov aptr, ESI; 1010 mov bptr, EAX; 1011 } 1012 } 1013 } 1014 else 1015 // MMX version is 4562% faster 1016 if (mmx() && a.length >= 8) 1017 { 1018 auto n = aptr + (a.length & ~7); 1019 1020 uint l = cast(ushort) value; 1021 1022 asm 1023 { 1024 mov ESI, aptr; 1025 mov EDI, n; 1026 mov EAX, bptr; 1027 movd MM4, l; 1028 pshufw MM4, MM4, 0; 1029 1030 align 4; 1031 startmmx: 1032 add ESI, 16; 1033 movq MM2, [EAX]; 1034 movq MM3, [EAX+8]; 1035 movq MM0, MM4; 1036 movq MM1, MM4; 1037 add EAX, 16; 1038 psubw MM0, MM2; 1039 psubw MM1, MM3; 1040 movq [ESI -16], MM0; 1041 movq [ESI+8-16], MM1; 1042 cmp ESI, EDI; 1043 jb startmmx; 1044 1045 emms; 1046 mov aptr, ESI; 1047 mov bptr, EAX; 1048 } 1049 } 1050 } 1051 1052 while (aptr < aend) 1053 *aptr++ = cast(T)(value - *bptr++); 1054 1055 return a; 1056 } 1057 1058 unittest 1059 { 1060 printf("_arrayExpSliceMinSliceAssign_s unittest\n"); 1061 1062 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1063 { 1064 version (log) printf(" cpuid %d\n", cpuid); 1065 1066 for (int j = 0; j < 2; j++) 1067 { 1068 const int dim = 67; 1069 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1070 a = a[j .. dim + j]; // misalign for second iteration 1071 T[] b = new T[dim + j]; 1072 b = b[j .. dim + j]; 1073 T[] c = new T[dim + j]; 1074 c = c[j .. dim + j]; 1075 1076 for (int i = 0; i < dim; i++) 1077 { a[i] = cast(T)i; 1078 b[i] = cast(T)(i + 7); 1079 c[i] = cast(T)(i * 2); 1080 } 1081 1082 c[] = 6 - a[]; 1083 1084 for (int i = 0; i < dim; i++) 1085 { 1086 if (c[i] != cast(T)(6 - a[i])) 1087 { 1088 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); 1089 assert(0); 1090 } 1091 } 1092 } 1093 } 1094 } 1095 1096 1097 /* ======================================================================== */ 1098 1099 /*********************** 1100 * Computes: 1101 * a[] = b[] - c[] 1102 */ 1103 1104 T[] _arraySliceSliceMinSliceAssign_u(T[] a, T[] c, T[] b) 1105 { 1106 return _arraySliceSliceMinSliceAssign_s(a, c, b); 1107 } 1108 1109 T[] _arraySliceSliceMinSliceAssign_t(T[] a, T[] c, T[] b) 1110 { 1111 return _arraySliceSliceMinSliceAssign_s(a, c, b); 1112 } 1113 1114 T[] _arraySliceSliceMinSliceAssign_s(T[] a, T[] c, T[] b) 1115 in 1116 { 1117 assert(a.length == b.length && b.length == c.length); 1118 assert(disjoint(a, b)); 1119 assert(disjoint(a, c)); 1120 assert(disjoint(b, c)); 1121 } 1122 body 1123 { 1124 auto aptr = a.ptr; 1125 auto aend = aptr + a.length; 1126 auto bptr = b.ptr; 1127 auto cptr = c.ptr; 1128 1129 version (D_InlineAsm_X86) 1130 { 1131 // SSE2 aligned version is 4129% faster 1132 if (sse2() && a.length >= 16) 1133 { 1134 auto n = aptr + (a.length & ~15); 1135 1136 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 1137 { 1138 asm // unaligned case 1139 { 1140 mov ESI, aptr; 1141 mov EDI, n; 1142 mov EAX, bptr; 1143 mov ECX, cptr; 1144 1145 align 4; 1146 startsse2u: 1147 add ESI, 32; 1148 movdqu XMM0, [EAX]; 1149 movdqu XMM1, [EAX+16]; 1150 add EAX, 32; 1151 movdqu XMM2, [ECX]; 1152 movdqu XMM3, [ECX+16]; 1153 add ECX, 32; 1154 psubw XMM0, XMM2; 1155 psubw XMM1, XMM3; 1156 movdqu [ESI -32], XMM0; 1157 movdqu [ESI+16-32], XMM1; 1158 cmp ESI, EDI; 1159 jb startsse2u; 1160 1161 mov aptr, ESI; 1162 mov bptr, EAX; 1163 mov cptr, ECX; 1164 } 1165 } 1166 else 1167 { 1168 asm // aligned case 1169 { 1170 mov ESI, aptr; 1171 mov EDI, n; 1172 mov EAX, bptr; 1173 mov ECX, cptr; 1174 1175 align 4; 1176 startsse2a: 1177 add ESI, 32; 1178 movdqa XMM0, [EAX]; 1179 movdqa XMM1, [EAX+16]; 1180 add EAX, 32; 1181 movdqa XMM2, [ECX]; 1182 movdqa XMM3, [ECX+16]; 1183 add ECX, 32; 1184 psubw XMM0, XMM2; 1185 psubw XMM1, XMM3; 1186 movdqa [ESI -32], XMM0; 1187 movdqa [ESI+16-32], XMM1; 1188 cmp ESI, EDI; 1189 jb startsse2a; 1190 1191 mov aptr, ESI; 1192 mov bptr, EAX; 1193 mov cptr, ECX; 1194 } 1195 } 1196 } 1197 else 1198 // MMX version is 2018% faster 1199 if (mmx() && a.length >= 8) 1200 { 1201 auto n = aptr + (a.length & ~7); 1202 1203 asm 1204 { 1205 mov ESI, aptr; 1206 mov EDI, n; 1207 mov EAX, bptr; 1208 mov ECX, cptr; 1209 1210 align 4; 1211 startmmx: 1212 add ESI, 16; 1213 movq MM0, [EAX]; 1214 movq MM1, [EAX+8]; 1215 add EAX, 16; 1216 movq MM2, [ECX]; 1217 movq MM3, [ECX+8]; 1218 add ECX, 16; 1219 psubw MM0, MM2; 1220 psubw MM1, MM3; 1221 movq [ESI -16], MM0; 1222 movq [ESI+8-16], MM1; 1223 cmp ESI, EDI; 1224 jb startmmx; 1225 1226 emms; 1227 mov aptr, ESI; 1228 mov bptr, EAX; 1229 mov cptr, ECX; 1230 } 1231 } 1232 } 1233 1234 while (aptr < aend) 1235 *aptr++ = cast(T)(*bptr++ - *cptr++); 1236 1237 return a; 1238 } 1239 1240 unittest 1241 { 1242 printf("_arraySliceSliceMinSliceAssign_s unittest\n"); 1243 1244 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1245 { 1246 version (log) printf(" cpuid %d\n", cpuid); 1247 1248 for (int j = 0; j < 2; j++) 1249 { 1250 const int dim = 67; 1251 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1252 a = a[j .. dim + j]; // misalign for second iteration 1253 T[] b = new T[dim + j]; 1254 b = b[j .. dim + j]; 1255 T[] c = new T[dim + j]; 1256 c = c[j .. dim + j]; 1257 1258 for (int i = 0; i < dim; i++) 1259 { a[i] = cast(T)i; 1260 b[i] = cast(T)(i + 7); 1261 c[i] = cast(T)(i * 2); 1262 } 1263 1264 c[] = a[] - b[]; 1265 1266 for (int i = 0; i < dim; i++) 1267 { 1268 if (c[i] != cast(T)(a[i] - b[i])) 1269 { 1270 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); 1271 assert(0); 1272 } 1273 } 1274 } 1275 } 1276 } 1277 1278 1279 /* ======================================================================== */ 1280 1281 /*********************** 1282 * Computes: 1283 * a[] -= value 1284 */ 1285 1286 T[] _arrayExpSliceMinass_u(T[] a, T value) 1287 { 1288 return _arrayExpSliceMinass_s(a, value); 1289 } 1290 1291 T[] _arrayExpSliceMinass_t(T[] a, T value) 1292 { 1293 return _arrayExpSliceMinass_s(a, value); 1294 } 1295 1296 T[] _arrayExpSliceMinass_s(T[] a, T value) 1297 { 1298 //printf("_arrayExpSliceMinass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 1299 auto aptr = a.ptr; 1300 auto aend = aptr + a.length; 1301 1302 version (D_InlineAsm_X86) 1303 { 1304 // SSE2 aligned version is 835% faster 1305 if (sse2() && a.length >= 16) 1306 { 1307 auto n = aptr + (a.length & ~15); 1308 1309 uint l = cast(ushort) value; 1310 l |= (l << 16); 1311 1312 if (((cast(uint) aptr) & 15) != 0) 1313 { 1314 asm // unaligned case 1315 { 1316 mov ESI, aptr; 1317 mov EDI, n; 1318 movd XMM2, l; 1319 pshufd XMM2, XMM2, 0; 1320 1321 align 4; 1322 startaddsse2u: 1323 movdqu XMM0, [ESI]; 1324 movdqu XMM1, [ESI+16]; 1325 add ESI, 32; 1326 psubw XMM0, XMM2; 1327 psubw XMM1, XMM2; 1328 movdqu [ESI -32], XMM0; 1329 movdqu [ESI+16-32], XMM1; 1330 cmp ESI, EDI; 1331 jb startaddsse2u; 1332 1333 mov aptr, ESI; 1334 } 1335 } 1336 else 1337 { 1338 asm // aligned case 1339 { 1340 mov ESI, aptr; 1341 mov EDI, n; 1342 movd XMM2, l; 1343 pshufd XMM2, XMM2, 0; 1344 1345 align 4; 1346 startaddsse2a: 1347 movdqa XMM0, [ESI]; 1348 movdqa XMM1, [ESI+16]; 1349 add ESI, 32; 1350 psubw XMM0, XMM2; 1351 psubw XMM1, XMM2; 1352 movdqa [ESI -32], XMM0; 1353 movdqa [ESI+16-32], XMM1; 1354 cmp ESI, EDI; 1355 jb startaddsse2a; 1356 1357 mov aptr, ESI; 1358 } 1359 } 1360 } 1361 else 1362 // MMX version is 835% faster 1363 if (mmx() && a.length >= 8) 1364 { 1365 auto n = aptr + (a.length & ~7); 1366 1367 uint l = cast(ushort) value; 1368 1369 asm 1370 { 1371 mov ESI, aptr; 1372 mov EDI, n; 1373 movd MM2, l; 1374 pshufw MM2, MM2, 0; 1375 1376 align 4; 1377 startmmx: 1378 movq MM0, [ESI]; 1379 movq MM1, [ESI+8]; 1380 add ESI, 16; 1381 psubw MM0, MM2; 1382 psubw MM1, MM2; 1383 movq [ESI -16], MM0; 1384 movq [ESI+8-16], MM1; 1385 cmp ESI, EDI; 1386 jb startmmx; 1387 1388 emms; 1389 mov aptr, ESI; 1390 } 1391 } 1392 } 1393 1394 while (aptr < aend) 1395 *aptr++ -= value; 1396 1397 return a; 1398 } 1399 1400 unittest 1401 { 1402 printf("_arrayExpSliceMinass_s unittest\n"); 1403 1404 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1405 { 1406 version (log) printf(" cpuid %d\n", cpuid); 1407 1408 for (int j = 0; j < 2; j++) 1409 { 1410 const int dim = 67; 1411 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1412 a = a[j .. dim + j]; // misalign for second iteration 1413 T[] b = new T[dim + j]; 1414 b = b[j .. dim + j]; 1415 T[] c = new T[dim + j]; 1416 c = c[j .. dim + j]; 1417 1418 for (int i = 0; i < dim; i++) 1419 { a[i] = cast(T)i; 1420 b[i] = cast(T)(i + 7); 1421 c[i] = cast(T)(i * 2); 1422 } 1423 1424 a[] = c[]; 1425 a[] -= 6; 1426 1427 for (int i = 0; i < dim; i++) 1428 { 1429 if (a[i] != cast(T)(c[i] - 6)) 1430 { 1431 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); 1432 assert(0); 1433 } 1434 } 1435 } 1436 } 1437 } 1438 1439 1440 /* ======================================================================== */ 1441 1442 /*********************** 1443 * Computes: 1444 * a[] -= b[] 1445 */ 1446 1447 T[] _arraySliceSliceMinass_u(T[] a, T[] b) 1448 { 1449 return _arraySliceSliceMinass_s(a, b); 1450 } 1451 1452 T[] _arraySliceSliceMinass_t(T[] a, T[] b) 1453 { 1454 return _arraySliceSliceMinass_s(a, b); 1455 } 1456 1457 T[] _arraySliceSliceMinass_s(T[] a, T[] b) 1458 in 1459 { 1460 assert (a.length == b.length); 1461 assert (disjoint(a, b)); 1462 } 1463 body 1464 { 1465 //printf("_arraySliceSliceMinass_s()\n"); 1466 auto aptr = a.ptr; 1467 auto aend = aptr + a.length; 1468 auto bptr = b.ptr; 1469 1470 version (D_InlineAsm_X86) 1471 { 1472 // SSE2 aligned version is 2121% faster 1473 if (sse2() && a.length >= 16) 1474 { 1475 auto n = aptr + (a.length & ~15); 1476 1477 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1478 { 1479 asm // unaligned case 1480 { 1481 mov ESI, aptr; 1482 mov EDI, n; 1483 mov ECX, bptr; 1484 1485 align 4; 1486 startsse2u: 1487 movdqu XMM0, [ESI]; 1488 movdqu XMM1, [ESI+16]; 1489 add ESI, 32; 1490 movdqu XMM2, [ECX]; 1491 movdqu XMM3, [ECX+16]; 1492 add ECX, 32; 1493 psubw XMM0, XMM2; 1494 psubw XMM1, XMM3; 1495 movdqu [ESI -32], XMM0; 1496 movdqu [ESI+16-32], XMM1; 1497 cmp ESI, EDI; 1498 jb startsse2u; 1499 1500 mov aptr, ESI; 1501 mov bptr, ECX; 1502 } 1503 } 1504 else 1505 { 1506 asm // aligned case 1507 { 1508 mov ESI, aptr; 1509 mov EDI, n; 1510 mov ECX, bptr; 1511 1512 align 4; 1513 startsse2a: 1514 movdqa XMM0, [ESI]; 1515 movdqa XMM1, [ESI+16]; 1516 add ESI, 32; 1517 movdqa XMM2, [ECX]; 1518 movdqa XMM3, [ECX+16]; 1519 add ECX, 32; 1520 psubw XMM0, XMM2; 1521 psubw XMM1, XMM3; 1522 movdqa [ESI -32], XMM0; 1523 movdqa [ESI+16-32], XMM1; 1524 cmp ESI, EDI; 1525 jb startsse2a; 1526 1527 mov aptr, ESI; 1528 mov bptr, ECX; 1529 } 1530 } 1531 } 1532 else 1533 // MMX version is 1116% faster 1534 if (mmx() && a.length >= 8) 1535 { 1536 auto n = aptr + (a.length & ~7); 1537 1538 asm 1539 { 1540 mov ESI, aptr; 1541 mov EDI, n; 1542 mov ECX, bptr; 1543 1544 align 4; 1545 start: 1546 movq MM0, [ESI]; 1547 movq MM1, [ESI+8]; 1548 add ESI, 16; 1549 movq MM2, [ECX]; 1550 movq MM3, [ECX+8]; 1551 add ECX, 16; 1552 psubw MM0, MM2; 1553 psubw MM1, MM3; 1554 movq [ESI -16], MM0; 1555 movq [ESI+8-16], MM1; 1556 cmp ESI, EDI; 1557 jb start; 1558 1559 emms; 1560 mov aptr, ESI; 1561 mov bptr, ECX; 1562 } 1563 } 1564 } 1565 1566 while (aptr < aend) 1567 *aptr++ -= *bptr++; 1568 1569 return a; 1570 } 1571 1572 unittest 1573 { 1574 printf("_arraySliceSliceMinass_s unittest\n"); 1575 1576 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1577 { 1578 version (log) printf(" cpuid %d\n", cpuid); 1579 1580 for (int j = 0; j < 2; j++) 1581 { 1582 const int dim = 67; 1583 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1584 a = a[j .. dim + j]; // misalign for second iteration 1585 T[] b = new T[dim + j]; 1586 b = b[j .. dim + j]; 1587 T[] c = new T[dim + j]; 1588 c = c[j .. dim + j]; 1589 1590 for (int i = 0; i < dim; i++) 1591 { a[i] = cast(T)i; 1592 b[i] = cast(T)(i + 7); 1593 c[i] = cast(T)(i * 2); 1594 } 1595 1596 b[] = c[]; 1597 c[] -= a[]; 1598 1599 for (int i = 0; i < dim; i++) 1600 { 1601 if (c[i] != cast(T)(b[i] - a[i])) 1602 { 1603 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); 1604 assert(0); 1605 } 1606 } 1607 } 1608 } 1609 } 1610 1611 1612 /* ======================================================================== */ 1613 1614 /*********************** 1615 * Computes: 1616 * a[] = b[] * value 1617 */ 1618 1619 T[] _arraySliceExpMulSliceAssign_u(T[] a, T value, T[] b) 1620 { 1621 return _arraySliceExpMulSliceAssign_s(a, value, b); 1622 } 1623 1624 T[] _arraySliceExpMulSliceAssign_t(T[] a, T value, T[] b) 1625 { 1626 return _arraySliceExpMulSliceAssign_s(a, value, b); 1627 } 1628 1629 T[] _arraySliceExpMulSliceAssign_s(T[] a, T value, T[] b) 1630 in 1631 { 1632 assert(a.length == b.length); 1633 assert(disjoint(a, b)); 1634 } 1635 body 1636 { 1637 //printf("_arraySliceExpMulSliceAssign_s()\n"); 1638 auto aptr = a.ptr; 1639 auto aend = aptr + a.length; 1640 auto bptr = b.ptr; 1641 1642 version (D_InlineAsm_X86) 1643 { 1644 // SSE2 aligned version is 3733% faster 1645 if (sse2() && a.length >= 16) 1646 { 1647 auto n = aptr + (a.length & ~15); 1648 1649 uint l = cast(ushort) value; 1650 l |= l << 16; 1651 1652 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1653 { 1654 asm 1655 { 1656 mov ESI, aptr; 1657 mov EDI, n; 1658 mov EAX, bptr; 1659 movd XMM2, l; 1660 pshufd XMM2, XMM2, 0; 1661 1662 align 4; 1663 startsse2u: 1664 add ESI, 32; 1665 movdqu XMM0, [EAX]; 1666 movdqu XMM1, [EAX+16]; 1667 add EAX, 32; 1668 pmullw XMM0, XMM2; 1669 pmullw XMM1, XMM2; 1670 movdqu [ESI -32], XMM0; 1671 movdqu [ESI+16-32], XMM1; 1672 cmp ESI, EDI; 1673 jb startsse2u; 1674 1675 mov aptr, ESI; 1676 mov bptr, EAX; 1677 } 1678 } 1679 else 1680 { 1681 asm 1682 { 1683 mov ESI, aptr; 1684 mov EDI, n; 1685 mov EAX, bptr; 1686 movd XMM2, l; 1687 pshufd XMM2, XMM2, 0; 1688 1689 align 4; 1690 startsse2a: 1691 add ESI, 32; 1692 movdqa XMM0, [EAX]; 1693 movdqa XMM1, [EAX+16]; 1694 add EAX, 32; 1695 pmullw XMM0, XMM2; 1696 pmullw XMM1, XMM2; 1697 movdqa [ESI -32], XMM0; 1698 movdqa [ESI+16-32], XMM1; 1699 cmp ESI, EDI; 1700 jb startsse2a; 1701 1702 mov aptr, ESI; 1703 mov bptr, EAX; 1704 } 1705 } 1706 } 1707 else 1708 // MMX version is 3733% faster 1709 if (mmx() && a.length >= 8) 1710 { 1711 auto n = aptr + (a.length & ~7); 1712 1713 uint l = cast(ushort) value; 1714 1715 asm 1716 { 1717 mov ESI, aptr; 1718 mov EDI, n; 1719 mov EAX, bptr; 1720 movd MM2, l; 1721 pshufw MM2, MM2, 0; 1722 1723 align 4; 1724 startmmx: 1725 add ESI, 16; 1726 movq MM0, [EAX]; 1727 movq MM1, [EAX+8]; 1728 add EAX, 16; 1729 pmullw MM0, MM2; 1730 pmullw MM1, MM2; 1731 movq [ESI -16], MM0; 1732 movq [ESI+8-16], MM1; 1733 cmp ESI, EDI; 1734 jb startmmx; 1735 1736 emms; 1737 mov aptr, ESI; 1738 mov bptr, EAX; 1739 } 1740 } 1741 } 1742 1743 while (aptr < aend) 1744 *aptr++ = cast(T)(*bptr++ * value); 1745 1746 return a; 1747 } 1748 1749 unittest 1750 { 1751 printf("_arraySliceExpMulSliceAssign_s unittest\n"); 1752 1753 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1754 { 1755 version (log) printf(" cpuid %d\n", cpuid); 1756 1757 for (int j = 0; j < 2; j++) 1758 { 1759 const int dim = 67; 1760 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1761 a = a[j .. dim + j]; // misalign for second iteration 1762 T[] b = new T[dim + j]; 1763 b = b[j .. dim + j]; 1764 T[] c = new T[dim + j]; 1765 c = c[j .. dim + j]; 1766 1767 for (int i = 0; i < dim; i++) 1768 { a[i] = cast(T)i; 1769 b[i] = cast(T)(i + 7); 1770 c[i] = cast(T)(i * 2); 1771 } 1772 1773 c[] = a[] * 6; 1774 1775 for (int i = 0; i < dim; i++) 1776 { 1777 if (c[i] != cast(T)(a[i] * 6)) 1778 { 1779 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); 1780 assert(0); 1781 } 1782 } 1783 } 1784 } 1785 } 1786 1787 1788 /* ======================================================================== */ 1789 1790 /*********************** 1791 * Computes: 1792 * a[] = b[] * c[] 1793 */ 1794 1795 T[] _arraySliceSliceMulSliceAssign_u(T[] a, T[] c, T[] b) 1796 { 1797 return _arraySliceSliceMulSliceAssign_s(a, c, b); 1798 } 1799 1800 T[] _arraySliceSliceMulSliceAssign_t(T[] a, T[] c, T[] b) 1801 { 1802 return _arraySliceSliceMulSliceAssign_s(a, c, b); 1803 } 1804 1805 T[] _arraySliceSliceMulSliceAssign_s(T[] a, T[] c, T[] b) 1806 in 1807 { 1808 assert(a.length == b.length && b.length == c.length); 1809 assert(disjoint(a, b)); 1810 assert(disjoint(a, c)); 1811 assert(disjoint(b, c)); 1812 } 1813 body 1814 { 1815 //printf("_arraySliceSliceMulSliceAssign_s()\n"); 1816 auto aptr = a.ptr; 1817 auto aend = aptr + a.length; 1818 auto bptr = b.ptr; 1819 auto cptr = c.ptr; 1820 1821 version (D_InlineAsm_X86) 1822 { 1823 // SSE2 aligned version is 2515% faster 1824 if (sse2() && a.length >= 16) 1825 { 1826 auto n = aptr + (a.length & ~15); 1827 1828 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 1829 { 1830 asm 1831 { 1832 mov ESI, aptr; 1833 mov EDI, n; 1834 mov EAX, bptr; 1835 mov ECX, cptr; 1836 1837 align 4; 1838 startsse2u: 1839 add ESI, 32; 1840 movdqu XMM0, [EAX]; 1841 movdqu XMM2, [ECX]; 1842 movdqu XMM1, [EAX+16]; 1843 movdqu XMM3, [ECX+16]; 1844 add EAX, 32; 1845 add ECX, 32; 1846 pmullw XMM0, XMM2; 1847 pmullw XMM1, XMM3; 1848 movdqu [ESI -32], XMM0; 1849 movdqu [ESI+16-32], XMM1; 1850 cmp ESI, EDI; 1851 jb startsse2u; 1852 1853 mov aptr, ESI; 1854 mov bptr, EAX; 1855 mov cptr, ECX; 1856 } 1857 } 1858 else 1859 { 1860 asm 1861 { 1862 mov ESI, aptr; 1863 mov EDI, n; 1864 mov EAX, bptr; 1865 mov ECX, cptr; 1866 1867 align 4; 1868 startsse2a: 1869 add ESI, 32; 1870 movdqa XMM0, [EAX]; 1871 movdqa XMM2, [ECX]; 1872 movdqa XMM1, [EAX+16]; 1873 movdqa XMM3, [ECX+16]; 1874 add EAX, 32; 1875 add ECX, 32; 1876 pmullw XMM0, XMM2; 1877 pmullw XMM1, XMM3; 1878 movdqa [ESI -32], XMM0; 1879 movdqa [ESI+16-32], XMM1; 1880 cmp ESI, EDI; 1881 jb startsse2a; 1882 1883 mov aptr, ESI; 1884 mov bptr, EAX; 1885 mov cptr, ECX; 1886 } 1887 } 1888 } 1889 else 1890 // MMX version is 2515% faster 1891 if (mmx() && a.length >= 8) 1892 { 1893 auto n = aptr + (a.length & ~7); 1894 1895 asm 1896 { 1897 mov ESI, aptr; 1898 mov EDI, n; 1899 mov EAX, bptr; 1900 mov ECX, cptr; 1901 1902 align 4; 1903 startmmx: 1904 add ESI, 16; 1905 movq MM0, [EAX]; 1906 movq MM2, [ECX]; 1907 movq MM1, [EAX+8]; 1908 movq MM3, [ECX+8]; 1909 add EAX, 16; 1910 add ECX, 16; 1911 pmullw MM0, MM2; 1912 pmullw MM1, MM3; 1913 movq [ESI -16], MM0; 1914 movq [ESI+8-16], MM1; 1915 cmp ESI, EDI; 1916 jb startmmx; 1917 1918 emms; 1919 mov aptr, ESI; 1920 mov bptr, EAX; 1921 mov cptr, ECX; 1922 } 1923 } 1924 } 1925 1926 while (aptr < aend) 1927 *aptr++ = cast(T)(*bptr++ * *cptr++); 1928 1929 return a; 1930 } 1931 1932 unittest 1933 { 1934 printf("_arraySliceSliceMulSliceAssign_s unittest\n"); 1935 1936 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1937 { 1938 version (log) printf(" cpuid %d\n", cpuid); 1939 1940 for (int j = 0; j < 2; j++) 1941 { 1942 const int dim = 67; 1943 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1944 a = a[j .. dim + j]; // misalign for second iteration 1945 T[] b = new T[dim + j]; 1946 b = b[j .. dim + j]; 1947 T[] c = new T[dim + j]; 1948 c = c[j .. dim + j]; 1949 1950 for (int i = 0; i < dim; i++) 1951 { a[i] = cast(T)i; 1952 b[i] = cast(T)(i + 7); 1953 c[i] = cast(T)(i * 2); 1954 } 1955 1956 c[] = a[] * b[]; 1957 1958 for (int i = 0; i < dim; i++) 1959 { 1960 if (c[i] != cast(T)(a[i] * b[i])) 1961 { 1962 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); 1963 assert(0); 1964 } 1965 } 1966 } 1967 } 1968 } 1969 1970 1971 /* ======================================================================== */ 1972 1973 /*********************** 1974 * Computes: 1975 * a[] *= value 1976 */ 1977 1978 T[] _arrayExpSliceMulass_u(T[] a, T value) 1979 { 1980 return _arrayExpSliceMulass_s(a, value); 1981 } 1982 1983 T[] _arrayExpSliceMulass_t(T[] a, T value) 1984 { 1985 return _arrayExpSliceMulass_s(a, value); 1986 } 1987 1988 T[] _arrayExpSliceMulass_s(T[] a, T value) 1989 { 1990 //printf("_arrayExpSliceMulass_s(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 1991 auto aptr = a.ptr; 1992 auto aend = aptr + a.length; 1993 1994 version (D_InlineAsm_X86) 1995 { 1996 // SSE2 aligned version is 2044% faster 1997 if (sse2() && a.length >= 16) 1998 { 1999 auto n = aptr + (a.length & ~15); 2000 2001 uint l = cast(ushort) value; 2002 l |= l << 16; 2003 2004 if (((cast(uint) aptr) & 15) != 0) 2005 { 2006 asm 2007 { 2008 mov ESI, aptr; 2009 mov EDI, n; 2010 movd XMM2, l; 2011 pshufd XMM2, XMM2, 0; 2012 2013 align 4; 2014 startsse2u: 2015 movdqu XMM0, [ESI]; 2016 movdqu XMM1, [ESI+16]; 2017 add ESI, 32; 2018 pmullw XMM0, XMM2; 2019 pmullw XMM1, XMM2; 2020 movdqu [ESI -32], XMM0; 2021 movdqu [ESI+16-32], XMM1; 2022 cmp ESI, EDI; 2023 jb startsse2u; 2024 2025 mov aptr, ESI; 2026 } 2027 } 2028 else 2029 { 2030 asm 2031 { 2032 mov ESI, aptr; 2033 mov EDI, n; 2034 movd XMM2, l; 2035 pshufd XMM2, XMM2, 0; 2036 2037 align 4; 2038 startsse2a: 2039 movdqa XMM0, [ESI]; 2040 movdqa XMM1, [ESI+16]; 2041 add ESI, 32; 2042 pmullw XMM0, XMM2; 2043 pmullw XMM1, XMM2; 2044 movdqa [ESI -32], XMM0; 2045 movdqa [ESI+16-32], XMM1; 2046 cmp ESI, EDI; 2047 jb startsse2a; 2048 2049 mov aptr, ESI; 2050 } 2051 } 2052 } 2053 else 2054 // MMX version is 2056% faster 2055 if (mmx() && a.length >= 8) 2056 { 2057 auto n = aptr + (a.length & ~7); 2058 2059 uint l = cast(ushort) value; 2060 2061 asm 2062 { 2063 mov ESI, aptr; 2064 mov EDI, n; 2065 movd MM2, l; 2066 pshufw MM2, MM2, 0; 2067 2068 align 4; 2069 startmmx: 2070 movq MM0, [ESI]; 2071 movq MM1, [ESI+8]; 2072 add ESI, 16; 2073 pmullw MM0, MM2; 2074 pmullw MM1, MM2; 2075 movq [ESI -16], MM0; 2076 movq [ESI+8-16], MM1; 2077 cmp ESI, EDI; 2078 jb startmmx; 2079 2080 emms; 2081 mov aptr, ESI; 2082 } 2083 } 2084 } 2085 2086 while (aptr < aend) 2087 *aptr++ *= value; 2088 2089 return a; 2090 } 2091 2092 unittest 2093 { 2094 printf("_arrayExpSliceMulass_s unittest\n"); 2095 2096 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 2097 { 2098 version (log) printf(" cpuid %d\n", cpuid); 2099 2100 for (int j = 0; j < 2; j++) 2101 { 2102 const int dim = 67; 2103 T[] a = new T[dim + j]; // aligned on 16 byte boundary 2104 a = a[j .. dim + j]; // misalign for second iteration 2105 T[] b = new T[dim + j]; 2106 b = b[j .. dim + j]; 2107 T[] c = new T[dim + j]; 2108 c = c[j .. dim + j]; 2109 2110 for (int i = 0; i < dim; i++) 2111 { a[i] = cast(T)i; 2112 b[i] = cast(T)(i + 7); 2113 c[i] = cast(T)(i * 2); 2114 } 2115 2116 b[] = a[]; 2117 a[] *= 6; 2118 2119 for (int i = 0; i < dim; i++) 2120 { 2121 if (a[i] != cast(T)(b[i] * 6)) 2122 { 2123 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); 2124 assert(0); 2125 } 2126 } 2127 } 2128 } 2129 } 2130 2131 2132 /* ======================================================================== */ 2133 2134 /*********************** 2135 * Computes: 2136 * a[] *= b[] 2137 */ 2138 2139 T[] _arraySliceSliceMulass_u(T[] a, T[] b) 2140 { 2141 return _arraySliceSliceMulass_s(a, b); 2142 } 2143 2144 T[] _arraySliceSliceMulass_t(T[] a, T[] b) 2145 { 2146 return _arraySliceSliceMulass_s(a, b); 2147 } 2148 2149 T[] _arraySliceSliceMulass_s(T[] a, T[] b) 2150 in 2151 { 2152 assert (a.length == b.length); 2153 assert (disjoint(a, b)); 2154 } 2155 body 2156 { 2157 //printf("_arraySliceSliceMulass_s()\n"); 2158 auto aptr = a.ptr; 2159 auto aend = aptr + a.length; 2160 auto bptr = b.ptr; 2161 2162 version (D_InlineAsm_X86) 2163 { 2164 // SSE2 aligned version is 2519% faster 2165 if (sse2() && a.length >= 16) 2166 { 2167 auto n = aptr + (a.length & ~15); 2168 2169 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 2170 { 2171 asm 2172 { 2173 mov ESI, aptr; 2174 mov EDI, n; 2175 mov ECX, bptr; 2176 2177 align 4; 2178 startsse2u: 2179 movdqu XMM0, [ESI]; 2180 movdqu XMM2, [ECX]; 2181 movdqu XMM1, [ESI+16]; 2182 movdqu XMM3, [ECX+16]; 2183 add ESI, 32; 2184 add ECX, 32; 2185 pmullw XMM0, XMM2; 2186 pmullw XMM1, XMM3; 2187 movdqu [ESI -32], XMM0; 2188 movdqu [ESI+16-32], XMM1; 2189 cmp ESI, EDI; 2190 jb startsse2u; 2191 2192 mov aptr, ESI; 2193 mov bptr, ECX; 2194 } 2195 } 2196 else 2197 { 2198 asm 2199 { 2200 mov ESI, aptr; 2201 mov EDI, n; 2202 mov ECX, bptr; 2203 2204 align 4; 2205 startsse2a: 2206 movdqa XMM0, [ESI]; 2207 movdqa XMM2, [ECX]; 2208 movdqa XMM1, [ESI+16]; 2209 movdqa XMM3, [ECX+16]; 2210 add ESI, 32; 2211 add ECX, 32; 2212 pmullw XMM0, XMM2; 2213 pmullw XMM1, XMM3; 2214 movdqa [ESI -32], XMM0; 2215 movdqa [ESI+16-32], XMM1; 2216 cmp ESI, EDI; 2217 jb startsse2a; 2218 2219 mov aptr, ESI; 2220 mov bptr, ECX; 2221 } 2222 } 2223 } 2224 else 2225 // MMX version is 1712% faster 2226 if (mmx() && a.length >= 8) 2227 { 2228 auto n = aptr + (a.length & ~7); 2229 2230 asm 2231 { 2232 mov ESI, aptr; 2233 mov EDI, n; 2234 mov ECX, bptr; 2235 2236 align 4; 2237 startmmx: 2238 movq MM0, [ESI]; 2239 movq MM2, [ECX]; 2240 movq MM1, [ESI+8]; 2241 movq MM3, [ECX+8]; 2242 add ESI, 16; 2243 add ECX, 16; 2244 pmullw MM0, MM2; 2245 pmullw MM1, MM3; 2246 movq [ESI -16], MM0; 2247 movq [ESI+8-16], MM1; 2248 cmp ESI, EDI; 2249 jb startmmx; 2250 2251 emms; 2252 mov aptr, ESI; 2253 mov bptr, ECX; 2254 } 2255 } 2256 } 2257 2258 while (aptr < aend) 2259 *aptr++ *= *bptr++; 2260 2261 return a; 2262 } 2263 2264 unittest 2265 { 2266 printf("_arraySliceSliceMulass_s unittest\n"); 2267 2268 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 2269 { 2270 version (log) printf(" cpuid %d\n", cpuid); 2271 2272 for (int j = 0; j < 2; j++) 2273 { 2274 const int dim = 67; 2275 T[] a = new T[dim + j]; // aligned on 16 byte boundary 2276 a = a[j .. dim + j]; // misalign for second iteration 2277 T[] b = new T[dim + j]; 2278 b = b[j .. dim + j]; 2279 T[] c = new T[dim + j]; 2280 c = c[j .. dim + j]; 2281 2282 for (int i = 0; i < dim; i++) 2283 { a[i] = cast(T)i; 2284 b[i] = cast(T)(i + 7); 2285 c[i] = cast(T)(i * 2); 2286 } 2287 2288 b[] = a[]; 2289 a[] *= c[]; 2290 2291 for (int i = 0; i < dim; i++) 2292 { 2293 if (a[i] != cast(T)(b[i] * c[i])) 2294 { 2295 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); 2296 assert(0); 2297 } 2298 } 2299 } 2300 } 2301 }