1 /*************************** 2 * D programming language http://www.digitalmars.com/d/ 3 * Runtime support for byte array operations. 4 * Based on code originally written by Burton Radons. 5 * Placed in public domain. 6 */ 7 8 /* Contains MMX versions of certain operations for dchar, int, 9 * and uint ('w', 'i' and 'k' suffixes). 10 */ 11 12 module rt.compiler.gdc.rt.arrayint; 13 14 private import CPUid = rt.compiler.util.cpuid; 15 16 debug(UnitTest) 17 { 18 private extern(C) int printf(char*,...); 19 /* This is so unit tests will test every CPU variant 20 */ 21 int cpuid; 22 const int CPUID_MAX = 4; 23 bool mmx() { return cpuid == 1 && CPUid.mmx(); } 24 bool sse() { return cpuid == 2 && CPUid.sse(); } 25 bool sse2() { return cpuid == 3 && CPUid.sse2(); } 26 bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); } 27 } 28 else 29 { 30 alias CPUid.mmx mmx; 31 alias CPUid.sse sse; 32 alias CPUid.sse2 sse2; 33 alias CPUid.amd3dnow amd3dnow; 34 } 35 36 //version = log; 37 38 bool disjoint(T)(T[] a, T[] b) 39 { 40 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); 41 } 42 43 alias int T; 44 45 extern (C): 46 47 /* ======================================================================== */ 48 49 /*********************** 50 * Computes: 51 * a[] = b[] + value 52 */ 53 54 T[] _arraySliceExpAddSliceAssign_w(T[] a, T value, T[] b) 55 { 56 return _arraySliceExpAddSliceAssign_i(a, value, b); 57 } 58 59 T[] _arraySliceExpAddSliceAssign_k(T[] a, T value, T[] b) 60 { 61 return _arraySliceExpAddSliceAssign_i(a, value, b); 62 } 63 64 T[] _arraySliceExpAddSliceAssign_i(T[] a, T value, T[] b) 65 in 66 { 67 assert(a.length == b.length); 68 assert(disjoint(a, b)); 69 } 70 body 71 { 72 //printf("_arraySliceExpAddSliceAssign_i()\n"); 73 auto aptr = a.ptr; 74 auto aend = aptr + a.length; 75 auto bptr = b.ptr; 76 77 version (D_InlineAsm_X86) 78 { 79 // SSE2 aligned version is 380% faster 80 if (sse2() && a.length >= 8) 81 { 82 auto n = aptr + (a.length & ~7); 83 84 uint l = value; 85 86 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 87 { 88 asm // unaligned case 89 { 90 mov ESI, aptr; 91 mov EDI, n; 92 mov EAX, bptr; 93 movd XMM2, l; 94 pshufd XMM2, XMM2, 0; 95 96 align 4; 97 startaddsse2u: 98 add ESI, 32; 99 movdqu XMM0, [EAX]; 100 movdqu XMM1, [EAX+16]; 101 add EAX, 32; 102 paddd XMM0, XMM2; 103 paddd XMM1, XMM2; 104 movdqu [ESI -32], XMM0; 105 movdqu [ESI+16-32], XMM1; 106 cmp ESI, EDI; 107 jb startaddsse2u; 108 109 mov aptr, ESI; 110 mov bptr, EAX; 111 } 112 } 113 else 114 { 115 asm // aligned case 116 { 117 mov ESI, aptr; 118 mov EDI, n; 119 mov EAX, bptr; 120 movd XMM2, l; 121 pshufd XMM2, XMM2, 0; 122 123 align 4; 124 startaddsse2a: 125 add ESI, 32; 126 movdqa XMM0, [EAX]; 127 movdqa XMM1, [EAX+16]; 128 add EAX, 32; 129 paddd XMM0, XMM2; 130 paddd XMM1, XMM2; 131 movdqa [ESI -32], XMM0; 132 movdqa [ESI+16-32], XMM1; 133 cmp ESI, EDI; 134 jb startaddsse2a; 135 136 mov aptr, ESI; 137 mov bptr, EAX; 138 } 139 } 140 } 141 else 142 // MMX version is 298% faster 143 if (mmx() && a.length >= 4) 144 { 145 auto n = aptr + (a.length & ~3); 146 147 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); 148 149 asm 150 { 151 mov ESI, aptr; 152 mov EDI, n; 153 mov EAX, bptr; 154 movq MM2, l; 155 156 align 4; 157 startmmx: 158 add ESI, 16; 159 movq MM0, [EAX]; 160 movq MM1, [EAX+8]; 161 add EAX, 16; 162 paddd MM0, MM2; 163 paddd MM1, MM2; 164 movq [ESI -16], MM0; 165 movq [ESI+8-16], MM1; 166 cmp ESI, EDI; 167 jb startmmx; 168 169 emms; 170 mov aptr, ESI; 171 mov bptr, EAX; 172 } 173 } 174 else 175 if (a.length >= 2) 176 { 177 auto n = aptr + (a.length & ~1); 178 179 asm 180 { 181 mov ESI, aptr; 182 mov EDI, n; 183 mov EAX, bptr; 184 mov EDX, value; 185 186 align 4; 187 start386: 188 add ESI, 8; 189 mov EBX, [EAX]; 190 mov ECX, [EAX+4]; 191 add EAX, 8; 192 add EBX, EDX; 193 add ECX, EDX; 194 mov [ESI -8], EBX; 195 mov [ESI+4-8], ECX; 196 cmp ESI, EDI; 197 jb start386; 198 199 mov aptr, ESI; 200 mov bptr, EAX; 201 } 202 } 203 } 204 205 while (aptr < aend) 206 *aptr++ = *bptr++ + value; 207 208 return a; 209 } 210 211 unittest 212 { 213 printf("_arraySliceExpAddSliceAssign_i unittest\n"); 214 215 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 216 { 217 version (log) printf(" cpuid %d\n", cpuid); 218 219 for (int j = 0; j < 2; j++) 220 { 221 const int dim = 67; 222 T[] a = new T[dim + j]; // aligned on 16 byte boundary 223 a = a[j .. dim + j]; // misalign for second iteration 224 T[] b = new T[dim + j]; 225 b = b[j .. dim + j]; 226 T[] c = new T[dim + j]; 227 c = c[j .. dim + j]; 228 229 for (int i = 0; i < dim; i++) 230 { a[i] = cast(T)i; 231 b[i] = cast(T)(i + 7); 232 c[i] = cast(T)(i * 2); 233 } 234 235 c[] = a[] + 6; 236 237 for (int i = 0; i < dim; i++) 238 { 239 if (c[i] != cast(T)(a[i] + 6)) 240 { 241 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); 242 assert(0); 243 } 244 } 245 } 246 } 247 } 248 249 250 /* ======================================================================== */ 251 252 /*********************** 253 * Computes: 254 * a[] = b[] + c[] 255 */ 256 257 T[] _arraySliceSliceAddSliceAssign_w(T[] a, T[] c, T[] b) 258 { 259 return _arraySliceSliceAddSliceAssign_i(a, c, b); 260 } 261 262 T[] _arraySliceSliceAddSliceAssign_k(T[] a, T[] c, T[] b) 263 { 264 return _arraySliceSliceAddSliceAssign_i(a, c, b); 265 } 266 267 T[] _arraySliceSliceAddSliceAssign_i(T[] a, T[] c, T[] b) 268 in 269 { 270 assert(a.length == b.length && b.length == c.length); 271 assert(disjoint(a, b)); 272 assert(disjoint(a, c)); 273 assert(disjoint(b, c)); 274 } 275 body 276 { 277 //printf("_arraySliceSliceAddSliceAssign_i()\n"); 278 auto aptr = a.ptr; 279 auto aend = aptr + a.length; 280 auto bptr = b.ptr; 281 auto cptr = c.ptr; 282 283 version (D_InlineAsm_X86) 284 { 285 // SSE2 aligned version is 1710% faster 286 if (sse2() && a.length >= 8) 287 { 288 auto n = aptr + (a.length & ~7); 289 290 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 291 { 292 asm // unaligned case 293 { 294 mov ESI, aptr; 295 mov EDI, n; 296 mov EAX, bptr; 297 mov ECX, cptr; 298 299 align 4; 300 startsse2u: 301 add ESI, 32; 302 movdqu XMM0, [EAX]; 303 movdqu XMM2, [ECX]; 304 movdqu XMM1, [EAX+16]; 305 movdqu XMM3, [ECX+16]; 306 add EAX, 32; 307 add ECX, 32; 308 paddd XMM0, XMM2; 309 paddd XMM1, XMM3; 310 movdqu [ESI -32], XMM0; 311 movdqu [ESI+16-32], XMM1; 312 cmp ESI, EDI; 313 jb startsse2u; 314 315 mov aptr, ESI; 316 mov bptr, EAX; 317 mov cptr, ECX; 318 } 319 } 320 else 321 { 322 asm // aligned case 323 { 324 mov ESI, aptr; 325 mov EDI, n; 326 mov EAX, bptr; 327 mov ECX, cptr; 328 329 align 4; 330 startsse2a: 331 add ESI, 32; 332 movdqa XMM0, [EAX]; 333 movdqa XMM2, [ECX]; 334 movdqa XMM1, [EAX+16]; 335 movdqa XMM3, [ECX+16]; 336 add EAX, 32; 337 add ECX, 32; 338 paddd XMM0, XMM2; 339 paddd XMM1, XMM3; 340 movdqa [ESI -32], XMM0; 341 movdqa [ESI+16-32], XMM1; 342 cmp ESI, EDI; 343 jb startsse2a; 344 345 mov aptr, ESI; 346 mov bptr, EAX; 347 mov cptr, ECX; 348 } 349 } 350 } 351 else 352 // MMX version is 995% faster 353 if (mmx() && a.length >= 4) 354 { 355 auto n = aptr + (a.length & ~3); 356 357 asm 358 { 359 mov ESI, aptr; 360 mov EDI, n; 361 mov EAX, bptr; 362 mov ECX, cptr; 363 364 align 4; 365 startmmx: 366 add ESI, 16; 367 movq MM0, [EAX]; 368 movq MM2, [ECX]; 369 movq MM1, [EAX+8]; 370 movq MM3, [ECX+8]; 371 add EAX, 16; 372 add ECX, 16; 373 paddd MM0, MM2; 374 paddd MM1, MM3; 375 movq [ESI -16], MM0; 376 movq [ESI+8-16], MM1; 377 cmp ESI, EDI; 378 jb startmmx; 379 380 emms; 381 mov aptr, ESI; 382 mov bptr, EAX; 383 mov cptr, ECX; 384 } 385 } 386 } 387 388 normal: 389 while (aptr < aend) 390 *aptr++ = *bptr++ + *cptr++; 391 392 return a; 393 } 394 395 unittest 396 { 397 printf("_arraySliceSliceAddSliceAssign_i unittest\n"); 398 399 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 400 { 401 version (log) printf(" cpuid %d\n", cpuid); 402 403 for (int j = 0; j < 2; j++) 404 { 405 const int dim = 67; 406 T[] a = new T[dim + j]; // aligned on 16 byte boundary 407 a = a[j .. dim + j]; // misalign for second iteration 408 T[] b = new T[dim + j]; 409 b = b[j .. dim + j]; 410 T[] c = new T[dim + j]; 411 c = c[j .. dim + j]; 412 413 for (int i = 0; i < dim; i++) 414 { a[i] = cast(T)i; 415 b[i] = cast(T)(i + 7); 416 c[i] = cast(T)(i * 2); 417 } 418 419 c[] = a[] + b[]; 420 421 for (int i = 0; i < dim; i++) 422 { 423 if (c[i] != cast(T)(a[i] + b[i])) 424 { 425 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); 426 assert(0); 427 } 428 } 429 } 430 } 431 } 432 433 434 /* ======================================================================== */ 435 436 /*********************** 437 * Computes: 438 * a[] += value 439 */ 440 441 T[] _arrayExpSliceAddass_w(T[] a, T value) 442 { 443 return _arrayExpSliceAddass_i(a, value); 444 } 445 446 T[] _arrayExpSliceAddass_k(T[] a, T value) 447 { 448 return _arrayExpSliceAddass_i(a, value); 449 } 450 451 T[] _arrayExpSliceAddass_i(T[] a, T value) 452 { 453 //printf("_arrayExpSliceAddass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 454 auto aptr = a.ptr; 455 auto aend = aptr + a.length; 456 457 version (D_InlineAsm_X86) 458 { 459 // SSE2 aligned version is 83% faster 460 if (sse2() && a.length >= 8) 461 { 462 auto n = aptr + (a.length & ~7); 463 464 uint l = value; 465 466 if (((cast(uint) aptr) & 15) != 0) 467 { 468 asm // unaligned case 469 { 470 mov ESI, aptr; 471 mov EDI, n; 472 movd XMM2, l; 473 pshufd XMM2, XMM2, 0; 474 475 align 4; 476 startaddsse2u: 477 movdqu XMM0, [ESI]; 478 movdqu XMM1, [ESI+16]; 479 add ESI, 32; 480 paddd XMM0, XMM2; 481 paddd XMM1, XMM2; 482 movdqu [ESI -32], XMM0; 483 movdqu [ESI+16-32], XMM1; 484 cmp ESI, EDI; 485 jb startaddsse2u; 486 487 mov aptr, ESI; 488 } 489 } 490 else 491 { 492 asm // aligned case 493 { 494 mov ESI, aptr; 495 mov EDI, n; 496 movd XMM2, l; 497 pshufd XMM2, XMM2, 0; 498 499 align 4; 500 startaddsse2a: 501 movdqa XMM0, [ESI]; 502 movdqa XMM1, [ESI+16]; 503 add ESI, 32; 504 paddd XMM0, XMM2; 505 paddd XMM1, XMM2; 506 movdqa [ESI -32], XMM0; 507 movdqa [ESI+16-32], XMM1; 508 cmp ESI, EDI; 509 jb startaddsse2a; 510 511 mov aptr, ESI; 512 } 513 } 514 } 515 else 516 // MMX version is 81% faster 517 if (mmx() && a.length >= 4) 518 { 519 auto n = aptr + (a.length & ~3); 520 521 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); 522 523 asm 524 { 525 mov ESI, aptr; 526 mov EDI, n; 527 movq MM2, l; 528 529 align 4; 530 startmmx: 531 movq MM0, [ESI]; 532 movq MM1, [ESI+8]; 533 add ESI, 16; 534 paddd MM0, MM2; 535 paddd MM1, MM2; 536 movq [ESI -16], MM0; 537 movq [ESI+8-16], MM1; 538 cmp ESI, EDI; 539 jb startmmx; 540 541 emms; 542 mov aptr, ESI; 543 } 544 } 545 else 546 if (a.length >= 2) 547 { 548 auto n = aptr + (a.length & ~1); 549 550 asm 551 { 552 mov ESI, aptr; 553 mov EDI, n; 554 mov EDX, value; 555 556 align 4; 557 start386: 558 mov EBX, [ESI]; 559 mov ECX, [ESI+4]; 560 add ESI, 8; 561 add EBX, EDX; 562 add ECX, EDX; 563 mov [ESI -8], EBX; 564 mov [ESI+4-8], ECX; 565 cmp ESI, EDI; 566 jb start386; 567 568 mov aptr, ESI; 569 } 570 } 571 } 572 573 while (aptr < aend) 574 *aptr++ += value; 575 576 return a; 577 } 578 579 unittest 580 { 581 printf("_arrayExpSliceAddass_i unittest\n"); 582 583 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 584 { 585 version (log) printf(" cpuid %d\n", cpuid); 586 587 for (int j = 0; j < 2; j++) 588 { 589 const int dim = 67; 590 T[] a = new T[dim + j]; // aligned on 16 byte boundary 591 a = a[j .. dim + j]; // misalign for second iteration 592 T[] b = new T[dim + j]; 593 b = b[j .. dim + j]; 594 T[] c = new T[dim + j]; 595 c = c[j .. dim + j]; 596 597 for (int i = 0; i < dim; i++) 598 { a[i] = cast(T)i; 599 b[i] = cast(T)(i + 7); 600 c[i] = cast(T)(i * 2); 601 } 602 603 a[] = c[]; 604 a[] += 6; 605 606 for (int i = 0; i < dim; i++) 607 { 608 if (a[i] != cast(T)(c[i] + 6)) 609 { 610 printf("[%d]: %d != %d + 6\n", i, a[i], c[i]); 611 assert(0); 612 } 613 } 614 } 615 } 616 } 617 618 619 /* ======================================================================== */ 620 621 /*********************** 622 * Computes: 623 * a[] += b[] 624 */ 625 626 T[] _arraySliceSliceAddass_w(T[] a, T[] b) 627 { 628 return _arraySliceSliceAddass_i(a, b); 629 } 630 631 T[] _arraySliceSliceAddass_k(T[] a, T[] b) 632 { 633 return _arraySliceSliceAddass_i(a, b); 634 } 635 636 T[] _arraySliceSliceAddass_i(T[] a, T[] b) 637 in 638 { 639 assert (a.length == b.length); 640 assert (disjoint(a, b)); 641 } 642 body 643 { 644 //printf("_arraySliceSliceAddass_i()\n"); 645 auto aptr = a.ptr; 646 auto aend = aptr + a.length; 647 auto bptr = b.ptr; 648 649 version (D_InlineAsm_X86) 650 { 651 // SSE2 aligned version is 695% faster 652 if (sse2() && a.length >= 8) 653 { 654 auto n = aptr + (a.length & ~7); 655 656 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 657 { 658 asm // unaligned case 659 { 660 mov ESI, aptr; 661 mov EDI, n; 662 mov ECX, bptr; 663 664 align 4; 665 startsse2u: 666 movdqu XMM0, [ESI]; 667 movdqu XMM2, [ECX]; 668 movdqu XMM1, [ESI+16]; 669 movdqu XMM3, [ECX+16]; 670 add ESI, 32; 671 add ECX, 32; 672 paddd XMM0, XMM2; 673 paddd XMM1, XMM3; 674 movdqu [ESI -32], XMM0; 675 movdqu [ESI+16-32], XMM1; 676 cmp ESI, EDI; 677 jb startsse2u; 678 679 mov aptr, ESI; 680 mov bptr, ECX; 681 } 682 } 683 else 684 { 685 asm // aligned case 686 { 687 mov ESI, aptr; 688 mov EDI, n; 689 mov ECX, bptr; 690 691 align 4; 692 startsse2a: 693 movdqa XMM0, [ESI]; 694 movdqa XMM2, [ECX]; 695 movdqa XMM1, [ESI+16]; 696 movdqa XMM3, [ECX+16]; 697 add ESI, 32; 698 add ECX, 32; 699 paddd XMM0, XMM2; 700 paddd XMM1, XMM3; 701 movdqa [ESI -32], XMM0; 702 movdqa [ESI+16-32], XMM1; 703 cmp ESI, EDI; 704 jb startsse2a; 705 706 mov aptr, ESI; 707 mov bptr, ECX; 708 } 709 } 710 } 711 else 712 // MMX version is 471% faster 713 if (mmx() && a.length >= 4) 714 { 715 auto n = aptr + (a.length & ~3); 716 717 asm 718 { 719 mov ESI, aptr; 720 mov EDI, n; 721 mov ECX, bptr; 722 723 align 4; 724 startmmx: 725 movq MM0, [ESI]; 726 movq MM2, [ECX]; 727 movq MM1, [ESI+8]; 728 movq MM3, [ECX+8]; 729 add ESI, 16; 730 add ECX, 16; 731 paddd MM0, MM2; 732 paddd MM1, MM3; 733 movq [ESI -16], MM0; 734 movq [ESI+8-16], MM1; 735 cmp ESI, EDI; 736 jb startmmx; 737 738 emms; 739 mov aptr, ESI; 740 mov bptr, ECX; 741 } 742 } 743 } 744 745 normal: 746 while (aptr < aend) 747 *aptr++ += *bptr++; 748 749 return a; 750 } 751 752 unittest 753 { 754 printf("_arraySliceSliceAddass_i unittest\n"); 755 756 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 757 { 758 version (log) printf(" cpuid %d\n", cpuid); 759 760 for (int j = 0; j < 2; j++) 761 { 762 const int dim = 67; 763 T[] a = new T[dim + j]; // aligned on 16 byte boundary 764 a = a[j .. dim + j]; // misalign for second iteration 765 T[] b = new T[dim + j]; 766 b = b[j .. dim + j]; 767 T[] c = new T[dim + j]; 768 c = c[j .. dim + j]; 769 770 for (int i = 0; i < dim; i++) 771 { a[i] = cast(T)i; 772 b[i] = cast(T)(i + 7); 773 c[i] = cast(T)(i * 2); 774 } 775 776 b[] = c[]; 777 c[] += a[]; 778 779 for (int i = 0; i < dim; i++) 780 { 781 if (c[i] != cast(T)(b[i] + a[i])) 782 { 783 printf("[%d]: %d != %d + %d\n", i, c[i], b[i], a[i]); 784 assert(0); 785 } 786 } 787 } 788 } 789 } 790 791 792 /* ======================================================================== */ 793 794 /*********************** 795 * Computes: 796 * a[] = b[] - value 797 */ 798 799 T[] _arraySliceExpMinSliceAssign_w(T[] a, T value, T[] b) 800 { 801 return _arraySliceExpMinSliceAssign_i(a, value, b); 802 } 803 804 T[] _arraySliceExpMinSliceAssign_k(T[] a, T value, T[] b) 805 { 806 return _arraySliceExpMinSliceAssign_i(a, value, b); 807 } 808 809 T[] _arraySliceExpMinSliceAssign_i(T[] a, T value, T[] b) 810 in 811 { 812 assert(a.length == b.length); 813 assert(disjoint(a, b)); 814 } 815 body 816 { 817 //printf("_arraySliceExpMinSliceAssign_i()\n"); 818 auto aptr = a.ptr; 819 auto aend = aptr + a.length; 820 auto bptr = b.ptr; 821 822 version (D_InlineAsm_X86) 823 { 824 // SSE2 aligned version is 400% faster 825 if (sse2() && a.length >= 8) 826 { 827 auto n = aptr + (a.length & ~7); 828 829 uint l = value; 830 831 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 832 { 833 asm // unaligned case 834 { 835 mov ESI, aptr; 836 mov EDI, n; 837 mov EAX, bptr; 838 movd XMM2, l; 839 pshufd XMM2, XMM2, 0; 840 841 align 4; 842 startaddsse2u: 843 add ESI, 32; 844 movdqu XMM0, [EAX]; 845 movdqu XMM1, [EAX+16]; 846 add EAX, 32; 847 psubd XMM0, XMM2; 848 psubd XMM1, XMM2; 849 movdqu [ESI -32], XMM0; 850 movdqu [ESI+16-32], XMM1; 851 cmp ESI, EDI; 852 jb startaddsse2u; 853 854 mov aptr, ESI; 855 mov bptr, EAX; 856 } 857 } 858 else 859 { 860 asm // aligned case 861 { 862 mov ESI, aptr; 863 mov EDI, n; 864 mov EAX, bptr; 865 movd XMM2, l; 866 pshufd XMM2, XMM2, 0; 867 868 align 4; 869 startaddsse2a: 870 add ESI, 32; 871 movdqa XMM0, [EAX]; 872 movdqa XMM1, [EAX+16]; 873 add EAX, 32; 874 psubd XMM0, XMM2; 875 psubd XMM1, XMM2; 876 movdqa [ESI -32], XMM0; 877 movdqa [ESI+16-32], XMM1; 878 cmp ESI, EDI; 879 jb startaddsse2a; 880 881 mov aptr, ESI; 882 mov bptr, EAX; 883 } 884 } 885 } 886 else 887 // MMX version is 315% faster 888 if (mmx() && a.length >= 4) 889 { 890 auto n = aptr + (a.length & ~3); 891 892 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); 893 894 asm 895 { 896 mov ESI, aptr; 897 mov EDI, n; 898 mov EAX, bptr; 899 movq MM2, l; 900 901 align 4; 902 startmmx: 903 add ESI, 16; 904 movq MM0, [EAX]; 905 movq MM1, [EAX+8]; 906 add EAX, 16; 907 psubd MM0, MM2; 908 psubd MM1, MM2; 909 movq [ESI -16], MM0; 910 movq [ESI+8-16], MM1; 911 cmp ESI, EDI; 912 jb startmmx; 913 914 emms; 915 mov aptr, ESI; 916 mov bptr, EAX; 917 } 918 } 919 else 920 if (a.length >= 2) 921 { 922 auto n = aptr + (a.length & ~1); 923 924 asm 925 { 926 mov ESI, aptr; 927 mov EDI, n; 928 mov EAX, bptr; 929 mov EDX, value; 930 931 align 4; 932 start386: 933 add ESI, 8; 934 mov EBX, [EAX]; 935 mov ECX, [EAX+4]; 936 add EAX, 8; 937 sub EBX, EDX; 938 sub ECX, EDX; 939 mov [ESI -8], EBX; 940 mov [ESI+4-8], ECX; 941 cmp ESI, EDI; 942 jb start386; 943 944 mov aptr, ESI; 945 mov bptr, EAX; 946 } 947 } 948 } 949 950 while (aptr < aend) 951 *aptr++ = *bptr++ - value; 952 953 return a; 954 } 955 956 unittest 957 { 958 printf("_arraySliceExpMinSliceAssign_i unittest\n"); 959 960 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 961 { 962 version (log) printf(" cpuid %d\n", cpuid); 963 964 for (int j = 0; j < 2; j++) 965 { 966 const int dim = 67; 967 T[] a = new T[dim + j]; // aligned on 16 byte boundary 968 a = a[j .. dim + j]; // misalign for second iteration 969 T[] b = new T[dim + j]; 970 b = b[j .. dim + j]; 971 T[] c = new T[dim + j]; 972 c = c[j .. dim + j]; 973 974 for (int i = 0; i < dim; i++) 975 { a[i] = cast(T)i; 976 b[i] = cast(T)(i + 7); 977 c[i] = cast(T)(i * 2); 978 } 979 980 c[] = a[] - 6; 981 982 for (int i = 0; i < dim; i++) 983 { 984 if (c[i] != cast(T)(a[i] - 6)) 985 { 986 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); 987 assert(0); 988 } 989 } 990 } 991 } 992 } 993 994 995 /* ======================================================================== */ 996 997 /*********************** 998 * Computes: 999 * a[] = value - b[] 1000 */ 1001 1002 T[] _arrayExpSliceMinSliceAssign_w(T[] a, T[] b, T value) 1003 { 1004 return _arrayExpSliceMinSliceAssign_i(a, b, value); 1005 } 1006 1007 T[] _arrayExpSliceMinSliceAssign_k(T[] a, T[] b, T value) 1008 { 1009 return _arrayExpSliceMinSliceAssign_i(a, b, value); 1010 } 1011 1012 T[] _arrayExpSliceMinSliceAssign_i(T[] a, T[] b, T value) 1013 in 1014 { 1015 assert(a.length == b.length); 1016 assert(disjoint(a, b)); 1017 } 1018 body 1019 { 1020 //printf("_arrayExpSliceMinSliceAssign_i()\n"); 1021 auto aptr = a.ptr; 1022 auto aend = aptr + a.length; 1023 auto bptr = b.ptr; 1024 1025 version (D_InlineAsm_X86) 1026 { 1027 // SSE2 aligned version is 1812% faster 1028 if (sse2() && a.length >= 8) 1029 { 1030 auto n = aptr + (a.length & ~7); 1031 1032 uint l = value; 1033 1034 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1035 { 1036 asm // unaligned case 1037 { 1038 mov ESI, aptr; 1039 mov EDI, n; 1040 mov EAX, bptr; 1041 movd XMM4, l; 1042 pshufd XMM4, XMM4, 0; 1043 1044 align 4; 1045 startaddsse2u: 1046 add ESI, 32; 1047 movdqu XMM2, [EAX]; 1048 movdqu XMM3, [EAX+16]; 1049 movdqa XMM0, XMM4; 1050 movdqa XMM1, XMM4; 1051 add EAX, 32; 1052 psubd XMM0, XMM2; 1053 psubd XMM1, XMM3; 1054 movdqu [ESI -32], XMM0; 1055 movdqu [ESI+16-32], XMM1; 1056 cmp ESI, EDI; 1057 jb startaddsse2u; 1058 1059 mov aptr, ESI; 1060 mov bptr, EAX; 1061 } 1062 } 1063 else 1064 { 1065 asm // aligned case 1066 { 1067 mov ESI, aptr; 1068 mov EDI, n; 1069 mov EAX, bptr; 1070 movd XMM4, l; 1071 pshufd XMM4, XMM4, 0; 1072 1073 align 4; 1074 startaddsse2a: 1075 add ESI, 32; 1076 movdqa XMM2, [EAX]; 1077 movdqa XMM3, [EAX+16]; 1078 movdqa XMM0, XMM4; 1079 movdqa XMM1, XMM4; 1080 add EAX, 32; 1081 psubd XMM0, XMM2; 1082 psubd XMM1, XMM3; 1083 movdqa [ESI -32], XMM0; 1084 movdqa [ESI+16-32], XMM1; 1085 cmp ESI, EDI; 1086 jb startaddsse2a; 1087 1088 mov aptr, ESI; 1089 mov bptr, EAX; 1090 } 1091 } 1092 } 1093 else 1094 // MMX version is 1077% faster 1095 if (mmx() && a.length >= 4) 1096 { 1097 auto n = aptr + (a.length & ~3); 1098 1099 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); 1100 1101 asm 1102 { 1103 mov ESI, aptr; 1104 mov EDI, n; 1105 mov EAX, bptr; 1106 movq MM4, l; 1107 1108 align 4; 1109 startmmx: 1110 add ESI, 16; 1111 movq MM2, [EAX]; 1112 movq MM3, [EAX+8]; 1113 movq MM0, MM4; 1114 movq MM1, MM4; 1115 add EAX, 16; 1116 psubd MM0, MM2; 1117 psubd MM1, MM3; 1118 movq [ESI -16], MM0; 1119 movq [ESI+8-16], MM1; 1120 cmp ESI, EDI; 1121 jb startmmx; 1122 1123 emms; 1124 mov aptr, ESI; 1125 mov bptr, EAX; 1126 } 1127 } 1128 } 1129 1130 while (aptr < aend) 1131 *aptr++ = value - *bptr++; 1132 1133 return a; 1134 } 1135 1136 unittest 1137 { 1138 printf("_arrayExpSliceMinSliceAssign_i unittest\n"); 1139 1140 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1141 { 1142 version (log) printf(" cpuid %d\n", cpuid); 1143 1144 for (int j = 0; j < 2; j++) 1145 { 1146 const int dim = 67; 1147 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1148 a = a[j .. dim + j]; // misalign for second iteration 1149 T[] b = new T[dim + j]; 1150 b = b[j .. dim + j]; 1151 T[] c = new T[dim + j]; 1152 c = c[j .. dim + j]; 1153 1154 for (int i = 0; i < dim; i++) 1155 { a[i] = cast(T)i; 1156 b[i] = cast(T)(i + 7); 1157 c[i] = cast(T)(i * 2); 1158 } 1159 1160 c[] = 6 - a[]; 1161 1162 for (int i = 0; i < dim; i++) 1163 { 1164 if (c[i] != cast(T)(6 - a[i])) 1165 { 1166 printf("[%d]: %d != 6 - %d\n", i, c[i], a[i]); 1167 assert(0); 1168 } 1169 } 1170 } 1171 } 1172 } 1173 1174 1175 /* ======================================================================== */ 1176 1177 /*********************** 1178 * Computes: 1179 * a[] = b[] - c[] 1180 */ 1181 1182 T[] _arraySliceSliceMinSliceAssign_w(T[] a, T[] c, T[] b) 1183 { 1184 return _arraySliceSliceMinSliceAssign_i(a, c, b); 1185 } 1186 1187 T[] _arraySliceSliceMinSliceAssign_k(T[] a, T[] c, T[] b) 1188 { 1189 return _arraySliceSliceMinSliceAssign_i(a, c, b); 1190 } 1191 1192 T[] _arraySliceSliceMinSliceAssign_i(T[] a, T[] c, T[] b) 1193 in 1194 { 1195 assert(a.length == b.length && b.length == c.length); 1196 assert(disjoint(a, b)); 1197 assert(disjoint(a, c)); 1198 assert(disjoint(b, c)); 1199 } 1200 body 1201 { 1202 auto aptr = a.ptr; 1203 auto aend = aptr + a.length; 1204 auto bptr = b.ptr; 1205 auto cptr = c.ptr; 1206 1207 version (D_InlineAsm_X86) 1208 { 1209 // SSE2 aligned version is 1721% faster 1210 if (sse2() && a.length >= 8) 1211 { 1212 auto n = aptr + (a.length & ~7); 1213 1214 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 1215 { 1216 asm // unaligned case 1217 { 1218 mov ESI, aptr; 1219 mov EDI, n; 1220 mov EAX, bptr; 1221 mov ECX, cptr; 1222 1223 align 4; 1224 startsse2u: 1225 add ESI, 32; 1226 movdqu XMM0, [EAX]; 1227 movdqu XMM2, [ECX]; 1228 movdqu XMM1, [EAX+16]; 1229 movdqu XMM3, [ECX+16]; 1230 add EAX, 32; 1231 add ECX, 32; 1232 psubd XMM0, XMM2; 1233 psubd XMM1, XMM3; 1234 movdqu [ESI -32], XMM0; 1235 movdqu [ESI+16-32], XMM1; 1236 cmp ESI, EDI; 1237 jb startsse2u; 1238 1239 mov aptr, ESI; 1240 mov bptr, EAX; 1241 mov cptr, ECX; 1242 } 1243 } 1244 else 1245 { 1246 asm // aligned case 1247 { 1248 mov ESI, aptr; 1249 mov EDI, n; 1250 mov EAX, bptr; 1251 mov ECX, cptr; 1252 1253 align 4; 1254 startsse2a: 1255 add ESI, 32; 1256 movdqa XMM0, [EAX]; 1257 movdqa XMM2, [ECX]; 1258 movdqa XMM1, [EAX+16]; 1259 movdqa XMM3, [ECX+16]; 1260 add EAX, 32; 1261 add ECX, 32; 1262 psubd XMM0, XMM2; 1263 psubd XMM1, XMM3; 1264 movdqa [ESI -32], XMM0; 1265 movdqa [ESI+16-32], XMM1; 1266 cmp ESI, EDI; 1267 jb startsse2a; 1268 1269 mov aptr, ESI; 1270 mov bptr, EAX; 1271 mov cptr, ECX; 1272 } 1273 } 1274 } 1275 else 1276 // MMX version is 1002% faster 1277 if (mmx() && a.length >= 4) 1278 { 1279 auto n = aptr + (a.length & ~3); 1280 1281 asm 1282 { 1283 mov ESI, aptr; 1284 mov EDI, n; 1285 mov EAX, bptr; 1286 mov ECX, cptr; 1287 1288 align 4; 1289 startmmx: 1290 add ESI, 16; 1291 movq MM0, [EAX]; 1292 movq MM2, [ECX]; 1293 movq MM1, [EAX+8]; 1294 movq MM3, [ECX+8]; 1295 add EAX, 16; 1296 add ECX, 16; 1297 psubd MM0, MM2; 1298 psubd MM1, MM3; 1299 movq [ESI -16], MM0; 1300 movq [ESI+8-16], MM1; 1301 cmp ESI, EDI; 1302 jb startmmx; 1303 1304 emms; 1305 mov aptr, ESI; 1306 mov bptr, EAX; 1307 mov cptr, ECX; 1308 } 1309 } 1310 } 1311 1312 while (aptr < aend) 1313 *aptr++ = *bptr++ - *cptr++; 1314 1315 return a; 1316 } 1317 1318 unittest 1319 { 1320 printf("_arraySliceSliceMinSliceAssign_i unittest\n"); 1321 1322 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1323 { 1324 version (log) printf(" cpuid %d\n", cpuid); 1325 1326 for (int j = 0; j < 2; j++) 1327 { 1328 const int dim = 67; 1329 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1330 a = a[j .. dim + j]; // misalign for second iteration 1331 T[] b = new T[dim + j]; 1332 b = b[j .. dim + j]; 1333 T[] c = new T[dim + j]; 1334 c = c[j .. dim + j]; 1335 1336 for (int i = 0; i < dim; i++) 1337 { a[i] = cast(T)i; 1338 b[i] = cast(T)(i + 7); 1339 c[i] = cast(T)(i * 2); 1340 } 1341 1342 c[] = a[] - b[]; 1343 1344 for (int i = 0; i < dim; i++) 1345 { 1346 if (c[i] != cast(T)(a[i] - b[i])) 1347 { 1348 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); 1349 assert(0); 1350 } 1351 } 1352 } 1353 } 1354 } 1355 1356 1357 /* ======================================================================== */ 1358 1359 /*********************** 1360 * Computes: 1361 * a[] -= value 1362 */ 1363 1364 T[] _arrayExpSliceMinass_w(T[] a, T value) 1365 { 1366 return _arrayExpSliceMinass_i(a, value); 1367 } 1368 1369 T[] _arrayExpSliceMinass_k(T[] a, T value) 1370 { 1371 return _arrayExpSliceMinass_i(a, value); 1372 } 1373 1374 T[] _arrayExpSliceMinass_i(T[] a, T value) 1375 { 1376 //printf("_arrayExpSliceMinass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 1377 auto aptr = a.ptr; 1378 auto aend = aptr + a.length; 1379 1380 version (D_InlineAsm_X86) 1381 { 1382 // SSE2 aligned version is 81% faster 1383 if (sse2() && a.length >= 8) 1384 { 1385 auto n = aptr + (a.length & ~7); 1386 1387 uint l = value; 1388 1389 if (((cast(uint) aptr) & 15) != 0) 1390 { 1391 asm // unaligned case 1392 { 1393 mov ESI, aptr; 1394 mov EDI, n; 1395 movd XMM2, l; 1396 pshufd XMM2, XMM2, 0; 1397 1398 align 4; 1399 startaddsse2u: 1400 movdqu XMM0, [ESI]; 1401 movdqu XMM1, [ESI+16]; 1402 add ESI, 32; 1403 psubd XMM0, XMM2; 1404 psubd XMM1, XMM2; 1405 movdqu [ESI -32], XMM0; 1406 movdqu [ESI+16-32], XMM1; 1407 cmp ESI, EDI; 1408 jb startaddsse2u; 1409 1410 mov aptr, ESI; 1411 } 1412 } 1413 else 1414 { 1415 asm // aligned case 1416 { 1417 mov ESI, aptr; 1418 mov EDI, n; 1419 movd XMM2, l; 1420 pshufd XMM2, XMM2, 0; 1421 1422 align 4; 1423 startaddsse2a: 1424 movdqa XMM0, [ESI]; 1425 movdqa XMM1, [ESI+16]; 1426 add ESI, 32; 1427 psubd XMM0, XMM2; 1428 psubd XMM1, XMM2; 1429 movdqa [ESI -32], XMM0; 1430 movdqa [ESI+16-32], XMM1; 1431 cmp ESI, EDI; 1432 jb startaddsse2a; 1433 1434 mov aptr, ESI; 1435 } 1436 } 1437 } 1438 else 1439 // MMX version is 81% faster 1440 if (mmx() && a.length >= 4) 1441 { 1442 auto n = aptr + (a.length & ~3); 1443 1444 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); 1445 1446 asm 1447 { 1448 mov ESI, aptr; 1449 mov EDI, n; 1450 movq MM2, l; 1451 1452 align 4; 1453 startmmx: 1454 movq MM0, [ESI]; 1455 movq MM1, [ESI+8]; 1456 add ESI, 16; 1457 psubd MM0, MM2; 1458 psubd MM1, MM2; 1459 movq [ESI -16], MM0; 1460 movq [ESI+8-16], MM1; 1461 cmp ESI, EDI; 1462 jb startmmx; 1463 1464 emms; 1465 mov aptr, ESI; 1466 } 1467 } 1468 else 1469 if (a.length >= 2) 1470 { 1471 auto n = aptr + (a.length & ~1); 1472 1473 asm 1474 { 1475 mov ESI, aptr; 1476 mov EDI, n; 1477 mov EDX, value; 1478 1479 align 4; 1480 start386: 1481 mov EBX, [ESI]; 1482 mov ECX, [ESI+4]; 1483 add ESI, 8; 1484 sub EBX, EDX; 1485 sub ECX, EDX; 1486 mov [ESI -8], EBX; 1487 mov [ESI+4-8], ECX; 1488 cmp ESI, EDI; 1489 jb start386; 1490 1491 mov aptr, ESI; 1492 } 1493 } 1494 } 1495 1496 while (aptr < aend) 1497 *aptr++ -= value; 1498 1499 return a; 1500 } 1501 1502 unittest 1503 { 1504 printf("_arrayExpSliceMinass_i unittest\n"); 1505 1506 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1507 { 1508 version (log) printf(" cpuid %d\n", cpuid); 1509 1510 for (int j = 0; j < 2; j++) 1511 { 1512 const int dim = 67; 1513 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1514 a = a[j .. dim + j]; // misalign for second iteration 1515 T[] b = new T[dim + j]; 1516 b = b[j .. dim + j]; 1517 T[] c = new T[dim + j]; 1518 c = c[j .. dim + j]; 1519 1520 for (int i = 0; i < dim; i++) 1521 { a[i] = cast(T)i; 1522 b[i] = cast(T)(i + 7); 1523 c[i] = cast(T)(i * 2); 1524 } 1525 1526 a[] = c[]; 1527 a[] -= 6; 1528 1529 for (int i = 0; i < dim; i++) 1530 { 1531 if (a[i] != cast(T)(c[i] - 6)) 1532 { 1533 printf("[%d]: %d != %d - 6\n", i, a[i], c[i]); 1534 assert(0); 1535 } 1536 } 1537 } 1538 } 1539 } 1540 1541 1542 /* ======================================================================== */ 1543 1544 /*********************** 1545 * Computes: 1546 * a[] -= b[] 1547 */ 1548 1549 T[] _arraySliceSliceMinass_w(T[] a, T[] b) 1550 { 1551 return _arraySliceSliceMinass_i(a, b); 1552 } 1553 1554 T[] _arraySliceSliceMinass_k(T[] a, T[] b) 1555 { 1556 return _arraySliceSliceMinass_i(a, b); 1557 } 1558 1559 T[] _arraySliceSliceMinass_i(T[] a, T[] b) 1560 in 1561 { 1562 assert (a.length == b.length); 1563 assert (disjoint(a, b)); 1564 } 1565 body 1566 { 1567 //printf("_arraySliceSliceMinass_i()\n"); 1568 auto aptr = a.ptr; 1569 auto aend = aptr + a.length; 1570 auto bptr = b.ptr; 1571 1572 version (D_InlineAsm_X86) 1573 { 1574 // SSE2 aligned version is 731% faster 1575 if (sse2() && a.length >= 8) 1576 { 1577 auto n = aptr + (a.length & ~7); 1578 1579 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1580 { 1581 asm // unaligned case 1582 { 1583 mov ESI, aptr; 1584 mov EDI, n; 1585 mov ECX, bptr; 1586 1587 align 4; 1588 startsse2u: 1589 movdqu XMM0, [ESI]; 1590 movdqu XMM2, [ECX]; 1591 movdqu XMM1, [ESI+16]; 1592 movdqu XMM3, [ECX+16]; 1593 add ESI, 32; 1594 add ECX, 32; 1595 psubd XMM0, XMM2; 1596 psubd XMM1, XMM3; 1597 movdqu [ESI -32], XMM0; 1598 movdqu [ESI+16-32], XMM1; 1599 cmp ESI, EDI; 1600 jb startsse2u; 1601 1602 mov aptr, ESI; 1603 mov bptr, ECX; 1604 } 1605 } 1606 else 1607 { 1608 asm // aligned case 1609 { 1610 mov ESI, aptr; 1611 mov EDI, n; 1612 mov ECX, bptr; 1613 1614 align 4; 1615 startsse2a: 1616 movdqa XMM0, [ESI]; 1617 movdqa XMM2, [ECX]; 1618 movdqa XMM1, [ESI+16]; 1619 movdqa XMM3, [ECX+16]; 1620 add ESI, 32; 1621 add ECX, 32; 1622 psubd XMM0, XMM2; 1623 psubd XMM1, XMM3; 1624 movdqa [ESI -32], XMM0; 1625 movdqa [ESI+16-32], XMM1; 1626 cmp ESI, EDI; 1627 jb startsse2a; 1628 1629 mov aptr, ESI; 1630 mov bptr, ECX; 1631 } 1632 } 1633 } 1634 else 1635 // MMX version is 441% faster 1636 if (mmx() && a.length >= 4) 1637 { 1638 auto n = aptr + (a.length & ~3); 1639 1640 asm 1641 { 1642 mov ESI, aptr; 1643 mov EDI, n; 1644 mov ECX, bptr; 1645 1646 align 4; 1647 startmmx: 1648 movq MM0, [ESI]; 1649 movq MM2, [ECX]; 1650 movq MM1, [ESI+8]; 1651 movq MM3, [ECX+8]; 1652 add ESI, 16; 1653 add ECX, 16; 1654 psubd MM0, MM2; 1655 psubd MM1, MM3; 1656 movq [ESI -16], MM0; 1657 movq [ESI+8-16], MM1; 1658 cmp ESI, EDI; 1659 jb startmmx; 1660 1661 emms; 1662 mov aptr, ESI; 1663 mov bptr, ECX; 1664 } 1665 } 1666 } 1667 1668 while (aptr < aend) 1669 *aptr++ -= *bptr++; 1670 1671 return a; 1672 } 1673 1674 unittest 1675 { 1676 printf("_arraySliceSliceMinass_i unittest\n"); 1677 1678 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1679 { 1680 version (log) printf(" cpuid %d\n", cpuid); 1681 1682 for (int j = 0; j < 2; j++) 1683 { 1684 const int dim = 67; 1685 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1686 a = a[j .. dim + j]; // misalign for second iteration 1687 T[] b = new T[dim + j]; 1688 b = b[j .. dim + j]; 1689 T[] c = new T[dim + j]; 1690 c = c[j .. dim + j]; 1691 1692 for (int i = 0; i < dim; i++) 1693 { a[i] = cast(T)i; 1694 b[i] = cast(T)(i + 7); 1695 c[i] = cast(T)(i * 2); 1696 } 1697 1698 b[] = c[]; 1699 c[] -= a[]; 1700 1701 for (int i = 0; i < dim; i++) 1702 { 1703 if (c[i] != cast(T)(b[i] - a[i])) 1704 { 1705 printf("[%d]: %d != %d - %d\n", i, c[i], b[i], a[i]); 1706 assert(0); 1707 } 1708 } 1709 } 1710 } 1711 } 1712 1713 1714 /* ======================================================================== */ 1715 1716 /*********************** 1717 * Computes: 1718 * a[] = b[] * value 1719 */ 1720 1721 T[] _arraySliceExpMulSliceAssign_w(T[] a, T value, T[] b) 1722 { 1723 return _arraySliceExpMulSliceAssign_i(a, value, b); 1724 } 1725 1726 T[] _arraySliceExpMulSliceAssign_k(T[] a, T value, T[] b) 1727 { 1728 return _arraySliceExpMulSliceAssign_i(a, value, b); 1729 } 1730 1731 T[] _arraySliceExpMulSliceAssign_i(T[] a, T value, T[] b) 1732 in 1733 { 1734 assert(a.length == b.length); 1735 assert(disjoint(a, b)); 1736 } 1737 body 1738 { 1739 //printf("_arraySliceExpMulSliceAssign_i()\n"); 1740 auto aptr = a.ptr; 1741 auto aend = aptr + a.length; 1742 auto bptr = b.ptr; 1743 1744 version (none) // multiplying a pair is not supported by MMX 1745 { 1746 version (D_InlineAsm_X86) 1747 { 1748 // SSE2 aligned version is 1380% faster 1749 if (sse2() && a.length >= 8) 1750 { 1751 auto n = aptr + (a.length & ~7); 1752 1753 uint l = value; 1754 1755 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1756 { 1757 asm 1758 { 1759 mov ESI, aptr; 1760 mov EDI, n; 1761 mov EAX, bptr; 1762 movd XMM2, l; 1763 pshufd XMM2, XMM2, 0; 1764 1765 align 4; 1766 startsse2u: 1767 add ESI, 32; 1768 movdqu XMM0, [EAX]; 1769 movdqu XMM1, [EAX+16]; 1770 add EAX, 32; 1771 pmuludq XMM0, XMM2; 1772 pmuludq XMM1, XMM2; 1773 movdqu [ESI -32], XMM0; 1774 movdqu [ESI+16-32], XMM1; 1775 cmp ESI, EDI; 1776 jb startsse2u; 1777 1778 mov aptr, ESI; 1779 mov bptr, EAX; 1780 } 1781 } 1782 else 1783 { 1784 asm 1785 { 1786 mov ESI, aptr; 1787 mov EDI, n; 1788 mov EAX, bptr; 1789 movd XMM2, l; 1790 pshufd XMM2, XMM2, 0; 1791 1792 align 4; 1793 startsse2a: 1794 add ESI, 32; 1795 movdqa XMM0, [EAX]; 1796 movdqa XMM1, [EAX+16]; 1797 add EAX, 32; 1798 pmuludq XMM0, XMM2; 1799 pmuludq XMM1, XMM2; 1800 movdqa [ESI -32], XMM0; 1801 movdqa [ESI+16-32], XMM1; 1802 cmp ESI, EDI; 1803 jb startsse2a; 1804 1805 mov aptr, ESI; 1806 mov bptr, EAX; 1807 } 1808 } 1809 } 1810 else 1811 { 1812 // MMX version is 1380% faster 1813 if (mmx() && a.length >= 4) 1814 { 1815 auto n = aptr + (a.length & ~3); 1816 1817 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); 1818 1819 asm 1820 { 1821 mov ESI, aptr; 1822 mov EDI, n; 1823 mov EAX, bptr; 1824 movq MM2, l; 1825 1826 align 4; 1827 startmmx: 1828 add ESI, 16; 1829 movq MM0, [EAX]; 1830 movq MM1, [EAX+8]; 1831 add EAX, 16; 1832 pmuludq MM0, MM2; // only multiplies low 32 bits 1833 pmuludq MM1, MM2; 1834 movq [ESI -16], MM0; 1835 movq [ESI+8-16], MM1; 1836 cmp ESI, EDI; 1837 jb startmmx; 1838 1839 emms; 1840 mov aptr, ESI; 1841 mov bptr, EAX; 1842 } 1843 } 1844 } 1845 } 1846 } 1847 1848 while (aptr < aend) 1849 *aptr++ = *bptr++ * value; 1850 1851 return a; 1852 } 1853 1854 unittest 1855 { 1856 printf("_arraySliceExpMulSliceAssign_s unittest\n"); 1857 1858 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1859 { 1860 version (log) printf(" cpuid %d\n", cpuid); 1861 1862 for (int j = 0; j < 2; j++) 1863 { 1864 const int dim = 67; 1865 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1866 a = a[j .. dim + j]; // misalign for second iteration 1867 T[] b = new T[dim + j]; 1868 b = b[j .. dim + j]; 1869 T[] c = new T[dim + j]; 1870 c = c[j .. dim + j]; 1871 1872 for (int i = 0; i < dim; i++) 1873 { a[i] = cast(T)i; 1874 b[i] = cast(T)(i + 7); 1875 c[i] = cast(T)(i * 2); 1876 } 1877 1878 c[] = a[] * 6; 1879 1880 for (int i = 0; i < dim; i++) 1881 { 1882 //printf("[%d]: %d ?= %d * 6\n", i, c[i], a[i]); 1883 if (c[i] != cast(T)(a[i] * 6)) 1884 { 1885 printf("[%d]: %d != %d * 6\n", i, c[i], a[i]); 1886 assert(0); 1887 } 1888 } 1889 } 1890 } 1891 } 1892 1893 1894 /* ======================================================================== */ 1895 1896 /*********************** 1897 * Computes: 1898 * a[] = b[] * c[] 1899 */ 1900 1901 T[] _arraySliceSliceMulSliceAssign_w(T[] a, T[] c, T[] b) 1902 { 1903 return _arraySliceSliceMulSliceAssign_i(a, c, b); 1904 } 1905 1906 T[] _arraySliceSliceMulSliceAssign_k(T[] a, T[] c, T[] b) 1907 { 1908 return _arraySliceSliceMulSliceAssign_i(a, c, b); 1909 } 1910 1911 T[] _arraySliceSliceMulSliceAssign_i(T[] a, T[] c, T[] b) 1912 in 1913 { 1914 assert(a.length == b.length && b.length == c.length); 1915 assert(disjoint(a, b)); 1916 assert(disjoint(a, c)); 1917 assert(disjoint(b, c)); 1918 } 1919 body 1920 { 1921 //printf("_arraySliceSliceMulSliceAssign_i()\n"); 1922 auto aptr = a.ptr; 1923 auto aend = aptr + a.length; 1924 auto bptr = b.ptr; 1925 auto cptr = c.ptr; 1926 1927 version (none) 1928 { 1929 version (D_InlineAsm_X86) 1930 { 1931 // SSE2 aligned version is 1407% faster 1932 if (sse2() && a.length >= 8) 1933 { 1934 auto n = aptr + (a.length & ~7); 1935 1936 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 1937 { 1938 asm 1939 { 1940 mov ESI, aptr; 1941 mov EDI, n; 1942 mov EAX, bptr; 1943 mov ECX, cptr; 1944 1945 align 4; 1946 startsse2u: 1947 add ESI, 32; 1948 movdqu XMM0, [EAX]; 1949 movdqu XMM2, [ECX]; 1950 movdqu XMM1, [EAX+16]; 1951 movdqu XMM3, [ECX+16]; 1952 add EAX, 32; 1953 add ECX, 32; 1954 pmuludq XMM0, XMM2; 1955 pmuludq XMM1, XMM3; 1956 movdqu [ESI -32], XMM0; 1957 movdqu [ESI+16-32], XMM1; 1958 cmp ESI, EDI; 1959 jb startsse2u; 1960 1961 mov aptr, ESI; 1962 mov bptr, EAX; 1963 mov cptr, ECX; 1964 } 1965 } 1966 else 1967 { 1968 asm 1969 { 1970 mov ESI, aptr; 1971 mov EDI, n; 1972 mov EAX, bptr; 1973 mov ECX, cptr; 1974 1975 align 4; 1976 startsse2a: 1977 add ESI, 32; 1978 movdqa XMM0, [EAX]; 1979 movdqa XMM2, [ECX]; 1980 movdqa XMM1, [EAX+16]; 1981 movdqa XMM3, [ECX+16]; 1982 add EAX, 32; 1983 add ECX, 32; 1984 pmuludq XMM0, XMM2; 1985 pmuludq XMM1, XMM3; 1986 movdqa [ESI -32], XMM0; 1987 movdqa [ESI+16-32], XMM1; 1988 cmp ESI, EDI; 1989 jb startsse2a; 1990 1991 mov aptr, ESI; 1992 mov bptr, EAX; 1993 mov cptr, ECX; 1994 } 1995 } 1996 } 1997 else 1998 // MMX version is 1029% faster 1999 if (mmx() && a.length >= 4) 2000 { 2001 auto n = aptr + (a.length & ~3); 2002 2003 asm 2004 { 2005 mov ESI, aptr; 2006 mov EDI, n; 2007 mov EAX, bptr; 2008 mov ECX, cptr; 2009 2010 align 4; 2011 startmmx: 2012 add ESI, 16; 2013 movq MM0, [EAX]; 2014 movq MM2, [ECX]; 2015 movq MM1, [EAX+8]; 2016 movq MM3, [ECX+8]; 2017 add EAX, 16; 2018 add ECX, 16; 2019 pmuludq MM0, MM2; 2020 pmuludq MM1, MM3; 2021 movq [ESI -16], MM0; 2022 movq [ESI+8-16], MM1; 2023 cmp ESI, EDI; 2024 jb startmmx; 2025 2026 emms; 2027 mov aptr, ESI; 2028 mov bptr, EAX; 2029 mov cptr, ECX; 2030 } 2031 } 2032 } 2033 } 2034 2035 while (aptr < aend) 2036 *aptr++ = *bptr++ * *cptr++; 2037 2038 return a; 2039 } 2040 2041 unittest 2042 { 2043 printf("_arraySliceSliceMulSliceAssign_i unittest\n"); 2044 2045 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 2046 { 2047 version (log) printf(" cpuid %d\n", cpuid); 2048 2049 for (int j = 0; j < 2; j++) 2050 { 2051 const int dim = 67; 2052 T[] a = new T[dim + j]; // aligned on 16 byte boundary 2053 a = a[j .. dim + j]; // misalign for second iteration 2054 T[] b = new T[dim + j]; 2055 b = b[j .. dim + j]; 2056 T[] c = new T[dim + j]; 2057 c = c[j .. dim + j]; 2058 2059 for (int i = 0; i < dim; i++) 2060 { a[i] = cast(T)i; 2061 b[i] = cast(T)(i + 7); 2062 c[i] = cast(T)(i * 2); 2063 } 2064 2065 c[] = a[] * b[]; 2066 2067 for (int i = 0; i < dim; i++) 2068 { 2069 if (c[i] != cast(T)(a[i] * b[i])) 2070 { 2071 printf("[%d]: %d != %d * %d\n", i, c[i], a[i], b[i]); 2072 assert(0); 2073 } 2074 } 2075 } 2076 } 2077 } 2078 2079 2080 /* ======================================================================== */ 2081 2082 /*********************** 2083 * Computes: 2084 * a[] *= value 2085 */ 2086 2087 T[] _arrayExpSliceMulass_w(T[] a, T value) 2088 { 2089 return _arrayExpSliceMulass_i(a, value); 2090 } 2091 2092 T[] _arrayExpSliceMulass_k(T[] a, T value) 2093 { 2094 return _arrayExpSliceMulass_i(a, value); 2095 } 2096 2097 T[] _arrayExpSliceMulass_i(T[] a, T value) 2098 { 2099 //printf("_arrayExpSliceMulass_i(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 2100 auto aptr = a.ptr; 2101 auto aend = aptr + a.length; 2102 2103 version (none) 2104 { 2105 version (D_InlineAsm_X86) 2106 { 2107 // SSE2 aligned version is 400% faster 2108 if (sse2() && a.length >= 8) 2109 { 2110 auto n = aptr + (a.length & ~7); 2111 2112 uint l = value; 2113 2114 if (((cast(uint) aptr) & 15) != 0) 2115 { 2116 asm 2117 { 2118 mov ESI, aptr; 2119 mov EDI, n; 2120 movd XMM2, l; 2121 pshufd XMM2, XMM2, 0; 2122 2123 align 4; 2124 startsse2u: 2125 movdqu XMM0, [ESI]; 2126 movdqu XMM1, [ESI+16]; 2127 add ESI, 32; 2128 pmuludq XMM0, XMM2; 2129 pmuludq XMM1, XMM2; 2130 movdqu [ESI -32], XMM0; 2131 movdqu [ESI+16-32], XMM1; 2132 cmp ESI, EDI; 2133 jb startsse2u; 2134 2135 mov aptr, ESI; 2136 } 2137 } 2138 else 2139 { 2140 asm 2141 { 2142 mov ESI, aptr; 2143 mov EDI, n; 2144 movd XMM2, l; 2145 pshufd XMM2, XMM2, 0; 2146 2147 align 4; 2148 startsse2a: 2149 movdqa XMM0, [ESI]; 2150 movdqa XMM1, [ESI+16]; 2151 add ESI, 32; 2152 pmuludq XMM0, XMM2; 2153 pmuludq XMM1, XMM2; 2154 movdqa [ESI -32], XMM0; 2155 movdqa [ESI+16-32], XMM1; 2156 cmp ESI, EDI; 2157 jb startsse2a; 2158 2159 mov aptr, ESI; 2160 } 2161 } 2162 } 2163 else 2164 // MMX version is 402% faster 2165 if (mmx() && a.length >= 4) 2166 { 2167 auto n = aptr + (a.length & ~3); 2168 2169 ulong l = cast(uint) value | (cast(ulong)cast(uint) value << 32); 2170 2171 asm 2172 { 2173 mov ESI, aptr; 2174 mov EDI, n; 2175 movq MM2, l; 2176 2177 align 4; 2178 startmmx: 2179 movq MM0, [ESI]; 2180 movq MM1, [ESI+8]; 2181 add ESI, 16; 2182 pmuludq MM0, MM2; 2183 pmuludq MM1, MM2; 2184 movq [ESI -16], MM0; 2185 movq [ESI+8-16], MM1; 2186 cmp ESI, EDI; 2187 jb startmmx; 2188 2189 emms; 2190 mov aptr, ESI; 2191 } 2192 } 2193 } 2194 } 2195 2196 while (aptr < aend) 2197 *aptr++ *= value; 2198 2199 return a; 2200 } 2201 2202 unittest 2203 { 2204 printf("_arrayExpSliceMulass_i unittest\n"); 2205 2206 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 2207 { 2208 version (log) printf(" cpuid %d\n", cpuid); 2209 2210 for (int j = 0; j < 2; j++) 2211 { 2212 const int dim = 67; 2213 T[] a = new T[dim + j]; // aligned on 16 byte boundary 2214 a = a[j .. dim + j]; // misalign for second iteration 2215 T[] b = new T[dim + j]; 2216 b = b[j .. dim + j]; 2217 T[] c = new T[dim + j]; 2218 c = c[j .. dim + j]; 2219 2220 for (int i = 0; i < dim; i++) 2221 { a[i] = cast(T)i; 2222 b[i] = cast(T)(i + 7); 2223 c[i] = cast(T)(i * 2); 2224 } 2225 2226 b[] = a[]; 2227 a[] *= 6; 2228 2229 for (int i = 0; i < dim; i++) 2230 { 2231 if (a[i] != cast(T)(b[i] * 6)) 2232 { 2233 printf("[%d]: %d != %d * 6\n", i, a[i], b[i]); 2234 assert(0); 2235 } 2236 } 2237 } 2238 } 2239 } 2240 2241 2242 /* ======================================================================== */ 2243 2244 /*********************** 2245 * Computes: 2246 * a[] *= b[] 2247 */ 2248 2249 T[] _arraySliceSliceMulass_w(T[] a, T[] b) 2250 { 2251 return _arraySliceSliceMulass_i(a, b); 2252 } 2253 2254 T[] _arraySliceSliceMulass_k(T[] a, T[] b) 2255 { 2256 return _arraySliceSliceMulass_i(a, b); 2257 } 2258 2259 T[] _arraySliceSliceMulass_i(T[] a, T[] b) 2260 in 2261 { 2262 assert (a.length == b.length); 2263 assert (disjoint(a, b)); 2264 } 2265 body 2266 { 2267 //printf("_arraySliceSliceMulass_i()\n"); 2268 auto aptr = a.ptr; 2269 auto aend = aptr + a.length; 2270 auto bptr = b.ptr; 2271 2272 version (none) 2273 { 2274 version (D_InlineAsm_X86) 2275 { 2276 // SSE2 aligned version is 873% faster 2277 if (sse2() && a.length >= 8) 2278 { 2279 auto n = aptr + (a.length & ~7); 2280 2281 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 2282 { 2283 asm 2284 { 2285 mov ESI, aptr; 2286 mov EDI, n; 2287 mov ECX, bptr; 2288 2289 align 4; 2290 startsse2u: 2291 movdqu XMM0, [ESI]; 2292 movdqu XMM2, [ECX]; 2293 movdqu XMM1, [ESI+16]; 2294 movdqu XMM3, [ECX+16]; 2295 add ESI, 32; 2296 add ECX, 32; 2297 pmuludq XMM0, XMM2; 2298 pmuludq XMM1, XMM3; 2299 movdqu [ESI -32], XMM0; 2300 movdqu [ESI+16-32], XMM1; 2301 cmp ESI, EDI; 2302 jb startsse2u; 2303 2304 mov aptr, ESI; 2305 mov bptr, ECX; 2306 } 2307 } 2308 else 2309 { 2310 asm 2311 { 2312 mov ESI, aptr; 2313 mov EDI, n; 2314 mov ECX, bptr; 2315 2316 align 4; 2317 startsse2a: 2318 movdqa XMM0, [ESI]; 2319 movdqa XMM2, [ECX]; 2320 movdqa XMM1, [ESI+16]; 2321 movdqa XMM3, [ECX+16]; 2322 add ESI, 32; 2323 add ECX, 32; 2324 pmuludq XMM0, XMM2; 2325 pmuludq XMM1, XMM3; 2326 movdqa [ESI -32], XMM0; 2327 movdqa [ESI+16-32], XMM1; 2328 cmp ESI, EDI; 2329 jb startsse2a; 2330 2331 mov aptr, ESI; 2332 mov bptr, ECX; 2333 } 2334 } 2335 } 2336 /+ BUG: comment out this section until we figure out what is going 2337 wrong with the invalid pshufd instructions. 2338 2339 else 2340 // MMX version is 573% faster 2341 if (mmx() && a.length >= 4) 2342 { 2343 auto n = aptr + (a.length & ~3); 2344 2345 asm 2346 { 2347 mov ESI, aptr; 2348 mov EDI, n; 2349 mov ECX, bptr; 2350 2351 align 4; 2352 startmmx: 2353 movq MM0, [ESI]; 2354 movq MM2, [ECX]; 2355 movq MM1, [ESI+8]; 2356 movq MM3, [ECX+8]; 2357 pxor MM4, MM4; 2358 pxor MM5, MM5; 2359 punpckldq MM4, MM0; 2360 punpckldq MM5, MM2; 2361 add ESI, 16; 2362 add ECX, 16; 2363 pmuludq MM4, MM5; 2364 pshufd MM4, MM4, 8; // ? 2365 movq [ESI -16], MM4; 2366 pxor MM4, MM4; 2367 pxor MM5, MM5; 2368 punpckldq MM4, MM1; 2369 punpckldq MM5, MM3; 2370 pmuludq MM4, MM5; 2371 pshufd MM4, MM4, 8; // ? 2372 movq [ESI+8-16], MM4; 2373 cmp ESI, EDI; 2374 jb startmmx; 2375 2376 emms; 2377 mov aptr, ESI; 2378 mov bptr, ECX; 2379 } 2380 } 2381 +/ 2382 } 2383 } 2384 2385 while (aptr < aend) 2386 *aptr++ *= *bptr++; 2387 2388 return a; 2389 } 2390 2391 unittest 2392 { 2393 printf("_arraySliceSliceMulass_i unittest\n"); 2394 2395 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 2396 { 2397 version (log) printf(" cpuid %d\n", cpuid); 2398 2399 for (int j = 0; j < 2; j++) 2400 { 2401 const int dim = 67; 2402 T[] a = new T[dim + j]; // aligned on 16 byte boundary 2403 a = a[j .. dim + j]; // misalign for second iteration 2404 T[] b = new T[dim + j]; 2405 b = b[j .. dim + j]; 2406 T[] c = new T[dim + j]; 2407 c = c[j .. dim + j]; 2408 2409 for (int i = 0; i < dim; i++) 2410 { a[i] = cast(T)i; 2411 b[i] = cast(T)(i + 7); 2412 c[i] = cast(T)(i * 2); 2413 } 2414 2415 b[] = a[]; 2416 a[] *= c[]; 2417 2418 for (int i = 0; i < dim; i++) 2419 { 2420 if (a[i] != cast(T)(b[i] * c[i])) 2421 { 2422 printf("[%d]: %d != %d * %d\n", i, a[i], b[i], c[i]); 2423 assert(0); 2424 } 2425 } 2426 } 2427 } 2428 }