1 /*************************** 2 * D programming language http://www.digitalmars.com/d/ 3 * Runtime support for byte array operations. 4 * Based on code originally written by Burton Radons. 5 * Placed in public domain. 6 */ 7 8 /* Contains SSE2 and MMX versions of certain operations for char, byte, 9 * and ubyte ('a', 'g' and 'h' suffixes). 10 */ 11 12 module rt.compiler.gdc.rt.arraybyte; 13 14 import CPUid = rt.compiler.util.cpuid; 15 16 debug(UnitTest) 17 { 18 private extern(C) int printf(char*,...); 19 /* This is so unit tests will test every CPU variant 20 */ 21 int cpuid; 22 const int CPUID_MAX = 4; 23 bool mmx() { return cpuid == 1 && CPUid.mmx(); } 24 bool sse() { return cpuid == 2 && CPUid.sse(); } 25 bool sse2() { return cpuid == 3 && CPUid.sse2(); } 26 bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); } 27 } 28 else 29 { 30 alias CPUid.mmx mmx; 31 alias CPUid.sse sse; 32 alias CPUid.sse2 sse2; 33 alias CPUid.amd3dnow amd3dnow; 34 } 35 36 //version = log; 37 38 bool disjoint(T)(T[] a, T[] b) 39 { 40 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); 41 } 42 43 alias byte T; 44 45 extern (C): 46 47 /* ======================================================================== */ 48 49 50 /*********************** 51 * Computes: 52 * a[] = b[] + value 53 */ 54 55 T[] _arraySliceExpAddSliceAssign_a(T[] a, T value, T[] b) 56 { 57 return _arraySliceExpAddSliceAssign_g(a, value, b); 58 } 59 60 T[] _arraySliceExpAddSliceAssign_h(T[] a, T value, T[] b) 61 { 62 return _arraySliceExpAddSliceAssign_g(a, value, b); 63 } 64 65 T[] _arraySliceExpAddSliceAssign_g(T[] a, T value, T[] b) 66 in 67 { 68 assert(a.length == b.length); 69 assert(disjoint(a, b)); 70 } 71 body 72 { 73 //printf("_arraySliceExpAddSliceAssign_g()\n"); 74 auto aptr = a.ptr; 75 auto aend = aptr + a.length; 76 auto bptr = b.ptr; 77 78 version (D_InlineAsm_X86) 79 { 80 // SSE2 aligned version is 1088% faster 81 if (sse2() && a.length >= 64) 82 { 83 auto n = aptr + (a.length & ~63); 84 85 uint l = cast(ubyte) value; 86 l |= (l << 8); 87 l |= (l << 16); 88 89 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 90 { 91 asm // unaligned case 92 { 93 mov ESI, aptr; 94 mov EDI, n; 95 mov EAX, bptr; 96 movd XMM4, l; 97 pshufd XMM4, XMM4, 0; 98 99 align 8; 100 startaddsse2u: 101 add ESI, 64; 102 movdqu XMM0, [EAX]; 103 movdqu XMM1, [EAX+16]; 104 movdqu XMM2, [EAX+32]; 105 movdqu XMM3, [EAX+48]; 106 add EAX, 64; 107 paddb XMM0, XMM4; 108 paddb XMM1, XMM4; 109 paddb XMM2, XMM4; 110 paddb XMM3, XMM4; 111 movdqu [ESI -64], XMM0; 112 movdqu [ESI+16-64], XMM1; 113 movdqu [ESI+32-64], XMM2; 114 movdqu [ESI+48-64], XMM3; 115 cmp ESI, EDI; 116 jb startaddsse2u; 117 118 mov aptr, ESI; 119 mov bptr, EAX; 120 } 121 } 122 else 123 { 124 asm // aligned case 125 { 126 mov ESI, aptr; 127 mov EDI, n; 128 mov EAX, bptr; 129 movd XMM4, l; 130 pshufd XMM4, XMM4, 0; 131 132 align 8; 133 startaddsse2a: 134 add ESI, 64; 135 movdqa XMM0, [EAX]; 136 movdqa XMM1, [EAX+16]; 137 movdqa XMM2, [EAX+32]; 138 movdqa XMM3, [EAX+48]; 139 add EAX, 64; 140 paddb XMM0, XMM4; 141 paddb XMM1, XMM4; 142 paddb XMM2, XMM4; 143 paddb XMM3, XMM4; 144 movdqa [ESI -64], XMM0; 145 movdqa [ESI+16-64], XMM1; 146 movdqa [ESI+32-64], XMM2; 147 movdqa [ESI+48-64], XMM3; 148 cmp ESI, EDI; 149 jb startaddsse2a; 150 151 mov aptr, ESI; 152 mov bptr, EAX; 153 } 154 } 155 } 156 else 157 // MMX version is 1000% faster 158 if (mmx() && a.length >= 32) 159 { 160 auto n = aptr + (a.length & ~31); 161 162 uint l = cast(ubyte) value; 163 l |= (l << 8); 164 165 asm 166 { 167 mov ESI, aptr; 168 mov EDI, n; 169 mov EAX, bptr; 170 movd MM4, l; 171 pshufw MM4, MM4, 0; 172 173 align 4; 174 startaddmmx: 175 add ESI, 32; 176 movq MM0, [EAX]; 177 movq MM1, [EAX+8]; 178 movq MM2, [EAX+16]; 179 movq MM3, [EAX+24]; 180 add EAX, 32; 181 paddb MM0, MM4; 182 paddb MM1, MM4; 183 paddb MM2, MM4; 184 paddb MM3, MM4; 185 movq [ESI -32], MM0; 186 movq [ESI+8 -32], MM1; 187 movq [ESI+16-32], MM2; 188 movq [ESI+24-32], MM3; 189 cmp ESI, EDI; 190 jb startaddmmx; 191 192 emms; 193 mov aptr, ESI; 194 mov bptr, EAX; 195 } 196 } 197 /* trying to be fair and treat normal 32-bit cpu the same way as we do 198 * the SIMD units, with unrolled asm. There's not enough registers, 199 * really. 200 */ 201 else 202 if (a.length >= 4) 203 { 204 205 auto n = aptr + (a.length & ~3); 206 asm 207 { 208 mov ESI, aptr; 209 mov EDI, n; 210 mov EAX, bptr; 211 mov CL, value; 212 213 align 4; 214 startadd386: 215 add ESI, 4; 216 mov DX, [EAX]; 217 mov BX, [EAX+2]; 218 add EAX, 4; 219 add BL, CL; 220 add BH, CL; 221 add DL, CL; 222 add DH, CL; 223 mov [ESI -4], DX; 224 mov [ESI+2 -4], BX; 225 cmp ESI, EDI; 226 jb startadd386; 227 228 mov aptr, ESI; 229 mov bptr, EAX; 230 } 231 232 } 233 } 234 235 while (aptr < aend) 236 *aptr++ = cast(T)(*bptr++ + value); 237 238 return a; 239 } 240 241 unittest 242 { 243 printf("_arraySliceExpAddSliceAssign_g unittest\n"); 244 245 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 246 { 247 version (log) printf(" cpuid %d\n", cpuid); 248 249 for (int j = 0; j < 2; j++) 250 { 251 const int dim = 67; 252 T[] a = new T[dim + j]; // aligned on 16 byte boundary 253 a = a[j .. dim + j]; // misalign for second iteration 254 T[] b = new T[dim + j]; 255 b = b[j .. dim + j]; 256 T[] c = new T[dim + j]; 257 c = c[j .. dim + j]; 258 259 for (int i = 0; i < dim; i++) 260 { a[i] = cast(T)i; 261 b[i] = cast(T)(i + 7); 262 c[i] = cast(T)(i * 2); 263 } 264 265 c[] = a[] + 6; 266 267 for (int i = 0; i < dim; i++) 268 { 269 if (c[i] != cast(T)(a[i] + 6)) 270 { 271 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); 272 assert(0); 273 } 274 } 275 } 276 } 277 } 278 279 280 /* ======================================================================== */ 281 282 /*********************** 283 * Computes: 284 * a[] = b[] + c[] 285 */ 286 287 T[] _arraySliceSliceAddSliceAssign_a(T[] a, T[] c, T[] b) 288 { 289 return _arraySliceSliceAddSliceAssign_g(a, c, b); 290 } 291 292 T[] _arraySliceSliceAddSliceAssign_h(T[] a, T[] c, T[] b) 293 { 294 return _arraySliceSliceAddSliceAssign_g(a, c, b); 295 } 296 297 T[] _arraySliceSliceAddSliceAssign_g(T[] a, T[] c, T[] b) 298 in 299 { 300 assert(a.length == b.length && b.length == c.length); 301 assert(disjoint(a, b)); 302 assert(disjoint(a, c)); 303 assert(disjoint(b, c)); 304 } 305 body 306 { 307 //printf("_arraySliceSliceAddSliceAssign_g()\n"); 308 auto aptr = a.ptr; 309 auto aend = aptr + a.length; 310 auto bptr = b.ptr; 311 auto cptr = c.ptr; 312 313 version (D_InlineAsm_X86) 314 { 315 // SSE2 aligned version is 5739% faster 316 if (sse2() && a.length >= 64) 317 { 318 auto n = aptr + (a.length & ~63); 319 320 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 321 { 322 version (log) printf("\tsse2 unaligned\n"); 323 asm // unaligned case 324 { 325 mov ESI, aptr; 326 mov EDI, n; 327 mov EAX, bptr; 328 mov ECX, cptr; 329 330 align 8; 331 startaddlsse2u: 332 add ESI, 64; 333 movdqu XMM0, [EAX]; 334 movdqu XMM1, [EAX+16]; 335 movdqu XMM2, [EAX+32]; 336 movdqu XMM3, [EAX+48]; 337 add EAX, 64; 338 movdqu XMM4, [ECX]; 339 movdqu XMM5, [ECX+16]; 340 movdqu XMM6, [ECX+32]; 341 movdqu XMM7, [ECX+48]; 342 add ECX, 64; 343 paddb XMM0, XMM4; 344 paddb XMM1, XMM5; 345 paddb XMM2, XMM6; 346 paddb XMM3, XMM7; 347 movdqu [ESI -64], XMM0; 348 movdqu [ESI+16-64], XMM1; 349 movdqu [ESI+32-64], XMM2; 350 movdqu [ESI+48-64], XMM3; 351 cmp ESI, EDI; 352 jb startaddlsse2u; 353 354 mov aptr, ESI; 355 mov bptr, EAX; 356 mov cptr, ECX; 357 } 358 } 359 else 360 { 361 version (log) printf("\tsse2 aligned\n"); 362 asm // aligned case 363 { 364 mov ESI, aptr; 365 mov EDI, n; 366 mov EAX, bptr; 367 mov ECX, cptr; 368 369 align 8; 370 startaddlsse2a: 371 add ESI, 64; 372 movdqa XMM0, [EAX]; 373 movdqa XMM1, [EAX+16]; 374 movdqa XMM2, [EAX+32]; 375 movdqa XMM3, [EAX+48]; 376 add EAX, 64; 377 movdqa XMM4, [ECX]; 378 movdqa XMM5, [ECX+16]; 379 movdqa XMM6, [ECX+32]; 380 movdqa XMM7, [ECX+48]; 381 add ECX, 64; 382 paddb XMM0, XMM4; 383 paddb XMM1, XMM5; 384 paddb XMM2, XMM6; 385 paddb XMM3, XMM7; 386 movdqa [ESI -64], XMM0; 387 movdqa [ESI+16-64], XMM1; 388 movdqa [ESI+32-64], XMM2; 389 movdqa [ESI+48-64], XMM3; 390 cmp ESI, EDI; 391 jb startaddlsse2a; 392 393 mov aptr, ESI; 394 mov bptr, EAX; 395 mov cptr, ECX; 396 } 397 } 398 } 399 else 400 // MMX version is 4428% faster 401 if (mmx() && a.length >= 32) 402 { 403 version (log) printf("\tmmx\n"); 404 auto n = aptr + (a.length & ~31); 405 406 asm 407 { 408 mov ESI, aptr; 409 mov EDI, n; 410 mov EAX, bptr; 411 mov ECX, cptr; 412 413 align 4; 414 startaddlmmx: 415 add ESI, 32; 416 movq MM0, [EAX]; 417 movq MM1, [EAX+8]; 418 movq MM2, [EAX+16]; 419 movq MM3, [EAX+24]; 420 add EAX, 32; 421 movq MM4, [ECX]; 422 movq MM5, [ECX+8]; 423 movq MM6, [ECX+16]; 424 movq MM7, [ECX+24]; 425 add ECX, 32; 426 paddb MM0, MM4; 427 paddb MM1, MM5; 428 paddb MM2, MM6; 429 paddb MM3, MM7; 430 movq [ESI -32], MM0; 431 movq [ESI+8 -32], MM1; 432 movq [ESI+16-32], MM2; 433 movq [ESI+24-32], MM3; 434 cmp ESI, EDI; 435 jb startaddlmmx; 436 437 emms; 438 mov aptr, ESI; 439 mov bptr, EAX; 440 mov cptr, ECX; 441 } 442 } 443 } 444 445 version (log) if (aptr < aend) printf("\tbase\n"); 446 while (aptr < aend) 447 *aptr++ = cast(T)(*bptr++ + *cptr++); 448 449 return a; 450 } 451 452 unittest 453 { 454 printf("_arraySliceSliceAddSliceAssign_g unittest\n"); 455 456 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 457 { 458 version (log) printf(" cpuid %d\n", cpuid); 459 460 for (int j = 0; j < 2; j++) 461 { 462 const int dim = 67; 463 T[] a = new T[dim + j]; // aligned on 16 byte boundary 464 a = a[j .. dim + j]; // misalign for second iteration 465 T[] b = new T[dim + j]; 466 b = b[j .. dim + j]; 467 T[] c = new T[dim + j]; 468 c = c[j .. dim + j]; 469 470 for (int i = 0; i < dim; i++) 471 { a[i] = cast(T)i; 472 b[i] = cast(T)(i + 7); 473 c[i] = cast(T)(i * 2); 474 } 475 476 c[] = a[] + b[]; 477 478 for (int i = 0; i < dim; i++) 479 { 480 if (c[i] != cast(T)(a[i] + b[i])) 481 { 482 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); 483 assert(0); 484 } 485 } 486 } 487 } 488 } 489 490 491 /* ======================================================================== */ 492 493 /*********************** 494 * Computes: 495 * a[] += value 496 */ 497 498 T[] _arrayExpSliceAddass_a(T[] a, T value) 499 { 500 return _arrayExpSliceAddass_g(a, value); 501 } 502 503 T[] _arrayExpSliceAddass_h(T[] a, T value) 504 { 505 return _arrayExpSliceAddass_g(a, value); 506 } 507 508 T[] _arrayExpSliceAddass_g(T[] a, T value) 509 { 510 //printf("_arrayExpSliceAddass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 511 auto aptr = a.ptr; 512 auto aend = aptr + a.length; 513 514 version (D_InlineAsm_X86) 515 { 516 // SSE2 aligned version is 1578% faster 517 if (sse2() && a.length >= 64) 518 { 519 auto n = aptr + (a.length & ~63); 520 521 uint l = cast(ubyte) value; 522 l |= (l << 8); 523 l |= (l << 16); 524 525 if (((cast(uint) aptr) & 15) != 0) 526 { 527 asm // unaligned case 528 { 529 mov ESI, aptr; 530 mov EDI, n; 531 movd XMM4, l; 532 pshufd XMM4, XMM4, 0; 533 534 align 8; 535 startaddasssse2u: 536 movdqu XMM0, [ESI]; 537 movdqu XMM1, [ESI+16]; 538 movdqu XMM2, [ESI+32]; 539 movdqu XMM3, [ESI+48]; 540 add ESI, 64; 541 paddb XMM0, XMM4; 542 paddb XMM1, XMM4; 543 paddb XMM2, XMM4; 544 paddb XMM3, XMM4; 545 movdqu [ESI -64], XMM0; 546 movdqu [ESI+16-64], XMM1; 547 movdqu [ESI+32-64], XMM2; 548 movdqu [ESI+48-64], XMM3; 549 cmp ESI, EDI; 550 jb startaddasssse2u; 551 552 mov aptr, ESI; 553 } 554 } 555 else 556 { 557 asm // aligned case 558 { 559 mov ESI, aptr; 560 mov EDI, n; 561 movd XMM4, l; 562 pshufd XMM4, XMM4, 0; 563 564 align 8; 565 startaddasssse2a: 566 movdqa XMM0, [ESI]; 567 movdqa XMM1, [ESI+16]; 568 movdqa XMM2, [ESI+32]; 569 movdqa XMM3, [ESI+48]; 570 add ESI, 64; 571 paddb XMM0, XMM4; 572 paddb XMM1, XMM4; 573 paddb XMM2, XMM4; 574 paddb XMM3, XMM4; 575 movdqa [ESI -64], XMM0; 576 movdqa [ESI+16-64], XMM1; 577 movdqa [ESI+32-64], XMM2; 578 movdqa [ESI+48-64], XMM3; 579 cmp ESI, EDI; 580 jb startaddasssse2a; 581 582 mov aptr, ESI; 583 } 584 } 585 } 586 else 587 // MMX version is 1721% faster 588 if (mmx() && a.length >= 32) 589 { 590 591 auto n = aptr + (a.length & ~31); 592 593 uint l = cast(ubyte) value; 594 l |= (l << 8); 595 596 asm 597 { 598 mov ESI, aptr; 599 mov EDI, n; 600 movd MM4, l; 601 pshufw MM4, MM4, 0; 602 603 align 8; 604 startaddassmmx: 605 movq MM0, [ESI]; 606 movq MM1, [ESI+8]; 607 movq MM2, [ESI+16]; 608 movq MM3, [ESI+24]; 609 add ESI, 32; 610 paddb MM0, MM4; 611 paddb MM1, MM4; 612 paddb MM2, MM4; 613 paddb MM3, MM4; 614 movq [ESI -32], MM0; 615 movq [ESI+8 -32], MM1; 616 movq [ESI+16-32], MM2; 617 movq [ESI+24-32], MM3; 618 cmp ESI, EDI; 619 jb startaddassmmx; 620 621 emms; 622 mov aptr, ESI; 623 } 624 } 625 } 626 627 while (aptr < aend) 628 *aptr++ += value; 629 630 return a; 631 } 632 633 unittest 634 { 635 printf("_arrayExpSliceAddass_g unittest\n"); 636 637 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 638 { 639 version (log) printf(" cpuid %d\n", cpuid); 640 641 for (int j = 0; j < 2; j++) 642 { 643 const int dim = 67; 644 T[] a = new T[dim + j]; // aligned on 16 byte boundary 645 a = a[j .. dim + j]; // misalign for second iteration 646 T[] b = new T[dim + j]; 647 b = b[j .. dim + j]; 648 T[] c = new T[dim + j]; 649 c = c[j .. dim + j]; 650 651 for (int i = 0; i < dim; i++) 652 { a[i] = cast(T)i; 653 b[i] = cast(T)(i + 7); 654 c[i] = cast(T)(i * 2); 655 } 656 657 a[] = c[]; 658 c[] += 6; 659 660 for (int i = 0; i < dim; i++) 661 { 662 if (c[i] != cast(T)(a[i] + 6)) 663 { 664 printf("[%d]: %d != %d + 6\n", i, c[i], a[i]); 665 assert(0); 666 } 667 } 668 } 669 } 670 } 671 672 673 /* ======================================================================== */ 674 675 /*********************** 676 * Computes: 677 * a[] += b[] 678 */ 679 680 T[] _arraySliceSliceAddass_a(T[] a, T[] b) 681 { 682 return _arraySliceSliceAddass_g(a, b); 683 } 684 685 T[] _arraySliceSliceAddass_h(T[] a, T[] b) 686 { 687 return _arraySliceSliceAddass_g(a, b); 688 } 689 690 T[] _arraySliceSliceAddass_g(T[] a, T[] b) 691 in 692 { 693 assert (a.length == b.length); 694 assert (disjoint(a, b)); 695 } 696 body 697 { 698 //printf("_arraySliceSliceAddass_g()\n"); 699 auto aptr = a.ptr; 700 auto aend = aptr + a.length; 701 auto bptr = b.ptr; 702 703 version (D_InlineAsm_X86) 704 { 705 // SSE2 aligned version is 4727% faster 706 if (sse2() && a.length >= 64) 707 { 708 auto n = aptr + (a.length & ~63); 709 710 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 711 { 712 asm // unaligned case 713 { 714 mov ESI, aptr; 715 mov EDI, n; 716 mov ECX, bptr; 717 718 align 8; 719 startaddasslsse2u: 720 movdqu XMM0, [ESI]; 721 movdqu XMM1, [ESI+16]; 722 movdqu XMM2, [ESI+32]; 723 movdqu XMM3, [ESI+48]; 724 add ESI, 64; 725 movdqu XMM4, [ECX]; 726 movdqu XMM5, [ECX+16]; 727 movdqu XMM6, [ECX+32]; 728 movdqu XMM7, [ECX+48]; 729 add ECX, 64; 730 paddb XMM0, XMM4; 731 paddb XMM1, XMM5; 732 paddb XMM2, XMM6; 733 paddb XMM3, XMM7; 734 movdqu [ESI -64], XMM0; 735 movdqu [ESI+16-64], XMM1; 736 movdqu [ESI+32-64], XMM2; 737 movdqu [ESI+48-64], XMM3; 738 cmp ESI, EDI; 739 jb startaddasslsse2u; 740 741 mov aptr, ESI; 742 mov bptr, ECX; 743 } 744 } 745 else 746 { 747 asm // aligned case 748 { 749 mov ESI, aptr; 750 mov EDI, n; 751 mov ECX, bptr; 752 753 align 8; 754 startaddasslsse2a: 755 movdqa XMM0, [ESI]; 756 movdqa XMM1, [ESI+16]; 757 movdqa XMM2, [ESI+32]; 758 movdqa XMM3, [ESI+48]; 759 add ESI, 64; 760 movdqa XMM4, [ECX]; 761 movdqa XMM5, [ECX+16]; 762 movdqa XMM6, [ECX+32]; 763 movdqa XMM7, [ECX+48]; 764 add ECX, 64; 765 paddb XMM0, XMM4; 766 paddb XMM1, XMM5; 767 paddb XMM2, XMM6; 768 paddb XMM3, XMM7; 769 movdqa [ESI -64], XMM0; 770 movdqa [ESI+16-64], XMM1; 771 movdqa [ESI+32-64], XMM2; 772 movdqa [ESI+48-64], XMM3; 773 cmp ESI, EDI; 774 jb startaddasslsse2a; 775 776 mov aptr, ESI; 777 mov bptr, ECX; 778 } 779 } 780 } 781 else 782 // MMX version is 3059% faster 783 if (mmx() && a.length >= 32) 784 { 785 786 auto n = aptr + (a.length & ~31); 787 788 asm 789 { 790 mov ESI, aptr; 791 mov EDI, n; 792 mov ECX, bptr; 793 794 align 8; 795 startaddasslmmx: 796 movq MM0, [ESI]; 797 movq MM1, [ESI+8]; 798 movq MM2, [ESI+16]; 799 movq MM3, [ESI+24]; 800 add ESI, 32; 801 movq MM4, [ECX]; 802 movq MM5, [ECX+8]; 803 movq MM6, [ECX+16]; 804 movq MM7, [ECX+24]; 805 add ECX, 32; 806 paddb MM0, MM4; 807 paddb MM1, MM5; 808 paddb MM2, MM6; 809 paddb MM3, MM7; 810 movq [ESI -32], MM0; 811 movq [ESI+8 -32], MM1; 812 movq [ESI+16-32], MM2; 813 movq [ESI+24-32], MM3; 814 cmp ESI, EDI; 815 jb startaddasslmmx; 816 817 emms; 818 mov aptr, ESI; 819 mov bptr, ECX; 820 } 821 } 822 } 823 824 while (aptr < aend) 825 *aptr++ += *bptr++; 826 827 return a; 828 } 829 830 unittest 831 { 832 printf("_arraySliceSliceAddass_g unittest\n"); 833 834 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 835 { 836 version (log) printf(" cpuid %d\n", cpuid); 837 838 for (int j = 0; j < 2; j++) 839 { 840 const int dim = 67; 841 T[] a = new T[dim + j]; // aligned on 16 byte boundary 842 a = a[j .. dim + j]; // misalign for second iteration 843 T[] b = new T[dim + j]; 844 b = b[j .. dim + j]; 845 T[] c = new T[dim + j]; 846 c = c[j .. dim + j]; 847 848 for (int i = 0; i < dim; i++) 849 { a[i] = cast(T)i; 850 b[i] = cast(T)(i + 7); 851 c[i] = cast(T)(i * 2); 852 } 853 854 a[] = c[]; 855 c[] += b[]; 856 857 for (int i = 0; i < dim; i++) 858 { 859 if (c[i] != cast(T)(a[i] + b[i])) 860 { 861 printf("[%d]: %d != %d + %d\n", i, c[i], a[i], b[i]); 862 assert(0); 863 } 864 } 865 } 866 } 867 } 868 869 870 /* ======================================================================== */ 871 872 873 /*********************** 874 * Computes: 875 * a[] = b[] - value 876 */ 877 878 T[] _arraySliceExpMinSliceAssign_a(T[] a, T value, T[] b) 879 { 880 return _arraySliceExpMinSliceAssign_g(a, value, b); 881 } 882 883 T[] _arraySliceExpMinSliceAssign_h(T[] a, T value, T[] b) 884 { 885 return _arraySliceExpMinSliceAssign_g(a, value, b); 886 } 887 888 T[] _arraySliceExpMinSliceAssign_g(T[] a, T value, T[] b) 889 in 890 { 891 assert(a.length == b.length); 892 assert(disjoint(a, b)); 893 } 894 body 895 { 896 //printf("_arraySliceExpMinSliceAssign_g()\n"); 897 auto aptr = a.ptr; 898 auto aend = aptr + a.length; 899 auto bptr = b.ptr; 900 901 version (D_InlineAsm_X86) 902 { 903 // SSE2 aligned version is 1189% faster 904 if (sse2() && a.length >= 64) 905 { 906 auto n = aptr + (a.length & ~63); 907 908 uint l = cast(ubyte) value; 909 l |= (l << 8); 910 l |= (l << 16); 911 912 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 913 { 914 asm // unaligned case 915 { 916 mov ESI, aptr; 917 mov EDI, n; 918 mov EAX, bptr; 919 movd XMM4, l; 920 pshufd XMM4, XMM4, 0; 921 922 align 8; 923 startsubsse2u: 924 add ESI, 64; 925 movdqu XMM0, [EAX]; 926 movdqu XMM1, [EAX+16]; 927 movdqu XMM2, [EAX+32]; 928 movdqu XMM3, [EAX+48]; 929 add EAX, 64; 930 psubb XMM0, XMM4; 931 psubb XMM1, XMM4; 932 psubb XMM2, XMM4; 933 psubb XMM3, XMM4; 934 movdqu [ESI -64], XMM0; 935 movdqu [ESI+16-64], XMM1; 936 movdqu [ESI+32-64], XMM2; 937 movdqu [ESI+48-64], XMM3; 938 cmp ESI, EDI; 939 jb startsubsse2u; 940 941 mov aptr, ESI; 942 mov bptr, EAX; 943 } 944 } 945 else 946 { 947 asm // aligned case 948 { 949 mov ESI, aptr; 950 mov EDI, n; 951 mov EAX, bptr; 952 movd XMM4, l; 953 pshufd XMM4, XMM4, 0; 954 955 align 8; 956 startsubsse2a: 957 add ESI, 64; 958 movdqa XMM0, [EAX]; 959 movdqa XMM1, [EAX+16]; 960 movdqa XMM2, [EAX+32]; 961 movdqa XMM3, [EAX+48]; 962 add EAX, 64; 963 psubb XMM0, XMM4; 964 psubb XMM1, XMM4; 965 psubb XMM2, XMM4; 966 psubb XMM3, XMM4; 967 movdqa [ESI -64], XMM0; 968 movdqa [ESI+16-64], XMM1; 969 movdqa [ESI+32-64], XMM2; 970 movdqa [ESI+48-64], XMM3; 971 cmp ESI, EDI; 972 jb startsubsse2a; 973 974 mov aptr, ESI; 975 mov bptr, EAX; 976 } 977 } 978 } 979 else 980 // MMX version is 1079% faster 981 if (mmx() && a.length >= 32) 982 { 983 auto n = aptr + (a.length & ~31); 984 985 uint l = cast(ubyte) value; 986 l |= (l << 8); 987 988 asm 989 { 990 mov ESI, aptr; 991 mov EDI, n; 992 mov EAX, bptr; 993 movd MM4, l; 994 pshufw MM4, MM4, 0; 995 996 align 4; 997 startsubmmx: 998 add ESI, 32; 999 movq MM0, [EAX]; 1000 movq MM1, [EAX+8]; 1001 movq MM2, [EAX+16]; 1002 movq MM3, [EAX+24]; 1003 add EAX, 32; 1004 psubb MM0, MM4; 1005 psubb MM1, MM4; 1006 psubb MM2, MM4; 1007 psubb MM3, MM4; 1008 movq [ESI -32], MM0; 1009 movq [ESI+8 -32], MM1; 1010 movq [ESI+16-32], MM2; 1011 movq [ESI+24-32], MM3; 1012 cmp ESI, EDI; 1013 jb startsubmmx; 1014 1015 emms; 1016 mov aptr, ESI; 1017 mov bptr, EAX; 1018 } 1019 } 1020 // trying to be fair and treat normal 32-bit cpu the same way as we do the SIMD units, with unrolled asm. There's not enough registers, really. 1021 else 1022 if (a.length >= 4) 1023 { 1024 auto n = aptr + (a.length & ~3); 1025 asm 1026 { 1027 mov ESI, aptr; 1028 mov EDI, n; 1029 mov EAX, bptr; 1030 mov CL, value; 1031 1032 align 4; 1033 startsub386: 1034 add ESI, 4; 1035 mov DX, [EAX]; 1036 mov BX, [EAX+2]; 1037 add EAX, 4; 1038 sub BL, CL; 1039 sub BH, CL; 1040 sub DL, CL; 1041 sub DH, CL; 1042 mov [ESI -4], DX; 1043 mov [ESI+2 -4], BX; 1044 cmp ESI, EDI; 1045 jb startsub386; 1046 1047 mov aptr, ESI; 1048 mov bptr, EAX; 1049 } 1050 } 1051 } 1052 1053 while (aptr < aend) 1054 *aptr++ = cast(T)(*bptr++ - value); 1055 1056 return a; 1057 } 1058 1059 unittest 1060 { 1061 printf("_arraySliceExpMinSliceAssign_g unittest\n"); 1062 1063 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1064 { 1065 version (log) printf(" cpuid %d\n", cpuid); 1066 1067 for (int j = 0; j < 2; j++) 1068 { 1069 const int dim = 67; 1070 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1071 a = a[j .. dim + j]; // misalign for second iteration 1072 T[] b = new T[dim + j]; 1073 b = b[j .. dim + j]; 1074 T[] c = new T[dim + j]; 1075 c = c[j .. dim + j]; 1076 1077 for (int i = 0; i < dim; i++) 1078 { a[i] = cast(T)i; 1079 b[i] = cast(T)(i + 7); 1080 c[i] = cast(T)(i * 2); 1081 } 1082 1083 a[] = c[]; 1084 c[] = b[] - 6; 1085 1086 for (int i = 0; i < dim; i++) 1087 { 1088 if (c[i] != cast(T)(b[i] - 6)) 1089 { 1090 printf("[%d]: %d != %d - 6\n", i, c[i], b[i]); 1091 assert(0); 1092 } 1093 } 1094 } 1095 } 1096 } 1097 1098 1099 /* ======================================================================== */ 1100 1101 /*********************** 1102 * Computes: 1103 * a[] = value - b[] 1104 */ 1105 1106 T[] _arrayExpSliceMinSliceAssign_a(T[] a, T[] b, T value) 1107 { 1108 return _arrayExpSliceMinSliceAssign_g(a, b, value); 1109 } 1110 1111 T[] _arrayExpSliceMinSliceAssign_h(T[] a, T[] b, T value) 1112 { 1113 return _arrayExpSliceMinSliceAssign_g(a, b, value); 1114 } 1115 1116 T[] _arrayExpSliceMinSliceAssign_g(T[] a, T[] b, T value) 1117 in 1118 { 1119 assert(a.length == b.length); 1120 assert(disjoint(a, b)); 1121 } 1122 body 1123 { 1124 //printf("_arrayExpSliceMinSliceAssign_g()\n"); 1125 auto aptr = a.ptr; 1126 auto aend = aptr + a.length; 1127 auto bptr = b.ptr; 1128 1129 version (D_InlineAsm_X86) 1130 { 1131 // SSE2 aligned version is 8748% faster 1132 if (sse2() && a.length >= 64) 1133 { 1134 auto n = aptr + (a.length & ~63); 1135 1136 uint l = cast(ubyte) value; 1137 l |= (l << 8); 1138 l |= (l << 16); 1139 1140 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1141 { 1142 asm // unaligned case 1143 { 1144 mov ESI, aptr; 1145 mov EDI, n; 1146 mov EAX, bptr; 1147 movd XMM4, l; 1148 pshufd XMM4, XMM4, 0; 1149 1150 align 8; 1151 startsubrsse2u: 1152 add ESI, 64; 1153 movdqa XMM5, XMM4; 1154 movdqa XMM6, XMM4; 1155 movdqu XMM0, [EAX]; 1156 movdqu XMM1, [EAX+16]; 1157 psubb XMM5, XMM0; 1158 psubb XMM6, XMM1; 1159 movdqu [ESI -64], XMM5; 1160 movdqu [ESI+16-64], XMM6; 1161 movdqa XMM5, XMM4; 1162 movdqa XMM6, XMM4; 1163 movdqu XMM2, [EAX+32]; 1164 movdqu XMM3, [EAX+48]; 1165 add EAX, 64; 1166 psubb XMM5, XMM2; 1167 psubb XMM6, XMM3; 1168 movdqu [ESI+32-64], XMM5; 1169 movdqu [ESI+48-64], XMM6; 1170 cmp ESI, EDI; 1171 jb startsubrsse2u; 1172 1173 mov aptr, ESI; 1174 mov bptr, EAX; 1175 } 1176 } 1177 else 1178 { 1179 asm // aligned case 1180 { 1181 mov ESI, aptr; 1182 mov EDI, n; 1183 mov EAX, bptr; 1184 movd XMM4, l; 1185 pshufd XMM4, XMM4, 0; 1186 1187 align 8; 1188 startsubrsse2a: 1189 add ESI, 64; 1190 movdqa XMM5, XMM4; 1191 movdqa XMM6, XMM4; 1192 movdqa XMM0, [EAX]; 1193 movdqa XMM1, [EAX+16]; 1194 psubb XMM5, XMM0; 1195 psubb XMM6, XMM1; 1196 movdqa [ESI -64], XMM5; 1197 movdqa [ESI+16-64], XMM6; 1198 movdqa XMM5, XMM4; 1199 movdqa XMM6, XMM4; 1200 movdqa XMM2, [EAX+32]; 1201 movdqa XMM3, [EAX+48]; 1202 add EAX, 64; 1203 psubb XMM5, XMM2; 1204 psubb XMM6, XMM3; 1205 movdqa [ESI+32-64], XMM5; 1206 movdqa [ESI+48-64], XMM6; 1207 cmp ESI, EDI; 1208 jb startsubrsse2a; 1209 1210 mov aptr, ESI; 1211 mov bptr, EAX; 1212 } 1213 } 1214 } 1215 else 1216 // MMX version is 7397% faster 1217 if (mmx() && a.length >= 32) 1218 { 1219 auto n = aptr + (a.length & ~31); 1220 1221 uint l = cast(ubyte) value; 1222 l |= (l << 8); 1223 1224 asm 1225 { 1226 mov ESI, aptr; 1227 mov EDI, n; 1228 mov EAX, bptr; 1229 movd MM4, l; 1230 pshufw MM4, MM4, 0; 1231 1232 align 4; 1233 startsubrmmx: 1234 add ESI, 32; 1235 movq MM5, MM4; 1236 movq MM6, MM4; 1237 movq MM0, [EAX]; 1238 movq MM1, [EAX+8]; 1239 psubb MM5, MM0; 1240 psubb MM6, MM1; 1241 movq [ESI -32], MM5; 1242 movq [ESI+8 -32], MM6; 1243 movq MM5, MM4; 1244 movq MM6, MM4; 1245 movq MM2, [EAX+16]; 1246 movq MM3, [EAX+24]; 1247 add EAX, 32; 1248 psubb MM5, MM2; 1249 psubb MM6, MM3; 1250 movq [ESI+16-32], MM5; 1251 movq [ESI+24-32], MM6; 1252 cmp ESI, EDI; 1253 jb startsubrmmx; 1254 1255 emms; 1256 mov aptr, ESI; 1257 mov bptr, EAX; 1258 } 1259 } 1260 1261 } 1262 1263 while (aptr < aend) 1264 *aptr++ = cast(T)(value - *bptr++); 1265 1266 return a; 1267 } 1268 1269 unittest 1270 { 1271 printf("_arrayExpSliceMinSliceAssign_g unittest\n"); 1272 1273 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1274 { 1275 version (log) printf(" cpuid %d\n", cpuid); 1276 1277 for (int j = 0; j < 2; j++) 1278 { 1279 const int dim = 67; 1280 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1281 a = a[j .. dim + j]; // misalign for second iteration 1282 T[] b = new T[dim + j]; 1283 b = b[j .. dim + j]; 1284 T[] c = new T[dim + j]; 1285 c = c[j .. dim + j]; 1286 1287 for (int i = 0; i < dim; i++) 1288 { a[i] = cast(T)i; 1289 b[i] = cast(T)(i + 7); 1290 c[i] = cast(T)(i * 2); 1291 } 1292 1293 a[] = c[]; 1294 c[] = 6 - b[]; 1295 1296 for (int i = 0; i < dim; i++) 1297 { 1298 if (c[i] != cast(T)(6 - b[i])) 1299 { 1300 printf("[%d]: %d != 6 - %d\n", i, c[i], b[i]); 1301 assert(0); 1302 } 1303 } 1304 } 1305 } 1306 } 1307 1308 1309 /* ======================================================================== */ 1310 1311 /*********************** 1312 * Computes: 1313 * a[] = b[] - c[] 1314 */ 1315 1316 T[] _arraySliceSliceMinSliceAssign_a(T[] a, T[] c, T[] b) 1317 { 1318 return _arraySliceSliceMinSliceAssign_g(a, c, b); 1319 } 1320 1321 T[] _arraySliceSliceMinSliceAssign_h(T[] a, T[] c, T[] b) 1322 { 1323 return _arraySliceSliceMinSliceAssign_g(a, c, b); 1324 } 1325 1326 T[] _arraySliceSliceMinSliceAssign_g(T[] a, T[] c, T[] b) 1327 in 1328 { 1329 assert(a.length == b.length && b.length == c.length); 1330 assert(disjoint(a, b)); 1331 assert(disjoint(a, c)); 1332 assert(disjoint(b, c)); 1333 } 1334 body 1335 { 1336 auto aptr = a.ptr; 1337 auto aend = aptr + a.length; 1338 auto bptr = b.ptr; 1339 auto cptr = c.ptr; 1340 1341 version (D_InlineAsm_X86) 1342 { 1343 // SSE2 aligned version is 5756% faster 1344 if (sse2() && a.length >= 64) 1345 { 1346 auto n = aptr + (a.length & ~63); 1347 1348 if (((cast(uint) aptr | cast(uint) bptr | cast(uint) cptr) & 15) != 0) 1349 { 1350 asm // unaligned case 1351 { 1352 mov ESI, aptr; 1353 mov EDI, n; 1354 mov EAX, bptr; 1355 mov ECX, cptr; 1356 1357 align 8; 1358 startsublsse2u: 1359 add ESI, 64; 1360 movdqu XMM0, [EAX]; 1361 movdqu XMM1, [EAX+16]; 1362 movdqu XMM2, [EAX+32]; 1363 movdqu XMM3, [EAX+48]; 1364 add EAX, 64; 1365 movdqu XMM4, [ECX]; 1366 movdqu XMM5, [ECX+16]; 1367 movdqu XMM6, [ECX+32]; 1368 movdqu XMM7, [ECX+48]; 1369 add ECX, 64; 1370 psubb XMM0, XMM4; 1371 psubb XMM1, XMM5; 1372 psubb XMM2, XMM6; 1373 psubb XMM3, XMM7; 1374 movdqu [ESI -64], XMM0; 1375 movdqu [ESI+16-64], XMM1; 1376 movdqu [ESI+32-64], XMM2; 1377 movdqu [ESI+48-64], XMM3; 1378 cmp ESI, EDI; 1379 jb startsublsse2u; 1380 1381 mov aptr, ESI; 1382 mov bptr, EAX; 1383 mov cptr, ECX; 1384 } 1385 } 1386 else 1387 { 1388 asm // aligned case 1389 { 1390 mov ESI, aptr; 1391 mov EDI, n; 1392 mov EAX, bptr; 1393 mov ECX, cptr; 1394 1395 align 8; 1396 startsublsse2a: 1397 add ESI, 64; 1398 movdqa XMM0, [EAX]; 1399 movdqa XMM1, [EAX+16]; 1400 movdqa XMM2, [EAX+32]; 1401 movdqa XMM3, [EAX+48]; 1402 add EAX, 64; 1403 movdqa XMM4, [ECX]; 1404 movdqa XMM5, [ECX+16]; 1405 movdqa XMM6, [ECX+32]; 1406 movdqa XMM7, [ECX+48]; 1407 add ECX, 64; 1408 psubb XMM0, XMM4; 1409 psubb XMM1, XMM5; 1410 psubb XMM2, XMM6; 1411 psubb XMM3, XMM7; 1412 movdqa [ESI -64], XMM0; 1413 movdqa [ESI+16-64], XMM1; 1414 movdqa [ESI+32-64], XMM2; 1415 movdqa [ESI+48-64], XMM3; 1416 cmp ESI, EDI; 1417 jb startsublsse2a; 1418 1419 mov aptr, ESI; 1420 mov bptr, EAX; 1421 mov cptr, ECX; 1422 } 1423 } 1424 } 1425 else 1426 // MMX version is 4428% faster 1427 if (mmx() && a.length >= 32) 1428 { 1429 auto n = aptr + (a.length & ~31); 1430 1431 asm 1432 { 1433 mov ESI, aptr; 1434 mov EDI, n; 1435 mov EAX, bptr; 1436 mov ECX, cptr; 1437 1438 align 8; 1439 startsublmmx: 1440 add ESI, 32; 1441 movq MM0, [EAX]; 1442 movq MM1, [EAX+8]; 1443 movq MM2, [EAX+16]; 1444 movq MM3, [EAX+24]; 1445 add EAX, 32; 1446 movq MM4, [ECX]; 1447 movq MM5, [ECX+8]; 1448 movq MM6, [ECX+16]; 1449 movq MM7, [ECX+24]; 1450 add ECX, 32; 1451 psubb MM0, MM4; 1452 psubb MM1, MM5; 1453 psubb MM2, MM6; 1454 psubb MM3, MM7; 1455 movq [ESI -32], MM0; 1456 movq [ESI+8 -32], MM1; 1457 movq [ESI+16-32], MM2; 1458 movq [ESI+24-32], MM3; 1459 cmp ESI, EDI; 1460 jb startsublmmx; 1461 1462 emms; 1463 mov aptr, ESI; 1464 mov bptr, EAX; 1465 mov cptr, ECX; 1466 } 1467 } 1468 } 1469 1470 while (aptr < aend) 1471 *aptr++ = cast(T)(*bptr++ - *cptr++); 1472 1473 return a; 1474 } 1475 1476 unittest 1477 { 1478 printf("_arraySliceSliceMinSliceAssign_g unittest\n"); 1479 1480 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1481 { 1482 version (log) printf(" cpuid %d\n", cpuid); 1483 1484 for (int j = 0; j < 2; j++) 1485 { 1486 const int dim = 67; 1487 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1488 a = a[j .. dim + j]; // misalign for second iteration 1489 T[] b = new T[dim + j]; 1490 b = b[j .. dim + j]; 1491 T[] c = new T[dim + j]; 1492 c = c[j .. dim + j]; 1493 1494 for (int i = 0; i < dim; i++) 1495 { a[i] = cast(T)i; 1496 b[i] = cast(T)(i + 7); 1497 c[i] = cast(T)(i * 2); 1498 } 1499 1500 c[] = a[] - b[]; 1501 1502 for (int i = 0; i < dim; i++) 1503 { 1504 if (c[i] != cast(T)(a[i] - b[i])) 1505 { 1506 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); 1507 assert(0); 1508 } 1509 } 1510 } 1511 } 1512 } 1513 1514 1515 /* ======================================================================== */ 1516 1517 /*********************** 1518 * Computes: 1519 * a[] -= value 1520 */ 1521 1522 T[] _arrayExpSliceMinass_a(T[] a, T value) 1523 { 1524 return _arrayExpSliceMinass_g(a, value); 1525 } 1526 1527 T[] _arrayExpSliceMinass_h(T[] a, T value) 1528 { 1529 return _arrayExpSliceMinass_g(a, value); 1530 } 1531 1532 T[] _arrayExpSliceMinass_g(T[] a, T value) 1533 { 1534 //printf("_arrayExpSliceMinass_g(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 1535 auto aptr = a.ptr; 1536 auto aend = aptr + a.length; 1537 1538 version (D_InlineAsm_X86) 1539 { 1540 // SSE2 aligned version is 1577% faster 1541 if (sse2() && a.length >= 64) 1542 { 1543 auto n = aptr + (a.length & ~63); 1544 1545 uint l = cast(ubyte) value; 1546 l |= (l << 8); 1547 l |= (l << 16); 1548 1549 if (((cast(uint) aptr) & 15) != 0) 1550 { 1551 asm // unaligned case 1552 { 1553 mov ESI, aptr; 1554 mov EDI, n; 1555 movd XMM4, l; 1556 pshufd XMM4, XMM4, 0; 1557 1558 align 8; 1559 startsubasssse2u: 1560 movdqu XMM0, [ESI]; 1561 movdqu XMM1, [ESI+16]; 1562 movdqu XMM2, [ESI+32]; 1563 movdqu XMM3, [ESI+48]; 1564 add ESI, 64; 1565 psubb XMM0, XMM4; 1566 psubb XMM1, XMM4; 1567 psubb XMM2, XMM4; 1568 psubb XMM3, XMM4; 1569 movdqu [ESI -64], XMM0; 1570 movdqu [ESI+16-64], XMM1; 1571 movdqu [ESI+32-64], XMM2; 1572 movdqu [ESI+48-64], XMM3; 1573 cmp ESI, EDI; 1574 jb startsubasssse2u; 1575 1576 mov aptr, ESI; 1577 } 1578 } 1579 else 1580 { 1581 asm // aligned case 1582 { 1583 mov ESI, aptr; 1584 mov EDI, n; 1585 movd XMM4, l; 1586 pshufd XMM4, XMM4, 0; 1587 1588 align 8; 1589 startsubasssse2a: 1590 movdqa XMM0, [ESI]; 1591 movdqa XMM1, [ESI+16]; 1592 movdqa XMM2, [ESI+32]; 1593 movdqa XMM3, [ESI+48]; 1594 add ESI, 64; 1595 psubb XMM0, XMM4; 1596 psubb XMM1, XMM4; 1597 psubb XMM2, XMM4; 1598 psubb XMM3, XMM4; 1599 movdqa [ESI -64], XMM0; 1600 movdqa [ESI+16-64], XMM1; 1601 movdqa [ESI+32-64], XMM2; 1602 movdqa [ESI+48-64], XMM3; 1603 cmp ESI, EDI; 1604 jb startsubasssse2a; 1605 1606 mov aptr, ESI; 1607 } 1608 } 1609 } 1610 else 1611 // MMX version is 1577% faster 1612 if (mmx() && a.length >= 32) 1613 { 1614 1615 auto n = aptr + (a.length & ~31); 1616 1617 uint l = cast(ubyte) value; 1618 l |= (l << 8); 1619 1620 asm 1621 { 1622 mov ESI, aptr; 1623 mov EDI, n; 1624 movd MM4, l; 1625 pshufw MM4, MM4, 0; 1626 1627 align 8; 1628 startsubassmmx: 1629 movq MM0, [ESI]; 1630 movq MM1, [ESI+8]; 1631 movq MM2, [ESI+16]; 1632 movq MM3, [ESI+24]; 1633 add ESI, 32; 1634 psubb MM0, MM4; 1635 psubb MM1, MM4; 1636 psubb MM2, MM4; 1637 psubb MM3, MM4; 1638 movq [ESI -32], MM0; 1639 movq [ESI+8 -32], MM1; 1640 movq [ESI+16-32], MM2; 1641 movq [ESI+24-32], MM3; 1642 cmp ESI, EDI; 1643 jb startsubassmmx; 1644 1645 emms; 1646 mov aptr, ESI; 1647 } 1648 } 1649 } 1650 1651 while (aptr < aend) 1652 *aptr++ -= value; 1653 1654 return a; 1655 } 1656 1657 unittest 1658 { 1659 printf("_arrayExpSliceMinass_g unittest\n"); 1660 1661 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1662 { 1663 version (log) printf(" cpuid %d\n", cpuid); 1664 1665 for (int j = 0; j < 2; j++) 1666 { 1667 const int dim = 67; 1668 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1669 a = a[j .. dim + j]; // misalign for second iteration 1670 T[] b = new T[dim + j]; 1671 b = b[j .. dim + j]; 1672 T[] c = new T[dim + j]; 1673 c = c[j .. dim + j]; 1674 1675 for (int i = 0; i < dim; i++) 1676 { a[i] = cast(T)i; 1677 b[i] = cast(T)(i + 7); 1678 c[i] = cast(T)(i * 2); 1679 } 1680 1681 a[] = c[]; 1682 c[] -= 6; 1683 1684 for (int i = 0; i < dim; i++) 1685 { 1686 if (c[i] != cast(T)(a[i] - 6)) 1687 { 1688 printf("[%d]: %d != %d - 6\n", i, c[i], a[i]); 1689 assert(0); 1690 } 1691 } 1692 } 1693 } 1694 } 1695 1696 1697 /* ======================================================================== */ 1698 1699 /*********************** 1700 * Computes: 1701 * a[] -= b[] 1702 */ 1703 1704 T[] _arraySliceSliceMinass_a(T[] a, T[] b) 1705 { 1706 return _arraySliceSliceMinass_g(a, b); 1707 } 1708 1709 T[] _arraySliceSliceMinass_h(T[] a, T[] b) 1710 { 1711 return _arraySliceSliceMinass_g(a, b); 1712 } 1713 1714 T[] _arraySliceSliceMinass_g(T[] a, T[] b) 1715 in 1716 { 1717 assert (a.length == b.length); 1718 assert (disjoint(a, b)); 1719 } 1720 body 1721 { 1722 //printf("_arraySliceSliceMinass_g()\n"); 1723 auto aptr = a.ptr; 1724 auto aend = aptr + a.length; 1725 auto bptr = b.ptr; 1726 1727 version (D_InlineAsm_X86) 1728 { 1729 // SSE2 aligned version is 4800% faster 1730 if (sse2() && a.length >= 64) 1731 { 1732 auto n = aptr + (a.length & ~63); 1733 1734 if (((cast(uint) aptr | cast(uint) bptr) & 15) != 0) 1735 { 1736 asm // unaligned case 1737 { 1738 mov ESI, aptr; 1739 mov EDI, n; 1740 mov ECX, bptr; 1741 1742 align 8; 1743 startsubasslsse2u: 1744 movdqu XMM0, [ESI]; 1745 movdqu XMM1, [ESI+16]; 1746 movdqu XMM2, [ESI+32]; 1747 movdqu XMM3, [ESI+48]; 1748 add ESI, 64; 1749 movdqu XMM4, [ECX]; 1750 movdqu XMM5, [ECX+16]; 1751 movdqu XMM6, [ECX+32]; 1752 movdqu XMM7, [ECX+48]; 1753 add ECX, 64; 1754 psubb XMM0, XMM4; 1755 psubb XMM1, XMM5; 1756 psubb XMM2, XMM6; 1757 psubb XMM3, XMM7; 1758 movdqu [ESI -64], XMM0; 1759 movdqu [ESI+16-64], XMM1; 1760 movdqu [ESI+32-64], XMM2; 1761 movdqu [ESI+48-64], XMM3; 1762 cmp ESI, EDI; 1763 jb startsubasslsse2u; 1764 1765 mov aptr, ESI; 1766 mov bptr, ECX; 1767 } 1768 } 1769 else 1770 { 1771 asm // aligned case 1772 { 1773 mov ESI, aptr; 1774 mov EDI, n; 1775 mov ECX, bptr; 1776 1777 align 8; 1778 startsubasslsse2a: 1779 movdqa XMM0, [ESI]; 1780 movdqa XMM1, [ESI+16]; 1781 movdqa XMM2, [ESI+32]; 1782 movdqa XMM3, [ESI+48]; 1783 add ESI, 64; 1784 movdqa XMM4, [ECX]; 1785 movdqa XMM5, [ECX+16]; 1786 movdqa XMM6, [ECX+32]; 1787 movdqa XMM7, [ECX+48]; 1788 add ECX, 64; 1789 psubb XMM0, XMM4; 1790 psubb XMM1, XMM5; 1791 psubb XMM2, XMM6; 1792 psubb XMM3, XMM7; 1793 movdqa [ESI -64], XMM0; 1794 movdqa [ESI+16-64], XMM1; 1795 movdqa [ESI+32-64], XMM2; 1796 movdqa [ESI+48-64], XMM3; 1797 cmp ESI, EDI; 1798 jb startsubasslsse2a; 1799 1800 mov aptr, ESI; 1801 mov bptr, ECX; 1802 } 1803 } 1804 } 1805 else 1806 // MMX version is 3107% faster 1807 if (mmx() && a.length >= 32) 1808 { 1809 1810 auto n = aptr + (a.length & ~31); 1811 1812 asm 1813 { 1814 mov ESI, aptr; 1815 mov EDI, n; 1816 mov ECX, bptr; 1817 1818 align 8; 1819 startsubasslmmx: 1820 movq MM0, [ESI]; 1821 movq MM1, [ESI+8]; 1822 movq MM2, [ESI+16]; 1823 movq MM3, [ESI+24]; 1824 add ESI, 32; 1825 movq MM4, [ECX]; 1826 movq MM5, [ECX+8]; 1827 movq MM6, [ECX+16]; 1828 movq MM7, [ECX+24]; 1829 add ECX, 32; 1830 psubb MM0, MM4; 1831 psubb MM1, MM5; 1832 psubb MM2, MM6; 1833 psubb MM3, MM7; 1834 movq [ESI -32], MM0; 1835 movq [ESI+8 -32], MM1; 1836 movq [ESI+16-32], MM2; 1837 movq [ESI+24-32], MM3; 1838 cmp ESI, EDI; 1839 jb startsubasslmmx; 1840 1841 emms; 1842 mov aptr, ESI; 1843 mov bptr, ECX; 1844 } 1845 } 1846 } 1847 1848 while (aptr < aend) 1849 *aptr++ -= *bptr++; 1850 1851 return a; 1852 } 1853 1854 unittest 1855 { 1856 printf("_arraySliceSliceMinass_g unittest\n"); 1857 1858 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1859 { 1860 version (log) printf(" cpuid %d\n", cpuid); 1861 1862 for (int j = 0; j < 2; j++) 1863 { 1864 const int dim = 67; 1865 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1866 a = a[j .. dim + j]; // misalign for second iteration 1867 T[] b = new T[dim + j]; 1868 b = b[j .. dim + j]; 1869 T[] c = new T[dim + j]; 1870 c = c[j .. dim + j]; 1871 1872 for (int i = 0; i < dim; i++) 1873 { a[i] = cast(T)i; 1874 b[i] = cast(T)(i + 7); 1875 c[i] = cast(T)(i * 2); 1876 } 1877 1878 a[] = c[]; 1879 c[] -= b[]; 1880 1881 for (int i = 0; i < dim; i++) 1882 { 1883 if (c[i] != cast(T)(a[i] - b[i])) 1884 { 1885 printf("[%d]: %d != %d - %d\n", i, c[i], a[i], b[i]); 1886 assert(0); 1887 } 1888 } 1889 } 1890 } 1891 }