1 /*************************** 2 * D programming language http://www.digitalmars.com/d/ 3 * Runtime support for float array operations. 4 * Based on code originally written by Burton Radons. 5 * Placed in public domain. 6 */ 7 8 module rt.compiler.dmd.rt.arrayfloat; 9 10 import CPUid = rt.compiler.util.cpuid; 11 12 debug(UnitTest) 13 { 14 private extern(C) int printf(char*,...); 15 /* This is so unit tests will test every CPU variant 16 */ 17 int cpuid; 18 const int CPUID_MAX = 5; 19 bool mmx() { return cpuid == 1 && CPUid.mmx(); } 20 bool sse() { return cpuid == 2 && CPUid.sse(); } 21 bool sse2() { return cpuid == 3 && CPUid.sse2(); } 22 bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); } 23 } 24 else 25 { 26 alias CPUid.mmx mmx; 27 alias CPUid.sse sse; 28 alias CPUid.sse2 sse2; 29 alias CPUid.amd3dnow amd3dnow; 30 } 31 32 //version = log; 33 34 bool disjoint(T)(T[] a, T[] b) 35 { 36 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); 37 } 38 39 alias float T; 40 41 extern (C): 42 43 /* ======================================================================== */ 44 /* ======================================================================== */ 45 46 /* template for the case 47 * a[] = b[] ? c[] 48 * with some binary operator ? 49 */ 50 private template CodeGenSliceSliceOp(char[] opD, char[] opSSE, char[] op3DNow) 51 { 52 const char[] CodeGenSliceSliceOp = ` 53 auto aptr = a.ptr; 54 auto aend = aptr + a.length; 55 auto bptr = b.ptr; 56 auto cptr = c.ptr; 57 58 version (D_InlineAsm_X86) 59 { 60 // SSE version is 834% faster 61 if (sse() && b.length >= 16) 62 { 63 auto n = aptr + (b.length & ~15); 64 65 // Unaligned case 66 asm 67 { 68 mov EAX, bptr; // left operand 69 mov ECX, cptr; // right operand 70 mov ESI, aptr; // destination operand 71 mov EDI, n; // end comparison 72 73 align 8; 74 startsseloopb: 75 movups XMM0, [EAX]; 76 movups XMM1, [EAX+16]; 77 movups XMM2, [EAX+32]; 78 movups XMM3, [EAX+48]; 79 add EAX, 64; 80 movups XMM4, [ECX]; 81 movups XMM5, [ECX+16]; 82 movups XMM6, [ECX+32]; 83 movups XMM7, [ECX+48]; 84 add ESI, 64; 85 ` ~ opSSE ~ ` XMM0, XMM4; 86 ` ~ opSSE ~ ` XMM1, XMM5; 87 ` ~ opSSE ~ ` XMM2, XMM6; 88 ` ~ opSSE ~ ` XMM3, XMM7; 89 add ECX, 64; 90 movups [ESI+ 0-64], XMM0; 91 movups [ESI+16-64], XMM1; 92 movups [ESI+32-64], XMM2; 93 movups [ESI+48-64], XMM3; 94 cmp ESI, EDI; 95 jb startsseloopb; 96 97 mov aptr, ESI; 98 mov bptr, EAX; 99 mov cptr, ECX; 100 } 101 } 102 else 103 // 3DNow! version is only 13% faster 104 if (amd3dnow() && b.length >= 8) 105 { 106 auto n = aptr + (b.length & ~7); 107 108 asm 109 { 110 mov ESI, aptr; // destination operand 111 mov EDI, n; // end comparison 112 mov EAX, bptr; // left operand 113 mov ECX, cptr; // right operand 114 115 align 4; 116 start3dnow: 117 movq MM0, [EAX]; 118 movq MM1, [EAX+8]; 119 movq MM2, [EAX+16]; 120 movq MM3, [EAX+24]; 121 ` ~ op3DNow ~ ` MM0, [ECX]; 122 ` ~ op3DNow ~ ` MM1, [ECX+8]; 123 ` ~ op3DNow ~ ` MM2, [ECX+16]; 124 ` ~ op3DNow ~ ` MM3, [ECX+24]; 125 movq [ESI], MM0; 126 movq [ESI+8], MM1; 127 movq [ESI+16], MM2; 128 movq [ESI+24], MM3; 129 add ECX, 32; 130 add ESI, 32; 131 add EAX, 32; 132 cmp ESI, EDI; 133 jb start3dnow; 134 135 emms; 136 mov aptr, ESI; 137 mov bptr, EAX; 138 mov cptr, ECX; 139 } 140 } 141 } 142 143 // Handle remainder 144 while (aptr < aend) 145 *aptr++ = *bptr++ ` ~ opD ~ ` *cptr++; 146 147 return a;`; 148 } 149 150 /* ======================================================================== */ 151 152 /*********************** 153 * Computes: 154 * a[] = b[] + c[] 155 */ 156 157 T[] _arraySliceSliceAddSliceAssign_f(T[] a, T[] c, T[] b) 158 in 159 { 160 assert(a.length == b.length && b.length == c.length); 161 assert(disjoint(a, b)); 162 assert(disjoint(a, c)); 163 assert(disjoint(b, c)); 164 } 165 body 166 { 167 mixin(CodeGenSliceSliceOp!("+", "addps", "pfadd")); 168 } 169 170 171 unittest 172 { 173 printf("_arraySliceSliceAddSliceAssign_f unittest\n"); 174 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 175 { 176 version (log) printf(" cpuid %d\n", cpuid); 177 178 for (int j = 0; j < 2; j++) 179 { 180 const int dim = 67; 181 T[] a = new T[dim + j]; // aligned on 16 byte boundary 182 a = a[j .. dim + j]; // misalign for second iteration 183 T[] b = new T[dim + j]; 184 b = b[j .. dim + j]; 185 T[] c = new T[dim + j]; 186 c = c[j .. dim + j]; 187 188 for (int i = 0; i < dim; i++) 189 { a[i] = cast(T)i; 190 b[i] = cast(T)(i + 7); 191 c[i] = cast(T)(i * 2); 192 } 193 194 c[] = a[] + b[]; 195 196 for (int i = 0; i < dim; i++) 197 { 198 if (c[i] != cast(T)(a[i] + b[i])) 199 { 200 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); 201 assert(0); 202 } 203 } 204 } 205 } 206 } 207 208 /* ======================================================================== */ 209 210 /*********************** 211 * Computes: 212 * a[] = b[] - c[] 213 */ 214 215 T[] _arraySliceSliceMinSliceAssign_f(T[] a, T[] c, T[] b) 216 in 217 { 218 assert(a.length == b.length && b.length == c.length); 219 assert(disjoint(a, b)); 220 assert(disjoint(a, c)); 221 assert(disjoint(b, c)); 222 } 223 body 224 { 225 mixin(CodeGenSliceSliceOp!("-", "subps", "pfsub")); 226 } 227 228 229 unittest 230 { 231 printf("_arraySliceSliceMinSliceAssign_f unittest\n"); 232 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 233 { 234 version (log) printf(" cpuid %d\n", cpuid); 235 236 for (int j = 0; j < 2; j++) 237 { 238 const int dim = 67; 239 T[] a = new T[dim + j]; // aligned on 16 byte boundary 240 a = a[j .. dim + j]; // misalign for second iteration 241 T[] b = new T[dim + j]; 242 b = b[j .. dim + j]; 243 T[] c = new T[dim + j]; 244 c = c[j .. dim + j]; 245 246 for (int i = 0; i < dim; i++) 247 { a[i] = cast(T)i; 248 b[i] = cast(T)(i + 7); 249 c[i] = cast(T)(i * 2); 250 } 251 252 c[] = a[] - b[]; 253 254 for (int i = 0; i < dim; i++) 255 { 256 if (c[i] != cast(T)(a[i] - b[i])) 257 { 258 printf("[%d]: %g != %gd - %g\n", i, c[i], a[i], b[i]); 259 assert(0); 260 } 261 } 262 } 263 } 264 } 265 266 /* ======================================================================== */ 267 268 /*********************** 269 * Computes: 270 * a[] = b[] * c[] 271 */ 272 273 T[] _arraySliceSliceMulSliceAssign_f(T[] a, T[] c, T[] b) 274 in 275 { 276 assert(a.length == b.length && b.length == c.length); 277 assert(disjoint(a, b)); 278 assert(disjoint(a, c)); 279 assert(disjoint(b, c)); 280 } 281 body 282 { 283 mixin(CodeGenSliceSliceOp!("*", "mulps", "pfmul")); 284 } 285 286 unittest 287 { 288 printf("_arraySliceSliceMulSliceAssign_f unittest\n"); 289 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 290 { 291 version (log) printf(" cpuid %d\n", cpuid); 292 293 for (int j = 0; j < 2; j++) 294 { 295 const int dim = 67; 296 T[] a = new T[dim + j]; // aligned on 16 byte boundary 297 a = a[j .. dim + j]; // misalign for second iteration 298 T[] b = new T[dim + j]; 299 b = b[j .. dim + j]; 300 T[] c = new T[dim + j]; 301 c = c[j .. dim + j]; 302 303 for (int i = 0; i < dim; i++) 304 { a[i] = cast(T)i; 305 b[i] = cast(T)(i + 7); 306 c[i] = cast(T)(i * 2); 307 } 308 309 c[] = a[] * b[]; 310 311 for (int i = 0; i < dim; i++) 312 { 313 if (c[i] != cast(T)(a[i] * b[i])) 314 { 315 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); 316 assert(0); 317 } 318 } 319 } 320 } 321 } 322 323 /* ======================================================================== */ 324 /* ======================================================================== */ 325 326 /* template for the case 327 * a[] ?= value 328 * with some binary operator ? 329 */ 330 private template CodeGenExpSliceOpAssign(char[] opD, char[] opSSE, char[] op3DNow) 331 { 332 const char[] CodeGenExpSliceOpAssign = ` 333 auto aptr = a.ptr; 334 auto aend = aptr + a.length; 335 336 version (D_InlineAsm_X86) 337 { 338 if (sse() && a.length >= 16) 339 { 340 auto aabeg = cast(T*)((cast(uint)aptr + 15) & ~15); // beginning of paragraph-aligned slice of a 341 auto aaend = cast(T*)((cast(uint)aend) & ~15); // end of paragraph-aligned slice of a 342 343 int numAligned = cast(int)(aaend - aabeg); // how many floats are in the aligned slice? 344 345 // are there at least 16 floats in the paragraph-aligned slice? 346 // otherwise we can't do anything with SSE. 347 if (numAligned >= 16) 348 { 349 aaend = aabeg + (numAligned & ~15); // make sure the slice is actually a multiple of 16 floats long 350 351 // process values up to aligned slice one by one 352 while (aptr < aabeg) 353 *aptr++ ` ~ opD ~ ` value; 354 355 // process aligned slice with fast SSE operations 356 asm 357 { 358 mov ESI, aabeg; 359 mov EDI, aaend; 360 movss XMM4, value; 361 shufps XMM4, XMM4, 0; 362 363 align 8; 364 startsseloopa: 365 movaps XMM0, [ESI]; 366 movaps XMM1, [ESI+16]; 367 movaps XMM2, [ESI+32]; 368 movaps XMM3, [ESI+48]; 369 add ESI, 64; 370 ` ~ opSSE ~ ` XMM0, XMM4; 371 ` ~ opSSE ~ ` XMM1, XMM4; 372 ` ~ opSSE ~ ` XMM2, XMM4; 373 ` ~ opSSE ~ ` XMM3, XMM4; 374 movaps [ESI+ 0-64], XMM0; 375 movaps [ESI+16-64], XMM1; 376 movaps [ESI+32-64], XMM2; 377 movaps [ESI+48-64], XMM3; 378 cmp ESI, EDI; 379 jb startsseloopa; 380 } 381 aptr = aaend; 382 } 383 } 384 else 385 // 3DNow! version is 63% faster 386 if (amd3dnow() && a.length >= 8) 387 { 388 auto n = aptr + (a.length & ~7); 389 390 ulong w = *cast(uint *) &value; 391 ulong v = w | (w << 32L); 392 393 asm 394 { 395 mov ESI, dword ptr [aptr]; 396 mov EDI, dword ptr [n]; 397 movq MM4, qword ptr [v]; 398 399 align 8; 400 start: 401 movq MM0, [ESI]; 402 movq MM1, [ESI+8]; 403 movq MM2, [ESI+16]; 404 movq MM3, [ESI+24]; 405 ` ~ op3DNow ~ ` MM0, MM4; 406 ` ~ op3DNow ~ ` MM1, MM4; 407 ` ~ op3DNow ~ ` MM2, MM4; 408 ` ~ op3DNow ~ ` MM3, MM4; 409 movq [ESI], MM0; 410 movq [ESI+8], MM1; 411 movq [ESI+16], MM2; 412 movq [ESI+24], MM3; 413 add ESI, 32; 414 cmp ESI, EDI; 415 jb start; 416 417 emms; 418 mov dword ptr [aptr], ESI; 419 } 420 } 421 } 422 423 while (aptr < aend) 424 *aptr++ ` ~ opD ~ ` value; 425 426 return a;`; 427 } 428 429 /* ======================================================================== */ 430 431 /*********************** 432 * Computes: 433 * a[] += value 434 */ 435 436 T[] _arrayExpSliceAddass_f(T[] a, T value) 437 { 438 mixin(CodeGenExpSliceOpAssign!("+=", "addps", "pfadd")); 439 } 440 441 unittest 442 { 443 printf("_arrayExpSliceAddass_f unittest\n"); 444 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 445 { 446 version (log) printf(" cpuid %d\n", cpuid); 447 448 for (int j = 0; j < 2; j++) 449 { 450 const int dim = 67; 451 T[] a = new T[dim + j]; // aligned on 16 byte boundary 452 a = a[j .. dim + j]; // misalign for second iteration 453 T[] b = new T[dim + j]; 454 b = b[j .. dim + j]; 455 T[] c = new T[dim + j]; 456 c = c[j .. dim + j]; 457 458 for (int i = 0; i < dim; i++) 459 { a[i] = cast(T)i; 460 b[i] = cast(T)(i + 7); 461 c[i] = cast(T)(i * 2); 462 } 463 464 a[] = c[]; 465 c[] += 6; 466 467 for (int i = 0; i < dim; i++) 468 { 469 if (c[i] != cast(T)(a[i] + 6)) 470 { 471 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); 472 assert(0); 473 } 474 } 475 } 476 } 477 } 478 479 /* ======================================================================== */ 480 481 /*********************** 482 * Computes: 483 * a[] -= value 484 */ 485 486 T[] _arrayExpSliceMinass_f(T[] a, T value) 487 { 488 mixin(CodeGenExpSliceOpAssign!("-=", "subps", "pfsub")); 489 } 490 491 unittest 492 { 493 printf("_arrayExpSliceminass_f unittest\n"); 494 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 495 { 496 version (log) printf(" cpuid %d\n", cpuid); 497 498 for (int j = 0; j < 2; j++) 499 { 500 const int dim = 67; 501 T[] a = new T[dim + j]; // aligned on 16 byte boundary 502 a = a[j .. dim + j]; // misalign for second iteration 503 T[] b = new T[dim + j]; 504 b = b[j .. dim + j]; 505 T[] c = new T[dim + j]; 506 c = c[j .. dim + j]; 507 508 for (int i = 0; i < dim; i++) 509 { a[i] = cast(T)i; 510 b[i] = cast(T)(i + 7); 511 c[i] = cast(T)(i * 2); 512 } 513 514 a[] = c[]; 515 c[] -= 6; 516 517 for (int i = 0; i < dim; i++) 518 { 519 if (c[i] != cast(T)(a[i] - 6)) 520 { 521 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); 522 assert(0); 523 } 524 } 525 } 526 } 527 } 528 529 /* ======================================================================== */ 530 531 /*********************** 532 * Computes: 533 * a[] *= value 534 */ 535 536 T[] _arrayExpSliceMulass_f(T[] a, T value) 537 { 538 mixin(CodeGenExpSliceOpAssign!("*=", "mulps", "pfmul")); 539 } 540 541 unittest 542 { 543 printf("_arrayExpSliceMulass_f unittest\n"); 544 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 545 { 546 version (log) printf(" cpuid %d\n", cpuid); 547 548 for (int j = 0; j < 2; j++) 549 { 550 const int dim = 67; 551 T[] a = new T[dim + j]; // aligned on 16 byte boundary 552 a = a[j .. dim + j]; // misalign for second iteration 553 T[] b = new T[dim + j]; 554 b = b[j .. dim + j]; 555 T[] c = new T[dim + j]; 556 c = c[j .. dim + j]; 557 558 for (int i = 0; i < dim; i++) 559 { a[i] = cast(T)i; 560 b[i] = cast(T)(i + 7); 561 c[i] = cast(T)(i * 2); 562 } 563 564 a[] = c[]; 565 c[] *= 6; 566 567 for (int i = 0; i < dim; i++) 568 { 569 if (c[i] != cast(T)(a[i] * 6)) 570 { 571 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); 572 assert(0); 573 } 574 } 575 } 576 } 577 } 578 579 /* ======================================================================== */ 580 581 /*********************** 582 * Computes: 583 * a[] /= value 584 */ 585 586 T[] _arrayExpSliceDivass_f(T[] a, T value) 587 { 588 return _arrayExpSliceMulass_f(a, 1f / value); 589 } 590 591 unittest 592 { 593 printf("_arrayExpSliceDivass_f unittest\n"); 594 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 595 { 596 version (log) printf(" cpuid %d\n", cpuid); 597 598 for (int j = 0; j < 2; j++) 599 { 600 const int dim = 67; 601 T[] a = new T[dim + j]; // aligned on 16 byte boundary 602 a = a[j .. dim + j]; // misalign for second iteration 603 T[] b = new T[dim + j]; 604 b = b[j .. dim + j]; 605 T[] c = new T[dim + j]; 606 c = c[j .. dim + j]; 607 608 for (int i = 0; i < dim; i++) 609 { a[i] = cast(T)i; 610 b[i] = cast(T)(i + 7); 611 c[i] = cast(T)(i * 2); 612 } 613 614 a[] = c[]; 615 c[] /= 8; 616 617 for (int i = 0; i < dim; i++) 618 { 619 if (c[i] != cast(T)(a[i] / 8)) 620 { 621 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); 622 assert(0); 623 } 624 } 625 } 626 } 627 } 628 629 630 /* ======================================================================== */ 631 /* ======================================================================== */ 632 633 /* template for the case 634 * a[] = b[] ? value 635 * with some binary operator ? 636 */ 637 private template CodeGenSliceExpOp(char[] opD, char[] opSSE, char[] op3DNow) 638 { 639 const char[] CodeGenSliceExpOp = ` 640 auto aptr = a.ptr; 641 auto aend = aptr + a.length; 642 auto bptr = b.ptr; 643 644 version (D_InlineAsm_X86) 645 { 646 // SSE version is 665% faster 647 if (sse() && a.length >= 16) 648 { 649 auto n = aptr + (a.length & ~15); 650 651 // Unaligned case 652 asm 653 { 654 mov EAX, bptr; 655 mov ESI, aptr; 656 mov EDI, n; 657 movss XMM4, value; 658 shufps XMM4, XMM4, 0; 659 660 align 8; 661 startsseloop: 662 add ESI, 64; 663 movups XMM0, [EAX]; 664 movups XMM1, [EAX+16]; 665 movups XMM2, [EAX+32]; 666 movups XMM3, [EAX+48]; 667 add EAX, 64; 668 ` ~ opSSE ~ ` XMM0, XMM4; 669 ` ~ opSSE ~ ` XMM1, XMM4; 670 ` ~ opSSE ~ ` XMM2, XMM4; 671 ` ~ opSSE ~ ` XMM3, XMM4; 672 movups [ESI+ 0-64], XMM0; 673 movups [ESI+16-64], XMM1; 674 movups [ESI+32-64], XMM2; 675 movups [ESI+48-64], XMM3; 676 cmp ESI, EDI; 677 jb startsseloop; 678 679 mov aptr, ESI; 680 mov bptr, EAX; 681 } 682 } 683 else 684 // 3DNow! version is 69% faster 685 if (amd3dnow() && a.length >= 8) 686 { 687 auto n = aptr + (a.length & ~7); 688 689 ulong w = *cast(uint *) &value; 690 ulong v = w | (w << 32L); 691 692 asm 693 { 694 mov ESI, aptr; 695 mov EDI, n; 696 mov EAX, bptr; 697 movq MM4, qword ptr [v]; 698 699 align 8; 700 start3dnow: 701 movq MM0, [EAX]; 702 movq MM1, [EAX+8]; 703 movq MM2, [EAX+16]; 704 movq MM3, [EAX+24]; 705 ` ~ op3DNow ~ ` MM0, MM4; 706 ` ~ op3DNow ~ ` MM1, MM4; 707 ` ~ op3DNow ~ ` MM2, MM4; 708 ` ~ op3DNow ~ ` MM3, MM4; 709 movq [ESI], MM0; 710 movq [ESI+8], MM1; 711 movq [ESI+16], MM2; 712 movq [ESI+24], MM3; 713 add ESI, 32; 714 add EAX, 32; 715 cmp ESI, EDI; 716 jb start3dnow; 717 718 emms; 719 mov aptr, ESI; 720 mov bptr, EAX; 721 } 722 } 723 } 724 725 while (aptr < aend) 726 *aptr++ = *bptr++ ` ~ opD ~ ` value; 727 728 return a;`; 729 } 730 731 /* ======================================================================== */ 732 733 /*********************** 734 * Computes: 735 * a[] = b[] + value 736 */ 737 738 T[] _arraySliceExpAddSliceAssign_f(T[] a, T value, T[] b) 739 in 740 { 741 assert(a.length == b.length); 742 assert(disjoint(a, b)); 743 } 744 body 745 { 746 mixin(CodeGenSliceExpOp!("+", "addps", "pfadd")); 747 } 748 749 unittest 750 { 751 printf("_arraySliceExpAddSliceAssign_f unittest\n"); 752 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 753 { 754 version (log) printf(" cpuid %d\n", cpuid); 755 756 for (int j = 0; j < 2; j++) 757 { 758 const int dim = 67; 759 T[] a = new T[dim + j]; // aligned on 16 byte boundary 760 a = a[j .. dim + j]; // misalign for second iteration 761 T[] b = new T[dim + j]; 762 b = b[j .. dim + j]; 763 T[] c = new T[dim + j]; 764 c = c[j .. dim + j]; 765 766 for (int i = 0; i < dim; i++) 767 { a[i] = cast(T)i; 768 b[i] = cast(T)(i + 7); 769 c[i] = cast(T)(i * 2); 770 } 771 772 c[] = a[] + 6; 773 774 for (int i = 0; i < dim; i++) 775 { 776 if (c[i] != cast(T)(a[i] + 6)) 777 { 778 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); 779 assert(0); 780 } 781 } 782 } 783 } 784 } 785 786 /* ======================================================================== */ 787 788 /*********************** 789 * Computes: 790 * a[] = b[] - value 791 */ 792 793 T[] _arraySliceExpMinSliceAssign_f(T[] a, T value, T[] b) 794 in 795 { 796 assert (a.length == b.length); 797 assert (disjoint(a, b)); 798 } 799 body 800 { 801 mixin(CodeGenSliceExpOp!("-", "subps", "pfsub")); 802 } 803 804 unittest 805 { 806 printf("_arraySliceExpMinSliceAssign_f unittest\n"); 807 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 808 { 809 version (log) printf(" cpuid %d\n", cpuid); 810 811 for (int j = 0; j < 2; j++) 812 { 813 const int dim = 67; 814 T[] a = new T[dim + j]; // aligned on 16 byte boundary 815 a = a[j .. dim + j]; // misalign for second iteration 816 T[] b = new T[dim + j]; 817 b = b[j .. dim + j]; 818 T[] c = new T[dim + j]; 819 c = c[j .. dim + j]; 820 821 for (int i = 0; i < dim; i++) 822 { a[i] = cast(T)i; 823 b[i] = cast(T)(i + 7); 824 c[i] = cast(T)(i * 2); 825 } 826 827 c[] = a[] - 6; 828 829 for (int i = 0; i < dim; i++) 830 { 831 if (c[i] != cast(T)(a[i] - 6)) 832 { 833 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); 834 assert(0); 835 } 836 } 837 } 838 } 839 } 840 841 /* ======================================================================== */ 842 843 /*********************** 844 * Computes: 845 * a[] = b[] * value 846 */ 847 848 T[] _arraySliceExpMulSliceAssign_f(T[] a, T value, T[] b) 849 in 850 { 851 assert(a.length == b.length); 852 assert(disjoint(a, b)); 853 } 854 body 855 { 856 mixin(CodeGenSliceExpOp!("*", "mulps", "pfmul")); 857 } 858 859 unittest 860 { 861 printf("_arraySliceExpMulSliceAssign_f unittest\n"); 862 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 863 { 864 version (log) printf(" cpuid %d\n", cpuid); 865 866 for (int j = 0; j < 2; j++) 867 { 868 const int dim = 67; 869 T[] a = new T[dim + j]; // aligned on 16 byte boundary 870 a = a[j .. dim + j]; // misalign for second iteration 871 T[] b = new T[dim + j]; 872 b = b[j .. dim + j]; 873 T[] c = new T[dim + j]; 874 c = c[j .. dim + j]; 875 876 for (int i = 0; i < dim; i++) 877 { a[i] = cast(T)i; 878 b[i] = cast(T)(i + 7); 879 c[i] = cast(T)(i * 2); 880 } 881 882 c[] = a[] * 6; 883 884 for (int i = 0; i < dim; i++) 885 { 886 if (c[i] != cast(T)(a[i] * 6)) 887 { 888 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); 889 assert(0); 890 } 891 } 892 } 893 } 894 } 895 896 /* ======================================================================== */ 897 898 /*********************** 899 * Computes: 900 * a[] = b[] / value 901 */ 902 903 T[] _arraySliceExpDivSliceAssign_f(T[] a, T value, T[] b) 904 { 905 return _arraySliceExpMulSliceAssign_f(a, 1f/value, b); 906 } 907 908 unittest 909 { 910 printf("_arraySliceExpDivSliceAssign_f unittest\n"); 911 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 912 { 913 version (log) printf(" cpuid %d\n", cpuid); 914 915 for (int j = 0; j < 2; j++) 916 { 917 const int dim = 67; 918 T[] a = new T[dim + j]; // aligned on 16 byte boundary 919 a = a[j .. dim + j]; // misalign for second iteration 920 T[] b = new T[dim + j]; 921 b = b[j .. dim + j]; 922 T[] c = new T[dim + j]; 923 c = c[j .. dim + j]; 924 925 for (int i = 0; i < dim; i++) 926 { a[i] = cast(T)i; 927 b[i] = cast(T)(i + 7); 928 c[i] = cast(T)(i * 2); 929 } 930 931 c[] = a[] / 8; 932 933 for (int i = 0; i < dim; i++) 934 { 935 if (c[i] != cast(T)(a[i] / 8)) 936 { 937 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); 938 assert(0); 939 } 940 } 941 } 942 } 943 } 944 945 /* ======================================================================== */ 946 /* ======================================================================== */ 947 948 private template CodeGenSliceOpAssign(char[] opD, char[] opSSE, char[] op3DNow) 949 { 950 const char[] CodeGenSliceOpAssign = ` 951 auto aptr = a.ptr; 952 auto aend = aptr + a.length; 953 auto bptr = b.ptr; 954 955 version (D_InlineAsm_X86) 956 { 957 // SSE version is 468% faster 958 if (sse() && a.length >= 16) 959 { 960 auto n = aptr + (a.length & ~15); 961 962 // Unaligned case 963 asm 964 { 965 mov ECX, bptr; // right operand 966 mov ESI, aptr; // destination operand 967 mov EDI, n; // end comparison 968 969 align 8; 970 startsseloopb: 971 movups XMM0, [ESI]; 972 movups XMM1, [ESI+16]; 973 movups XMM2, [ESI+32]; 974 movups XMM3, [ESI+48]; 975 add ESI, 64; 976 movups XMM4, [ECX]; 977 movups XMM5, [ECX+16]; 978 movups XMM6, [ECX+32]; 979 movups XMM7, [ECX+48]; 980 add ECX, 64; 981 ` ~ opSSE ~ ` XMM0, XMM4; 982 ` ~ opSSE ~ ` XMM1, XMM5; 983 ` ~ opSSE ~ ` XMM2, XMM6; 984 ` ~ opSSE ~ ` XMM3, XMM7; 985 movups [ESI+ 0-64], XMM0; 986 movups [ESI+16-64], XMM1; 987 movups [ESI+32-64], XMM2; 988 movups [ESI+48-64], XMM3; 989 cmp ESI, EDI; 990 jb startsseloopb; 991 992 mov aptr, ESI; 993 mov bptr, ECX; 994 } 995 } 996 else 997 // 3DNow! version is 57% faster 998 if (amd3dnow() && a.length >= 8) 999 { 1000 auto n = aptr + (a.length & ~7); 1001 1002 asm 1003 { 1004 mov ESI, dword ptr [aptr]; // destination operand 1005 mov EDI, dword ptr [n]; // end comparison 1006 mov ECX, dword ptr [bptr]; // right operand 1007 1008 align 4; 1009 start3dnow: 1010 movq MM0, [ESI]; 1011 movq MM1, [ESI+8]; 1012 movq MM2, [ESI+16]; 1013 movq MM3, [ESI+24]; 1014 ` ~ op3DNow ~ ` MM0, [ECX]; 1015 ` ~ op3DNow ~ ` MM1, [ECX+8]; 1016 ` ~ op3DNow ~ ` MM2, [ECX+16]; 1017 ` ~ op3DNow ~ ` MM3, [ECX+24]; 1018 movq [ESI], MM0; 1019 movq [ESI+8], MM1; 1020 movq [ESI+16], MM2; 1021 movq [ESI+24], MM3; 1022 add ESI, 32; 1023 add ECX, 32; 1024 cmp ESI, EDI; 1025 jb start3dnow; 1026 1027 emms; 1028 mov dword ptr [aptr], ESI; 1029 mov dword ptr [bptr], ECX; 1030 } 1031 } 1032 } 1033 1034 while (aptr < aend) 1035 *aptr++ ` ~ opD ~ ` *bptr++; 1036 1037 return a;`; 1038 } 1039 1040 /* ======================================================================== */ 1041 1042 /*********************** 1043 * Computes: 1044 * a[] += b[] 1045 */ 1046 1047 T[] _arraySliceSliceAddass_f(T[] a, T[] b) 1048 in 1049 { 1050 assert (a.length == b.length); 1051 assert (disjoint(a, b)); 1052 } 1053 body 1054 { 1055 mixin(CodeGenSliceOpAssign!("+=", "addps", "pfadd")); 1056 } 1057 1058 unittest 1059 { 1060 printf("_arraySliceSliceAddass_f unittest\n"); 1061 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1062 { 1063 version (log) printf(" cpuid %d\n", cpuid); 1064 1065 for (int j = 0; j < 2; j++) 1066 { 1067 const int dim = 67; 1068 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1069 a = a[j .. dim + j]; // misalign for second iteration 1070 T[] b = new T[dim + j]; 1071 b = b[j .. dim + j]; 1072 T[] c = new T[dim + j]; 1073 c = c[j .. dim + j]; 1074 1075 for (int i = 0; i < dim; i++) 1076 { a[i] = cast(T)i; 1077 b[i] = cast(T)(i + 7); 1078 c[i] = cast(T)(i * 2); 1079 } 1080 1081 a[] = c[]; 1082 c[] += b[]; 1083 1084 for (int i = 0; i < dim; i++) 1085 { 1086 if (c[i] != cast(T)(a[i] + b[i])) 1087 { 1088 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); 1089 assert(0); 1090 } 1091 } 1092 } 1093 } 1094 } 1095 1096 /* ======================================================================== */ 1097 1098 /*********************** 1099 * Computes: 1100 * a[] -= b[] 1101 */ 1102 1103 T[] _arraySliceSliceMinass_f(T[] a, T[] b) 1104 in 1105 { 1106 assert (a.length == b.length); 1107 assert (disjoint(a, b)); 1108 } 1109 body 1110 { 1111 mixin(CodeGenSliceOpAssign!("-=", "subps", "pfsub")); 1112 } 1113 1114 unittest 1115 { 1116 printf("_arrayExpSliceMinass_f unittest\n"); 1117 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1118 { 1119 version (log) printf(" cpuid %d\n", cpuid); 1120 1121 for (int j = 0; j < 2; j++) 1122 { 1123 const int dim = 67; 1124 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1125 a = a[j .. dim + j]; // misalign for second iteration 1126 T[] b = new T[dim + j]; 1127 b = b[j .. dim + j]; 1128 T[] c = new T[dim + j]; 1129 c = c[j .. dim + j]; 1130 1131 for (int i = 0; i < dim; i++) 1132 { a[i] = cast(T)i; 1133 b[i] = cast(T)(i + 7); 1134 c[i] = cast(T)(i * 2); 1135 } 1136 1137 a[] = c[]; 1138 c[] -= 6; 1139 1140 for (int i = 0; i < dim; i++) 1141 { 1142 if (c[i] != cast(T)(a[i] - 6)) 1143 { 1144 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); 1145 assert(0); 1146 } 1147 } 1148 } 1149 } 1150 } 1151 1152 /* ======================================================================== */ 1153 1154 /*********************** 1155 * Computes: 1156 * a[] *= b[] 1157 */ 1158 1159 T[] _arraySliceSliceMulass_f(T[] a, T[] b) 1160 in 1161 { 1162 assert (a.length == b.length); 1163 assert (disjoint(a, b)); 1164 } 1165 body 1166 { 1167 mixin(CodeGenSliceOpAssign!("*=", "mulps", "pfmul")); 1168 } 1169 1170 unittest 1171 { 1172 printf("_arrayExpSliceMulass_f unittest\n"); 1173 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1174 { 1175 version (log) printf(" cpuid %d\n", cpuid); 1176 1177 for (int j = 0; j < 2; j++) 1178 { 1179 const int dim = 67; 1180 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1181 a = a[j .. dim + j]; // misalign for second iteration 1182 T[] b = new T[dim + j]; 1183 b = b[j .. dim + j]; 1184 T[] c = new T[dim + j]; 1185 c = c[j .. dim + j]; 1186 1187 for (int i = 0; i < dim; i++) 1188 { a[i] = cast(T)i; 1189 b[i] = cast(T)(i + 7); 1190 c[i] = cast(T)(i * 2); 1191 } 1192 1193 a[] = c[]; 1194 c[] *= 6; 1195 1196 for (int i = 0; i < dim; i++) 1197 { 1198 if (c[i] != cast(T)(a[i] * 6)) 1199 { 1200 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); 1201 assert(0); 1202 } 1203 } 1204 } 1205 } 1206 } 1207 1208 /* ======================================================================== */ 1209 /* ======================================================================== */ 1210 1211 /*********************** 1212 * Computes: 1213 * a[] = value - b[] 1214 */ 1215 1216 T[] _arrayExpSliceMinSliceAssign_f(T[] a, T[] b, T value) 1217 in 1218 { 1219 assert (a.length == b.length); 1220 assert (disjoint(a, b)); 1221 } 1222 body 1223 { 1224 //printf("_arrayExpSliceMinSliceAssign_f()\n"); 1225 auto aptr = a.ptr; 1226 auto aend = aptr + a.length; 1227 auto bptr = b.ptr; 1228 1229 version (D_InlineAsm_X86) 1230 { 1231 // SSE version is 690% faster 1232 if (sse() && a.length >= 16) 1233 { 1234 auto n = aptr + (a.length & ~15); 1235 1236 // Unaligned case 1237 asm 1238 { 1239 mov EAX, bptr; 1240 mov ESI, aptr; 1241 mov EDI, n; 1242 movss XMM4, value; 1243 shufps XMM4, XMM4, 0; 1244 1245 align 8; 1246 startsseloop: 1247 add ESI, 64; 1248 movaps XMM5, XMM4; 1249 movaps XMM6, XMM4; 1250 movups XMM0, [EAX]; 1251 movups XMM1, [EAX+16]; 1252 movups XMM2, [EAX+32]; 1253 movups XMM3, [EAX+48]; 1254 add EAX, 64; 1255 subps XMM5, XMM0; 1256 subps XMM6, XMM1; 1257 movups [ESI+ 0-64], XMM5; 1258 movups [ESI+16-64], XMM6; 1259 movaps XMM5, XMM4; 1260 movaps XMM6, XMM4; 1261 subps XMM5, XMM2; 1262 subps XMM6, XMM3; 1263 movups [ESI+32-64], XMM5; 1264 movups [ESI+48-64], XMM6; 1265 cmp ESI, EDI; 1266 jb startsseloop; 1267 1268 mov aptr, ESI; 1269 mov bptr, EAX; 1270 } 1271 } 1272 else 1273 // 3DNow! version is 67% faster 1274 if (amd3dnow() && a.length >= 8) 1275 { 1276 auto n = aptr + (a.length & ~7); 1277 1278 ulong w = *cast(uint *) &value; 1279 ulong v = w | (w << 32L); 1280 1281 asm 1282 { 1283 mov ESI, aptr; 1284 mov EDI, n; 1285 mov EAX, bptr; 1286 movq MM4, qword ptr [v]; 1287 1288 align 8; 1289 start3dnow: 1290 movq MM0, [EAX]; 1291 movq MM1, [EAX+8]; 1292 movq MM2, [EAX+16]; 1293 movq MM3, [EAX+24]; 1294 pfsubr MM0, MM4; 1295 pfsubr MM1, MM4; 1296 pfsubr MM2, MM4; 1297 pfsubr MM3, MM4; 1298 movq [ESI], MM0; 1299 movq [ESI+8], MM1; 1300 movq [ESI+16], MM2; 1301 movq [ESI+24], MM3; 1302 add ESI, 32; 1303 add EAX, 32; 1304 cmp ESI, EDI; 1305 jb start3dnow; 1306 1307 emms; 1308 mov aptr, ESI; 1309 mov bptr, EAX; 1310 } 1311 } 1312 } 1313 1314 while (aptr < aend) 1315 *aptr++ = value - *bptr++; 1316 1317 return a; 1318 } 1319 1320 unittest 1321 { 1322 printf("_arrayExpSliceMinSliceAssign_f unittest\n"); 1323 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1324 { 1325 version (log) printf(" cpuid %d\n", cpuid); 1326 1327 for (int j = 0; j < 2; j++) 1328 { 1329 const int dim = 67; 1330 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1331 a = a[j .. dim + j]; // misalign for second iteration 1332 T[] b = new T[dim + j]; 1333 b = b[j .. dim + j]; 1334 T[] c = new T[dim + j]; 1335 c = c[j .. dim + j]; 1336 1337 for (int i = 0; i < dim; i++) 1338 { a[i] = cast(T)i; 1339 b[i] = cast(T)(i + 7); 1340 c[i] = cast(T)(i * 2); 1341 } 1342 1343 c[] = 6 - a[]; 1344 1345 for (int i = 0; i < dim; i++) 1346 { 1347 if (c[i] != cast(T)(6 - a[i])) 1348 { 1349 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); 1350 assert(0); 1351 } 1352 } 1353 } 1354 } 1355 } 1356 1357 /* ======================================================================== */ 1358 1359 /*********************** 1360 * Computes: 1361 * a[] -= b[] * value 1362 */ 1363 1364 T[] _arraySliceExpMulSliceMinass_f(T[] a, T value, T[] b) 1365 { 1366 return _arraySliceExpMulSliceAddass_f(a, -value, b); 1367 } 1368 1369 /*********************** 1370 * Computes: 1371 * a[] += b[] * value 1372 */ 1373 1374 T[] _arraySliceExpMulSliceAddass_f(T[] a, T value, T[] b) 1375 in 1376 { 1377 assert(a.length == b.length); 1378 assert(disjoint(a, b)); 1379 } 1380 body 1381 { 1382 auto aptr = a.ptr; 1383 auto aend = aptr + a.length; 1384 auto bptr = b.ptr; 1385 1386 // Handle remainder 1387 while (aptr < aend) 1388 *aptr++ += *bptr++ * value; 1389 1390 return a; 1391 } 1392 1393 unittest 1394 { 1395 printf("_arraySliceExpMulSliceAddass_f unittest\n"); 1396 1397 cpuid = 1; 1398 { 1399 version (log) printf(" cpuid %d\n", cpuid); 1400 1401 for (int j = 0; j < 1; j++) 1402 { 1403 const int dim = 67; 1404 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1405 a = a[j .. dim + j]; // misalign for second iteration 1406 T[] b = new T[dim + j]; 1407 b = b[j .. dim + j]; 1408 T[] c = new T[dim + j]; 1409 c = c[j .. dim + j]; 1410 1411 for (int i = 0; i < dim; i++) 1412 { a[i] = cast(T)i; 1413 b[i] = cast(T)(i + 7); 1414 c[i] = cast(T)(i * 2); 1415 } 1416 1417 b[] = c[]; 1418 c[] += a[] * 6; 1419 1420 for (int i = 0; i < dim; i++) 1421 { 1422 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); 1423 if (c[i] != cast(T)(b[i] + a[i] * 6)) 1424 { 1425 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); 1426 assert(0); 1427 } 1428 } 1429 } 1430 } 1431 }