1 /*************************** 2 * D programming language http://www.digitalmars.com/d/ 3 * Runtime support for double array operations. 4 * Based on code originally written by Burton Radons. 5 * Placed in public domain. 6 */ 7 8 module rt.compiler.gdc.rt.arraydouble; 9 10 import CPUid = rt.compiler.util.cpuid; 11 12 debug(UnitTest) 13 { 14 private extern(C) int printf(char*,...); 15 /* This is so unit tests will test every CPU variant 16 */ 17 int cpuid; 18 const int CPUID_MAX = 5; 19 bool mmx() { return cpuid == 1 && CPUid.mmx(); } 20 bool sse() { return cpuid == 2 && CPUid.sse(); } 21 bool sse2() { return cpuid == 3 && CPUid.sse2(); } 22 bool amd3dnow() { return cpuid == 4 && CPUid.amd3dnow(); } 23 } 24 else 25 { 26 alias CPUid.mmx mmx; 27 alias CPUid.sse sse; 28 alias CPUid.sse2 sse2; 29 alias CPUid.amd3dnow amd3dnow; 30 } 31 32 //version = log; 33 34 bool disjoint(T)(T[] a, T[] b) 35 { 36 return (a.ptr + a.length <= b.ptr || b.ptr + b.length <= a.ptr); 37 } 38 39 /* Performance figures measured by Burton Radons 40 */ 41 42 alias double T; 43 44 extern (C): 45 46 /* ======================================================================== */ 47 48 /*********************** 49 * Computes: 50 * a[] = b[] + c[] 51 */ 52 53 T[] _arraySliceSliceAddSliceAssign_d(T[] a, T[] c, T[] b) 54 in 55 { 56 assert(a.length == b.length && b.length == c.length); 57 assert(disjoint(a, b)); 58 assert(disjoint(a, c)); 59 assert(disjoint(b, c)); 60 } 61 body 62 { 63 auto aptr = a.ptr; 64 auto aend = aptr + a.length; 65 auto bptr = b.ptr; 66 auto cptr = c.ptr; 67 68 version (D_InlineAsm_X86) 69 { 70 // SSE2 version is 333% faster 71 if (sse2() && b.length >= 16) 72 { 73 auto n = aptr + (b.length & ~15); 74 75 // Unaligned case 76 asm 77 { 78 mov EAX, bptr; // left operand 79 mov ECX, cptr; // right operand 80 mov ESI, aptr; // destination operand 81 mov EDI, n; // end comparison 82 83 align 8; 84 startsseloopb: 85 movupd XMM0, [EAX]; 86 movupd XMM1, [EAX+16]; 87 movupd XMM2, [EAX+32]; 88 movupd XMM3, [EAX+48]; 89 add EAX, 64; 90 movupd XMM4, [ECX]; 91 movupd XMM5, [ECX+16]; 92 movupd XMM6, [ECX+32]; 93 movupd XMM7, [ECX+48]; 94 add ESI, 64; 95 addpd XMM0, XMM4; 96 addpd XMM1, XMM5; 97 addpd XMM2, XMM6; 98 addpd XMM3, XMM7; 99 add ECX, 64; 100 movupd [ESI+ 0-64], XMM0; 101 movupd [ESI+16-64], XMM1; 102 movupd [ESI+32-64], XMM2; 103 movupd [ESI+48-64], XMM3; 104 cmp ESI, EDI; 105 jb startsseloopb; 106 107 mov aptr, ESI; 108 mov bptr, EAX; 109 mov cptr, ECX; 110 } 111 } 112 } 113 114 // Handle remainder 115 while (aptr < aend) 116 *aptr++ = *bptr++ + *cptr++; 117 118 return a; 119 } 120 121 122 unittest 123 { 124 printf("_arraySliceSliceAddSliceAssign_d unittest\n"); 125 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 126 { 127 version (log) printf(" cpuid %d\n", cpuid); 128 129 for (int j = 0; j < 2; j++) 130 { 131 const int dim = 67; 132 T[] a = new T[dim + j]; // aligned on 16 byte boundary 133 a = a[j .. dim + j]; // misalign for second iteration 134 T[] b = new T[dim + j]; 135 b = b[j .. dim + j]; 136 T[] c = new T[dim + j]; 137 c = c[j .. dim + j]; 138 139 for (int i = 0; i < dim; i++) 140 { a[i] = cast(T)i; 141 b[i] = cast(T)(i + 7); 142 c[i] = cast(T)(i * 2); 143 } 144 145 c[] = a[] + b[]; 146 147 for (int i = 0; i < dim; i++) 148 { 149 if (c[i] != cast(T)(a[i] + b[i])) 150 { 151 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); 152 assert(0); 153 } 154 } 155 } 156 } 157 } 158 159 /* ======================================================================== */ 160 161 /*********************** 162 * Computes: 163 * a[] = b[] - c[] 164 */ 165 166 T[] _arraySliceSliceMinSliceAssign_d(T[] a, T[] c, T[] b) 167 in 168 { 169 assert(a.length == b.length && b.length == c.length); 170 assert(disjoint(a, b)); 171 assert(disjoint(a, c)); 172 assert(disjoint(b, c)); 173 } 174 body 175 { 176 auto aptr = a.ptr; 177 auto aend = aptr + a.length; 178 auto bptr = b.ptr; 179 auto cptr = c.ptr; 180 181 version (D_InlineAsm_X86) 182 { 183 // SSE2 version is 324% faster 184 if (sse2() && b.length >= 8) 185 { 186 auto n = aptr + (b.length & ~7); 187 188 // Unaligned case 189 asm 190 { 191 mov EAX, bptr; // left operand 192 mov ECX, cptr; // right operand 193 mov ESI, aptr; // destination operand 194 mov EDI, n; // end comparison 195 196 align 8; 197 startsseloopb: 198 movupd XMM0, [EAX]; 199 movupd XMM1, [EAX+16]; 200 movupd XMM2, [EAX+32]; 201 movupd XMM3, [EAX+48]; 202 add EAX, 64; 203 movupd XMM4, [ECX]; 204 movupd XMM5, [ECX+16]; 205 movupd XMM6, [ECX+32]; 206 movupd XMM7, [ECX+48]; 207 add ESI, 64; 208 subpd XMM0, XMM4; 209 subpd XMM1, XMM5; 210 subpd XMM2, XMM6; 211 subpd XMM3, XMM7; 212 add ECX, 64; 213 movupd [ESI+ 0-64], XMM0; 214 movupd [ESI+16-64], XMM1; 215 movupd [ESI+32-64], XMM2; 216 movupd [ESI+48-64], XMM3; 217 cmp ESI, EDI; 218 jb startsseloopb; 219 220 mov aptr, ESI; 221 mov bptr, EAX; 222 mov cptr, ECX; 223 } 224 } 225 } 226 227 // Handle remainder 228 while (aptr < aend) 229 *aptr++ = *bptr++ - *cptr++; 230 231 return a; 232 } 233 234 235 unittest 236 { 237 printf("_arraySliceSliceMinSliceAssign_d unittest\n"); 238 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 239 { 240 version (log) printf(" cpuid %d\n", cpuid); 241 242 for (int j = 0; j < 2; j++) 243 { 244 const int dim = 67; 245 T[] a = new T[dim + j]; // aligned on 16 byte boundary 246 a = a[j .. dim + j]; // misalign for second iteration 247 T[] b = new T[dim + j]; 248 b = b[j .. dim + j]; 249 T[] c = new T[dim + j]; 250 c = c[j .. dim + j]; 251 252 for (int i = 0; i < dim; i++) 253 { a[i] = cast(T)i; 254 b[i] = cast(T)(i + 7); 255 c[i] = cast(T)(i * 2); 256 } 257 258 c[] = a[] - b[]; 259 260 for (int i = 0; i < dim; i++) 261 { 262 if (c[i] != cast(T)(a[i] - b[i])) 263 { 264 printf("[%d]: %g != %g - %g\n", i, c[i], a[i], b[i]); 265 assert(0); 266 } 267 } 268 } 269 } 270 } 271 272 273 /* ======================================================================== */ 274 275 /*********************** 276 * Computes: 277 * a[] = b[] + value 278 */ 279 280 T[] _arraySliceExpAddSliceAssign_d(T[] a, T value, T[] b) 281 in 282 { 283 assert(a.length == b.length); 284 assert(disjoint(a, b)); 285 } 286 body 287 { 288 //printf("_arraySliceExpAddSliceAssign_d()\n"); 289 auto aptr = a.ptr; 290 auto aend = aptr + a.length; 291 auto bptr = b.ptr; 292 293 version (D_InlineAsm_X86) 294 { 295 // SSE2 version is 305% faster 296 if (sse2() && a.length >= 8) 297 { 298 auto n = aptr + (a.length & ~7); 299 300 // Unaligned case 301 asm 302 { 303 mov EAX, bptr; 304 mov ESI, aptr; 305 mov EDI, n; 306 movsd XMM4, value; 307 shufpd XMM4, XMM4, 0; 308 309 align 8; 310 startsseloop: 311 add ESI, 64; 312 movupd XMM0, [EAX]; 313 movupd XMM1, [EAX+16]; 314 movupd XMM2, [EAX+32]; 315 movupd XMM3, [EAX+48]; 316 add EAX, 64; 317 addpd XMM0, XMM4; 318 addpd XMM1, XMM4; 319 addpd XMM2, XMM4; 320 addpd XMM3, XMM4; 321 movupd [ESI+ 0-64], XMM0; 322 movupd [ESI+16-64], XMM1; 323 movupd [ESI+32-64], XMM2; 324 movupd [ESI+48-64], XMM3; 325 cmp ESI, EDI; 326 jb startsseloop; 327 328 mov aptr, ESI; 329 mov bptr, EAX; 330 } 331 } 332 } 333 334 while (aptr < aend) 335 *aptr++ = *bptr++ + value; 336 337 return a; 338 } 339 340 unittest 341 { 342 printf("_arraySliceExpAddSliceAssign_d unittest\n"); 343 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 344 { 345 version (log) printf(" cpuid %d\n", cpuid); 346 347 for (int j = 0; j < 2; j++) 348 { 349 const int dim = 67; 350 T[] a = new T[dim + j]; // aligned on 16 byte boundary 351 a = a[j .. dim + j]; // misalign for second iteration 352 T[] b = new T[dim + j]; 353 b = b[j .. dim + j]; 354 T[] c = new T[dim + j]; 355 c = c[j .. dim + j]; 356 357 for (int i = 0; i < dim; i++) 358 { a[i] = cast(T)i; 359 b[i] = cast(T)(i + 7); 360 c[i] = cast(T)(i * 2); 361 } 362 363 c[] = a[] + 6; 364 365 for (int i = 0; i < dim; i++) 366 { 367 if (c[i] != cast(T)(a[i] + 6)) 368 { 369 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); 370 assert(0); 371 } 372 } 373 } 374 } 375 } 376 377 /* ======================================================================== */ 378 379 /*********************** 380 * Computes: 381 * a[] += value 382 */ 383 384 T[] _arrayExpSliceAddass_d(T[] a, T value) 385 { 386 //printf("_arrayExpSliceAddass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 387 auto aptr = a.ptr; 388 auto aend = aptr + a.length; 389 390 version (D_InlineAsm_X86) 391 { 392 // SSE2 version is 114% faster 393 if (sse2() && a.length >= 8) 394 { 395 auto n = cast(T*)((cast(uint)aend) & ~7); 396 if (aptr < n) 397 398 // Unaligned case 399 asm 400 { 401 mov ESI, aptr; 402 mov EDI, n; 403 movsd XMM4, value; 404 shufpd XMM4, XMM4, 0; 405 406 align 8; 407 startsseloopa: 408 movupd XMM0, [ESI]; 409 movupd XMM1, [ESI+16]; 410 movupd XMM2, [ESI+32]; 411 movupd XMM3, [ESI+48]; 412 add ESI, 64; 413 addpd XMM0, XMM4; 414 addpd XMM1, XMM4; 415 addpd XMM2, XMM4; 416 addpd XMM3, XMM4; 417 movupd [ESI+ 0-64], XMM0; 418 movupd [ESI+16-64], XMM1; 419 movupd [ESI+32-64], XMM2; 420 movupd [ESI+48-64], XMM3; 421 cmp ESI, EDI; 422 jb startsseloopa; 423 424 mov aptr, ESI; 425 } 426 } 427 } 428 429 while (aptr < aend) 430 *aptr++ += value; 431 432 return a; 433 } 434 435 unittest 436 { 437 printf("_arrayExpSliceAddass_d unittest\n"); 438 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 439 { 440 version (log) printf(" cpuid %d\n", cpuid); 441 442 for (int j = 0; j < 2; j++) 443 { 444 const int dim = 67; 445 T[] a = new T[dim + j]; // aligned on 16 byte boundary 446 a = a[j .. dim + j]; // misalign for second iteration 447 T[] b = new T[dim + j]; 448 b = b[j .. dim + j]; 449 T[] c = new T[dim + j]; 450 c = c[j .. dim + j]; 451 452 for (int i = 0; i < dim; i++) 453 { a[i] = cast(T)i; 454 b[i] = cast(T)(i + 7); 455 c[i] = cast(T)(i * 2); 456 } 457 458 a[] = c[]; 459 c[] += 6; 460 461 for (int i = 0; i < dim; i++) 462 { 463 if (c[i] != cast(T)(a[i] + 6)) 464 { 465 printf("[%d]: %g != %g + 6\n", i, c[i], a[i]); 466 assert(0); 467 } 468 } 469 } 470 } 471 } 472 473 /* ======================================================================== */ 474 475 /*********************** 476 * Computes: 477 * a[] += b[] 478 */ 479 480 T[] _arraySliceSliceAddass_d(T[] a, T[] b) 481 in 482 { 483 assert (a.length == b.length); 484 assert (disjoint(a, b)); 485 } 486 body 487 { 488 //printf("_arraySliceSliceAddass_d()\n"); 489 auto aptr = a.ptr; 490 auto aend = aptr + a.length; 491 auto bptr = b.ptr; 492 493 version (D_InlineAsm_X86) 494 { 495 // SSE2 version is 183% faster 496 if (sse2() && a.length >= 8) 497 { 498 auto n = aptr + (a.length & ~7); 499 500 // Unaligned case 501 asm 502 { 503 mov ECX, bptr; // right operand 504 mov ESI, aptr; // destination operand 505 mov EDI, n; // end comparison 506 507 align 8; 508 startsseloopb: 509 movupd XMM0, [ESI]; 510 movupd XMM1, [ESI+16]; 511 movupd XMM2, [ESI+32]; 512 movupd XMM3, [ESI+48]; 513 add ESI, 64; 514 movupd XMM4, [ECX]; 515 movupd XMM5, [ECX+16]; 516 movupd XMM6, [ECX+32]; 517 movupd XMM7, [ECX+48]; 518 add ECX, 64; 519 addpd XMM0, XMM4; 520 addpd XMM1, XMM5; 521 addpd XMM2, XMM6; 522 addpd XMM3, XMM7; 523 movupd [ESI+ 0-64], XMM0; 524 movupd [ESI+16-64], XMM1; 525 movupd [ESI+32-64], XMM2; 526 movupd [ESI+48-64], XMM3; 527 cmp ESI, EDI; 528 jb startsseloopb; 529 530 mov aptr, ESI; 531 mov bptr, ECX; 532 } 533 } 534 } 535 536 while (aptr < aend) 537 *aptr++ += *bptr++; 538 539 return a; 540 } 541 542 unittest 543 { 544 printf("_arraySliceSliceAddass_d unittest\n"); 545 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 546 { 547 version (log) printf(" cpuid %d\n", cpuid); 548 549 for (int j = 0; j < 2; j++) 550 { 551 const int dim = 67; 552 T[] a = new T[dim + j]; // aligned on 16 byte boundary 553 a = a[j .. dim + j]; // misalign for second iteration 554 T[] b = new T[dim + j]; 555 b = b[j .. dim + j]; 556 T[] c = new T[dim + j]; 557 c = c[j .. dim + j]; 558 559 for (int i = 0; i < dim; i++) 560 { a[i] = cast(T)i; 561 b[i] = cast(T)(i + 7); 562 c[i] = cast(T)(i * 2); 563 } 564 565 a[] = c[]; 566 c[] += b[]; 567 568 for (int i = 0; i < dim; i++) 569 { 570 if (c[i] != cast(T)(a[i] + b[i])) 571 { 572 printf("[%d]: %g != %g + %g\n", i, c[i], a[i], b[i]); 573 assert(0); 574 } 575 } 576 } 577 } 578 } 579 580 /* ======================================================================== */ 581 582 /*********************** 583 * Computes: 584 * a[] = b[] - value 585 */ 586 587 T[] _arraySliceExpMinSliceAssign_d(T[] a, T value, T[] b) 588 in 589 { 590 assert (a.length == b.length); 591 assert (disjoint(a, b)); 592 } 593 body 594 { 595 //printf("_arraySliceExpMinSliceAssign_d()\n"); 596 auto aptr = a.ptr; 597 auto aend = aptr + a.length; 598 auto bptr = b.ptr; 599 600 version (D_InlineAsm_X86) 601 { 602 // SSE2 version is 305% faster 603 if (sse2() && a.length >= 8) 604 { 605 auto n = aptr + (a.length & ~7); 606 607 // Unaligned case 608 asm 609 { 610 mov EAX, bptr; 611 mov ESI, aptr; 612 mov EDI, n; 613 movsd XMM4, value; 614 shufpd XMM4, XMM4, 0; 615 616 align 8; 617 startsseloop: 618 add ESI, 64; 619 movupd XMM0, [EAX]; 620 movupd XMM1, [EAX+16]; 621 movupd XMM2, [EAX+32]; 622 movupd XMM3, [EAX+48]; 623 add EAX, 64; 624 subpd XMM0, XMM4; 625 subpd XMM1, XMM4; 626 subpd XMM2, XMM4; 627 subpd XMM3, XMM4; 628 movupd [ESI+ 0-64], XMM0; 629 movupd [ESI+16-64], XMM1; 630 movupd [ESI+32-64], XMM2; 631 movupd [ESI+48-64], XMM3; 632 cmp ESI, EDI; 633 jb startsseloop; 634 635 mov aptr, ESI; 636 mov bptr, EAX; 637 } 638 } 639 } 640 641 while (aptr < aend) 642 *aptr++ = *bptr++ - value; 643 644 return a; 645 } 646 647 unittest 648 { 649 printf("_arraySliceExpMinSliceAssign_d unittest\n"); 650 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 651 { 652 version (log) printf(" cpuid %d\n", cpuid); 653 654 for (int j = 0; j < 2; j++) 655 { 656 const int dim = 67; 657 T[] a = new T[dim + j]; // aligned on 16 byte boundary 658 a = a[j .. dim + j]; // misalign for second iteration 659 T[] b = new T[dim + j]; 660 b = b[j .. dim + j]; 661 T[] c = new T[dim + j]; 662 c = c[j .. dim + j]; 663 664 for (int i = 0; i < dim; i++) 665 { a[i] = cast(T)i; 666 b[i] = cast(T)(i + 7); 667 c[i] = cast(T)(i * 2); 668 } 669 670 c[] = a[] - 6; 671 672 for (int i = 0; i < dim; i++) 673 { 674 if (c[i] != cast(T)(a[i] - 6)) 675 { 676 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); 677 assert(0); 678 } 679 } 680 } 681 } 682 } 683 684 /* ======================================================================== */ 685 686 /*********************** 687 * Computes: 688 * a[] = value - b[] 689 */ 690 691 T[] _arrayExpSliceMinSliceAssign_d(T[] a, T[] b, T value) 692 in 693 { 694 assert (a.length == b.length); 695 assert (disjoint(a, b)); 696 } 697 body 698 { 699 //printf("_arrayExpSliceMinSliceAssign_d()\n"); 700 auto aptr = a.ptr; 701 auto aend = aptr + a.length; 702 auto bptr = b.ptr; 703 704 version (D_InlineAsm_X86) 705 { 706 // SSE2 version is 66% faster 707 if (sse2() && a.length >= 8) 708 { 709 auto n = aptr + (a.length & ~7); 710 711 // Unaligned case 712 asm 713 { 714 mov EAX, bptr; 715 mov ESI, aptr; 716 mov EDI, n; 717 movsd XMM4, value; 718 shufpd XMM4, XMM4, 0; 719 720 align 8; 721 startsseloop: 722 add ESI, 64; 723 movapd XMM5, XMM4; 724 movapd XMM6, XMM4; 725 movupd XMM0, [EAX]; 726 movupd XMM1, [EAX+16]; 727 movupd XMM2, [EAX+32]; 728 movupd XMM3, [EAX+48]; 729 add EAX, 64; 730 subpd XMM5, XMM0; 731 subpd XMM6, XMM1; 732 movupd [ESI+ 0-64], XMM5; 733 movupd [ESI+16-64], XMM6; 734 movapd XMM5, XMM4; 735 movapd XMM6, XMM4; 736 subpd XMM5, XMM2; 737 subpd XMM6, XMM3; 738 movupd [ESI+32-64], XMM5; 739 movupd [ESI+48-64], XMM6; 740 cmp ESI, EDI; 741 jb startsseloop; 742 743 mov aptr, ESI; 744 mov bptr, EAX; 745 } 746 } 747 } 748 749 while (aptr < aend) 750 *aptr++ = value - *bptr++; 751 752 return a; 753 } 754 755 unittest 756 { 757 printf("_arrayExpSliceMinSliceAssign_d unittest\n"); 758 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 759 { 760 version (log) printf(" cpuid %d\n", cpuid); 761 762 for (int j = 0; j < 2; j++) 763 { 764 const int dim = 67; 765 T[] a = new T[dim + j]; // aligned on 16 byte boundary 766 a = a[j .. dim + j]; // misalign for second iteration 767 T[] b = new T[dim + j]; 768 b = b[j .. dim + j]; 769 T[] c = new T[dim + j]; 770 c = c[j .. dim + j]; 771 772 for (int i = 0; i < dim; i++) 773 { a[i] = cast(T)i; 774 b[i] = cast(T)(i + 7); 775 c[i] = cast(T)(i * 2); 776 } 777 778 c[] = 6 - a[]; 779 780 for (int i = 0; i < dim; i++) 781 { 782 if (c[i] != cast(T)(6 - a[i])) 783 { 784 printf("[%d]: %g != 6 - %g\n", i, c[i], a[i]); 785 assert(0); 786 } 787 } 788 } 789 } 790 } 791 792 /* ======================================================================== */ 793 794 /*********************** 795 * Computes: 796 * a[] -= value 797 */ 798 799 T[] _arrayExpSliceMinass_d(T[] a, T value) 800 { 801 //printf("_arrayExpSliceMinass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 802 auto aptr = a.ptr; 803 auto aend = aptr + a.length; 804 805 version (D_InlineAsm_X86) 806 { 807 // SSE2 version is 115% faster 808 if (sse2() && a.length >= 8) 809 { 810 auto n = cast(T*)((cast(uint)aend) & ~7); 811 if (aptr < n) 812 813 // Unaligned case 814 asm 815 { 816 mov ESI, aptr; 817 mov EDI, n; 818 movsd XMM4, value; 819 shufpd XMM4, XMM4, 0; 820 821 align 8; 822 startsseloopa: 823 movupd XMM0, [ESI]; 824 movupd XMM1, [ESI+16]; 825 movupd XMM2, [ESI+32]; 826 movupd XMM3, [ESI+48]; 827 add ESI, 64; 828 subpd XMM0, XMM4; 829 subpd XMM1, XMM4; 830 subpd XMM2, XMM4; 831 subpd XMM3, XMM4; 832 movupd [ESI+ 0-64], XMM0; 833 movupd [ESI+16-64], XMM1; 834 movupd [ESI+32-64], XMM2; 835 movupd [ESI+48-64], XMM3; 836 cmp ESI, EDI; 837 jb startsseloopa; 838 839 mov aptr, ESI; 840 } 841 } 842 } 843 844 while (aptr < aend) 845 *aptr++ -= value; 846 847 return a; 848 } 849 850 unittest 851 { 852 printf("_arrayExpSliceMinass_d unittest\n"); 853 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 854 { 855 version (log) printf(" cpuid %d\n", cpuid); 856 857 for (int j = 0; j < 2; j++) 858 { 859 const int dim = 67; 860 T[] a = new T[dim + j]; // aligned on 16 byte boundary 861 a = a[j .. dim + j]; // misalign for second iteration 862 T[] b = new T[dim + j]; 863 b = b[j .. dim + j]; 864 T[] c = new T[dim + j]; 865 c = c[j .. dim + j]; 866 867 for (int i = 0; i < dim; i++) 868 { a[i] = cast(T)i; 869 b[i] = cast(T)(i + 7); 870 c[i] = cast(T)(i * 2); 871 } 872 873 a[] = c[]; 874 c[] -= 6; 875 876 for (int i = 0; i < dim; i++) 877 { 878 if (c[i] != cast(T)(a[i] - 6)) 879 { 880 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); 881 assert(0); 882 } 883 } 884 } 885 } 886 } 887 888 /* ======================================================================== */ 889 890 /*********************** 891 * Computes: 892 * a[] -= b[] 893 */ 894 895 T[] _arraySliceSliceMinass_d(T[] a, T[] b) 896 in 897 { 898 assert (a.length == b.length); 899 assert (disjoint(a, b)); 900 } 901 body 902 { 903 //printf("_arraySliceSliceMinass_d()\n"); 904 auto aptr = a.ptr; 905 auto aend = aptr + a.length; 906 auto bptr = b.ptr; 907 908 version (D_InlineAsm_X86) 909 { 910 // SSE2 version is 183% faster 911 if (sse2() && a.length >= 8) 912 { 913 auto n = aptr + (a.length & ~7); 914 915 // Unaligned case 916 asm 917 { 918 mov ECX, bptr; // right operand 919 mov ESI, aptr; // destination operand 920 mov EDI, n; // end comparison 921 922 align 8; 923 startsseloopb: 924 movupd XMM0, [ESI]; 925 movupd XMM1, [ESI+16]; 926 movupd XMM2, [ESI+32]; 927 movupd XMM3, [ESI+48]; 928 add ESI, 64; 929 movupd XMM4, [ECX]; 930 movupd XMM5, [ECX+16]; 931 movupd XMM6, [ECX+32]; 932 movupd XMM7, [ECX+48]; 933 add ECX, 64; 934 subpd XMM0, XMM4; 935 subpd XMM1, XMM5; 936 subpd XMM2, XMM6; 937 subpd XMM3, XMM7; 938 movupd [ESI+ 0-64], XMM0; 939 movupd [ESI+16-64], XMM1; 940 movupd [ESI+32-64], XMM2; 941 movupd [ESI+48-64], XMM3; 942 cmp ESI, EDI; 943 jb startsseloopb; 944 945 mov aptr, ESI; 946 mov bptr, ECX; 947 } 948 } 949 } 950 951 while (aptr < aend) 952 *aptr++ -= *bptr++; 953 954 return a; 955 } 956 957 unittest 958 { 959 printf("_arrayExpSliceMinass_d unittest\n"); 960 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 961 { 962 version (log) printf(" cpuid %d\n", cpuid); 963 964 for (int j = 0; j < 2; j++) 965 { 966 const int dim = 67; 967 T[] a = new T[dim + j]; // aligned on 16 byte boundary 968 a = a[j .. dim + j]; // misalign for second iteration 969 T[] b = new T[dim + j]; 970 b = b[j .. dim + j]; 971 T[] c = new T[dim + j]; 972 c = c[j .. dim + j]; 973 974 for (int i = 0; i < dim; i++) 975 { a[i] = cast(T)i; 976 b[i] = cast(T)(i + 7); 977 c[i] = cast(T)(i * 2); 978 } 979 980 a[] = c[]; 981 c[] -= 6; 982 983 for (int i = 0; i < dim; i++) 984 { 985 if (c[i] != cast(T)(a[i] - 6)) 986 { 987 printf("[%d]: %g != %g - 6\n", i, c[i], a[i]); 988 assert(0); 989 } 990 } 991 } 992 } 993 } 994 995 /* ======================================================================== */ 996 997 /*********************** 998 * Computes: 999 * a[] = b[] * value 1000 */ 1001 1002 T[] _arraySliceExpMulSliceAssign_d(T[] a, T value, T[] b) 1003 in 1004 { 1005 assert(a.length == b.length); 1006 assert(disjoint(a, b)); 1007 } 1008 body 1009 { 1010 //printf("_arraySliceExpMulSliceAssign_d()\n"); 1011 auto aptr = a.ptr; 1012 auto aend = aptr + a.length; 1013 auto bptr = b.ptr; 1014 1015 version (D_InlineAsm_X86) 1016 { 1017 // SSE2 version is 304% faster 1018 if (sse2() && a.length >= 8) 1019 { 1020 auto n = aptr + (a.length & ~7); 1021 1022 // Unaligned case 1023 asm 1024 { 1025 mov EAX, bptr; 1026 mov ESI, aptr; 1027 mov EDI, n; 1028 movsd XMM4, value; 1029 shufpd XMM4, XMM4, 0; 1030 1031 align 8; 1032 startsseloop: 1033 add ESI, 64; 1034 movupd XMM0, [EAX]; 1035 movupd XMM1, [EAX+16]; 1036 movupd XMM2, [EAX+32]; 1037 movupd XMM3, [EAX+48]; 1038 add EAX, 64; 1039 mulpd XMM0, XMM4; 1040 mulpd XMM1, XMM4; 1041 mulpd XMM2, XMM4; 1042 mulpd XMM3, XMM4; 1043 movupd [ESI+ 0-64], XMM0; 1044 movupd [ESI+16-64], XMM1; 1045 movupd [ESI+32-64], XMM2; 1046 movupd [ESI+48-64], XMM3; 1047 cmp ESI, EDI; 1048 jb startsseloop; 1049 1050 mov aptr, ESI; 1051 mov bptr, EAX; 1052 } 1053 } 1054 } 1055 1056 while (aptr < aend) 1057 *aptr++ = *bptr++ * value; 1058 1059 return a; 1060 } 1061 1062 unittest 1063 { 1064 printf("_arraySliceExpMulSliceAssign_d unittest\n"); 1065 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1066 { 1067 version (log) printf(" cpuid %d\n", cpuid); 1068 1069 for (int j = 0; j < 2; j++) 1070 { 1071 const int dim = 67; 1072 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1073 a = a[j .. dim + j]; // misalign for second iteration 1074 T[] b = new T[dim + j]; 1075 b = b[j .. dim + j]; 1076 T[] c = new T[dim + j]; 1077 c = c[j .. dim + j]; 1078 1079 for (int i = 0; i < dim; i++) 1080 { a[i] = cast(T)i; 1081 b[i] = cast(T)(i + 7); 1082 c[i] = cast(T)(i * 2); 1083 } 1084 1085 c[] = a[] * 6; 1086 1087 for (int i = 0; i < dim; i++) 1088 { 1089 if (c[i] != cast(T)(a[i] * 6)) 1090 { 1091 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); 1092 assert(0); 1093 } 1094 } 1095 } 1096 } 1097 } 1098 1099 /* ======================================================================== */ 1100 1101 /*********************** 1102 * Computes: 1103 * a[] = b[] * c[] 1104 */ 1105 1106 T[] _arraySliceSliceMulSliceAssign_d(T[] a, T[] c, T[] b) 1107 in 1108 { 1109 assert(a.length == b.length && b.length == c.length); 1110 assert(disjoint(a, b)); 1111 assert(disjoint(a, c)); 1112 assert(disjoint(b, c)); 1113 } 1114 body 1115 { 1116 //printf("_arraySliceSliceMulSliceAssign_d()\n"); 1117 auto aptr = a.ptr; 1118 auto aend = aptr + a.length; 1119 auto bptr = b.ptr; 1120 auto cptr = c.ptr; 1121 1122 version (D_InlineAsm_X86) 1123 { 1124 // SSE2 version is 329% faster 1125 if (sse2() && a.length >= 8) 1126 { 1127 auto n = aptr + (a.length & ~7); 1128 1129 // Unaligned case 1130 asm 1131 { 1132 mov EAX, bptr; // left operand 1133 mov ECX, cptr; // right operand 1134 mov ESI, aptr; // destination operand 1135 mov EDI, n; // end comparison 1136 1137 align 8; 1138 startsseloopb: 1139 movupd XMM0, [EAX]; 1140 movupd XMM1, [EAX+16]; 1141 movupd XMM2, [EAX+32]; 1142 movupd XMM3, [EAX+48]; 1143 add ESI, 64; 1144 movupd XMM4, [ECX]; 1145 movupd XMM5, [ECX+16]; 1146 movupd XMM6, [ECX+32]; 1147 movupd XMM7, [ECX+48]; 1148 add EAX, 64; 1149 mulpd XMM0, XMM4; 1150 mulpd XMM1, XMM5; 1151 mulpd XMM2, XMM6; 1152 mulpd XMM3, XMM7; 1153 add ECX, 64; 1154 movupd [ESI+ 0-64], XMM0; 1155 movupd [ESI+16-64], XMM1; 1156 movupd [ESI+32-64], XMM2; 1157 movupd [ESI+48-64], XMM3; 1158 cmp ESI, EDI; 1159 jb startsseloopb; 1160 1161 mov aptr, ESI; 1162 mov bptr, EAX; 1163 mov cptr, ECX; 1164 } 1165 } 1166 } 1167 1168 while (aptr < aend) 1169 *aptr++ = *bptr++ * *cptr++; 1170 1171 return a; 1172 } 1173 1174 unittest 1175 { 1176 printf("_arraySliceSliceMulSliceAssign_d unittest\n"); 1177 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1178 { 1179 version (log) printf(" cpuid %d\n", cpuid); 1180 1181 for (int j = 0; j < 2; j++) 1182 { 1183 const int dim = 67; 1184 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1185 a = a[j .. dim + j]; // misalign for second iteration 1186 T[] b = new T[dim + j]; 1187 b = b[j .. dim + j]; 1188 T[] c = new T[dim + j]; 1189 c = c[j .. dim + j]; 1190 1191 for (int i = 0; i < dim; i++) 1192 { a[i] = cast(T)i; 1193 b[i] = cast(T)(i + 7); 1194 c[i] = cast(T)(i * 2); 1195 } 1196 1197 c[] = a[] * b[]; 1198 1199 for (int i = 0; i < dim; i++) 1200 { 1201 if (c[i] != cast(T)(a[i] * b[i])) 1202 { 1203 printf("[%d]: %g != %g * %g\n", i, c[i], a[i], b[i]); 1204 assert(0); 1205 } 1206 } 1207 } 1208 } 1209 } 1210 1211 /* ======================================================================== */ 1212 1213 /*********************** 1214 * Computes: 1215 * a[] *= value 1216 */ 1217 1218 T[] _arrayExpSliceMulass_d(T[] a, T value) 1219 { 1220 //printf("_arrayExpSliceMulass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 1221 auto aptr = a.ptr; 1222 auto aend = aptr + a.length; 1223 1224 version (D_InlineAsm_X86) 1225 { 1226 // SSE2 version is 109% faster 1227 if (sse2() && a.length >= 8) 1228 { 1229 auto n = cast(T*)((cast(uint)aend) & ~7); 1230 if (aptr < n) 1231 1232 // Unaligned case 1233 asm 1234 { 1235 mov ESI, aptr; 1236 mov EDI, n; 1237 movsd XMM4, value; 1238 shufpd XMM4, XMM4, 0; 1239 1240 align 8; 1241 startsseloopa: 1242 movupd XMM0, [ESI]; 1243 movupd XMM1, [ESI+16]; 1244 movupd XMM2, [ESI+32]; 1245 movupd XMM3, [ESI+48]; 1246 add ESI, 64; 1247 mulpd XMM0, XMM4; 1248 mulpd XMM1, XMM4; 1249 mulpd XMM2, XMM4; 1250 mulpd XMM3, XMM4; 1251 movupd [ESI+ 0-64], XMM0; 1252 movupd [ESI+16-64], XMM1; 1253 movupd [ESI+32-64], XMM2; 1254 movupd [ESI+48-64], XMM3; 1255 cmp ESI, EDI; 1256 jb startsseloopa; 1257 1258 mov aptr, ESI; 1259 } 1260 } 1261 } 1262 1263 while (aptr < aend) 1264 *aptr++ *= value; 1265 1266 return a; 1267 } 1268 1269 unittest 1270 { 1271 printf("_arrayExpSliceMulass_d unittest\n"); 1272 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1273 { 1274 version (log) printf(" cpuid %d\n", cpuid); 1275 1276 for (int j = 0; j < 2; j++) 1277 { 1278 const int dim = 67; 1279 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1280 a = a[j .. dim + j]; // misalign for second iteration 1281 T[] b = new T[dim + j]; 1282 b = b[j .. dim + j]; 1283 T[] c = new T[dim + j]; 1284 c = c[j .. dim + j]; 1285 1286 for (int i = 0; i < dim; i++) 1287 { a[i] = cast(T)i; 1288 b[i] = cast(T)(i + 7); 1289 c[i] = cast(T)(i * 2); 1290 } 1291 1292 a[] = c[]; 1293 c[] *= 6; 1294 1295 for (int i = 0; i < dim; i++) 1296 { 1297 if (c[i] != cast(T)(a[i] * 6)) 1298 { 1299 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); 1300 assert(0); 1301 } 1302 } 1303 } 1304 } 1305 } 1306 1307 /* ======================================================================== */ 1308 1309 /*********************** 1310 * Computes: 1311 * a[] *= b[] 1312 */ 1313 1314 T[] _arraySliceSliceMulass_d(T[] a, T[] b) 1315 in 1316 { 1317 assert (a.length == b.length); 1318 assert (disjoint(a, b)); 1319 } 1320 body 1321 { 1322 //printf("_arraySliceSliceMulass_d()\n"); 1323 auto aptr = a.ptr; 1324 auto aend = aptr + a.length; 1325 auto bptr = b.ptr; 1326 1327 version (D_InlineAsm_X86) 1328 { 1329 // SSE2 version is 205% faster 1330 if (sse2() && a.length >= 8) 1331 { 1332 auto n = aptr + (a.length & ~7); 1333 1334 // Unaligned case 1335 asm 1336 { 1337 mov ECX, bptr; // right operand 1338 mov ESI, aptr; // destination operand 1339 mov EDI, n; // end comparison 1340 1341 align 8; 1342 startsseloopb: 1343 movupd XMM0, [ESI]; 1344 movupd XMM1, [ESI+16]; 1345 movupd XMM2, [ESI+32]; 1346 movupd XMM3, [ESI+48]; 1347 add ESI, 64; 1348 movupd XMM4, [ECX]; 1349 movupd XMM5, [ECX+16]; 1350 movupd XMM6, [ECX+32]; 1351 movupd XMM7, [ECX+48]; 1352 add ECX, 64; 1353 mulpd XMM0, XMM4; 1354 mulpd XMM1, XMM5; 1355 mulpd XMM2, XMM6; 1356 mulpd XMM3, XMM7; 1357 movupd [ESI+ 0-64], XMM0; 1358 movupd [ESI+16-64], XMM1; 1359 movupd [ESI+32-64], XMM2; 1360 movupd [ESI+48-64], XMM3; 1361 cmp ESI, EDI; 1362 jb startsseloopb; 1363 1364 mov aptr, ESI; 1365 mov bptr, ECX; 1366 } 1367 } 1368 } 1369 1370 while (aptr < aend) 1371 *aptr++ *= *bptr++; 1372 1373 return a; 1374 } 1375 1376 unittest 1377 { 1378 printf("_arrayExpSliceMulass_d unittest\n"); 1379 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1380 { 1381 version (log) printf(" cpuid %d\n", cpuid); 1382 1383 for (int j = 0; j < 2; j++) 1384 { 1385 const int dim = 67; 1386 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1387 a = a[j .. dim + j]; // misalign for second iteration 1388 T[] b = new T[dim + j]; 1389 b = b[j .. dim + j]; 1390 T[] c = new T[dim + j]; 1391 c = c[j .. dim + j]; 1392 1393 for (int i = 0; i < dim; i++) 1394 { a[i] = cast(T)i; 1395 b[i] = cast(T)(i + 7); 1396 c[i] = cast(T)(i * 2); 1397 } 1398 1399 a[] = c[]; 1400 c[] *= 6; 1401 1402 for (int i = 0; i < dim; i++) 1403 { 1404 if (c[i] != cast(T)(a[i] * 6)) 1405 { 1406 printf("[%d]: %g != %g * 6\n", i, c[i], a[i]); 1407 assert(0); 1408 } 1409 } 1410 } 1411 } 1412 } 1413 1414 /* ======================================================================== */ 1415 1416 /*********************** 1417 * Computes: 1418 * a[] = b[] / value 1419 */ 1420 1421 T[] _arraySliceExpDivSliceAssign_d(T[] a, T value, T[] b) 1422 in 1423 { 1424 assert(a.length == b.length); 1425 assert(disjoint(a, b)); 1426 } 1427 body 1428 { 1429 //printf("_arraySliceExpDivSliceAssign_d()\n"); 1430 auto aptr = a.ptr; 1431 auto aend = aptr + a.length; 1432 auto bptr = b.ptr; 1433 1434 /* Multiplying by the reciprocal is faster, but does 1435 * not produce as accurate an answer. 1436 */ 1437 T recip = cast(T)1 / value; 1438 1439 version (D_InlineAsm_X86) 1440 { 1441 // SSE2 version is 299% faster 1442 if (sse2() && a.length >= 8) 1443 { 1444 auto n = aptr + (a.length & ~7); 1445 1446 // Unaligned case 1447 asm 1448 { 1449 mov EAX, bptr; 1450 mov ESI, aptr; 1451 mov EDI, n; 1452 movsd XMM4, recip; 1453 //movsd XMM4, value 1454 //rcpsd XMM4, XMM4 1455 shufpd XMM4, XMM4, 0; 1456 1457 align 8; 1458 startsseloop: 1459 add ESI, 64; 1460 movupd XMM0, [EAX]; 1461 movupd XMM1, [EAX+16]; 1462 movupd XMM2, [EAX+32]; 1463 movupd XMM3, [EAX+48]; 1464 add EAX, 64; 1465 mulpd XMM0, XMM4; 1466 mulpd XMM1, XMM4; 1467 mulpd XMM2, XMM4; 1468 mulpd XMM3, XMM4; 1469 //divpd XMM0, XMM4; 1470 //divpd XMM1, XMM4; 1471 //divpd XMM2, XMM4; 1472 //divpd XMM3, XMM4; 1473 movupd [ESI+ 0-64], XMM0; 1474 movupd [ESI+16-64], XMM1; 1475 movupd [ESI+32-64], XMM2; 1476 movupd [ESI+48-64], XMM3; 1477 cmp ESI, EDI; 1478 jb startsseloop; 1479 1480 mov aptr, ESI; 1481 mov bptr, EAX; 1482 } 1483 } 1484 } 1485 1486 while (aptr < aend) 1487 { 1488 *aptr++ = *bptr++ / value; 1489 //*aptr++ = *bptr++ * recip; 1490 } 1491 1492 return a; 1493 } 1494 1495 unittest 1496 { 1497 printf("_arraySliceExpDivSliceAssign_d unittest\n"); 1498 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1499 { 1500 version (log) printf(" cpuid %d\n", cpuid); 1501 1502 for (int j = 0; j < 2; j++) 1503 { 1504 const int dim = 67; 1505 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1506 a = a[j .. dim + j]; // misalign for second iteration 1507 T[] b = new T[dim + j]; 1508 b = b[j .. dim + j]; 1509 T[] c = new T[dim + j]; 1510 c = c[j .. dim + j]; 1511 1512 for (int i = 0; i < dim; i++) 1513 { a[i] = cast(T)i; 1514 b[i] = cast(T)(i + 7); 1515 c[i] = cast(T)(i * 2); 1516 } 1517 1518 c[] = a[] / 8; 1519 1520 for (int i = 0; i < dim; i++) 1521 { 1522 //printf("[%d]: %g ?= %g / 8\n", i, c[i], a[i]); 1523 if (c[i] != cast(T)(a[i] / 8)) 1524 { 1525 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); 1526 assert(0); 1527 } 1528 } 1529 } 1530 } 1531 } 1532 1533 /* ======================================================================== */ 1534 1535 /*********************** 1536 * Computes: 1537 * a[] /= value 1538 */ 1539 1540 T[] _arrayExpSliceDivass_d(T[] a, T value) 1541 { 1542 //printf("_arrayExpSliceDivass_d(a.length = %d, value = %Lg)\n", a.length, cast(real)value); 1543 auto aptr = a.ptr; 1544 auto aend = aptr + a.length; 1545 1546 /* Multiplying by the reciprocal is faster, but does 1547 * not produce as accurate an answer. 1548 */ 1549 T recip = cast(T)1 / value; 1550 1551 version (D_InlineAsm_X86) 1552 { 1553 // SSE2 version is 65% faster 1554 if (sse2() && a.length >= 8) 1555 { 1556 auto n = aptr + (a.length & ~7); 1557 1558 // Unaligned case 1559 asm 1560 { 1561 mov ESI, aptr; 1562 mov EDI, n; 1563 movsd XMM4, recip; 1564 //movsd XMM4, value 1565 //rcpsd XMM4, XMM4 1566 shufpd XMM4, XMM4, 0; 1567 1568 align 8; 1569 startsseloopa: 1570 movupd XMM0, [ESI]; 1571 movupd XMM1, [ESI+16]; 1572 movupd XMM2, [ESI+32]; 1573 movupd XMM3, [ESI+48]; 1574 add ESI, 64; 1575 mulpd XMM0, XMM4; 1576 mulpd XMM1, XMM4; 1577 mulpd XMM2, XMM4; 1578 mulpd XMM3, XMM4; 1579 //divpd XMM0, XMM4; 1580 //divpd XMM1, XMM4; 1581 //divpd XMM2, XMM4; 1582 //divpd XMM3, XMM4; 1583 movupd [ESI+ 0-64], XMM0; 1584 movupd [ESI+16-64], XMM1; 1585 movupd [ESI+32-64], XMM2; 1586 movupd [ESI+48-64], XMM3; 1587 cmp ESI, EDI; 1588 jb startsseloopa; 1589 1590 mov aptr, ESI; 1591 } 1592 } 1593 } 1594 1595 while (aptr < aend) 1596 *aptr++ *= recip; 1597 1598 return a; 1599 } 1600 1601 1602 unittest 1603 { 1604 printf("_arrayExpSliceDivass_d unittest\n"); 1605 for (cpuid = 0; cpuid < CPUID_MAX; cpuid++) 1606 { 1607 version (log) printf(" cpuid %d\n", cpuid); 1608 1609 for (int j = 0; j < 2; j++) 1610 { 1611 const int dim = 67; 1612 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1613 a = a[j .. dim + j]; // misalign for second iteration 1614 T[] b = new T[dim + j]; 1615 b = b[j .. dim + j]; 1616 T[] c = new T[dim + j]; 1617 c = c[j .. dim + j]; 1618 1619 for (int i = 0; i < dim; i++) 1620 { a[i] = cast(T)i; 1621 b[i] = cast(T)(i + 7); 1622 c[i] = cast(T)(i * 2); 1623 } 1624 1625 a[] = c[]; 1626 c[] /= 8; 1627 1628 for (int i = 0; i < dim; i++) 1629 { 1630 if (c[i] != cast(T)(a[i] / 8)) 1631 { 1632 printf("[%d]: %g != %g / 8\n", i, c[i], a[i]); 1633 assert(0); 1634 } 1635 } 1636 } 1637 } 1638 } 1639 1640 1641 /* ======================================================================== */ 1642 1643 /*********************** 1644 * Computes: 1645 * a[] -= b[] * value 1646 */ 1647 1648 T[] _arraySliceExpMulSliceMinass_d(T[] a, T value, T[] b) 1649 { 1650 return _arraySliceExpMulSliceAddass_d(a, -value, b); 1651 } 1652 1653 /*********************** 1654 * Computes: 1655 * a[] += b[] * value 1656 */ 1657 1658 T[] _arraySliceExpMulSliceAddass_d(T[] a, T value, T[] b) 1659 in 1660 { 1661 assert(a.length == b.length); 1662 assert(disjoint(a, b)); 1663 } 1664 body 1665 { 1666 auto aptr = a.ptr; 1667 auto aend = aptr + a.length; 1668 auto bptr = b.ptr; 1669 1670 // Handle remainder 1671 while (aptr < aend) 1672 *aptr++ += *bptr++ * value; 1673 1674 return a; 1675 } 1676 1677 unittest 1678 { 1679 printf("_arraySliceExpMulSliceAddass_d unittest\n"); 1680 1681 cpuid = 1; 1682 { 1683 version (log) printf(" cpuid %d\n", cpuid); 1684 1685 for (int j = 0; j < 1; j++) 1686 { 1687 const int dim = 67; 1688 T[] a = new T[dim + j]; // aligned on 16 byte boundary 1689 a = a[j .. dim + j]; // misalign for second iteration 1690 T[] b = new T[dim + j]; 1691 b = b[j .. dim + j]; 1692 T[] c = new T[dim + j]; 1693 c = c[j .. dim + j]; 1694 1695 for (int i = 0; i < dim; i++) 1696 { a[i] = cast(T)i; 1697 b[i] = cast(T)(i + 7); 1698 c[i] = cast(T)(i * 2); 1699 } 1700 1701 b[] = c[]; 1702 c[] += a[] * 6; 1703 1704 for (int i = 0; i < dim; i++) 1705 { 1706 //printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); 1707 if (c[i] != cast(T)(b[i] + a[i] * 6)) 1708 { 1709 printf("[%d]: %g ?= %g + %g * 6\n", i, c[i], b[i], a[i]); 1710 assert(0); 1711 } 1712 } 1713 } 1714 } 1715 }