1 // Written in the D programming language. 2 3 /** 4 * Builtin SIMD intrinsics 5 * 6 * Source: $(DRUNTIMESRC core/_simd.d) 7 * 8 * Copyright: Copyright Digital Mars 2012. 9 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 10 * Authors: $(HTTP digitalmars.com, Walter Bright), 11 */ 12 13 module core.simd; 14 15 pure: 16 nothrow: 17 @safe: 18 @nogc: 19 20 /******************************* 21 * Create a vector type. 22 * 23 * Parameters: 24 * T = one of double[2], float[4], void[16], byte[16], ubyte[16], 25 * short[8], ushort[8], int[4], uint[4], long[2], ulong[2]. 26 * For 256 bit vectors, 27 * one of double[4], float[8], void[32], byte[32], ubyte[32], 28 * short[16], ushort[16], int[8], uint[8], long[4], ulong[4] 29 */ 30 31 template Vector(T) 32 { 33 /* __vector is compiler magic, hide it behind a template. 34 * The compiler will reject T's that don't work. 35 */ 36 alias __vector(T) Vector; 37 } 38 39 /* Handy aliases 40 */ 41 static if (is(Vector!(void[8]))) alias Vector!(void[8]) void8; /// 42 static if (is(Vector!(double[1]))) alias Vector!(double[1]) double1; /// 43 static if (is(Vector!(float[2]))) alias Vector!(float[2]) float2; /// 44 static if (is(Vector!(byte[8]))) alias Vector!(byte[8]) byte8; /// 45 static if (is(Vector!(ubyte[8]))) alias Vector!(ubyte[8]) ubyte8; /// 46 static if (is(Vector!(short[4]))) alias Vector!(short[4]) short4; /// 47 static if (is(Vector!(ushort[4]))) alias Vector!(ushort[4]) ushort4; /// 48 static if (is(Vector!(int[2]))) alias Vector!(int[2]) int2; /// 49 static if (is(Vector!(uint[2]))) alias Vector!(uint[2]) uint2; /// 50 static if (is(Vector!(long[1]))) alias Vector!(long[1]) long1; /// 51 static if (is(Vector!(ulong[1]))) alias Vector!(ulong[1]) ulong1; /// 52 53 static if (is(Vector!(void[16]))) alias Vector!(void[16]) void16; /// 54 static if (is(Vector!(double[2]))) alias Vector!(double[2]) double2; /// 55 static if (is(Vector!(float[4]))) alias Vector!(float[4]) float4; /// 56 static if (is(Vector!(byte[16]))) alias Vector!(byte[16]) byte16; /// 57 static if (is(Vector!(ubyte[16]))) alias Vector!(ubyte[16]) ubyte16; /// 58 static if (is(Vector!(short[8]))) alias Vector!(short[8]) short8; /// 59 static if (is(Vector!(ushort[8]))) alias Vector!(ushort[8]) ushort8; /// 60 static if (is(Vector!(int[4]))) alias Vector!(int[4]) int4; /// 61 static if (is(Vector!(uint[4]))) alias Vector!(uint[4]) uint4; /// 62 static if (is(Vector!(long[2]))) alias Vector!(long[2]) long2; /// 63 static if (is(Vector!(ulong[2]))) alias Vector!(ulong[2]) ulong2; /// 64 65 static if (is(Vector!(void[32]))) alias Vector!(void[32]) void32; /// 66 static if (is(Vector!(double[4]))) alias Vector!(double[4]) double4; /// 67 static if (is(Vector!(float[8]))) alias Vector!(float[8]) float8; /// 68 static if (is(Vector!(byte[32]))) alias Vector!(byte[32]) byte32; /// 69 static if (is(Vector!(ubyte[32]))) alias Vector!(ubyte[32]) ubyte32; /// 70 static if (is(Vector!(short[16]))) alias Vector!(short[16]) short16; /// 71 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16; /// 72 static if (is(Vector!(int[8]))) alias Vector!(int[8]) int8; /// 73 static if (is(Vector!(uint[8]))) alias Vector!(uint[8]) uint8; /// 74 static if (is(Vector!(long[4]))) alias Vector!(long[4]) long4; /// 75 static if (is(Vector!(ulong[4]))) alias Vector!(ulong[4]) ulong4; /// 76 77 static if (is(Vector!(void[64]))) alias Vector!(void[64]) void64; /// 78 static if (is(Vector!(double[8]))) alias Vector!(double[8]) double8; /// 79 static if (is(Vector!(float[16]))) alias Vector!(float[16]) float16; /// 80 static if (is(Vector!(byte[64]))) alias Vector!(byte[64]) byte64; /// 81 static if (is(Vector!(ubyte[64]))) alias Vector!(ubyte[64]) ubyte64; /// 82 static if (is(Vector!(short[32]))) alias Vector!(short[32]) short32; /// 83 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32; /// 84 static if (is(Vector!(int[16]))) alias Vector!(int[16]) int16; /// 85 static if (is(Vector!(uint[16]))) alias Vector!(uint[16]) uint16; /// 86 static if (is(Vector!(long[8]))) alias Vector!(long[8]) long8; /// 87 static if (is(Vector!(ulong[8]))) alias Vector!(ulong[8]) ulong8; /// 88 89 version (D_SIMD) 90 { 91 /** XMM opcodes that conform to the following: 92 * 93 * opcode xmm1,xmm2/mem 94 * 95 * and do not have side effects (i.e. do not write to memory). 96 */ 97 enum XMM 98 { 99 ADDSS = 0xF30F58, 100 ADDSD = 0xF20F58, 101 ADDPS = 0x000F58, 102 ADDPD = 0x660F58, 103 PADDB = 0x660FFC, 104 PADDW = 0x660FFD, 105 PADDD = 0x660FFE, 106 PADDQ = 0x660FD4, 107 108 SUBSS = 0xF30F5C, 109 SUBSD = 0xF20F5C, 110 SUBPS = 0x000F5C, 111 SUBPD = 0x660F5C, 112 PSUBB = 0x660FF8, 113 PSUBW = 0x660FF9, 114 PSUBD = 0x660FFA, 115 PSUBQ = 0x660FFB, 116 117 MULSS = 0xF30F59, 118 MULSD = 0xF20F59, 119 MULPS = 0x000F59, 120 MULPD = 0x660F59, 121 PMULLW = 0x660FD5, 122 123 DIVSS = 0xF30F5E, 124 DIVSD = 0xF20F5E, 125 DIVPS = 0x000F5E, 126 DIVPD = 0x660F5E, 127 128 PAND = 0x660FDB, 129 POR = 0x660FEB, 130 131 UCOMISS = 0x000F2E, 132 UCOMISD = 0x660F2E, 133 134 XORPS = 0x000F57, 135 XORPD = 0x660F57, 136 137 // Use STO and LOD instead of MOV to distinguish the direction 138 STOSS = 0xF30F11, 139 STOSD = 0xF20F11, 140 STOAPS = 0x000F29, 141 STOAPD = 0x660F29, 142 STODQA = 0x660F7F, 143 STOD = 0x660F7E, // MOVD reg/mem64, xmm 66 0F 7E /r 144 STOQ = 0x660FD6, 145 146 LODSS = 0xF30F10, 147 LODSD = 0xF20F10, 148 LODAPS = 0x000F28, 149 LODAPD = 0x660F28, 150 LODDQA = 0x660F6F, 151 LODD = 0x660F6E, // MOVD xmm, reg/mem64 66 0F 6E /r 152 LODQ = 0xF30F7E, 153 154 LODDQU = 0xF30F6F, // MOVDQU xmm1, xmm2/mem128 F3 0F 6F /r 155 STODQU = 0xF30F7F, // MOVDQU xmm1/mem128, xmm2 F3 0F 7F /r 156 MOVDQ2Q = 0xF20FD6, // MOVDQ2Q mmx, xmm F2 0F D6 /r 157 MOVHLPS = 0x0F12, // MOVHLPS xmm1, xmm2 0F 12 /r 158 LODHPD = 0x660F16, 159 STOHPD = 0x660F17, // MOVHPD mem64, xmm 66 0F 17 /r 160 LODHPS = 0x0F16, 161 STOHPS = 0x0F17, 162 MOVLHPS = 0x0F16, 163 LODLPD = 0x660F12, 164 STOLPD = 0x660F13, 165 LODLPS = 0x0F12, 166 STOLPS = 0x0F13, 167 MOVMSKPD = 0x660F50, 168 MOVMSKPS = 0x0F50, 169 MOVNTDQ = 0x660FE7, 170 MOVNTI = 0x0FC3, 171 MOVNTPD = 0x660F2B, 172 MOVNTPS = 0x0F2B, 173 MOVNTQ = 0x0FE7, 174 MOVQ2DQ = 0xF30FD6, 175 LODUPD = 0x660F10, 176 STOUPD = 0x660F11, 177 LODUPS = 0x0F10, 178 STOUPS = 0x0F11, 179 180 PACKSSDW = 0x660F6B, 181 PACKSSWB = 0x660F63, 182 PACKUSWB = 0x660F67, 183 PADDSB = 0x660FEC, 184 PADDSW = 0x660FED, 185 PADDUSB = 0x660FDC, 186 PADDUSW = 0x660FDD, 187 PANDN = 0x660FDF, 188 PCMPEQB = 0x660F74, 189 PCMPEQD = 0x660F76, 190 PCMPEQW = 0x660F75, 191 PCMPGTB = 0x660F64, 192 PCMPGTD = 0x660F66, 193 PCMPGTW = 0x660F65, 194 PMADDWD = 0x660FF5, 195 PSLLW = 0x660FF1, 196 PSLLD = 0x660FF2, 197 PSLLQ = 0x660FF3, 198 PSRAW = 0x660FE1, 199 PSRAD = 0x660FE2, 200 PSRLW = 0x660FD1, 201 PSRLD = 0x660FD2, 202 PSRLQ = 0x660FD3, 203 PSUBSB = 0x660FE8, 204 PSUBSW = 0x660FE9, 205 PSUBUSB = 0x660FD8, 206 PSUBUSW = 0x660FD9, 207 PUNPCKHBW = 0x660F68, 208 PUNPCKHDQ = 0x660F6A, 209 PUNPCKHWD = 0x660F69, 210 PUNPCKLBW = 0x660F60, 211 PUNPCKLDQ = 0x660F62, 212 PUNPCKLWD = 0x660F61, 213 PXOR = 0x660FEF, 214 ANDPD = 0x660F54, 215 ANDPS = 0x0F54, 216 ANDNPD = 0x660F55, 217 ANDNPS = 0x0F55, 218 CMPPS = 0x0FC2, 219 CMPPD = 0x660FC2, 220 CMPSD = 0xF20FC2, 221 CMPSS = 0xF30FC2, 222 COMISD = 0x660F2F, 223 COMISS = 0x0F2F, 224 CVTDQ2PD = 0xF30FE6, 225 CVTDQ2PS = 0x0F5B, 226 CVTPD2DQ = 0xF20FE6, 227 CVTPD2PI = 0x660F2D, 228 CVTPD2PS = 0x660F5A, 229 CVTPI2PD = 0x660F2A, 230 CVTPI2PS = 0x0F2A, 231 CVTPS2DQ = 0x660F5B, 232 CVTPS2PD = 0x0F5A, 233 CVTPS2PI = 0x0F2D, 234 CVTSD2SI = 0xF20F2D, 235 CVTSD2SS = 0xF20F5A, 236 CVTSI2SD = 0xF20F2A, 237 CVTSI2SS = 0xF30F2A, 238 CVTSS2SD = 0xF30F5A, 239 CVTSS2SI = 0xF30F2D, 240 CVTTPD2PI = 0x660F2C, 241 CVTTPD2DQ = 0x660FE6, 242 CVTTPS2DQ = 0xF30F5B, 243 CVTTPS2PI = 0x0F2C, 244 CVTTSD2SI = 0xF20F2C, 245 CVTTSS2SI = 0xF30F2C, 246 MASKMOVDQU = 0x660FF7, 247 MASKMOVQ = 0x0FF7, 248 MAXPD = 0x660F5F, 249 MAXPS = 0x0F5F, 250 MAXSD = 0xF20F5F, 251 MAXSS = 0xF30F5F, 252 MINPD = 0x660F5D, 253 MINPS = 0x0F5D, 254 MINSD = 0xF20F5D, 255 MINSS = 0xF30F5D, 256 ORPD = 0x660F56, 257 ORPS = 0x0F56, 258 PAVGB = 0x660FE0, 259 PAVGW = 0x660FE3, 260 PMAXSW = 0x660FEE, 261 //PINSRW = 0x660FC4, 262 PMAXUB = 0x660FDE, 263 PMINSW = 0x660FEA, 264 PMINUB = 0x660FDA, 265 //PMOVMSKB = 0x660FD7, 266 PMULHUW = 0x660FE4, 267 PMULHW = 0x660FE5, 268 PMULUDQ = 0x660FF4, 269 PSADBW = 0x660FF6, 270 PUNPCKHQDQ = 0x660F6D, 271 PUNPCKLQDQ = 0x660F6C, 272 RCPPS = 0x0F53, 273 RCPSS = 0xF30F53, 274 RSQRTPS = 0x0F52, 275 RSQRTSS = 0xF30F52, 276 SQRTPD = 0x660F51, 277 SHUFPD = 0x660FC6, 278 SHUFPS = 0x0FC6, 279 SQRTPS = 0x0F51, 280 SQRTSD = 0xF20F51, 281 SQRTSS = 0xF30F51, 282 UNPCKHPD = 0x660F15, 283 UNPCKHPS = 0x0F15, 284 UNPCKLPD = 0x660F14, 285 UNPCKLPS = 0x0F14, 286 287 PSHUFD = 0x660F70, 288 PSHUFHW = 0xF30F70, 289 PSHUFLW = 0xF20F70, 290 PSHUFW = 0x0F70, 291 PSLLDQ = 0x07660F73, 292 PSRLDQ = 0x03660F73, 293 294 //PREFETCH = 0x0F18, 295 296 // SSE3 Pentium 4 (Prescott) 297 298 ADDSUBPD = 0x660FD0, 299 ADDSUBPS = 0xF20FD0, 300 HADDPD = 0x660F7C, 301 HADDPS = 0xF20F7C, 302 HSUBPD = 0x660F7D, 303 HSUBPS = 0xF20F7D, 304 MOVDDUP = 0xF20F12, 305 MOVSHDUP = 0xF30F16, 306 MOVSLDUP = 0xF30F12, 307 LDDQU = 0xF20FF0, 308 MONITOR = 0x0F01C8, 309 MWAIT = 0x0F01C9, 310 311 // SSSE3 312 PALIGNR = 0x660F3A0F, 313 PHADDD = 0x660F3802, 314 PHADDW = 0x660F3801, 315 PHADDSW = 0x660F3803, 316 PABSB = 0x660F381C, 317 PABSD = 0x660F381E, 318 PABSW = 0x660F381D, 319 PSIGNB = 0x660F3808, 320 PSIGND = 0x660F380A, 321 PSIGNW = 0x660F3809, 322 PSHUFB = 0x660F3800, 323 PMADDUBSW = 0x660F3804, 324 PMULHRSW = 0x660F380B, 325 PHSUBD = 0x660F3806, 326 PHSUBW = 0x660F3805, 327 PHSUBSW = 0x660F3807, 328 329 // SSE4.1 330 331 BLENDPD = 0x660F3A0D, 332 BLENDPS = 0x660F3A0C, 333 BLENDVPD = 0x660F3815, 334 BLENDVPS = 0x660F3814, 335 DPPD = 0x660F3A41, 336 DPPS = 0x660F3A40, 337 EXTRACTPS = 0x660F3A17, 338 INSERTPS = 0x660F3A21, 339 MPSADBW = 0x660F3A42, 340 PBLENDVB = 0x660F3810, 341 PBLENDW = 0x660F3A0E, 342 PEXTRD = 0x660F3A16, 343 PEXTRQ = 0x660F3A16, 344 PINSRB = 0x660F3A20, 345 PINSRD = 0x660F3A22, 346 PINSRQ = 0x660F3A22, 347 348 MOVNTDQA = 0x660F382A, 349 PACKUSDW = 0x660F382B, 350 PCMPEQQ = 0x660F3829, 351 PEXTRB = 0x660F3A14, 352 PHMINPOSUW = 0x660F3841, 353 PMAXSB = 0x660F383C, 354 PMAXSD = 0x660F383D, 355 PMAXUD = 0x660F383F, 356 PMAXUW = 0x660F383E, 357 PMINSB = 0x660F3838, 358 PMINSD = 0x660F3839, 359 PMINUD = 0x660F383B, 360 PMINUW = 0x660F383A, 361 PMOVSXBW = 0x660F3820, 362 PMOVSXBD = 0x660F3821, 363 PMOVSXBQ = 0x660F3822, 364 PMOVSXWD = 0x660F3823, 365 PMOVSXWQ = 0x660F3824, 366 PMOVSXDQ = 0x660F3825, 367 PMOVZXBW = 0x660F3830, 368 PMOVZXBD = 0x660F3831, 369 PMOVZXBQ = 0x660F3832, 370 PMOVZXWD = 0x660F3833, 371 PMOVZXWQ = 0x660F3834, 372 PMOVZXDQ = 0x660F3835, 373 PMULDQ = 0x660F3828, 374 PMULLD = 0x660F3840, 375 PTEST = 0x660F3817, 376 377 ROUNDPD = 0x660F3A09, 378 ROUNDPS = 0x660F3A08, 379 ROUNDSD = 0x660F3A0B, 380 ROUNDSS = 0x660F3A0A, 381 382 // SSE4.2 383 PCMPESTRI = 0x660F3A61, 384 PCMPESTRM = 0x660F3A60, 385 PCMPISTRI = 0x660F3A63, 386 PCMPISTRM = 0x660F3A62, 387 PCMPGTQ = 0x660F3837, 388 //CRC32 389 390 // SSE4a (AMD only) 391 // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS 392 393 // POPCNT and LZCNT (have their own CPUID bits) 394 POPCNT = 0xF30FB8, 395 // LZCNT 396 } 397 398 /** 399 * Generate two operand instruction with XMM 128 bit operands. 400 * 401 * This is a compiler magic function - it doesn't behave like 402 * regular D functions. 403 * 404 * Parameters: 405 * opcode = any of the XMM opcodes; it must be a compile time constant 406 * op1 = first operand 407 * op2 = second operand 408 * Returns: 409 * result of opcode 410 */ 411 pure @safe V1 simd(XMM opcode, V1, V2)(V1 op1, V2 op2) 412 if (is(V1 == __vector) && is(V2 == __vector)) 413 { 414 pragma(inline, true); 415 return cast(V1)__simd(opcode, op1, op2); 416 } 417 418 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2); // intrinsic 419 420 /// 421 unittest 422 { 423 float4 a; 424 a = simd!(XMM.PXOR)(a, a); 425 } 426 427 /** 428 * Unary SIMD instructions. 429 */ 430 pure @safe V1 simd(XMM opcode, V1)(V1 op1) 431 if (is(V1 == __vector)) 432 { 433 pragma(inline, true); 434 return cast(V1)__simd(opcode, op1); 435 } 436 437 /// 438 pure @safe V1 simd(XMM opcode, V1)(double d) 439 if (is(V1 == __vector)) 440 { 441 pragma(inline, true); 442 return cast(V1)__simd(opcode, d); 443 } 444 445 /// 446 pure @safe V1 simd(XMM opcode, V1)(float f) 447 if (is(V1 == __vector)) 448 { 449 pragma(inline, true); 450 return cast(V1)__simd(opcode, f); 451 } 452 453 pure @safe void16 __simd(XMM opcode, void16 op1); // intrinsic 454 pure @safe void16 __simd(XMM opcode, double d); // intrinsic 455 pure @safe void16 __simd(XMM opcode, float f); // intrinsic 456 457 /// 458 unittest 459 { 460 float4 a; 461 a = simd!(XMM.LODSS)(a); 462 } 463 464 /**** 465 * For instructions: 466 * CMPPD, CMPSS, CMPSD, CMPPS, 467 * PSHUFD, PSHUFHW, PSHUFLW, 468 * BLENDPD, BLENDPS, DPPD, DPPS, 469 * MPSADBW, PBLENDW, 470 * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS 471 * Parameters: 472 * opcode = any of the above XMM opcodes; it must be a compile time constant 473 * op1 = first operand 474 * op2 = second operand 475 * imm8 = third operand; must be a compile time constant 476 * Returns: 477 * result of opcode 478 */ 479 pure @safe V1 simd(XMM opcode, ubyte imm8, V1, V2)(V1 op1, V2 op2) 480 if (is(V1 == __vector) && is(V2 == __vector)) 481 { 482 pragma(inline, true); 483 return cast(V1)__simd(opcode, op1, op2, imm8); 484 } 485 486 pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8); // intrinsic 487 488 /// 489 unittest 490 { 491 float4 a; 492 a = simd!(XMM.CMPPD, 0x7A)(a, a); 493 } 494 495 /*** 496 * For instructions with the imm8 version: 497 * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, 498 * PSRLDQ, PSLLDQ 499 * Parameters: 500 * opcode = any of the XMM opcodes; it must be a compile time constant 501 * op1 = first operand 502 * imm8 = second operand; must be a compile time constant 503 * Returns: 504 * result of opcode 505 */ 506 pure @safe V1 simd(XMM opcode, ubyte imm8, V1)(V1 op1) 507 if (is(V1 == __vector)) 508 { 509 pragma(inline, true); 510 return cast(V1)__simd_ib(opcode, op1, imm8); 511 } 512 513 pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8); // intrinsic 514 515 /// 516 unittest 517 { 518 float4 a; 519 a = simd!(XMM.PSRLQ, 0x7A)(a); 520 } 521 522 /***** 523 * For "store" operations of the form: 524 * op1 op= op2 525 * Returns: 526 * op2 527 * These cannot be marked as pure, as semantic() doesn't check them. 528 */ 529 @safe V1 simd_sto(XMM opcode, V1, V2)(V1 op1, V2 op2) 530 if (is(V1 == __vector) && is(V2 == __vector)) 531 { 532 pragma(inline, true); 533 return cast(V1)__simd_sto(opcode, op1, op2); 534 } 535 536 /// 537 @safe V1 simd_stod(XMM opcode, V1, V2)(double op1, V1 op2) 538 if (is(V1 == __vector)) 539 { 540 pragma(inline, true); 541 return cast(V1)__simd_sto(opcode, op1, op2); 542 } 543 544 /// 545 @safe V1 simd_stof(XMM opcode, V1)(float op1, V1 op2) 546 if (is(V1 == __vector)) 547 { 548 pragma(inline, true); 549 return cast(V1)__simd_sto(opcode, op1, op2); 550 } 551 552 @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2); // intrinsic 553 @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); // intrinsic 554 @safe void16 __simd_sto(XMM opcode, float op1, void16 op2); // intrinsic 555 556 /// 557 unittest 558 { 559 void16 a; 560 float f = 1; 561 double d = 1; 562 563 cast(void)simd_sto!(XMM.STOUPS)(a, a); 564 //simd_sto!(XMM.STOUPS)(f, a); 565 //simd_sto!(XMM.STOUPS)(d, a); 566 } 567 568 /* The following use overloading to ensure correct typing. 569 * Compile with inlining on for best performance. 570 */ 571 572 pure @safe short8 pcmpeq()(short8 v1, short8 v2) 573 { 574 return cast(short8)__simd(XMM.PCMPEQW, v1, v2); 575 } 576 577 pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2) 578 { 579 return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2); 580 } 581 582 /********************* 583 * Emit prefetch instruction. 584 * Params: 585 * address = address to be prefetched 586 * writeFetch = true for write fetch, false for read fetch 587 * locality = 0..3 (0 meaning least local, 3 meaning most local) 588 * Note: 589 * The Intel mappings are: 590 * $(TABLE 591 * $(THEAD writeFetch, locality, Instruction) 592 * $(TROW false, 0, prefetchnta) 593 * $(TROW false, 1, prefetch2) 594 * $(TROW false, 2, prefetch1) 595 * $(TROW false, 3, prefetch0) 596 * $(TROW true, 0, prefetchw) 597 * $(TROW true, 1, prefetchw) 598 * $(TROW true, 2, prefetchw) 599 * $(TROW true, 3, prefetchw) 600 * ) 601 */ 602 void prefetch(bool writeFetch, ubyte locality)(const(void)* address) 603 { 604 static if (writeFetch) 605 __prefetch(address, 4); 606 else static if (locality < 4) 607 __prefetch(address, 3 - locality); 608 else 609 static assert(0, "0..3 expected for locality"); 610 } 611 612 private void __prefetch(const(void*) address, ubyte encoding); 613 614 /************************************* 615 * Load unaligned vector from address. 616 * This is a compiler intrinsic. 617 * Params: 618 * p = pointer to vector 619 * Returns: 620 * vector 621 */ 622 623 V loadUnaligned(V)(const V* p) 624 if (is(V == void16) || 625 is(V == byte16) || 626 is(V == ubyte16) || 627 is(V == short8) || 628 is(V == ushort8) || 629 is(V == int4) || 630 is(V == uint4) || 631 is(V == long2) || 632 is(V == ulong2) || 633 is(V == double2) || 634 is(V == float4)) 635 { 636 pragma(inline, true); 637 static if (is(V == double2)) 638 return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p); 639 else static if (is(V == float4)) 640 return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p); 641 else 642 return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p); 643 } 644 645 @system 646 unittest 647 { 648 // Memory to load into the vector: 649 // Should have enough data to test all 16-byte alignments, and still 650 // have room for a 16-byte vector 651 ubyte[32] data; 652 foreach (i; 0..data.length) 653 { 654 data[i] = cast(ubyte)i; 655 } 656 657 // to test all alignments from 1 ~ 16 658 foreach (i; 0..16) 659 { 660 ubyte* d = &data[i]; 661 662 void test(T)() 663 { 664 // load the data 665 T v = loadUnaligned(cast(T*)d); 666 667 // check that the data was loaded correctly 668 ubyte* ptrToV = cast(ubyte*)&v; 669 foreach (j; 0..T.sizeof) 670 { 671 assert(ptrToV[j] == d[j]); 672 } 673 } 674 675 test!void16(); 676 test!byte16(); 677 test!ubyte16(); 678 test!short8(); 679 test!ushort8(); 680 test!int4(); 681 test!uint4(); 682 test!long2(); 683 test!ulong2(); 684 test!double2(); 685 test!float4(); 686 } 687 } 688 689 /************************************* 690 * Store vector to unaligned address. 691 * This is a compiler intrinsic. 692 * Params: 693 * p = pointer to vector 694 * value = value to store 695 * Returns: 696 * value 697 */ 698 699 V storeUnaligned(V)(V* p, V value) 700 if (is(V == void16) || 701 is(V == byte16) || 702 is(V == ubyte16) || 703 is(V == short8) || 704 is(V == ushort8) || 705 is(V == int4) || 706 is(V == uint4) || 707 is(V == long2) || 708 is(V == ulong2) || 709 is(V == double2) || 710 is(V == float4)) 711 { 712 pragma(inline, true); 713 static if (is(V == double2)) 714 return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value); 715 else static if (is(V == float4)) 716 return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value); 717 else 718 return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value); 719 } 720 721 @system 722 unittest 723 { 724 // Memory to store the vector to: 725 // Should have enough data to test all 16-byte alignments, and still 726 // have room for a 16-byte vector 727 ubyte[32] data; 728 729 // to test all alignments from 1 ~ 16 730 foreach (i; 0..16) 731 { 732 ubyte* d = &data[i]; 733 734 void test(T)() 735 { 736 T v; 737 738 // populate v` with data 739 ubyte* ptrToV = cast(ubyte*)&v; 740 foreach (j; 0..T.sizeof) 741 { 742 ptrToV[j] = cast(ubyte)j; 743 } 744 745 // store `v` to location pointed to by `d` 746 storeUnaligned(cast(T*)d, v); 747 748 // check that the the data was stored correctly 749 foreach (j; 0..T.sizeof) 750 { 751 assert(ptrToV[j] == d[j]); 752 } 753 } 754 755 test!void16(); 756 test!byte16(); 757 test!ubyte16(); 758 test!short8(); 759 test!ushort8(); 760 test!int4(); 761 test!uint4(); 762 test!long2(); 763 test!ulong2(); 764 test!double2(); 765 test!float4(); 766 } 767 } 768 }