1 // Written in the D programming language.
2 
3 /**
4  * Builtin SIMD intrinsics
5  *
6  * Source: $(DRUNTIMESRC core/_simd.d)
7  *
8  * Copyright: Copyright Digital Mars 2012.
9  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
10  * Authors:   $(HTTP digitalmars.com, Walter Bright),
11  */
12 
13 module core.simd;
14 
15 pure:
16 nothrow:
17 @safe:
18 @nogc:
19 
20 /*******************************
21  * Create a vector type.
22  *
23  * Parameters:
24  *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
25  *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
26  *      For 256 bit vectors,
27  *      one of double[4], float[8], void[32], byte[32], ubyte[32],
28  *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
29  */
30 
31 template Vector(T)
32 {
33     /* __vector is compiler magic, hide it behind a template.
34      * The compiler will reject T's that don't work.
35      */
36     alias __vector(T) Vector;
37 }
38 
39 /* Handy aliases
40  */
41 static if (is(Vector!(void[8])))    alias Vector!(void[8])    void8;        ///
42 static if (is(Vector!(double[1])))  alias Vector!(double[1])  double1;      ///
43 static if (is(Vector!(float[2])))   alias Vector!(float[2])   float2;       ///
44 static if (is(Vector!(byte[8])))    alias Vector!(byte[8])    byte8;        ///
45 static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8])   ubyte8;       ///
46 static if (is(Vector!(short[4])))   alias Vector!(short[4])   short4;       ///
47 static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4])  ushort4;      ///
48 static if (is(Vector!(int[2])))     alias Vector!(int[2])     int2;         ///
49 static if (is(Vector!(uint[2])))    alias Vector!(uint[2])    uint2;        ///
50 static if (is(Vector!(long[1])))    alias Vector!(long[1])    long1;        ///
51 static if (is(Vector!(ulong[1])))   alias Vector!(ulong[1])   ulong1;       ///
52 
53 static if (is(Vector!(void[16])))   alias Vector!(void[16])   void16;       ///
54 static if (is(Vector!(double[2])))  alias Vector!(double[2])  double2;      ///
55 static if (is(Vector!(float[4])))   alias Vector!(float[4])   float4;       ///
56 static if (is(Vector!(byte[16])))   alias Vector!(byte[16])   byte16;       ///
57 static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16])  ubyte16;      ///
58 static if (is(Vector!(short[8])))   alias Vector!(short[8])   short8;       ///
59 static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8])  ushort8;      ///
60 static if (is(Vector!(int[4])))     alias Vector!(int[4])     int4;         ///
61 static if (is(Vector!(uint[4])))    alias Vector!(uint[4])    uint4;        ///
62 static if (is(Vector!(long[2])))    alias Vector!(long[2])    long2;        ///
63 static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])   ulong2;       ///
64 
65 static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;       ///
66 static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;      ///
67 static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;       ///
68 static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;       ///
69 static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;      ///
70 static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;      ///
71 static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;     ///
72 static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;         ///
73 static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;        ///
74 static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;        ///
75 static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;       ///
76 
77 static if (is(Vector!(void[64])))   alias Vector!(void[64])   void64;       ///
78 static if (is(Vector!(double[8])))  alias Vector!(double[8])  double8;      ///
79 static if (is(Vector!(float[16])))  alias Vector!(float[16])  float16;      ///
80 static if (is(Vector!(byte[64])))   alias Vector!(byte[64])   byte64;       ///
81 static if (is(Vector!(ubyte[64])))  alias Vector!(ubyte[64])  ubyte64;      ///
82 static if (is(Vector!(short[32])))  alias Vector!(short[32])  short32;      ///
83 static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32;     ///
84 static if (is(Vector!(int[16])))    alias Vector!(int[16])    int16;        ///
85 static if (is(Vector!(uint[16])))   alias Vector!(uint[16])   uint16;       ///
86 static if (is(Vector!(long[8])))    alias Vector!(long[8])    long8;        ///
87 static if (is(Vector!(ulong[8])))   alias Vector!(ulong[8])   ulong8;       ///
88 
89 version (D_SIMD)
90 {
91     /** XMM opcodes that conform to the following:
92     *
93     *  opcode xmm1,xmm2/mem
94     *
95     * and do not have side effects (i.e. do not write to memory).
96     */
97     enum XMM
98     {
99         ADDSS = 0xF30F58,
100         ADDSD = 0xF20F58,
101         ADDPS = 0x000F58,
102         ADDPD = 0x660F58,
103         PADDB = 0x660FFC,
104         PADDW = 0x660FFD,
105         PADDD = 0x660FFE,
106         PADDQ = 0x660FD4,
107 
108         SUBSS = 0xF30F5C,
109         SUBSD = 0xF20F5C,
110         SUBPS = 0x000F5C,
111         SUBPD = 0x660F5C,
112         PSUBB = 0x660FF8,
113         PSUBW = 0x660FF9,
114         PSUBD = 0x660FFA,
115         PSUBQ = 0x660FFB,
116 
117         MULSS = 0xF30F59,
118         MULSD = 0xF20F59,
119         MULPS = 0x000F59,
120         MULPD = 0x660F59,
121         PMULLW = 0x660FD5,
122 
123         DIVSS = 0xF30F5E,
124         DIVSD = 0xF20F5E,
125         DIVPS = 0x000F5E,
126         DIVPD = 0x660F5E,
127 
128         PAND  = 0x660FDB,
129         POR   = 0x660FEB,
130 
131         UCOMISS = 0x000F2E,
132         UCOMISD = 0x660F2E,
133 
134         XORPS = 0x000F57,
135         XORPD = 0x660F57,
136 
137         // Use STO and LOD instead of MOV to distinguish the direction
138         STOSS  = 0xF30F11,
139         STOSD  = 0xF20F11,
140         STOAPS = 0x000F29,
141         STOAPD = 0x660F29,
142         STODQA = 0x660F7F,
143         STOD   = 0x660F7E,        // MOVD reg/mem64, xmm   66 0F 7E /r
144         STOQ   = 0x660FD6,
145 
146         LODSS  = 0xF30F10,
147         LODSD  = 0xF20F10,
148         LODAPS = 0x000F28,
149         LODAPD = 0x660F28,
150         LODDQA = 0x660F6F,
151         LODD   = 0x660F6E,        // MOVD xmm, reg/mem64   66 0F 6E /r
152         LODQ   = 0xF30F7E,
153 
154         LODDQU   = 0xF30F6F,      // MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
155         STODQU   = 0xF30F7F,      // MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
156         MOVDQ2Q  = 0xF20FD6,      // MOVDQ2Q mmx, xmm          F2 0F D6 /r
157         MOVHLPS  = 0x0F12,        // MOVHLPS xmm1, xmm2        0F 12 /r
158         LODHPD   = 0x660F16,
159         STOHPD   = 0x660F17,      // MOVHPD mem64, xmm         66 0F 17 /r
160         LODHPS   = 0x0F16,
161         STOHPS   = 0x0F17,
162         MOVLHPS  = 0x0F16,
163         LODLPD   = 0x660F12,
164         STOLPD   = 0x660F13,
165         LODLPS   = 0x0F12,
166         STOLPS   = 0x0F13,
167         MOVMSKPD = 0x660F50,
168         MOVMSKPS = 0x0F50,
169         MOVNTDQ  = 0x660FE7,
170         MOVNTI   = 0x0FC3,
171         MOVNTPD  = 0x660F2B,
172         MOVNTPS  = 0x0F2B,
173         MOVNTQ   = 0x0FE7,
174         MOVQ2DQ  = 0xF30FD6,
175         LODUPD   = 0x660F10,
176         STOUPD   = 0x660F11,
177         LODUPS   = 0x0F10,
178         STOUPS   = 0x0F11,
179 
180         PACKSSDW = 0x660F6B,
181         PACKSSWB = 0x660F63,
182         PACKUSWB = 0x660F67,
183         PADDSB = 0x660FEC,
184         PADDSW = 0x660FED,
185         PADDUSB = 0x660FDC,
186         PADDUSW = 0x660FDD,
187         PANDN = 0x660FDF,
188         PCMPEQB = 0x660F74,
189         PCMPEQD = 0x660F76,
190         PCMPEQW = 0x660F75,
191         PCMPGTB = 0x660F64,
192         PCMPGTD = 0x660F66,
193         PCMPGTW = 0x660F65,
194         PMADDWD = 0x660FF5,
195         PSLLW = 0x660FF1,
196         PSLLD = 0x660FF2,
197         PSLLQ = 0x660FF3,
198         PSRAW = 0x660FE1,
199         PSRAD = 0x660FE2,
200         PSRLW = 0x660FD1,
201         PSRLD = 0x660FD2,
202         PSRLQ = 0x660FD3,
203         PSUBSB = 0x660FE8,
204         PSUBSW = 0x660FE9,
205         PSUBUSB = 0x660FD8,
206         PSUBUSW = 0x660FD9,
207         PUNPCKHBW = 0x660F68,
208         PUNPCKHDQ = 0x660F6A,
209         PUNPCKHWD = 0x660F69,
210         PUNPCKLBW = 0x660F60,
211         PUNPCKLDQ = 0x660F62,
212         PUNPCKLWD = 0x660F61,
213         PXOR = 0x660FEF,
214         ANDPD = 0x660F54,
215         ANDPS = 0x0F54,
216         ANDNPD = 0x660F55,
217         ANDNPS = 0x0F55,
218         CMPPS = 0x0FC2,
219         CMPPD = 0x660FC2,
220         CMPSD = 0xF20FC2,
221         CMPSS = 0xF30FC2,
222         COMISD = 0x660F2F,
223         COMISS = 0x0F2F,
224         CVTDQ2PD = 0xF30FE6,
225         CVTDQ2PS = 0x0F5B,
226         CVTPD2DQ = 0xF20FE6,
227         CVTPD2PI = 0x660F2D,
228         CVTPD2PS = 0x660F5A,
229         CVTPI2PD = 0x660F2A,
230         CVTPI2PS = 0x0F2A,
231         CVTPS2DQ = 0x660F5B,
232         CVTPS2PD = 0x0F5A,
233         CVTPS2PI = 0x0F2D,
234         CVTSD2SI = 0xF20F2D,
235         CVTSD2SS = 0xF20F5A,
236         CVTSI2SD = 0xF20F2A,
237         CVTSI2SS = 0xF30F2A,
238         CVTSS2SD = 0xF30F5A,
239         CVTSS2SI = 0xF30F2D,
240         CVTTPD2PI = 0x660F2C,
241         CVTTPD2DQ = 0x660FE6,
242         CVTTPS2DQ = 0xF30F5B,
243         CVTTPS2PI = 0x0F2C,
244         CVTTSD2SI = 0xF20F2C,
245         CVTTSS2SI = 0xF30F2C,
246         MASKMOVDQU = 0x660FF7,
247         MASKMOVQ = 0x0FF7,
248         MAXPD = 0x660F5F,
249         MAXPS = 0x0F5F,
250         MAXSD = 0xF20F5F,
251         MAXSS = 0xF30F5F,
252         MINPD = 0x660F5D,
253         MINPS = 0x0F5D,
254         MINSD = 0xF20F5D,
255         MINSS = 0xF30F5D,
256         ORPD = 0x660F56,
257         ORPS = 0x0F56,
258         PAVGB = 0x660FE0,
259         PAVGW = 0x660FE3,
260         PMAXSW = 0x660FEE,
261         //PINSRW = 0x660FC4,
262         PMAXUB = 0x660FDE,
263         PMINSW = 0x660FEA,
264         PMINUB = 0x660FDA,
265         //PMOVMSKB = 0x660FD7,
266         PMULHUW = 0x660FE4,
267         PMULHW = 0x660FE5,
268         PMULUDQ = 0x660FF4,
269         PSADBW = 0x660FF6,
270         PUNPCKHQDQ = 0x660F6D,
271         PUNPCKLQDQ = 0x660F6C,
272         RCPPS = 0x0F53,
273         RCPSS = 0xF30F53,
274         RSQRTPS = 0x0F52,
275         RSQRTSS = 0xF30F52,
276         SQRTPD = 0x660F51,
277         SHUFPD = 0x660FC6,
278         SHUFPS = 0x0FC6,
279         SQRTPS = 0x0F51,
280         SQRTSD = 0xF20F51,
281         SQRTSS = 0xF30F51,
282         UNPCKHPD = 0x660F15,
283         UNPCKHPS = 0x0F15,
284         UNPCKLPD = 0x660F14,
285         UNPCKLPS = 0x0F14,
286 
287         PSHUFD = 0x660F70,
288         PSHUFHW = 0xF30F70,
289         PSHUFLW = 0xF20F70,
290         PSHUFW = 0x0F70,
291         PSLLDQ = 0x07660F73,
292         PSRLDQ = 0x03660F73,
293 
294         //PREFETCH = 0x0F18,
295 
296         // SSE3 Pentium 4 (Prescott)
297 
298         ADDSUBPD = 0x660FD0,
299         ADDSUBPS = 0xF20FD0,
300         HADDPD   = 0x660F7C,
301         HADDPS   = 0xF20F7C,
302         HSUBPD   = 0x660F7D,
303         HSUBPS   = 0xF20F7D,
304         MOVDDUP  = 0xF20F12,
305         MOVSHDUP = 0xF30F16,
306         MOVSLDUP = 0xF30F12,
307         LDDQU    = 0xF20FF0,
308         MONITOR  = 0x0F01C8,
309         MWAIT    = 0x0F01C9,
310 
311         // SSSE3
312         PALIGNR = 0x660F3A0F,
313         PHADDD = 0x660F3802,
314         PHADDW = 0x660F3801,
315         PHADDSW = 0x660F3803,
316         PABSB = 0x660F381C,
317         PABSD = 0x660F381E,
318         PABSW = 0x660F381D,
319         PSIGNB = 0x660F3808,
320         PSIGND = 0x660F380A,
321         PSIGNW = 0x660F3809,
322         PSHUFB = 0x660F3800,
323         PMADDUBSW = 0x660F3804,
324         PMULHRSW = 0x660F380B,
325         PHSUBD = 0x660F3806,
326         PHSUBW = 0x660F3805,
327         PHSUBSW = 0x660F3807,
328 
329         // SSE4.1
330 
331         BLENDPD   = 0x660F3A0D,
332         BLENDPS   = 0x660F3A0C,
333         BLENDVPD  = 0x660F3815,
334         BLENDVPS  = 0x660F3814,
335         DPPD      = 0x660F3A41,
336         DPPS      = 0x660F3A40,
337         EXTRACTPS = 0x660F3A17,
338         INSERTPS  = 0x660F3A21,
339         MPSADBW   = 0x660F3A42,
340         PBLENDVB  = 0x660F3810,
341         PBLENDW   = 0x660F3A0E,
342         PEXTRD    = 0x660F3A16,
343         PEXTRQ    = 0x660F3A16,
344         PINSRB    = 0x660F3A20,
345         PINSRD    = 0x660F3A22,
346         PINSRQ    = 0x660F3A22,
347 
348         MOVNTDQA = 0x660F382A,
349         PACKUSDW = 0x660F382B,
350         PCMPEQQ = 0x660F3829,
351         PEXTRB = 0x660F3A14,
352         PHMINPOSUW = 0x660F3841,
353         PMAXSB = 0x660F383C,
354         PMAXSD = 0x660F383D,
355         PMAXUD = 0x660F383F,
356         PMAXUW = 0x660F383E,
357         PMINSB = 0x660F3838,
358         PMINSD = 0x660F3839,
359         PMINUD = 0x660F383B,
360         PMINUW = 0x660F383A,
361         PMOVSXBW = 0x660F3820,
362         PMOVSXBD = 0x660F3821,
363         PMOVSXBQ = 0x660F3822,
364         PMOVSXWD = 0x660F3823,
365         PMOVSXWQ = 0x660F3824,
366         PMOVSXDQ = 0x660F3825,
367         PMOVZXBW = 0x660F3830,
368         PMOVZXBD = 0x660F3831,
369         PMOVZXBQ = 0x660F3832,
370         PMOVZXWD = 0x660F3833,
371         PMOVZXWQ = 0x660F3834,
372         PMOVZXDQ = 0x660F3835,
373         PMULDQ   = 0x660F3828,
374         PMULLD   = 0x660F3840,
375         PTEST    = 0x660F3817,
376 
377         ROUNDPD = 0x660F3A09,
378         ROUNDPS = 0x660F3A08,
379         ROUNDSD = 0x660F3A0B,
380         ROUNDSS = 0x660F3A0A,
381 
382         // SSE4.2
383         PCMPESTRI  = 0x660F3A61,
384         PCMPESTRM  = 0x660F3A60,
385         PCMPISTRI  = 0x660F3A63,
386         PCMPISTRM  = 0x660F3A62,
387         PCMPGTQ    = 0x660F3837,
388         //CRC32
389 
390         // SSE4a (AMD only)
391         // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
392 
393         // POPCNT and LZCNT (have their own CPUID bits)
394         POPCNT     = 0xF30FB8,
395         // LZCNT
396     }
397 
398     /**
399     * Generate two operand instruction with XMM 128 bit operands.
400     *
401     * This is a compiler magic function - it doesn't behave like
402     * regular D functions.
403     *
404     * Parameters:
405     *      opcode = any of the XMM opcodes; it must be a compile time constant
406     *      op1    = first operand
407     *      op2    = second operand
408     * Returns:
409     *      result of opcode
410     */
411     pure @safe V1 simd(XMM opcode, V1, V2)(V1 op1, V2 op2)
412         if (is(V1 == __vector) && is(V2 == __vector))
413     {
414         pragma(inline, true);
415         return cast(V1)__simd(opcode, op1, op2);
416     }
417 
418     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2); // intrinsic
419 
420     ///
421     unittest
422     {
423         float4 a;
424         a = simd!(XMM.PXOR)(a, a);
425     }
426 
427     /**
428     * Unary SIMD instructions.
429     */
430     pure @safe V1 simd(XMM opcode, V1)(V1 op1)
431         if (is(V1 == __vector))
432     {
433         pragma(inline, true);
434         return cast(V1)__simd(opcode, op1);
435     }
436 
437     ///
438     pure @safe V1 simd(XMM opcode, V1)(double d)
439         if (is(V1 == __vector))
440     {
441         pragma(inline, true);
442         return cast(V1)__simd(opcode, d);
443     }
444 
445     ///
446     pure @safe V1 simd(XMM opcode, V1)(float f)
447         if (is(V1 == __vector))
448     {
449         pragma(inline, true);
450         return cast(V1)__simd(opcode, f);
451     }
452 
453     pure @safe void16 __simd(XMM opcode, void16 op1); // intrinsic
454     pure @safe void16 __simd(XMM opcode, double d);   // intrinsic
455     pure @safe void16 __simd(XMM opcode, float f);    // intrinsic
456 
457     ///
458     unittest
459     {
460         float4 a;
461         a = simd!(XMM.LODSS)(a);
462     }
463 
464     /****
465     * For instructions:
466     * CMPPD, CMPSS, CMPSD, CMPPS,
467     * PSHUFD, PSHUFHW, PSHUFLW,
468     * BLENDPD, BLENDPS, DPPD, DPPS,
469     * MPSADBW, PBLENDW,
470     * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
471     * Parameters:
472     *      opcode = any of the above XMM opcodes; it must be a compile time constant
473     *      op1    = first operand
474     *      op2    = second operand
475     *      imm8   = third operand; must be a compile time constant
476     * Returns:
477     *      result of opcode
478     */
479     pure @safe V1 simd(XMM opcode, ubyte imm8, V1, V2)(V1 op1, V2 op2)
480         if (is(V1 == __vector) && is(V2 == __vector))
481     {
482         pragma(inline, true);
483         return cast(V1)__simd(opcode, op1, op2, imm8);
484     }
485 
486     pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8); // intrinsic
487 
488     ///
489     unittest
490     {
491         float4 a;
492         a = simd!(XMM.CMPPD, 0x7A)(a, a);
493     }
494 
495     /***
496     * For instructions with the imm8 version:
497     * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
498     * PSRLDQ, PSLLDQ
499     * Parameters:
500     *      opcode = any of the XMM opcodes; it must be a compile time constant
501     *      op1    = first operand
502     *      imm8   = second operand; must be a compile time constant
503     * Returns:
504     *      result of opcode
505     */
506     pure @safe V1 simd(XMM opcode, ubyte imm8, V1)(V1 op1)
507         if (is(V1 == __vector))
508     {
509         pragma(inline, true);
510         return cast(V1)__simd_ib(opcode, op1, imm8);
511     }
512 
513     pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);  // intrinsic
514 
515     ///
516     unittest
517     {
518         float4 a;
519         a = simd!(XMM.PSRLQ, 0x7A)(a);
520     }
521 
522     /*****
523     * For "store" operations of the form:
524     *    op1 op= op2
525     * Returns:
526     *    op2
527     * These cannot be marked as pure, as semantic() doesn't check them.
528     */
529     @safe V1 simd_sto(XMM opcode, V1, V2)(V1 op1, V2 op2)
530         if (is(V1 == __vector) && is(V2 == __vector))
531     {
532         pragma(inline, true);
533         return cast(V1)__simd_sto(opcode, op1, op2);
534     }
535 
536     ///
537     @safe V1 simd_stod(XMM opcode, V1, V2)(double op1, V1 op2)
538         if (is(V1 == __vector))
539     {
540         pragma(inline, true);
541         return cast(V1)__simd_sto(opcode, op1, op2);
542     }
543 
544     ///
545     @safe V1 simd_stof(XMM opcode, V1)(float op1, V1 op2)
546         if (is(V1 == __vector))
547     {
548         pragma(inline, true);
549         return cast(V1)__simd_sto(opcode, op1, op2);
550     }
551 
552     @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);  // intrinsic
553     @safe void16 __simd_sto(XMM opcode, double op1, void16 op2);  // intrinsic
554     @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);   // intrinsic
555 
556     ///
557     unittest
558     {
559         void16 a;
560         float f = 1;
561         double d = 1;
562 
563         cast(void)simd_sto!(XMM.STOUPS)(a, a);
564         //simd_sto!(XMM.STOUPS)(f, a);
565         //simd_sto!(XMM.STOUPS)(d, a);
566     }
567 
568     /* The following use overloading to ensure correct typing.
569     * Compile with inlining on for best performance.
570     */
571 
572     pure @safe short8 pcmpeq()(short8 v1, short8 v2)
573     {
574         return cast(short8)__simd(XMM.PCMPEQW, v1, v2);
575     }
576 
577     pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
578     {
579         return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2);
580     }
581 
582     /*********************
583     * Emit prefetch instruction.
584     * Params:
585     *    address = address to be prefetched
586     *    writeFetch = true for write fetch, false for read fetch
587     *    locality = 0..3 (0 meaning least local, 3 meaning most local)
588     * Note:
589     *    The Intel mappings are:
590     *    $(TABLE
591     *    $(THEAD writeFetch, locality, Instruction)
592     *    $(TROW false, 0, prefetchnta)
593     *    $(TROW false, 1, prefetch2)
594     *    $(TROW false, 2, prefetch1)
595     *    $(TROW false, 3, prefetch0)
596     *    $(TROW true, 0, prefetchw)
597     *    $(TROW true, 1, prefetchw)
598     *    $(TROW true, 2, prefetchw)
599     *    $(TROW true, 3, prefetchw)
600     *    )
601     */
602     void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
603     {
604         static if (writeFetch)
605             __prefetch(address, 4);
606         else static if (locality < 4)
607             __prefetch(address, 3 - locality);
608         else
609             static assert(0, "0..3 expected for locality");
610     }
611 
612     private void __prefetch(const(void*) address, ubyte encoding);
613 
614     /*************************************
615     * Load unaligned vector from address.
616     * This is a compiler intrinsic.
617     * Params:
618     *    p = pointer to vector
619     * Returns:
620     *    vector
621     */
622 
623     V loadUnaligned(V)(const V* p)
624         if (is(V == void16) ||
625             is(V == byte16) ||
626             is(V == ubyte16) ||
627             is(V == short8) ||
628             is(V == ushort8) ||
629             is(V == int4) ||
630             is(V == uint4) ||
631             is(V == long2) ||
632             is(V == ulong2) ||
633             is(V == double2) ||
634             is(V == float4))
635     {
636         pragma(inline, true);
637         static if (is(V == double2))
638             return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
639         else static if (is(V == float4))
640             return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
641         else
642             return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
643     }
644 
645     @system
646     unittest
647     {
648         // Memory to load into the vector:
649         // Should have enough data to test all 16-byte alignments, and still
650         // have room for a 16-byte vector
651         ubyte[32] data;
652         foreach (i; 0..data.length)
653         {
654             data[i] = cast(ubyte)i;
655         }
656 
657         // to test all alignments from 1 ~ 16
658         foreach (i; 0..16)
659         {
660             ubyte* d = &data[i];
661 
662             void test(T)()
663             {
664                 // load the data
665                 T v = loadUnaligned(cast(T*)d);
666 
667                 // check that the data was loaded correctly
668                 ubyte* ptrToV = cast(ubyte*)&v;
669                 foreach (j; 0..T.sizeof)
670                 {
671                     assert(ptrToV[j] == d[j]);
672                 }
673             }
674 
675             test!void16();
676             test!byte16();
677             test!ubyte16();
678             test!short8();
679             test!ushort8();
680             test!int4();
681             test!uint4();
682             test!long2();
683             test!ulong2();
684             test!double2();
685             test!float4();
686         }
687     }
688 
689     /*************************************
690     * Store vector to unaligned address.
691     * This is a compiler intrinsic.
692     * Params:
693     *    p = pointer to vector
694     *    value = value to store
695     * Returns:
696     *    value
697     */
698 
699     V storeUnaligned(V)(V* p, V value)
700         if (is(V == void16) ||
701             is(V == byte16) ||
702             is(V == ubyte16) ||
703             is(V == short8) ||
704             is(V == ushort8) ||
705             is(V == int4) ||
706             is(V == uint4) ||
707             is(V == long2) ||
708             is(V == ulong2) ||
709             is(V == double2) ||
710             is(V == float4))
711     {
712         pragma(inline, true);
713         static if (is(V == double2))
714             return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
715         else static if (is(V == float4))
716             return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
717         else
718             return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
719     }
720 
721     @system
722     unittest
723     {
724         // Memory to store the vector to:
725         // Should have enough data to test all 16-byte alignments, and still
726         // have room for a 16-byte vector
727         ubyte[32] data;
728 
729         // to test all alignments from 1 ~ 16
730         foreach (i; 0..16)
731         {
732             ubyte* d = &data[i];
733 
734             void test(T)()
735             {
736                 T v;
737 
738                 // populate v` with data
739                 ubyte* ptrToV = cast(ubyte*)&v;
740                 foreach (j; 0..T.sizeof)
741                 {
742                     ptrToV[j] = cast(ubyte)j;
743                 }
744 
745                 // store `v` to location pointed to by `d`
746                 storeUnaligned(cast(T*)d, v);
747 
748                 // check that the the data was stored correctly
749                 foreach (j; 0..T.sizeof)
750                 {
751                     assert(ptrToV[j] == d[j]);
752                 }
753             }
754 
755             test!void16();
756             test!byte16();
757             test!ubyte16();
758             test!short8();
759             test!ushort8();
760             test!int4();
761             test!uint4();
762             test!long2();
763             test!ulong2();
764             test!double2();
765             test!float4();
766         }
767     }
768 }
Suggestion Box / Bug Report