EigenRand  0.5.0
 
Loading...
Searching...
No Matches
arch/SSE/MorePacketMath.h
Go to the documentation of this file.
1
12#ifndef EIGENRAND_MORE_PACKET_MATH_SSE_H
13#define EIGENRAND_MORE_PACKET_MATH_SSE_H
14
15#include <xmmintrin.h>
16
17namespace Eigen
18{
19 namespace internal
20 {
21 template<>
22 struct IsIntPacket<Packet4i> : std::true_type {};
23
24 template<>
25 struct IsFloatPacket<Packet4f> : std::true_type {};
26
27 template<>
28 struct IsDoublePacket<Packet2d> : std::true_type {};
29
30 template<>
31 struct HalfPacket<Packet4i>
32 {
33 using type = uint64_t;
34 };
35
36#ifdef EIGEN_VECTORIZE_AVX
37#else
38 template<>
39 struct HalfPacket<Packet4f>
40 {
41 //using type = Packet2f;
42 };
43#endif
44 template<>
45 struct reinterpreter<Packet4i>
46 {
47 EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x)
48 {
49 return _mm_castsi128_ps(x);
50 }
51
52 EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x)
53 {
54 return _mm_castsi128_pd(x);
55 }
56
57 EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x)
58 {
59 return x;
60 }
61 };
62
63 template<>
64 struct reinterpreter<Packet4f>
65 {
66 EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x)
67 {
68 return x;
69 }
70
71 EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x)
72 {
73 return _mm_castps_pd(x);
74 }
75
76 EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x)
77 {
78 return _mm_castps_si128(x);
79 }
80 };
81
82 template<>
83 struct reinterpreter<Packet2d>
84 {
85 EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x)
86 {
87 return _mm_castpd_ps(x);
88 }
89
90 EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x)
91 {
92 return x;
93 }
94
95 EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x)
96 {
97 return _mm_castpd_si128(x);
98 }
99 };
100
101 template<>
102 EIGEN_STRONG_INLINE void split_two<Packet4i>(const Packet4i& x, uint64_t& a, uint64_t& b)
103 {
104#ifdef EIGEN_VECTORIZE_SSE4_1
105 a = _mm_extract_epi64(x, 0);
106 b = _mm_extract_epi64(x, 1);
107#else
108 uint64_t u[2];
109 _mm_storeu_si128((__m128i*)u, x);
110 a = u[0];
111 b = u[1];
112#endif
113 }
114
115 EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet4i& a, const Packet4i& b)
116 {
117 auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
118 auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
119 sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
120 sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
121 return _mm_or_si128(sa, sb);
122 }
123
124 template<>
125 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
126 {
127 return _mm_set1_epi64x(a);
128 }
129
130 template<>
131 EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(const Packet4i& a, const Packet4i& b)
132 {
133 return _mm_add_epi64(a, b);
134 }
135
136 template<>
137 EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(const Packet4i& a, const Packet4i& b)
138 {
139 return _mm_sub_epi64(a, b);
140 }
141
142 template<>
143 EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(const Packet4i& a, const Packet4i& b)
144 {
145 return _mm_cmpeq_epi32(a, b);
146 }
147
148 template<>
149 EIGEN_STRONG_INLINE Packet4f pcmpeq<Packet4f>(const Packet4f& a, const Packet4f& b)
150 {
151 return _mm_cmpeq_ps(a, b);
152 }
153
154 template<>
155 struct BitShifter<Packet4i>
156 {
157 template<int b>
158 EIGEN_STRONG_INLINE Packet4i sll(const Packet4i& a)
159 {
160 return _mm_slli_epi32(a, b);
161 }
162
163 template<int b>
164 EIGEN_STRONG_INLINE Packet4i srl(const Packet4i& a, int _b = b)
165 {
166 if (b >= 0)
167 {
168 return _mm_srli_epi32(a, b);
169 }
170 else
171 {
172 return _mm_srli_epi32(a, _b);
173 }
174 }
175
176 template<int b>
177 EIGEN_STRONG_INLINE Packet4i sll64(const Packet4i& a)
178 {
179 return _mm_slli_epi64(a, b);
180 }
181
182 template<int b>
183 EIGEN_STRONG_INLINE Packet4i srl64(const Packet4i& a)
184 {
185 return _mm_srli_epi64(a, b);
186 }
187 };
188
189 template<>
190 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(const Packet4i& a, const Packet4i& b)
191 {
192 return _mm_cmplt_epi32(a, b);
193 }
194
195 template<>
196 EIGEN_STRONG_INLINE Packet4i pcmplt64<Packet4i>(const Packet4i& a, const Packet4i& b)
197 {
198#ifdef EIGEN_VECTORIZE_SSE4_2
199 return _mm_cmpgt_epi64(b, a);
200#else
201 int64_t u[2], v[2];
202 _mm_storeu_si128((__m128i*)u, a);
203 _mm_storeu_si128((__m128i*)v, b);
204 return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0);
205#endif
206 }
207
208 template<>
209 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(const Packet4f& a, const Packet4f& b)
210 {
211 return _mm_cmplt_ps(a, b);
212 }
213
214 template<>
215 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(const Packet4f& a, const Packet4f& b)
216 {
217 return _mm_cmple_ps(a, b);
218 }
219
220 template<>
221 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(const Packet2d& a, const Packet2d& b)
222 {
223 return _mm_cmplt_pd(a, b);
224 }
225
226 template<>
227 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(const Packet2d& a, const Packet2d& b)
228 {
229 return _mm_cmple_pd(a, b);
230 }
231
232 template<>
233 EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
234 {
235#ifdef EIGEN_VECTORIZE_SSE4_1
236 return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
237#else
238 return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
239#endif
240 }
241
242 template<>
243 EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
244 {
245 return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
246 }
247
248 template<>
249 EIGEN_STRONG_INLINE Packet4i pblendv(const Packet4i& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket)
250 {
251#ifdef EIGEN_VECTORIZE_SSE4_1
252 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
253#else
254 return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
255#endif
256 }
257
258 template<>
259 EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
260 {
261#ifdef EIGEN_VECTORIZE_SSE4_1
262 return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
263#else
264 return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
265#endif
266 }
267
268
269 template<>
270 EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
271 {
272 return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
273 }
274
275 template<>
276 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(const int* addr, const Packet4i& index)
277 {
278#ifdef EIGEN_VECTORIZE_AVX2
279 return _mm_i32gather_epi32(addr, index, 4);
280#else
281 uint32_t u[4];
282 _mm_storeu_si128((__m128i*)u, index);
283 return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
284#endif
285 }
286
287 template<>
288 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(const float* addr, const Packet4i& index)
289 {
290#ifdef EIGEN_VECTORIZE_AVX2
291 return _mm_i32gather_ps(addr, index, 4);
292#else
293 uint32_t u[4];
294 _mm_storeu_si128((__m128i*)u, index);
295 return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
296#endif
297 }
298
299 template<>
300 EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(const double* addr, const Packet4i& index, bool upperhalf)
301 {
302#ifdef EIGEN_VECTORIZE_AVX2
303 return _mm_i32gather_pd(addr, index, 8);
304#else
305 uint32_t u[4];
306 _mm_storeu_si128((__m128i*)u, index);
307 if (upperhalf)
308 {
309 return _mm_setr_pd(addr[u[2]], addr[u[3]]);
310 }
311 else
312 {
313 return _mm_setr_pd(addr[u[0]], addr[u[1]]);
314 }
315#endif
316 }
317
318 template<>
319 EIGEN_STRONG_INLINE int pmovemask<Packet4f>(const Packet4f& a)
320 {
321 return _mm_movemask_ps(a);
322 }
323
324 template<>
325 EIGEN_STRONG_INLINE int pmovemask<Packet2d>(const Packet2d& a)
326 {
327 return _mm_movemask_pd(a);
328 }
329
330 template<>
331 EIGEN_STRONG_INLINE int pmovemask<Packet4i>(const Packet4i& a)
332 {
333 return pmovemask((Packet4f)_mm_castsi128_ps(a));
334 }
335
336 template<>
337 EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(const Packet4f& a)
338 {
339#ifdef EIGEN_VECTORIZE_SSE4_1
340 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
341#else
342 auto round = _MM_GET_ROUNDING_MODE();
343 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
344 auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
345 _MM_SET_ROUNDING_MODE(round);
346 return ret;
347#endif
348 }
349
350 template<>
351 EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(const Packet2d& a)
352 {
353#ifdef EIGEN_VECTORIZE_SSE4_1
354 return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
355#else
356 auto round = _MM_GET_ROUNDING_MODE();
357 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
358 auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
359 _MM_SET_ROUNDING_MODE(round);
360 return ret;
361#endif
362 }
363
364 template<>
365 EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(const Packet4i& a, const Packet4i& b)
366 {
367#ifdef EIGEN_VECTORIZE_SSE4_1
368 return _mm_cmpeq_epi64(a, b);
369#else
370 Packet4i c = _mm_cmpeq_epi32(a, b);
371 return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
372#endif
373 }
374
375 template<>
376 EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(const Packet4i& a, uint64_t b, uint64_t c)
377 {
378 uint64_t u[2];
379 _mm_storeu_si128((__m128i*)u, a);
380 u[0] = u[0] * b + c;
381 u[1] = u[1] * b + c;
382 return _mm_loadu_si128((__m128i*)u);
383 }
384
385 EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) {
386 x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
387 return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
388 }
389
390 EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) {
391 x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
392 return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
393 }
394
395 EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) {
396 int _mm_rounding = _MM_GET_ROUNDING_MODE();
397 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
398 x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
399 _MM_SET_ROUNDING_MODE(_mm_rounding);
400 return _mm_sub_epi64(
401 _mm_castpd_si128(x),
402 _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
403 );
404 }
405
406 template<>
407 EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(const Packet2d& a)
408 {
409 return double_to_int64(a);
410 }
411
412 template<>
413 EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(const Packet4i& a)
414 {
415 return int64_to_double(a);
416 }
417
418 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
419 Packet2d psin<Packet2d>(const Packet2d& x)
420 {
421 return _psin(x);
422 }
423
424 template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
425 {
426 return _mm_movemask_ps(x) == 0x0F;
427 }
428
429 template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4i& x)
430 {
431 return predux_all(_mm_castsi128_ps(x));
432 }
433
434#ifdef EIGENRAND_EIGEN_33_MODE
435
436 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
437 {
438 return !!_mm_movemask_ps(x);
439 }
440
441 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x)
442 {
443 return predux_any(_mm_castsi128_ps(x));
444 }
445
446 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
447 Packet2d plog<Packet2d>(const Packet2d& _x)
448 {
449 Packet2d x = _x;
450 _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f);
451 _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f);
452 _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
453
454 auto inv_mant_mask = _mm_castsi128_pd(pseti64<Packet4i>(~0x7ff0000000000000));
455 auto min_norm_pos = _mm_castsi128_pd(pseti64<Packet4i>(0x10000000000000));
456 auto minus_inf = _mm_castsi128_pd(pseti64<Packet4i>(0xfff0000000000000));
457
458 /* natural logarithm computed for 4 simultaneous float
459 return NaN for x <= 0
460 */
461 _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524);
462 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2);
463 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1);
464 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1);
465 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1);
466 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1);
467 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1);
468 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1);
469 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1);
470 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1);
471 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4);
472 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375);
473
474
475 Packet4i emm0;
476
477 Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd()); // not greater equal is true if x is NaN
478 Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
479
480 x = pmax(x, min_norm_pos); /* cut off denormalized stuff */
481 emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52);
482
483 /* keep only the fractional part */
484 x = _mm_and_pd(x, inv_mant_mask);
485 x = _mm_or_pd(x, p2d_half);
486
487 Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1<Packet2d>(1022));
488
489 /* part2:
490 if( x < SQRTHF ) {
491 e -= 1;
492 x = x + x - 1.0;
493 } else { x = x - 1.0; }
494 */
495 Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF);
496 Packet2d tmp = pand(x, mask);
497 x = psub(x, p2d_1);
498 e = psub(e, pand(p2d_1, mask));
499 x = padd(x, tmp);
500
501 Packet2d x2 = pmul(x, x);
502 Packet2d x3 = pmul(x2, x);
503
504 Packet2d y, y1, y2;
505 y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1);
506 y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4);
507 y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7);
508 y = pmadd(y, x, p2d_cephes_log_p2);
509 y1 = pmadd(y1, x, p2d_cephes_log_p5);
510 y2 = pmadd(y2, x, p2d_cephes_log_p8);
511 y = pmadd(y, x3, y1);
512 y = pmadd(y, x3, y2);
513 y = pmul(y, x3);
514
515 y1 = pmul(e, p2d_cephes_log_q1);
516 tmp = pmul(x2, p2d_half);
517 y = padd(y, y1);
518 x = psub(x, tmp);
519 y2 = pmul(e, p2d_cephes_log_q2);
520 x = padd(x, y);
521 x = padd(x, y2);
522 // negative arg will be NAN, 0 will be -INF
523 return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask));
524 }
525#endif
526 }
527}
528
529#endif