EigenRand  0.5.0
 
Loading...
Searching...
No Matches
arch/AVX/MorePacketMath.h
Go to the documentation of this file.
1
12#ifndef EIGENRAND_MORE_PACKET_MATH_AVX_H
13#define EIGENRAND_MORE_PACKET_MATH_AVX_H
14
15#include <immintrin.h>
16
17namespace Eigen
18{
19 namespace internal
20 {
21 template<>
22 struct IsIntPacket<Packet8i> : std::true_type {};
23
24 template<>
25 struct HalfPacket<Packet8i>
26 {
27 using type = Packet4i;
28 };
29
30 template<>
31 struct HalfPacket<Packet8f>
32 {
33 using type = Packet4f;
34 };
35
36 template<>
37 struct IsFloatPacket<Packet8f> : std::true_type {};
38
39 template<>
40 struct IsDoublePacket<Packet4d> : std::true_type {};
41
42 template<>
43 struct reinterpreter<Packet8i>
44 {
45 EIGEN_STRONG_INLINE Packet8f to_float(const Packet8i& x)
46 {
47 return _mm256_castsi256_ps(x);
48 }
49
50 EIGEN_STRONG_INLINE Packet4d to_double(const Packet8i& x)
51 {
52 return _mm256_castsi256_pd(x);
53 }
54
55 EIGEN_STRONG_INLINE Packet8i to_int(const Packet8i& x)
56 {
57 return x;
58 }
59 };
60
61 template<>
62 struct reinterpreter<Packet8f>
63 {
64 EIGEN_STRONG_INLINE Packet8f to_float(const Packet8f& x)
65 {
66 return x;
67 }
68
69 EIGEN_STRONG_INLINE Packet4d to_double(const Packet8f& x)
70 {
71 return _mm256_castps_pd(x);
72 }
73
74 EIGEN_STRONG_INLINE Packet8i to_int(const Packet8f& x)
75 {
76 return _mm256_castps_si256(x);
77 }
78 };
79
80 template<>
81 struct reinterpreter<Packet4d>
82 {
83 EIGEN_STRONG_INLINE Packet8f to_float(const Packet4d& x)
84 {
85 return _mm256_castpd_ps(x);
86 }
87
88 EIGEN_STRONG_INLINE Packet4d to_double(const Packet4d& x)
89 {
90 return x;
91 }
92
93 EIGEN_STRONG_INLINE Packet8i to_int(const Packet4d& x)
94 {
95 return _mm256_castpd_si256(x);
96 }
97 };
98
99 template<>
100 EIGEN_STRONG_INLINE void split_two<Packet8i>(const Packet8i& x, Packet4i& a, Packet4i& b)
101 {
102 a = _mm256_extractf128_si256(x, 0);
103 b = _mm256_extractf128_si256(x, 1);
104 }
105
106 EIGEN_STRONG_INLINE Packet8i combine_two(const Packet4i& a, const Packet4i& b)
107 {
108 return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
109 }
110
111 template<>
112 EIGEN_STRONG_INLINE void split_two<Packet8f>(const Packet8f& x, Packet4f& a, Packet4f& b)
113 {
114 a = _mm256_extractf128_ps(x, 0);
115 b = _mm256_extractf128_ps(x, 1);
116 }
117
118 EIGEN_STRONG_INLINE Packet8f combine_two(const Packet4f& a, const Packet4f& b)
119 {
120 return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
121 }
122
123
124 EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet8i& a)
125 {
126#ifdef EIGEN_VECTORIZE_AVX2
127 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
128#else
129 auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
130 return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
131#endif
132 }
133
134 template<>
135 EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
136 {
137 return _mm256_set1_epi64x(a);
138 }
139
140 template<>
141 EIGEN_STRONG_INLINE Packet8i padd64<Packet8i>(const Packet8i& a, const Packet8i& b)
142 {
143#ifdef EIGEN_VECTORIZE_AVX2
144 return _mm256_add_epi64(a, b);
145#else
146 Packet4i a1, a2, b1, b2;
147 split_two(a, a1, a2);
148 split_two(b, b1, b2);
149 return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2));
150#endif
151 }
152
153 template<>
154 EIGEN_STRONG_INLINE Packet8i psub64<Packet8i>(const Packet8i& a, const Packet8i& b)
155 {
156#ifdef EIGEN_VECTORIZE_AVX2
157 return _mm256_sub_epi64(a, b);
158#else
159 Packet4i a1, a2, b1, b2;
160 split_two(a, a1, a2);
161 split_two(b, b1, b2);
162 return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2));
163#endif
164 }
165
166 template<>
167 EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(const Packet8i& a, const Packet8i& b)
168 {
169#ifdef EIGEN_VECTORIZE_AVX2
170 return _mm256_cmpeq_epi32(a, b);
171#else
172 Packet4i a1, a2, b1, b2;
173 split_two(a, a1, a2);
174 split_two(b, b1, b2);
175 return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
176#endif
177 }
178
179 template<>
180 EIGEN_STRONG_INLINE Packet8f pcmpeq<Packet8f>(const Packet8f& a, const Packet8f& b)
181 {
182 return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
183 }
184
185 template<>
186 EIGEN_STRONG_INLINE Packet8i pnegate<Packet8i>(const Packet8i& a)
187 {
188#ifdef EIGEN_VECTORIZE_AVX2
189 return _mm256_sub_epi32(pset1<Packet8i>(0), a);
190#else
191 Packet4i a1, a2;
192 split_two(a, a1, a2);
193 return combine_two(_mm_sub_epi32(pset1<Packet4i>(0), a1), _mm_sub_epi32(pset1<Packet4i>(0), a2));
194#endif
195 }
196
197 template<>
198 struct BitShifter<Packet8i>
199 {
200 template<int b>
201 EIGEN_STRONG_INLINE Packet8i sll(const Packet8i& a)
202 {
203#ifdef EIGEN_VECTORIZE_AVX2
204 return _mm256_slli_epi32(a, b);
205#else
206 Packet4i a1, a2;
207 split_two(a, a1, a2);
208 return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
209#endif
210 }
211
212 template<int b>
213 EIGEN_STRONG_INLINE Packet8i srl(const Packet8i& a, int _b = b)
214 {
215#ifdef EIGEN_VECTORIZE_AVX2
216 if (b >= 0)
217 {
218 return _mm256_srli_epi32(a, b);
219 }
220 else
221 {
222 return _mm256_srli_epi32(a, _b);
223 }
224#else
225 Packet4i a1, a2;
226 split_two(a, a1, a2);
227 if (b >= 0)
228 {
229 return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
230 }
231 else
232 {
233 return combine_two((Packet4i)_mm_srli_epi32(a1, _b), (Packet4i)_mm_srli_epi32(a2, _b));
234 }
235#endif
236 }
237
238 template<int b>
239 EIGEN_STRONG_INLINE Packet8i sll64(const Packet8i& a)
240 {
241#ifdef EIGEN_VECTORIZE_AVX2
242 return _mm256_slli_epi64(a, b);
243#else
244 Packet4i a1, a2;
245 split_two(a, a1, a2);
246 return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
247#endif
248 }
249
250 template<int b>
251 EIGEN_STRONG_INLINE Packet8i srl64(const Packet8i& a)
252 {
253#ifdef EIGEN_VECTORIZE_AVX2
254 return _mm256_srli_epi64(a, b);
255#else
256 Packet4i a1, a2;
257 split_two(a, a1, a2);
258 return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
259#endif
260 }
261 };
262#ifdef EIGENRAND_EIGEN_33_MODE
263 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b)
264 {
265 #ifdef EIGEN_VECTORIZE_AVX2
266 return _mm256_add_epi32(a, b);
267 #else
268 Packet4i a1, a2, b1, b2;
269 split_two(a, a1, a2);
270 split_two(b, b1, b2);
271 return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
272 #endif
273 }
274
275 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b)
276 {
277 #ifdef EIGEN_VECTORIZE_AVX2
278 return _mm256_sub_epi32(a, b);
279 #else
280 Packet4i a1, a2, b1, b2;
281 split_two(a, a1, a2);
282 split_two(b, b1, b2);
283 return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
284 #endif
285 }
286
287 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b)
288 {
289 #ifdef EIGEN_VECTORIZE_AVX2
290 return _mm256_and_si256(a, b);
291 #else
292 return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
293 #endif
294 }
295
296 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b)
297 {
298 #ifdef EIGEN_VECTORIZE_AVX2
299 return _mm256_andnot_si256(a, b);
300 #else
301 return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
302 #endif
303 }
304
305 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b)
306 {
307 #ifdef EIGEN_VECTORIZE_AVX2
308 return _mm256_or_si256(a, b);
309 #else
310 return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
311 #endif
312 }
313
314 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b)
315 {
316 #ifdef EIGEN_VECTORIZE_AVX2
317 return _mm256_xor_si256(a, b);
318 #else
319 return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
320 #endif
321 }
322
323 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
324 {
325 return !!_mm256_movemask_ps(x);
326 }
327
328 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x)
329 {
330 return predux_any(_mm256_castsi256_ps(x));
331 }
332#endif
333
334 template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
335 {
336 return _mm256_movemask_ps(x) == 0xFF;
337 }
338
339 template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8i& x)
340 {
341 return predux_all(_mm256_castsi256_ps(x));
342 }
343
344 template<>
345 EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(const Packet8i& a, const Packet8i& b)
346 {
347#ifdef EIGEN_VECTORIZE_AVX2
348 return _mm256_cmpgt_epi32(b, a);
349#else
350 Packet4i a1, a2, b1, b2;
351 split_two(a, a1, a2);
352 split_two(b, b1, b2);
353 return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
354#endif
355 }
356
357 template<>
358 EIGEN_STRONG_INLINE Packet8i pcmplt64<Packet8i>(const Packet8i& a, const Packet8i& b)
359 {
360#ifdef EIGEN_VECTORIZE_AVX2
361 return _mm256_cmpgt_epi64(b, a);
362#else
363 Packet4i a1, a2, b1, b2;
364 split_two(a, a1, a2);
365 split_two(b, b1, b2);
366 return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2));
367#endif
368 }
369
370 template<>
371 EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(const Packet8f& a, const Packet8f& b)
372 {
373 return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
374 }
375
376 template<>
377 EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(const Packet8f& a, const Packet8f& b)
378 {
379 return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
380 }
381
382 template<>
383 EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(const Packet4d& a, const Packet4d& b)
384 {
385 return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
386 }
387
388 template<>
389 EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(const Packet4d& a, const Packet4d& b)
390 {
391 return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
392 }
393
394 template<>
395 EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8f& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
396 {
397 return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
398 }
399
400 template<>
401 EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8i& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
402 {
403 return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
404 }
405
406 template<>
407 EIGEN_STRONG_INLINE Packet8i pblendv(const Packet8i& ifPacket, const Packet8i& thenPacket, const Packet8i& elsePacket)
408 {
409 return _mm256_castps_si256(_mm256_blendv_ps(
410 _mm256_castsi256_ps(elsePacket),
411 _mm256_castsi256_ps(thenPacket),
412 _mm256_castsi256_ps(ifPacket)
413 ));
414 }
415
416 template<>
417 EIGEN_STRONG_INLINE Packet4d pblendv(const Packet4d& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
418 {
419 return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
420 }
421
422 template<>
423 EIGEN_STRONG_INLINE Packet4d pblendv(const Packet8i& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
424 {
425 return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
426 }
427
428 template<>
429 EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(const int* addr, const Packet8i& index)
430 {
431#ifdef EIGEN_VECTORIZE_AVX2
432 return _mm256_i32gather_epi32(addr, index, 4);
433#else
434 uint32_t u[8];
435 _mm256_storeu_si256((Packet8i*)u, index);
436 return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
437 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
438#endif
439 }
440
441 template<>
442 EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(const float* addr, const Packet8i& index)
443 {
444#ifdef EIGEN_VECTORIZE_AVX2
445 return _mm256_i32gather_ps(addr, index, 4);
446#else
447 uint32_t u[8];
448 _mm256_storeu_si256((Packet8i*)u, index);
449 return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
450 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
451#endif
452 }
453
454 template<>
455 EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(const double* addr, const Packet8i& index, bool upperhalf)
456 {
457#ifdef EIGEN_VECTORIZE_AVX2
458 return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
459#else
460 uint32_t u[8];
461 _mm256_storeu_si256((Packet8i*)u, index);
462 if (upperhalf)
463 {
464 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
465 }
466 else
467 {
468 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
469 }
470#endif
471 }
472
473 template<>
474 EIGEN_STRONG_INLINE int pmovemask<Packet8f>(const Packet8f& a)
475 {
476 return _mm256_movemask_ps(a);
477 }
478
479 template<>
480 EIGEN_STRONG_INLINE int pmovemask<Packet4d>(const Packet4d& a)
481 {
482 return _mm256_movemask_pd(a);
483 }
484
485 template<>
486 EIGEN_STRONG_INLINE int pmovemask<Packet8i>(const Packet8i& a)
487 {
488 return pmovemask(_mm256_castsi256_ps(a));
489 }
490
491 template<>
492 EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(const Packet8f& a)
493 {
494 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
495 }
496
497 template<>
498 EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(const Packet4d& a)
499 {
500 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
501 }
502
503 template<>
504 EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(const Packet8i& a, const Packet8i& b)
505 {
506#ifdef EIGEN_VECTORIZE_AVX2
507 return _mm256_cmpeq_epi64(a, b);
508#else
509 Packet4i a1, a2, b1, b2;
510 split_two(a, a1, a2);
511 split_two(b, b1, b2);
512 return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
513#endif
514 }
515
516 template<>
517 EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(const Packet8i& a, uint64_t b, uint64_t c)
518 {
519 uint64_t u[4];
520 _mm256_storeu_si256((__m256i*)u, a);
521 u[0] = u[0] * b + c;
522 u[1] = u[1] * b + c;
523 u[2] = u[2] * b + c;
524 u[3] = u[3] * b + c;
525 return _mm256_loadu_si256((__m256i*)u);
526 }
527
528 EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) {
529 auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
530 return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000));
531 }
532
533 EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) {
534 x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
535 return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
536 }
537
538 EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) {
539 x = _mm256_add_pd(_mm256_floor_pd(x), _mm256_set1_pd(0x0018000000000000));
540 return psub64(
541 _mm256_castpd_si256(x),
542 _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
543 );
544 }
545
546 template<>
547 EIGEN_STRONG_INLINE Packet8i pcast64<Packet4d, Packet8i>(const Packet4d& a)
548 {
549 return double_to_int64(a);
550 }
551
552 template<>
553 EIGEN_STRONG_INLINE Packet4d pcast64<Packet8i, Packet4d>(const Packet8i& a)
554 {
555 return int64_to_double(a);
556 }
557
558 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
559 Packet4d psin<Packet4d>(const Packet4d& x)
560 {
561 return _psin(x);
562 }
563
564 #ifdef EIGENRAND_EIGEN_33_MODE
565 template <>
566 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
567 plog<Packet4d>(const Packet4d& _x) {
568 Packet4d x = _x;
569 _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
570 _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
571
572 auto inv_mant_mask = _mm256_castsi256_pd(pseti64<Packet8i>(~0x7ff0000000000000));
573 auto min_norm_pos = _mm256_castsi256_pd(pseti64<Packet8i>(0x10000000000000));
574 auto minus_inf = _mm256_castsi256_pd(pseti64<Packet8i>(0xfff0000000000000));
575
576 // Polynomial coefficients.
577 _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524);
578 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2);
579 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1);
580 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1);
581 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1);
582 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1);
583 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1);
584 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1);
585 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1);
586 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1);
587 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4);
588 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375);
589
590 Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
591 Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
592
593 // Truncate input values to the minimum positive normal.
594 x = pmax(x, min_norm_pos);
595
596 Packet4d emm0 = uint64_to_double(psrl64<52>(_mm256_castpd_si256(x)));
597 Packet4d e = psub(emm0, pset1<Packet4d>(1022));
598
599 // Set the exponents to -1, i.e. x are in the range [0.5,1).
600 x = _mm256_and_pd(x, inv_mant_mask);
601 x = _mm256_or_pd(x, p4d_half);
602
603 // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
604 // and shift by -1. The values are then centered around 0, which improves
605 // the stability of the polynomial evaluation.
606 // if( x < SQRTHF ) {
607 // e -= 1;
608 // x = x + x - 1.0;
609 // } else { x = x - 1.0; }
610 Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ);
611 Packet4d tmp = _mm256_and_pd(x, mask);
612 x = psub(x, p4d_1);
613 e = psub(e, _mm256_and_pd(p4d_1, mask));
614 x = padd(x, tmp);
615
616 Packet4d x2 = pmul(x, x);
617 Packet4d x3 = pmul(x2, x);
618
619 // Evaluate the polynomial approximant of degree 8 in three parts, probably
620 // to improve instruction-level parallelism.
621 Packet4d y, y1, y2;
622 y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1);
623 y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4);
624 y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7);
625 y = pmadd(y, x, p4d_cephes_log_p2);
626 y1 = pmadd(y1, x, p4d_cephes_log_p5);
627 y2 = pmadd(y2, x, p4d_cephes_log_p8);
628 y = pmadd(y, x3, y1);
629 y = pmadd(y, x3, y2);
630 y = pmul(y, x3);
631
632 // Add the logarithm of the exponent back to the result of the interpolation.
633 y1 = pmul(e, p4d_cephes_log_q1);
634 tmp = pmul(x2, p4d_half);
635 y = padd(y, y1);
636 x = psub(x, tmp);
637 y2 = pmul(e, p4d_cephes_log_q2);
638 x = padd(x, y);
639 x = padd(x, y2);
640
641 // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
642 return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask));
643 }
644 #endif
645
646 #if !(EIGEN_VERSION_AT_LEAST(3,3,5))
647 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
648 return _mm_cvtepi32_ps(a);
649 }
650
651 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
652 return _mm_cvttps_epi32(a);
653 }
654 #endif
655 }
656}
657
658#endif