12 #ifndef EIGENRAND_MORE_PACKET_MATH_H
13 #define EIGENRAND_MORE_PACKET_MATH_H
15 #include <Eigen/Dense>
22 struct IsIntPacket : std::false_type {};
25 struct IsFloatPacket : std::false_type {};
28 struct IsDoublePacket : std::false_type {};
33 #ifdef EIGEN_VECTORIZE_AVX2
35 struct IsIntPacket<Packet8i> : std::true_type {};
38 struct HalfPacket<Packet8i>
40 using type = Packet4i;
43 #ifdef EIGEN_VECTORIZE_AVX
45 struct IsFloatPacket<Packet8f> : std::true_type {};
48 struct IsDoublePacket<Packet4d> : std::true_type {};
50 #ifdef EIGEN_VECTORIZE_SSE2
52 struct IsIntPacket<Packet4i> : std::true_type {};
55 struct IsFloatPacket<Packet4f> : std::true_type {};
58 struct IsDoublePacket<Packet2d> : std::true_type {};
61 struct HalfPacket<Packet4i>
63 using type = uint64_t;
66 template<
typename Packet>
71 template<
typename Packet>
72 inline auto reinterpret_to_float(
const Packet& x)
73 -> decltype(reinterpreter<Packet>{}.to_float(x))
75 return reinterpreter<Packet>{}.to_float(x);
78 template<
typename Packet>
79 inline auto reinterpret_to_double(
const Packet& x)
80 -> decltype(reinterpreter<Packet>{}.to_double(x))
82 return reinterpreter<Packet>{}.to_double(x);
85 template<
typename Packet>
86 inline auto reinterpret_to_int(
const Packet& x)
87 -> decltype(reinterpreter<Packet>{}.to_int(x))
89 return reinterpreter<Packet>{}.to_int(x);
92 template<
typename Packet>
93 EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
95 template<
typename Packet>
96 EIGEN_STRONG_INLINE Packet padd64(
const Packet& a,
const Packet& b);
98 template<
typename Packet>
99 EIGEN_STRONG_INLINE Packet psub64(
const Packet& a,
const Packet& b);
101 template <
typename SrcPacket,
typename TgtPacket>
102 EIGEN_STRONG_INLINE TgtPacket pcast64(
const SrcPacket& a);
104 template<
typename Packet>
105 EIGEN_STRONG_INLINE Packet pcmpeq(
const Packet& a,
const Packet& b);
107 template<
typename Packet>
108 EIGEN_STRONG_INLINE Packet psll(
const Packet& a,
int b);
110 template<
typename Packet>
111 EIGEN_STRONG_INLINE Packet psrl(
const Packet& a,
int b);
113 template<
typename Packet>
114 EIGEN_STRONG_INLINE Packet psll64(
const Packet& a,
int b);
116 template<
typename Packet>
117 EIGEN_STRONG_INLINE Packet psrl64(
const Packet& a,
int b);
119 template<
typename Packet>
120 EIGEN_STRONG_INLINE
int pmovemask(
const Packet& a);
122 template<
typename Packet>
123 EIGEN_STRONG_INLINE
typename std::enable_if<
124 IsFloatPacket<Packet>::value, Packet
125 >::type pext_sign(
const Packet& a)
127 using IntPacket = decltype(reinterpret_to_int(a));
128 return reinterpret_to_float(
129 pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
133 template<
typename Packet>
134 EIGEN_STRONG_INLINE
typename std::enable_if<
135 IsDoublePacket<Packet>::value, Packet
136 >::type pext_sign(
const Packet& a)
138 using IntPacket = decltype(reinterpret_to_int(a));
139 return reinterpret_to_double(
140 pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
145 EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(
const uint64_t& a,
int b)
151 EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(
const uint64_t& a,
int b)
157 template<
typename Packet>
158 EIGEN_STRONG_INLINE Packet plgamma_approx(
const Packet& x)
160 auto x_3 = padd(x, pset1<Packet>(3));
161 auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
162 ret = psub(ret, x_3);
163 ret = padd(ret, pset1<Packet>(0.9189385332046727));
164 ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
165 ret = psub(ret, plog(pmul(
166 pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
170 template<
typename Packet>
171 EIGEN_STRONG_INLINE Packet pcmplt(
const Packet& a,
const Packet& b);
173 template<
typename Packet>
174 EIGEN_STRONG_INLINE Packet pcmple(
const Packet& a,
const Packet& b);
176 template<
typename PacketIf,
typename Packet>
177 EIGEN_STRONG_INLINE Packet pblendv(
const PacketIf& ifPacket,
const Packet& thenPacket,
const Packet& elsePacket);
179 template<
typename Packet>
180 EIGEN_STRONG_INLINE Packet pgather(
const int* addr,
const Packet& index);
182 template<
typename Packet>
183 EIGEN_STRONG_INLINE
auto pgather(
const float* addr,
const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
185 template<
typename Packet>
186 EIGEN_STRONG_INLINE
auto pgather(
const double* addr,
const Packet& index,
bool upperhalf =
false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
188 template<
typename Packet>
189 EIGEN_STRONG_INLINE Packet ptruncate(
const Packet& a);
191 template<
typename Packet>
192 EIGEN_STRONG_INLINE Packet pcmpeq64(
const Packet& a,
const Packet& b);
194 template<
typename Packet>
195 EIGEN_STRONG_INLINE Packet pcmplt64(
const Packet& a,
const Packet& b);
197 template<
typename Packet>
198 EIGEN_STRONG_INLINE Packet pmuluadd64(
const Packet& a, uint64_t b, uint64_t c);
200 template<
typename IntPacket>
201 EIGEN_STRONG_INLINE
auto bit_to_ur_float(
const IntPacket& x) -> decltype(reinterpret_to_float(x))
203 using FloatPacket = decltype(reinterpret_to_float(x));
205 const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
206 upper = pset1<IntPacket>(127 << 23);
207 const FloatPacket one = pset1<FloatPacket>(1);
209 return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
212 template<
typename IntPacket>
213 EIGEN_STRONG_INLINE
auto bit_to_ur_double(
const IntPacket& x) -> decltype(reinterpret_to_double(x))
215 using DoublePacket = decltype(reinterpret_to_double(x));
217 const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
218 upper = pseti64<IntPacket>(1023ull << 52);
219 const DoublePacket one = pset1<DoublePacket>(1);
221 return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
224 template<
typename _Scalar>
228 struct bit_scalar<float>
230 float to_ur(uint32_t x)
237 u = (x & 0x7FFFFF) | (127 << 23);
241 float to_nzur(uint32_t x)
243 return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
248 struct bit_scalar<double>
250 double to_ur(uint64_t x)
257 u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
261 double to_nzur(uint64_t x)
263 return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
273 EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
275 bit_scalar<float> bs;
277 ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
278 ret.f[1] = bs.to_ur(x >> 32);
282 template<
typename Packet>
283 EIGEN_STRONG_INLINE
typename std::enable_if<
284 IsFloatPacket<Packet>::value
285 >::type psincos(Packet x, Packet& s, Packet& c)
287 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
288 using IntPacket = decltype(reinterpret_to_int(x));
289 IntPacket emm0, emm2, emm4;
295 sign_bit_sin = pext_sign(sign_bit_sin);
298 y = pmul(x, pset1<Packet>(1.27323954473516));
301 emm2 = pcast<Packet, IntPacket>(y);
304 emm2 = padd(emm2, pset1<IntPacket>(1));
305 emm2 = pand(emm2, pset1<IntPacket>(~1));
306 y = pcast<IntPacket, Packet>(emm2);
311 emm0 = pand(emm2, pset1<IntPacket>(4));
312 emm0 = psll(emm0, 29);
313 Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
316 emm2 = pand(emm2, pset1<IntPacket>(2));
318 emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
319 Packet poly_mask = reinterpret_to_float(emm2);
323 xmm1 = pset1<Packet>(-0.78515625);
324 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
325 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
326 xmm1 = pmul(y, xmm1);
327 xmm2 = pmul(y, xmm2);
328 xmm3 = pmul(y, xmm3);
333 emm4 = psub(emm4, pset1<IntPacket>(2));
334 emm4 = pandnot(emm4, pset1<IntPacket>(4));
335 emm4 = psll(emm4, 29);
336 Packet sign_bit_cos = reinterpret_to_float(emm4);
337 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
341 Packet z = pmul(x, x);
342 y = pset1<Packet>(2.443315711809948E-005);
345 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
347 y = padd(y, pset1<Packet>(4.166664568298827E-002));
350 Packet tmp = pmul(z, pset1<Packet>(0.5));
352 y = padd(y, pset1<Packet>(1));
356 Packet y2 = pset1<Packet>(-1.9515295891E-4);
358 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
360 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
367 Packet ysin2 = pand(xmm3, y2);
368 Packet ysin1 = pandnot(xmm3, y);
369 y2 = psub(y2, ysin2);
372 xmm1 = padd(ysin1, ysin2);
376 s = pxor(xmm1, sign_bit_sin);
377 c = pxor(xmm2, sign_bit_cos);
380 template<
typename Packet>
381 EIGEN_STRONG_INLINE
typename std::enable_if<
382 IsDoublePacket<Packet>::value
383 >::type psincos(Packet x, Packet& s, Packet& c)
385 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
386 using IntPacket = decltype(reinterpret_to_int(x));
387 IntPacket emm0, emm2, emm4;
393 sign_bit_sin = pext_sign(sign_bit_sin);
396 y = pmul(x, pset1<Packet>(1.27323954473516));
399 emm2 = pcast64<Packet, IntPacket>(y);
402 emm2 = padd64(emm2, pseti64<IntPacket>(1));
403 emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
404 y = pcast64<IntPacket, Packet>(emm2);
409 emm0 = pand(emm2, pseti64<IntPacket>(4));
410 emm0 = psll64(emm0, 61);
411 Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
414 emm2 = pand(emm2, pseti64<IntPacket>(2));
416 emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
417 Packet poly_mask = reinterpret_to_double(emm2);
421 xmm1 = pset1<Packet>(-0.78515625);
422 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
423 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
424 xmm1 = pmul(y, xmm1);
425 xmm2 = pmul(y, xmm2);
426 xmm3 = pmul(y, xmm3);
431 emm4 = psub64(emm4, pseti64<IntPacket>(2));
432 emm4 = pandnot(emm4, pseti64<IntPacket>(4));
433 emm4 = psll64(emm4, 61);
434 Packet sign_bit_cos = reinterpret_to_double(emm4);
435 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
439 Packet z = pmul(x, x);
440 y = pset1<Packet>(2.443315711809948E-005);
443 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
445 y = padd(y, pset1<Packet>(4.166664568298827E-002));
448 Packet tmp = pmul(z, pset1<Packet>(0.5));
450 y = padd(y, pset1<Packet>(1));
454 Packet y2 = pset1<Packet>(-1.9515295891E-4);
456 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
458 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
465 Packet ysin2 = pand(xmm3, y2);
466 Packet ysin1 = pandnot(xmm3, y);
467 y2 = psub(y2, ysin2);
470 xmm1 = padd(ysin1, ysin2);
474 s = pxor(xmm1, sign_bit_sin);
475 c = pxor(xmm2, sign_bit_cos);
478 template<
typename Packet>
479 EIGEN_STRONG_INLINE
typename std::enable_if<
480 IsDoublePacket<Packet>::value, Packet
481 >::type _psin(Packet x)
483 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
484 using IntPacket = decltype(reinterpret_to_int(x));
485 IntPacket emm0, emm2;
491 sign_bit_sin = pext_sign(sign_bit_sin);
494 y = pmul(x, pset1<Packet>(1.27323954473516));
497 emm2 = pcast64<Packet, IntPacket>(y);
500 emm2 = padd64(emm2, pseti64<IntPacket>(1));
501 emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
502 y = pcast64<IntPacket, Packet>(emm2);
505 emm0 = pand(emm2, pseti64<IntPacket>(4));
506 emm0 = psll64(emm0, 61);
507 Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
510 emm2 = pand(emm2, pseti64<IntPacket>(2));
512 emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
513 Packet poly_mask = reinterpret_to_double(emm2);
517 xmm1 = pset1<Packet>(-0.78515625);
518 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
519 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
520 xmm1 = pmul(y, xmm1);
521 xmm2 = pmul(y, xmm2);
522 xmm3 = pmul(y, xmm3);
527 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
531 Packet z = pmul(x, x);
532 y = pset1<Packet>(2.443315711809948E-005);
535 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
537 y = padd(y, pset1<Packet>(4.166664568298827E-002));
540 Packet tmp = pmul(z, pset1<Packet>(0.5));
542 y = padd(y, pset1<Packet>(1));
546 Packet y2 = pset1<Packet>(-1.9515295891E-4);
548 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
550 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
557 Packet ysin2 = pand(xmm3, y2);
558 Packet ysin1 = pandnot(xmm3, y);
560 xmm1 = padd(ysin1, ysin2);
563 return pxor(xmm1, sign_bit_sin);
568 #ifdef EIGEN_VECTORIZE_AVX
569 #include <immintrin.h>
576 struct reinterpreter<Packet8i>
578 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8i& x)
580 return _mm256_castsi256_ps(x);
583 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8i& x)
585 return _mm256_castsi256_pd(x);
588 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8i& x)
595 struct reinterpreter<Packet8f>
597 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8f& x)
602 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8f& x)
604 return _mm256_castps_pd(x);
607 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8f& x)
609 return _mm256_castps_si256(x);
614 struct reinterpreter<Packet4d>
616 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet4d& x)
618 return _mm256_castpd_ps(x);
621 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet4d& x)
626 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet4d& x)
628 return _mm256_castpd_si256(x);
632 EIGEN_STRONG_INLINE
void split_two(
const Packet8i& x, Packet4i& a, Packet4i& b)
634 a = _mm256_extractf128_si256(x, 0);
635 b = _mm256_extractf128_si256(x, 1);
638 EIGEN_STRONG_INLINE Packet8i combine_two(
const Packet4i& a,
const Packet4i& b)
640 return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
643 EIGEN_STRONG_INLINE
void split_two(
const Packet8f& x, Packet4f& a, Packet4f& b)
645 a = _mm256_extractf128_ps(x, 0);
646 b = _mm256_extractf128_ps(x, 1);
649 EIGEN_STRONG_INLINE Packet8f combine_two(
const Packet4f& a,
const Packet4f& b)
651 return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
655 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet8i& a)
657 #ifdef EIGEN_VECTORIZE_AVX2
658 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
660 auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
661 return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
666 EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
668 return _mm256_set1_epi64x(a);
672 EIGEN_STRONG_INLINE Packet8i padd64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
674 #ifdef EIGEN_VECTORIZE_AVX2
675 return _mm256_add_epi64(a, b);
677 Packet4i a1, a2, b1, b2;
678 split_two(a, a1, a2);
679 split_two(b, b1, b2);
680 return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2));
685 EIGEN_STRONG_INLINE Packet8i psub64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
687 #ifdef EIGEN_VECTORIZE_AVX2
688 return _mm256_sub_epi64(a, b);
690 Packet4i a1, a2, b1, b2;
691 split_two(a, a1, a2);
692 split_two(b, b1, b2);
693 return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2));
698 EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(
const Packet8i& a,
const Packet8i& b)
700 #ifdef EIGEN_VECTORIZE_AVX2
701 return _mm256_cmpeq_epi32(a, b);
703 Packet4i a1, a2, b1, b2;
704 split_two(a, a1, a2);
705 split_two(b, b1, b2);
706 return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
711 EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(
const Packet8i& a,
int b)
713 #ifdef EIGEN_VECTORIZE_AVX2
714 return _mm256_slli_epi32(a, b);
717 split_two(a, a1, a2);
718 return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
723 EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(
const Packet8i& a,
int b)
725 #ifdef EIGEN_VECTORIZE_AVX2
726 return _mm256_srli_epi32(a, b);
729 split_two(a, a1, a2);
730 return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
735 EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(
const Packet8i& a,
int b)
737 #ifdef EIGEN_VECTORIZE_AVX2
738 return _mm256_slli_epi64(a, b);
741 split_two(a, a1, a2);
742 return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
747 EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(
const Packet8i& a,
int b)
749 #ifdef EIGEN_VECTORIZE_AVX2
750 return _mm256_srli_epi64(a, b);
753 split_two(a, a1, a2);
754 return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
758 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(
const Packet8i& a,
const Packet8i& b)
760 #ifdef EIGEN_VECTORIZE_AVX2
761 return _mm256_add_epi32(a, b);
763 Packet4i a1, a2, b1, b2;
764 split_two(a, a1, a2);
765 split_two(b, b1, b2);
766 return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
770 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(
const Packet8i& a,
const Packet8i& b)
772 #ifdef EIGEN_VECTORIZE_AVX2
773 return _mm256_sub_epi32(a, b);
775 Packet4i a1, a2, b1, b2;
776 split_two(a, a1, a2);
777 split_two(b, b1, b2);
778 return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
782 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(
const Packet8i& a,
const Packet8i& b)
784 #ifdef EIGEN_VECTORIZE_AVX2
785 return _mm256_and_si256(a, b);
787 return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
791 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(
const Packet8i& a,
const Packet8i& b)
793 #ifdef EIGEN_VECTORIZE_AVX2
794 return _mm256_andnot_si256(a, b);
796 return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
800 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(
const Packet8i& a,
const Packet8i& b)
802 #ifdef EIGEN_VECTORIZE_AVX2
803 return _mm256_or_si256(a, b);
805 return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
809 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(
const Packet8i& a,
const Packet8i& b)
811 #ifdef EIGEN_VECTORIZE_AVX2
812 return _mm256_xor_si256(a, b);
814 return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
819 EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(
const Packet8i& a,
const Packet8i& b)
821 #ifdef EIGEN_VECTORIZE_AVX2
822 return _mm256_cmpgt_epi32(b, a);
824 Packet4i a1, a2, b1, b2;
825 split_two(a, a1, a2);
826 split_two(b, b1, b2);
827 return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
832 EIGEN_STRONG_INLINE Packet8i pcmplt64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
834 #ifdef EIGEN_VECTORIZE_AVX2
835 return _mm256_cmpgt_epi64(b, a);
837 Packet4i a1, a2, b1, b2;
838 split_two(a, a1, a2);
839 split_two(b, b1, b2);
840 return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2));
845 EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(
const Packet8f& a,
const Packet8f& b)
847 return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
851 EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(
const Packet8f& a,
const Packet8f& b)
853 return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
857 EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(
const Packet4d& a,
const Packet4d& b)
859 return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
863 EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(
const Packet4d& a,
const Packet4d& b)
865 return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
869 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8f& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
871 return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
875 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8i& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
877 return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
881 EIGEN_STRONG_INLINE Packet8i pblendv(
const Packet8i& ifPacket,
const Packet8i& thenPacket,
const Packet8i& elsePacket)
883 return _mm256_castps_si256(_mm256_blendv_ps(
884 _mm256_castsi256_ps(elsePacket),
885 _mm256_castsi256_ps(thenPacket),
886 _mm256_castsi256_ps(ifPacket)
891 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet4d& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
893 return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
897 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet8i& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
899 return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
903 EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(
const int* addr,
const Packet8i& index)
905 #ifdef EIGEN_VECTORIZE_AVX2
906 return _mm256_i32gather_epi32(addr, index, 4);
909 _mm256_storeu_si256((Packet8i*)u, index);
910 return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
911 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
916 EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(
const float *addr,
const Packet8i& index)
918 #ifdef EIGEN_VECTORIZE_AVX2
919 return _mm256_i32gather_ps(addr, index, 4);
922 _mm256_storeu_si256((Packet8i*)u, index);
923 return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
924 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
929 EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(
const double *addr,
const Packet8i& index,
bool upperhalf)
931 #ifdef EIGEN_VECTORIZE_AVX2
932 return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
935 _mm256_storeu_si256((Packet8i*)u, index);
938 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
942 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
948 EIGEN_STRONG_INLINE
int pmovemask<Packet8f>(
const Packet8f& a)
950 return _mm256_movemask_ps(a);
954 EIGEN_STRONG_INLINE
int pmovemask<Packet4d>(
const Packet4d& a)
956 return _mm256_movemask_pd(a);
960 EIGEN_STRONG_INLINE
int pmovemask<Packet8i>(
const Packet8i& a)
962 return pmovemask(_mm256_castsi256_ps(a));
966 EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(
const Packet8f& a)
968 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
972 EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(
const Packet4d& a)
974 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
978 EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
980 #ifdef EIGEN_VECTORIZE_AVX2
981 return _mm256_cmpeq_epi64(a, b);
983 Packet4i a1, a2, b1, b2;
984 split_two(a, a1, a2);
985 split_two(b, b1, b2);
986 return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
991 EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(
const Packet8i& a, uint64_t b, uint64_t c)
994 _mm256_storeu_si256((__m256i*)u, a);
999 return _mm256_loadu_si256((__m256i*)u);
1002 EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) {
1003 auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
1004 return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000));
1007 EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) {
1008 x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
1009 return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
1012 EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) {
1013 x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000));
1015 _mm256_castpd_si256(x),
1016 _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
1021 EIGEN_STRONG_INLINE Packet8i pcast64<Packet4d, Packet8i>(
const Packet4d& a)
1023 return double_to_int64(a);
1027 EIGEN_STRONG_INLINE Packet4d pcast64<Packet8i, Packet4d>(
const Packet8i& a)
1029 return int64_to_double(a);
1032 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
1033 Packet4d psin<Packet4d>(
const Packet4d& x)
1039 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
1040 plog<Packet4d>(
const Packet4d& _x) {
1042 _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
1043 _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
1045 auto inv_mant_mask = _mm256_castsi256_pd(pseti64<Packet8i>(~0x7ff0000000000000));
1046 auto min_norm_pos = _mm256_castsi256_pd(pseti64<Packet8i>(0x10000000000000));
1047 auto minus_inf = _mm256_castsi256_pd(pseti64<Packet8i>(0xfff0000000000000));
1050 _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524);
1051 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2);
1052 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1);
1053 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1);
1054 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1);
1055 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1);
1056 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1);
1057 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1);
1058 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1);
1059 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1);
1060 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4);
1061 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375);
1063 Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ);
1064 Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
1067 x = pmax(x, min_norm_pos);
1069 Packet4d emm0 = uint64_to_double(psrl64(_mm256_castpd_si256(x), 52));
1070 Packet4d e = psub(emm0, pset1<Packet4d>(1022));
1073 x = _mm256_and_pd(x, inv_mant_mask);
1074 x = _mm256_or_pd(x, p4d_half);
1083 Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ);
1084 Packet4d tmp = _mm256_and_pd(x, mask);
1086 e = psub(e, _mm256_and_pd(p4d_1, mask));
1089 Packet4d x2 = pmul(x, x);
1090 Packet4d x3 = pmul(x2, x);
1095 y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1);
1096 y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4);
1097 y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7);
1098 y = pmadd(y, x, p4d_cephes_log_p2);
1099 y1 = pmadd(y1, x, p4d_cephes_log_p5);
1100 y2 = pmadd(y2, x, p4d_cephes_log_p8);
1101 y = pmadd(y, x3, y1);
1102 y = pmadd(y, x3, y2);
1106 y1 = pmul(e, p4d_cephes_log_q1);
1107 tmp = pmul(x2, p4d_half);
1110 y2 = pmul(e, p4d_cephes_log_q2);
1115 return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask));
1121 #ifdef EIGEN_VECTORIZE_SSE2
1122 #include <xmmintrin.h>
1129 struct reinterpreter<Packet4i>
1131 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
1133 return _mm_castsi128_ps(x);
1136 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
1138 return _mm_castsi128_pd(x);
1141 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
1148 struct reinterpreter<Packet4f>
1150 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
1155 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
1157 return _mm_castps_pd(x);
1160 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
1162 return _mm_castps_si128(x);
1167 struct reinterpreter<Packet2d>
1169 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
1171 return _mm_castpd_ps(x);
1174 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
1179 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
1181 return _mm_castpd_si128(x);
1185 EIGEN_STRONG_INLINE
void split_two(
const Packet4i& x, uint64_t& a, uint64_t& b)
1187 #ifdef EIGEN_VECTORIZE_SSE4_1
1188 a = _mm_extract_epi64(x, 0);
1189 b = _mm_extract_epi64(x, 1);
1192 _mm_storeu_si128((__m128i*)u, x);
1198 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet4i& a,
const Packet4i& b)
1200 auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
1201 auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
1202 sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
1203 sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
1204 return _mm_or_si128(sa, sb);
1208 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
1210 return _mm_set1_epi64x(a);
1214 EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1216 return _mm_add_epi64(a, b);
1220 EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1222 return _mm_sub_epi64(a, b);
1226 EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1228 return _mm_cmpeq_epi32(a, b);
1232 EIGEN_STRONG_INLINE Packet4i psll<Packet4i>(
const Packet4i& a,
int b)
1234 return _mm_slli_epi32(a, b);
1238 EIGEN_STRONG_INLINE Packet4i psrl<Packet4i>(
const Packet4i& a,
int b)
1240 return _mm_srli_epi32(a, b);
1245 EIGEN_STRONG_INLINE Packet4i psll64<Packet4i>(
const Packet4i& a,
int b)
1247 return _mm_slli_epi64(a, b);
1251 EIGEN_STRONG_INLINE Packet4i psrl64<Packet4i>(
const Packet4i& a,
int b)
1253 return _mm_srli_epi64(a, b);
1257 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1259 return _mm_cmplt_epi32(a, b);
1263 EIGEN_STRONG_INLINE Packet4i pcmplt64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1265 #ifdef EIGEN_VECTORIZE_SSE4_2
1266 return _mm_cmpgt_epi64(b, a);
1269 _mm_storeu_si128((__m128i*)u, a);
1270 _mm_storeu_si128((__m128i*)v, b);
1271 return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0);
1276 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1278 return _mm_cmplt_ps(a, b);
1282 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a,
const Packet4f& b)
1284 return _mm_cmple_ps(a, b);
1288 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a,
const Packet2d& b)
1290 return _mm_cmplt_pd(a, b);
1294 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a,
const Packet2d& b)
1296 return _mm_cmple_pd(a, b);
1300 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
1302 #ifdef EIGEN_VECTORIZE_SSE4_1
1303 return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
1305 return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
1310 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4i& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
1312 return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
1316 EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket,
const Packet4i& thenPacket,
const Packet4i& elsePacket)
1318 #ifdef EIGEN_VECTORIZE_SSE4_1
1319 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
1321 return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
1326 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
1328 #ifdef EIGEN_VECTORIZE_SSE4_1
1329 return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
1331 return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
1337 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet4i& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
1339 return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
1343 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr,
const Packet4i& index)
1345 #ifdef EIGEN_VECTORIZE_AVX2
1346 return _mm_i32gather_epi32(addr, index, 4);
1349 _mm_storeu_si128((__m128i*)u, index);
1350 return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
1355 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr,
const Packet4i& index)
1357 #ifdef EIGEN_VECTORIZE_AVX2
1358 return _mm_i32gather_ps(addr, index, 4);
1361 _mm_storeu_si128((__m128i*)u, index);
1362 return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
1367 EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(
const double* addr,
const Packet4i& index,
bool upperhalf)
1369 #ifdef EIGEN_VECTORIZE_AVX2
1370 return _mm_i32gather_pd(addr, index, 8);
1373 _mm_storeu_si128((__m128i*)u, index);
1376 return _mm_setr_pd(addr[u[2]], addr[u[3]]);
1380 return _mm_setr_pd(addr[u[0]], addr[u[1]]);
1386 EIGEN_STRONG_INLINE
int pmovemask<Packet4f>(
const Packet4f& a)
1388 return _mm_movemask_ps(a);
1392 EIGEN_STRONG_INLINE
int pmovemask<Packet2d>(
const Packet2d& a)
1394 return _mm_movemask_pd(a);
1398 EIGEN_STRONG_INLINE
int pmovemask<Packet4i>(
const Packet4i& a)
1400 return pmovemask((Packet4f)_mm_castsi128_ps(a));
1404 EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(
const Packet4f& a)
1406 #ifdef EIGEN_VECTORIZE_SSE4_1
1407 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1409 auto round = _MM_GET_ROUNDING_MODE();
1410 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1411 auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
1412 _MM_SET_ROUNDING_MODE(round);
1418 EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(
const Packet2d& a)
1420 #ifdef EIGEN_VECTORIZE_SSE4_1
1421 return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1423 auto round = _MM_GET_ROUNDING_MODE();
1424 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1425 auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
1426 _MM_SET_ROUNDING_MODE(round);
1432 EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
1434 #ifdef EIGEN_VECTORIZE_SSE4_1
1435 return _mm_cmpeq_epi64(a, b);
1437 Packet4i c = _mm_cmpeq_epi32(a, b);
1438 return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
1443 EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(
const Packet4i& a, uint64_t b, uint64_t c)
1446 _mm_storeu_si128((__m128i*)u, a);
1447 u[0] = u[0] * b + c;
1448 u[1] = u[1] * b + c;
1449 return _mm_loadu_si128((__m128i*)u);
1452 EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) {
1453 x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
1454 return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
1457 EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) {
1458 x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
1459 return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
1462 EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) {
1463 x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
1464 return _mm_sub_epi64(
1465 _mm_castpd_si128(x),
1466 _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
1471 EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(
const Packet2d& a)
1473 return double_to_int64(a);
1477 EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(
const Packet4i& a)
1479 return int64_to_double(a);
1482 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
1483 Packet2d psin<Packet2d>(
const Packet2d& x)
1488 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
1489 Packet2d plog<Packet2d>(
const Packet2d& _x)
1492 _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f);
1493 _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f);
1494 _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
1496 auto inv_mant_mask = _mm_castsi128_pd(pseti64<Packet4i>(~0x7ff0000000000000));
1497 auto min_norm_pos = _mm_castsi128_pd(pseti64<Packet4i>(0x10000000000000));
1498 auto minus_inf = _mm_castsi128_pd(pseti64<Packet4i>(0xfff0000000000000));
1503 _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524);
1504 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2);
1505 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1);
1506 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1);
1507 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1);
1508 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1);
1509 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1);
1510 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1);
1511 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1);
1512 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1);
1513 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4);
1514 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375);
1519 Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd());
1520 Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
1522 x = pmax(x, min_norm_pos);
1523 emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52);
1526 x = _mm_and_pd(x, inv_mant_mask);
1527 x = _mm_or_pd(x, p2d_half);
1529 Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1<Packet2d>(1022));
1537 Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF);
1538 Packet2d tmp = pand(x, mask);
1540 e = psub(e, pand(p2d_1, mask));
1543 Packet2d x2 = pmul(x, x);
1544 Packet2d x3 = pmul(x2, x);
1547 y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1);
1548 y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4);
1549 y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7);
1550 y = pmadd(y, x, p2d_cephes_log_p2);
1551 y1 = pmadd(y1, x, p2d_cephes_log_p5);
1552 y2 = pmadd(y2, x, p2d_cephes_log_p8);
1553 y = pmadd(y, x3, y1);
1554 y = pmadd(y, x3, y2);
1557 y1 = pmul(e, p2d_cephes_log_q1);
1558 tmp = pmul(x2, p2d_half);
1561 y2 = pmul(e, p2d_cephes_log_q2);
1565 return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask));