12 #ifndef EIGENRAND_MORE_PACKET_MATH_H
13 #define EIGENRAND_MORE_PACKET_MATH_H
15 #include <Eigen/Dense>
21 template<
typename Packet>
26 template<
typename Packet>
27 inline auto reinterpret_to_float(
const Packet& x)
28 -> decltype(reinterpreter<Packet>{}.to_float(x))
30 return reinterpreter<Packet>{}.to_float(x);
33 template<
typename Packet>
34 inline auto reinterpret_to_double(
const Packet& x)
35 -> decltype(reinterpreter<Packet>{}.to_double(x))
37 return reinterpreter<Packet>{}.to_double(x);
40 template<
typename Packet>
41 inline auto reinterpret_to_int(
const Packet& x)
42 -> decltype(reinterpreter<Packet>{}.to_int(x))
44 return reinterpreter<Packet>{}.to_int(x);
47 template<
typename Packet>
48 EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
50 template<
typename Packet>
51 EIGEN_STRONG_INLINE Packet pcmpeq(
const Packet& a,
const Packet& b);
53 template<
typename Packet>
54 EIGEN_STRONG_INLINE Packet psll(
const Packet& a,
int b);
56 template<
typename Packet>
57 EIGEN_STRONG_INLINE Packet psrl(
const Packet& a,
int b);
59 template<
typename Packet>
60 EIGEN_STRONG_INLINE Packet psll64(
const Packet& a,
int b);
62 template<
typename Packet>
63 EIGEN_STRONG_INLINE Packet psrl64(
const Packet& a,
int b);
65 template<
typename Packet>
66 EIGEN_STRONG_INLINE
int pmovemask(
const Packet& a);
68 template<
typename Packet>
69 EIGEN_STRONG_INLINE
void psincos(Packet x, Packet &s, Packet &c)
71 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
72 using IntPacket = decltype(reinterpret_to_int(x));
73 IntPacket emm0, emm2, emm4;
79 sign_bit_sin = reinterpret_to_float(
80 pand(reinterpret_to_int(sign_bit_sin), pset1<IntPacket>(0x80000000))
84 y = pmul(x, pset1<Packet>(1.27323954473516));
87 emm2 = pcast<Packet, IntPacket>(y);
90 emm2 = padd(emm2, pset1<IntPacket>(1));
91 emm2 = pand(emm2, pset1<IntPacket>(~1));
92 y = pcast<IntPacket, Packet>(emm2);
97 emm0 = pand(emm2, pset1<IntPacket>(4));
98 emm0 = psll(emm0, 29);
99 Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
102 emm2 = pand(emm2, pset1<IntPacket>(2));
104 emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
105 Packet poly_mask = reinterpret_to_float(emm2);
109 xmm1 = pset1<Packet>(-0.78515625);
110 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
111 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
112 xmm1 = pmul(y, xmm1);
113 xmm2 = pmul(y, xmm2);
114 xmm3 = pmul(y, xmm3);
119 emm4 = psub(emm4, pset1<IntPacket>(2));
120 emm4 = pandnot(emm4, pset1<IntPacket>(4));
121 emm4 = psll(emm4, 29);
122 Packet sign_bit_cos = reinterpret_to_float(emm4);
123 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
127 Packet z = pmul(x, x);
128 y = pset1<Packet>(2.443315711809948E-005);
131 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
133 y = padd(y, pset1<Packet>(4.166664568298827E-002));
136 Packet tmp = pmul(z, pset1<Packet>(0.5));
138 y = padd(y, pset1<Packet>(1));
142 Packet y2 = pset1<Packet>(-1.9515295891E-4);
144 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
146 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
153 Packet ysin2 = pand(xmm3, y2);
154 Packet ysin1 = pandnot(xmm3, y);
155 y2 = psub(y2, ysin2);
158 xmm1 = padd(ysin1, ysin2);
162 s = pxor(xmm1, sign_bit_sin);
163 c = pxor(xmm2, sign_bit_cos);
167 template<
typename Packet>
168 EIGEN_STRONG_INLINE Packet plgamma(
const Packet& x)
170 auto x_3 = padd(x, pset1<Packet>(3));
171 auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
172 ret = psub(ret, x_3);
173 ret = padd(ret, pset1<Packet>(0.9189385332046727));
174 ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
175 ret = psub(ret, plog(pmul(
176 pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
180 template<
typename Packet>
181 EIGEN_STRONG_INLINE Packet pcmplt(
const Packet& a,
const Packet& b);
183 template<
typename Packet>
184 EIGEN_STRONG_INLINE Packet pcmple(
const Packet& a,
const Packet& b);
186 template<
typename Packet>
187 EIGEN_STRONG_INLINE Packet pblendv(
const Packet& ifPacket,
const Packet& thenPacket,
const Packet& elsePacket);
189 template<
typename Packet>
190 EIGEN_STRONG_INLINE Packet pgather(
const int* addr,
const Packet& index);
192 template<
typename Packet>
193 EIGEN_STRONG_INLINE
auto pgather(
const float* addr,
const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
195 template<
typename Packet>
196 EIGEN_STRONG_INLINE
auto pgather(
const double* addr,
const Packet& index,
bool upperhalf =
false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
198 template<
typename Packet>
199 EIGEN_STRONG_INLINE Packet ptruncate(
const Packet& a);
201 template<
typename IntPacket>
202 EIGEN_STRONG_INLINE
auto bit_to_ur_float(
const IntPacket& x) -> decltype(reinterpret_to_float(x))
204 using FloatPacket = decltype(reinterpret_to_float(x));
206 const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
207 upper = pset1<IntPacket>(127 << 23);
208 const FloatPacket one = pset1<FloatPacket>(1);
210 return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
213 template<
typename IntPacket>
214 EIGEN_STRONG_INLINE
auto bit_to_ur_double(
const IntPacket& x) -> decltype(reinterpret_to_double(x))
216 using DoublePacket = decltype(reinterpret_to_double(x));
218 const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
219 upper = pseti64<IntPacket>(1023ull << 52);
220 const DoublePacket one = pset1<DoublePacket>(1);
222 return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
225 template<
typename Scalar>
229 struct bit_scalar<float>
231 float to_ur(uint32_t x)
238 u = (x & 0x7FFFFF) | (127 << 23);
242 float to_nzur(uint32_t x)
244 return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
249 struct bit_scalar<double>
251 double to_ur(uint64_t x)
258 u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
262 double to_nzur(uint64_t x)
264 return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
270 #ifdef EIGEN_VECTORIZE_AVX
271 #include <immintrin.h>
278 struct reinterpreter<Packet8i>
280 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8i& x)
282 return _mm256_castsi256_ps(x);
285 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8i& x)
287 return _mm256_castsi256_pd(x);
290 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8i& x)
297 struct reinterpreter<Packet8f>
299 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8f& x)
304 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8f& x)
306 return _mm256_castps_pd(x);
309 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8f& x)
311 return _mm256_castps_si256(x);
316 struct reinterpreter<Packet4d>
318 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet4d& x)
320 return _mm256_castpd_ps(x);
323 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet4d& x)
328 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet4d& x)
330 return _mm256_castpd_si256(x);
334 EIGEN_STRONG_INLINE
void split_two(
const Packet8i& x, Packet4i& a, Packet4i& b)
336 a = _mm256_extractf128_si256(x, 0);
337 b = _mm256_extractf128_si256(x, 1);
340 EIGEN_STRONG_INLINE Packet8i combine_two(
const Packet4i& a,
const Packet4i& b)
342 return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
345 EIGEN_STRONG_INLINE
void split_two(
const Packet8f& x, Packet4f& a, Packet4f& b)
347 a = _mm256_extractf128_ps(x, 0);
348 b = _mm256_extractf128_ps(x, 1);
351 EIGEN_STRONG_INLINE Packet8f combine_two(
const Packet4f& a,
const Packet4f& b)
353 return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
357 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet8i& a)
359 #ifdef EIGEN_VECTORIZE_AVX2
360 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
362 auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
363 return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
368 EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
370 return _mm256_set1_epi64x(a);
374 EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(
const Packet8i& a,
const Packet8i& b)
376 #ifdef EIGEN_VECTORIZE_AVX2
377 return _mm256_cmpeq_epi32(a, b);
379 Packet4i a1, a2, b1, b2;
380 split_two(a, a1, a2);
381 split_two(b, b1, b2);
382 return combine_two(_mm_cmpeq_epi32(a1, b1), _mm_cmpeq_epi32(a2, b2));
387 EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(
const Packet8i& a,
int b)
389 #ifdef EIGEN_VECTORIZE_AVX2
390 return _mm256_slli_epi32(a, b);
393 split_two(a, a1, a2);
394 return combine_two(_mm_slli_epi32(a1, b), _mm_slli_epi32(a2, b));
399 EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(
const Packet8i& a,
int b)
401 #ifdef EIGEN_VECTORIZE_AVX2
402 return _mm256_srli_epi32(a, b);
405 split_two(a, a1, a2);
406 return combine_two(_mm_srli_epi32(a1, b), _mm_srli_epi32(a2, b));
411 EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(
const Packet8i& a,
int b)
413 #ifdef EIGEN_VECTORIZE_AVX2
414 return _mm256_slli_epi64(a, b);
417 split_two(a, a1, a2);
418 return combine_two(_mm_slli_epi64(a1, b), _mm_slli_epi64(a2, b));
423 EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(
const Packet8i& a,
int b)
425 #ifdef EIGEN_VECTORIZE_AVX2
426 return _mm256_srli_epi64(a, b);
429 split_two(a, a1, a2);
430 return combine_two(_mm_srli_epi64(a1, b), _mm_srli_epi64(a2, b));
434 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(
const Packet8i& a,
const Packet8i& b)
436 #ifdef EIGEN_VECTORIZE_AVX2
437 return _mm256_add_epi32(a, b);
439 Packet4i a1, a2, b1, b2;
440 split_two(a, a1, a2);
441 split_two(b, b1, b2);
442 return combine_two(_mm_add_epi32(a1, b1), _mm_add_epi32(a2, b2));
446 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(
const Packet8i& a,
const Packet8i& b)
448 #ifdef EIGEN_VECTORIZE_AVX2
449 return _mm256_sub_epi32(a, b);
451 Packet4i a1, a2, b1, b2;
452 split_two(a, a1, a2);
453 split_two(b, b1, b2);
454 return combine_two(_mm_sub_epi32(a1, b1), _mm_sub_epi32(a2, b2));
458 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(
const Packet8i& a,
const Packet8i& b)
460 #ifdef EIGEN_VECTORIZE_AVX2
461 return _mm256_and_si256(a, b);
463 return reinterpret_to_int(_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
467 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(
const Packet8i& a,
const Packet8i& b)
469 #ifdef EIGEN_VECTORIZE_AVX2
470 return _mm256_andnot_si256(a, b);
472 return reinterpret_to_int(_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
476 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(
const Packet8i& a,
const Packet8i& b)
478 #ifdef EIGEN_VECTORIZE_AVX2
479 return _mm256_or_si256(a, b);
481 return reinterpret_to_int(_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
485 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(
const Packet8i& a,
const Packet8i& b)
487 #ifdef EIGEN_VECTORIZE_AVX2
488 return _mm256_xor_si256(a, b);
490 return reinterpret_to_int(_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
495 EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(
const Packet8i& a,
const Packet8i& b)
497 return _mm256_cmpgt_epi32(b, a);
501 EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(
const Packet8f& a,
const Packet8f& b)
503 return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
507 EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(
const Packet8f& a,
const Packet8f& b)
509 return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
513 EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(
const Packet4d& a,
const Packet4d& b)
515 return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
519 EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(
const Packet4d& a,
const Packet4d& b)
521 return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
525 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8f& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
527 return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
531 EIGEN_STRONG_INLINE Packet8i pblendv(
const Packet8i& ifPacket,
const Packet8i& thenPacket,
const Packet8i& elsePacket)
533 return _mm256_castps_si256(_mm256_blendv_ps(
534 _mm256_castsi256_ps(elsePacket),
535 _mm256_castsi256_ps(thenPacket),
536 _mm256_castsi256_ps(ifPacket)
541 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet4d& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
543 return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
547 EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(
const int* addr,
const Packet8i& index)
549 #ifdef EIGEN_VECTORIZE_AVX2
550 return _mm256_i32gather_epi32(addr, index, 4);
553 _mm256_storeu_si256((Packet8i*)u, index);
554 return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
555 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
560 EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(
const float *addr,
const Packet8i& index)
562 #ifdef EIGEN_VECTORIZE_AVX2
563 return _mm256_i32gather_ps(addr, index, 4);
566 _mm256_storeu_si256((Packet8i*)u, index);
567 return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
568 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
573 EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(
const double *addr,
const Packet8i& index,
bool upperhalf)
575 #ifdef EIGEN_VECTORIZE_AVX2
576 return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
579 _mm256_storeu_si256((Packet8i*)u, index);
582 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
586 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
592 EIGEN_STRONG_INLINE
int pmovemask<Packet8f>(
const Packet8f& a)
594 return _mm256_movemask_ps(a);
598 EIGEN_STRONG_INLINE
int pmovemask<Packet4d>(
const Packet4d& a)
600 return _mm256_movemask_pd(a);
604 EIGEN_STRONG_INLINE
int pmovemask<Packet8i>(
const Packet8i& a)
606 return pmovemask(_mm256_castsi256_ps(a));
610 EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(
const Packet8f& a)
612 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
616 EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(
const Packet4d& a)
618 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
624 #ifdef EIGEN_VECTORIZE_SSE2
625 #include <xmmintrin.h>
632 struct reinterpreter<Packet4i>
634 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
636 return _mm_castsi128_ps(x);
639 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
641 return _mm_castsi128_pd(x);
644 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
651 struct reinterpreter<Packet4f>
653 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
658 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
660 return _mm_castps_pd(x);
663 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
665 return _mm_castps_si128(x);
670 struct reinterpreter<Packet2d>
672 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
674 return _mm_castpd_ps(x);
677 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
682 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
684 return _mm_castpd_si128(x);
688 EIGEN_STRONG_INLINE
void split_two(
const Packet4i& x, uint64_t& a, uint64_t& b)
690 #ifdef EIGEN_VECTORIZE_SSE4_1
691 a = _mm_extract_epi64(x, 0);
692 b = _mm_extract_epi64(x, 1);
695 _mm_storeu_si128((__m128i*)u, x);
701 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet4i& a,
const Packet4i& b)
703 auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
704 auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
705 sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
706 sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
707 return _mm_or_si128(sa, sb);
711 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
713 return _mm_set1_epi64x(a);
717 EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(
const Packet4i& a,
const Packet4i& b)
719 return _mm_cmpeq_epi32(a, b);
723 EIGEN_STRONG_INLINE Packet4i psll<Packet4i>(
const Packet4i& a,
int b)
725 return _mm_slli_epi32(a, b);
729 EIGEN_STRONG_INLINE Packet4i psrl<Packet4i>(
const Packet4i& a,
int b)
731 return _mm_srli_epi32(a, b);
736 EIGEN_STRONG_INLINE Packet4i psll64<Packet4i>(
const Packet4i& a,
int b)
738 return _mm_slli_epi64(a, b);
742 EIGEN_STRONG_INLINE Packet4i psrl64<Packet4i>(
const Packet4i& a,
int b)
744 return _mm_srli_epi64(a, b);
748 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a,
const Packet4i& b)
750 return _mm_cmplt_epi32(a, b);
754 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a,
const Packet4f& b)
756 return _mm_cmplt_ps(a, b);
760 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a,
const Packet4f& b)
762 return _mm_cmple_ps(a, b);
766 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a,
const Packet2d& b)
768 return _mm_cmplt_pd(a, b);
772 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a,
const Packet2d& b)
774 return _mm_cmple_pd(a, b);
778 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
780 #ifdef EIGEN_VECTORIZE_SSE4_1
781 return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
783 return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
788 EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket,
const Packet4i& thenPacket,
const Packet4i& elsePacket)
790 #ifdef EIGEN_VECTORIZE_SSE4_1
791 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
793 return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
798 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
800 #ifdef EIGEN_VECTORIZE_SSE4_1
801 return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
803 return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
808 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr,
const Packet4i& index)
810 #ifdef EIGEN_VECTORIZE_AVX2
811 return _mm_i32gather_epi32(addr, index, 4);
814 _mm_storeu_si128((Packet4i*)u, index);
815 return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
820 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr,
const Packet4i& index)
822 #ifdef EIGEN_VECTORIZE_AVX2
823 return _mm_i32gather_ps(addr, index, 4);
826 _mm_storeu_si128((Packet4i*)u, index);
827 return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
832 EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(
const double* addr,
const Packet4i& index,
bool upperhalf)
834 #ifdef EIGEN_VECTORIZE_AVX2
835 return _mm_i32gather_pd(addr, index, 8);
838 _mm_storeu_si128((Packet4i*)u, index);
841 return _mm_setr_pd(addr[u[2]], addr[u[3]]);
845 return _mm_setr_pd(addr[u[0]], addr[u[1]]);
851 EIGEN_STRONG_INLINE
int pmovemask<Packet4f>(
const Packet4f& a)
853 return _mm_movemask_ps(a);
857 EIGEN_STRONG_INLINE
int pmovemask<Packet2d>(
const Packet2d& a)
859 return _mm_movemask_pd(a);
863 EIGEN_STRONG_INLINE
int pmovemask<Packet4i>(
const Packet4i& a)
865 return pmovemask(_mm_castsi128_ps(a));
869 EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(
const Packet4f& a)
871 #ifdef EIGEN_VECTORIZE_SSE4_1
872 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
874 auto round = _MM_GET_ROUNDING_MODE();
875 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
876 auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
877 _MM_SET_ROUNDING_MODE(round);
883 EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(
const Packet2d& a)
885 #ifdef EIGEN_VECTORIZE_SSE4_1
886 return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
888 auto round = _MM_GET_ROUNDING_MODE();
889 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
890 auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
891 _MM_SET_ROUNDING_MODE(round);