12 #ifndef EIGENRAND_MORE_PACKET_MATH_H
13 #define EIGENRAND_MORE_PACKET_MATH_H
15 #include <Eigen/Dense>
21 template<
typename Packet>
26 template<
typename Packet>
27 inline auto reinterpret_to_float(
const Packet& x)
28 -> decltype(reinterpreter<Packet>{}.to_float(x))
30 return reinterpreter<Packet>{}.to_float(x);
33 template<
typename Packet>
34 inline auto reinterpret_to_double(
const Packet& x)
35 -> decltype(reinterpreter<Packet>{}.to_double(x))
37 return reinterpreter<Packet>{}.to_double(x);
40 template<
typename Packet>
41 inline auto reinterpret_to_int(
const Packet& x)
42 -> decltype(reinterpreter<Packet>{}.to_int(x))
44 return reinterpreter<Packet>{}.to_int(x);
47 template<
typename Packet>
48 EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
50 template<
typename Packet>
51 EIGEN_STRONG_INLINE Packet pcmpeq(
const Packet& a,
const Packet& b);
53 template<
typename Packet>
54 EIGEN_STRONG_INLINE Packet psll(
const Packet& a,
int b);
56 template<
typename Packet>
57 EIGEN_STRONG_INLINE Packet psrl(
const Packet& a,
int b);
59 template<
typename Packet>
60 EIGEN_STRONG_INLINE Packet psll64(
const Packet& a,
int b);
62 template<
typename Packet>
63 EIGEN_STRONG_INLINE Packet psrl64(
const Packet& a,
int b);
65 template<
typename Packet>
66 EIGEN_STRONG_INLINE
int pmovemask(
const Packet& a);
69 EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(
const uint64_t& a,
int b)
75 EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(
const uint64_t& a,
int b)
80 template<
typename Packet>
81 EIGEN_STRONG_INLINE
void psincos(Packet x, Packet &s, Packet &c)
83 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
84 using IntPacket = decltype(reinterpret_to_int(x));
85 IntPacket emm0, emm2, emm4;
91 sign_bit_sin = reinterpret_to_float(
92 pand(reinterpret_to_int(sign_bit_sin), pset1<IntPacket>(0x80000000))
96 y = pmul(x, pset1<Packet>(1.27323954473516));
99 emm2 = pcast<Packet, IntPacket>(y);
102 emm2 = padd(emm2, pset1<IntPacket>(1));
103 emm2 = pand(emm2, pset1<IntPacket>(~1));
104 y = pcast<IntPacket, Packet>(emm2);
109 emm0 = pand(emm2, pset1<IntPacket>(4));
110 emm0 = psll(emm0, 29);
111 Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
114 emm2 = pand(emm2, pset1<IntPacket>(2));
116 emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
117 Packet poly_mask = reinterpret_to_float(emm2);
121 xmm1 = pset1<Packet>(-0.78515625);
122 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
123 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
124 xmm1 = pmul(y, xmm1);
125 xmm2 = pmul(y, xmm2);
126 xmm3 = pmul(y, xmm3);
131 emm4 = psub(emm4, pset1<IntPacket>(2));
132 emm4 = pandnot(emm4, pset1<IntPacket>(4));
133 emm4 = psll(emm4, 29);
134 Packet sign_bit_cos = reinterpret_to_float(emm4);
135 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
139 Packet z = pmul(x, x);
140 y = pset1<Packet>(2.443315711809948E-005);
143 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
145 y = padd(y, pset1<Packet>(4.166664568298827E-002));
148 Packet tmp = pmul(z, pset1<Packet>(0.5));
150 y = padd(y, pset1<Packet>(1));
154 Packet y2 = pset1<Packet>(-1.9515295891E-4);
156 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
158 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
165 Packet ysin2 = pand(xmm3, y2);
166 Packet ysin1 = pandnot(xmm3, y);
167 y2 = psub(y2, ysin2);
170 xmm1 = padd(ysin1, ysin2);
174 s = pxor(xmm1, sign_bit_sin);
175 c = pxor(xmm2, sign_bit_cos);
179 template<
typename Packet>
180 EIGEN_STRONG_INLINE Packet plgamma(
const Packet& x)
182 auto x_3 = padd(x, pset1<Packet>(3));
183 auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
184 ret = psub(ret, x_3);
185 ret = padd(ret, pset1<Packet>(0.9189385332046727));
186 ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
187 ret = psub(ret, plog(pmul(
188 pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
192 template<
typename Packet>
193 EIGEN_STRONG_INLINE Packet pcmplt(
const Packet& a,
const Packet& b);
195 template<
typename Packet>
196 EIGEN_STRONG_INLINE Packet pcmple(
const Packet& a,
const Packet& b);
198 template<
typename Packet>
199 EIGEN_STRONG_INLINE Packet pblendv(
const Packet& ifPacket,
const Packet& thenPacket,
const Packet& elsePacket);
201 template<
typename Packet>
202 EIGEN_STRONG_INLINE Packet pgather(
const int* addr,
const Packet& index);
204 template<
typename Packet>
205 EIGEN_STRONG_INLINE
auto pgather(
const float* addr,
const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
207 template<
typename Packet>
208 EIGEN_STRONG_INLINE
auto pgather(
const double* addr,
const Packet& index,
bool upperhalf =
false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
210 template<
typename Packet>
211 EIGEN_STRONG_INLINE Packet ptruncate(
const Packet& a);
213 template<
typename Packet>
214 EIGEN_STRONG_INLINE Packet pcmpeq64(
const Packet& a,
const Packet& b);
216 template<
typename Packet>
217 EIGEN_STRONG_INLINE Packet pmuluadd64(
const Packet& a, uint64_t b, uint64_t c);
219 template<
typename IntPacket>
220 EIGEN_STRONG_INLINE
auto bit_to_ur_float(
const IntPacket& x) -> decltype(reinterpret_to_float(x))
222 using FloatPacket = decltype(reinterpret_to_float(x));
224 const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
225 upper = pset1<IntPacket>(127 << 23);
226 const FloatPacket one = pset1<FloatPacket>(1);
228 return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
231 template<
typename IntPacket>
232 EIGEN_STRONG_INLINE
auto bit_to_ur_double(
const IntPacket& x) -> decltype(reinterpret_to_double(x))
234 using DoublePacket = decltype(reinterpret_to_double(x));
236 const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
237 upper = pseti64<IntPacket>(1023ull << 52);
238 const DoublePacket one = pset1<DoublePacket>(1);
240 return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
243 template<
typename _Scalar>
247 struct bit_scalar<float>
249 float to_ur(uint32_t x)
256 u = (x & 0x7FFFFF) | (127 << 23);
260 float to_nzur(uint32_t x)
262 return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
267 struct bit_scalar<double>
269 double to_ur(uint64_t x)
276 u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
280 double to_nzur(uint64_t x)
282 return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
292 EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
294 bit_scalar<float> bs;
296 ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
297 ret.f[1] = bs.to_ur(x >> 32);
303 #ifdef EIGEN_VECTORIZE_AVX
304 #include <immintrin.h>
311 struct reinterpreter<Packet8i>
313 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8i& x)
315 return _mm256_castsi256_ps(x);
318 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8i& x)
320 return _mm256_castsi256_pd(x);
323 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8i& x)
330 struct reinterpreter<Packet8f>
332 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8f& x)
337 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8f& x)
339 return _mm256_castps_pd(x);
342 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8f& x)
344 return _mm256_castps_si256(x);
349 struct reinterpreter<Packet4d>
351 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet4d& x)
353 return _mm256_castpd_ps(x);
356 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet4d& x)
361 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet4d& x)
363 return _mm256_castpd_si256(x);
367 EIGEN_STRONG_INLINE
void split_two(
const Packet8i& x, Packet4i& a, Packet4i& b)
369 a = _mm256_extractf128_si256(x, 0);
370 b = _mm256_extractf128_si256(x, 1);
373 EIGEN_STRONG_INLINE Packet8i combine_two(
const Packet4i& a,
const Packet4i& b)
375 return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
378 EIGEN_STRONG_INLINE
void split_two(
const Packet8f& x, Packet4f& a, Packet4f& b)
380 a = _mm256_extractf128_ps(x, 0);
381 b = _mm256_extractf128_ps(x, 1);
384 EIGEN_STRONG_INLINE Packet8f combine_two(
const Packet4f& a,
const Packet4f& b)
386 return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
390 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet8i& a)
392 #ifdef EIGEN_VECTORIZE_AVX2
393 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
395 auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
396 return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
401 EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
403 return _mm256_set1_epi64x(a);
407 EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(
const Packet8i& a,
const Packet8i& b)
409 #ifdef EIGEN_VECTORIZE_AVX2
410 return _mm256_cmpeq_epi32(a, b);
412 Packet4i a1, a2, b1, b2;
413 split_two(a, a1, a2);
414 split_two(b, b1, b2);
415 return combine_two(_mm_cmpeq_epi32(a1, b1), _mm_cmpeq_epi32(a2, b2));
420 EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(
const Packet8i& a,
int b)
422 #ifdef EIGEN_VECTORIZE_AVX2
423 return _mm256_slli_epi32(a, b);
426 split_two(a, a1, a2);
427 return combine_two(_mm_slli_epi32(a1, b), _mm_slli_epi32(a2, b));
432 EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(
const Packet8i& a,
int b)
434 #ifdef EIGEN_VECTORIZE_AVX2
435 return _mm256_srli_epi32(a, b);
438 split_two(a, a1, a2);
439 return combine_two(_mm_srli_epi32(a1, b), _mm_srli_epi32(a2, b));
444 EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(
const Packet8i& a,
int b)
446 #ifdef EIGEN_VECTORIZE_AVX2
447 return _mm256_slli_epi64(a, b);
450 split_two(a, a1, a2);
451 return combine_two(_mm_slli_epi64(a1, b), _mm_slli_epi64(a2, b));
456 EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(
const Packet8i& a,
int b)
458 #ifdef EIGEN_VECTORIZE_AVX2
459 return _mm256_srli_epi64(a, b);
462 split_two(a, a1, a2);
463 return combine_two(_mm_srli_epi64(a1, b), _mm_srli_epi64(a2, b));
467 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(
const Packet8i& a,
const Packet8i& b)
469 #ifdef EIGEN_VECTORIZE_AVX2
470 return _mm256_add_epi32(a, b);
472 Packet4i a1, a2, b1, b2;
473 split_two(a, a1, a2);
474 split_two(b, b1, b2);
475 return combine_two(_mm_add_epi32(a1, b1), _mm_add_epi32(a2, b2));
479 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(
const Packet8i& a,
const Packet8i& b)
481 #ifdef EIGEN_VECTORIZE_AVX2
482 return _mm256_sub_epi32(a, b);
484 Packet4i a1, a2, b1, b2;
485 split_two(a, a1, a2);
486 split_two(b, b1, b2);
487 return combine_two(_mm_sub_epi32(a1, b1), _mm_sub_epi32(a2, b2));
491 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(
const Packet8i& a,
const Packet8i& b)
493 #ifdef EIGEN_VECTORIZE_AVX2
494 return _mm256_and_si256(a, b);
496 return reinterpret_to_int(_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
500 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(
const Packet8i& a,
const Packet8i& b)
502 #ifdef EIGEN_VECTORIZE_AVX2
503 return _mm256_andnot_si256(a, b);
505 return reinterpret_to_int(_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
509 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(
const Packet8i& a,
const Packet8i& b)
511 #ifdef EIGEN_VECTORIZE_AVX2
512 return _mm256_or_si256(a, b);
514 return reinterpret_to_int(_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
518 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(
const Packet8i& a,
const Packet8i& b)
520 #ifdef EIGEN_VECTORIZE_AVX2
521 return _mm256_xor_si256(a, b);
523 return reinterpret_to_int(_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
528 EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(
const Packet8i& a,
const Packet8i& b)
530 return _mm256_cmpgt_epi32(b, a);
534 EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(
const Packet8f& a,
const Packet8f& b)
536 return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
540 EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(
const Packet8f& a,
const Packet8f& b)
542 return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
546 EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(
const Packet4d& a,
const Packet4d& b)
548 return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
552 EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(
const Packet4d& a,
const Packet4d& b)
554 return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
558 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8f& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
560 return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
564 EIGEN_STRONG_INLINE Packet8i pblendv(
const Packet8i& ifPacket,
const Packet8i& thenPacket,
const Packet8i& elsePacket)
566 return _mm256_castps_si256(_mm256_blendv_ps(
567 _mm256_castsi256_ps(elsePacket),
568 _mm256_castsi256_ps(thenPacket),
569 _mm256_castsi256_ps(ifPacket)
574 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet4d& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
576 return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
580 EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(
const int* addr,
const Packet8i& index)
582 #ifdef EIGEN_VECTORIZE_AVX2
583 return _mm256_i32gather_epi32(addr, index, 4);
586 _mm256_storeu_si256((Packet8i*)u, index);
587 return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
588 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
593 EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(
const float *addr,
const Packet8i& index)
595 #ifdef EIGEN_VECTORIZE_AVX2
596 return _mm256_i32gather_ps(addr, index, 4);
599 _mm256_storeu_si256((Packet8i*)u, index);
600 return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
601 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
606 EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(
const double *addr,
const Packet8i& index,
bool upperhalf)
608 #ifdef EIGEN_VECTORIZE_AVX2
609 return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
612 _mm256_storeu_si256((Packet8i*)u, index);
615 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
619 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
625 EIGEN_STRONG_INLINE
int pmovemask<Packet8f>(
const Packet8f& a)
627 return _mm256_movemask_ps(a);
631 EIGEN_STRONG_INLINE
int pmovemask<Packet4d>(
const Packet4d& a)
633 return _mm256_movemask_pd(a);
637 EIGEN_STRONG_INLINE
int pmovemask<Packet8i>(
const Packet8i& a)
639 return pmovemask(_mm256_castsi256_ps(a));
643 EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(
const Packet8f& a)
645 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
649 EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(
const Packet4d& a)
651 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
655 EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
657 #ifdef EIGEN_VECTORIZE_AVX2
658 return _mm256_cmpeq_epi64(a, b);
660 Packet4i a1, a2, b1, b2;
661 split_two(a, a1, a2);
662 split_two(b, b1, b2);
663 return combine_two(_mm_cmpeq_epi64(a1, b1), _mm_cmpeq_epi64(a2, b2));
668 EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(
const Packet8i& a, uint64_t b, uint64_t c)
671 _mm256_storeu_si256((__m256i*)u, a);
676 return _mm256_loadu_si256((__m256i*)u);
682 #ifdef EIGEN_VECTORIZE_SSE2
683 #include <xmmintrin.h>
690 struct reinterpreter<Packet4i>
692 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
694 return _mm_castsi128_ps(x);
697 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
699 return _mm_castsi128_pd(x);
702 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
709 struct reinterpreter<Packet4f>
711 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
716 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
718 return _mm_castps_pd(x);
721 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
723 return _mm_castps_si128(x);
728 struct reinterpreter<Packet2d>
730 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
732 return _mm_castpd_ps(x);
735 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
740 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
742 return _mm_castpd_si128(x);
746 EIGEN_STRONG_INLINE
void split_two(
const Packet4i& x, uint64_t& a, uint64_t& b)
748 #ifdef EIGEN_VECTORIZE_SSE4_1
749 a = _mm_extract_epi64(x, 0);
750 b = _mm_extract_epi64(x, 1);
753 _mm_storeu_si128((__m128i*)u, x);
759 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet4i& a,
const Packet4i& b)
761 auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
762 auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
763 sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
764 sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
765 return _mm_or_si128(sa, sb);
769 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
771 return _mm_set1_epi64x(a);
775 EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(
const Packet4i& a,
const Packet4i& b)
777 return _mm_cmpeq_epi32(a, b);
781 EIGEN_STRONG_INLINE Packet4i psll<Packet4i>(
const Packet4i& a,
int b)
783 return _mm_slli_epi32(a, b);
787 EIGEN_STRONG_INLINE Packet4i psrl<Packet4i>(
const Packet4i& a,
int b)
789 return _mm_srli_epi32(a, b);
794 EIGEN_STRONG_INLINE Packet4i psll64<Packet4i>(
const Packet4i& a,
int b)
796 return _mm_slli_epi64(a, b);
800 EIGEN_STRONG_INLINE Packet4i psrl64<Packet4i>(
const Packet4i& a,
int b)
802 return _mm_srli_epi64(a, b);
806 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a,
const Packet4i& b)
808 return _mm_cmplt_epi32(a, b);
812 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a,
const Packet4f& b)
814 return _mm_cmplt_ps(a, b);
818 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a,
const Packet4f& b)
820 return _mm_cmple_ps(a, b);
824 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a,
const Packet2d& b)
826 return _mm_cmplt_pd(a, b);
830 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a,
const Packet2d& b)
832 return _mm_cmple_pd(a, b);
836 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
838 #ifdef EIGEN_VECTORIZE_SSE4_1
839 return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
841 return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
846 EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket,
const Packet4i& thenPacket,
const Packet4i& elsePacket)
848 #ifdef EIGEN_VECTORIZE_SSE4_1
849 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
851 return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
856 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
858 #ifdef EIGEN_VECTORIZE_SSE4_1
859 return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
861 return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
866 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr,
const Packet4i& index)
868 #ifdef EIGEN_VECTORIZE_AVX2
869 return _mm_i32gather_epi32(addr, index, 4);
872 _mm_storeu_si128((Packet4i*)u, index);
873 return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
878 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr,
const Packet4i& index)
880 #ifdef EIGEN_VECTORIZE_AVX2
881 return _mm_i32gather_ps(addr, index, 4);
884 _mm_storeu_si128((Packet4i*)u, index);
885 return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
890 EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(
const double* addr,
const Packet4i& index,
bool upperhalf)
892 #ifdef EIGEN_VECTORIZE_AVX2
893 return _mm_i32gather_pd(addr, index, 8);
896 _mm_storeu_si128((Packet4i*)u, index);
899 return _mm_setr_pd(addr[u[2]], addr[u[3]]);
903 return _mm_setr_pd(addr[u[0]], addr[u[1]]);
909 EIGEN_STRONG_INLINE
int pmovemask<Packet4f>(
const Packet4f& a)
911 return _mm_movemask_ps(a);
915 EIGEN_STRONG_INLINE
int pmovemask<Packet2d>(
const Packet2d& a)
917 return _mm_movemask_pd(a);
921 EIGEN_STRONG_INLINE
int pmovemask<Packet4i>(
const Packet4i& a)
923 return pmovemask(_mm_castsi128_ps(a));
927 EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(
const Packet4f& a)
929 #ifdef EIGEN_VECTORIZE_SSE4_1
930 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
932 auto round = _MM_GET_ROUNDING_MODE();
933 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
934 auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
935 _MM_SET_ROUNDING_MODE(round);
941 EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(
const Packet2d& a)
943 #ifdef EIGEN_VECTORIZE_SSE4_1
944 return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
946 auto round = _MM_GET_ROUNDING_MODE();
947 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
948 auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
949 _MM_SET_ROUNDING_MODE(round);
955 EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
957 #ifdef EIGEN_VECTORIZE_SSE4_1
958 return _mm_cmpeq_epi64(a, b);
960 Packet4i c = _mm_cmpeq_epi32(a, b);
961 return pand(c, _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
966 EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(
const Packet4i& a, uint64_t b, uint64_t c)
969 _mm_storeu_si128((__m128i*)u, a);
972 return _mm_loadu_si128((__m128i*)u);