12 #ifndef EIGENRAND_MORE_PACKET_MATH_AVX_H
13 #define EIGENRAND_MORE_PACKET_MATH_AVX_H
15 #include <immintrin.h>
22 struct IsIntPacket<Packet8i> : std::true_type {};
25 struct HalfPacket<Packet8i>
27 using type = Packet4i;
31 struct HalfPacket<Packet8f>
33 using type = Packet4f;
37 struct IsFloatPacket<Packet8f> : std::true_type {};
40 struct IsDoublePacket<Packet4d> : std::true_type {};
43 struct reinterpreter<Packet8i>
45 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8i& x)
47 return _mm256_castsi256_ps(x);
50 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8i& x)
52 return _mm256_castsi256_pd(x);
55 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8i& x)
62 struct reinterpreter<Packet8f>
64 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8f& x)
69 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8f& x)
71 return _mm256_castps_pd(x);
74 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8f& x)
76 return _mm256_castps_si256(x);
81 struct reinterpreter<Packet4d>
83 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet4d& x)
85 return _mm256_castpd_ps(x);
88 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet4d& x)
93 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet4d& x)
95 return _mm256_castpd_si256(x);
100 EIGEN_STRONG_INLINE
void split_two<Packet8i>(
const Packet8i& x, Packet4i& a, Packet4i& b)
102 a = _mm256_extractf128_si256(x, 0);
103 b = _mm256_extractf128_si256(x, 1);
106 EIGEN_STRONG_INLINE Packet8i combine_two(
const Packet4i& a,
const Packet4i& b)
108 return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
112 EIGEN_STRONG_INLINE
void split_two<Packet8f>(
const Packet8f& x, Packet4f& a, Packet4f& b)
114 a = _mm256_extractf128_ps(x, 0);
115 b = _mm256_extractf128_ps(x, 1);
118 EIGEN_STRONG_INLINE Packet8f combine_two(
const Packet4f& a,
const Packet4f& b)
120 return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
124 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet8i& a)
126 #ifdef EIGEN_VECTORIZE_AVX2
127 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
129 auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
130 return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
135 EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
137 return _mm256_set1_epi64x(a);
141 EIGEN_STRONG_INLINE Packet8i padd64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
143 #ifdef EIGEN_VECTORIZE_AVX2
144 return _mm256_add_epi64(a, b);
146 Packet4i a1, a2, b1, b2;
147 split_two(a, a1, a2);
148 split_two(b, b1, b2);
149 return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2));
154 EIGEN_STRONG_INLINE Packet8i psub64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
156 #ifdef EIGEN_VECTORIZE_AVX2
157 return _mm256_sub_epi64(a, b);
159 Packet4i a1, a2, b1, b2;
160 split_two(a, a1, a2);
161 split_two(b, b1, b2);
162 return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2));
167 EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(
const Packet8i& a,
const Packet8i& b)
169 #ifdef EIGEN_VECTORIZE_AVX2
170 return _mm256_cmpeq_epi32(a, b);
172 Packet4i a1, a2, b1, b2;
173 split_two(a, a1, a2);
174 split_two(b, b1, b2);
175 return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
180 struct BitShifter<Packet8i>
183 EIGEN_STRONG_INLINE Packet8i sll(
const Packet8i& a)
185 #ifdef EIGEN_VECTORIZE_AVX2
186 return _mm256_slli_epi32(a, b);
189 split_two(a, a1, a2);
190 return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
195 EIGEN_STRONG_INLINE Packet8i srl(
const Packet8i& a,
int _b = b)
197 #ifdef EIGEN_VECTORIZE_AVX2
200 return _mm256_srli_epi32(a, b);
204 return _mm256_srli_epi32(a, _b);
208 split_two(a, a1, a2);
211 return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
215 return combine_two((Packet4i)_mm_srli_epi32(a1, _b), (Packet4i)_mm_srli_epi32(a2, _b));
221 EIGEN_STRONG_INLINE Packet8i sll64(
const Packet8i& a)
223 #ifdef EIGEN_VECTORIZE_AVX2
224 return _mm256_slli_epi64(a, b);
227 split_two(a, a1, a2);
228 return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
233 EIGEN_STRONG_INLINE Packet8i srl64(
const Packet8i& a)
235 #ifdef EIGEN_VECTORIZE_AVX2
236 return _mm256_srli_epi64(a, b);
239 split_two(a, a1, a2);
240 return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
244 #ifdef EIGENRAND_EIGEN_33_MODE
245 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(
const Packet8i& a,
const Packet8i& b)
247 #ifdef EIGEN_VECTORIZE_AVX2
248 return _mm256_add_epi32(a, b);
250 Packet4i a1, a2, b1, b2;
251 split_two(a, a1, a2);
252 split_two(b, b1, b2);
253 return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
257 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(
const Packet8i& a,
const Packet8i& b)
259 #ifdef EIGEN_VECTORIZE_AVX2
260 return _mm256_sub_epi32(a, b);
262 Packet4i a1, a2, b1, b2;
263 split_two(a, a1, a2);
264 split_two(b, b1, b2);
265 return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
269 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(
const Packet8i& a,
const Packet8i& b)
271 #ifdef EIGEN_VECTORIZE_AVX2
272 return _mm256_and_si256(a, b);
274 return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
278 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(
const Packet8i& a,
const Packet8i& b)
280 #ifdef EIGEN_VECTORIZE_AVX2
281 return _mm256_andnot_si256(a, b);
283 return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
287 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(
const Packet8i& a,
const Packet8i& b)
289 #ifdef EIGEN_VECTORIZE_AVX2
290 return _mm256_or_si256(a, b);
292 return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
296 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(
const Packet8i& a,
const Packet8i& b)
298 #ifdef EIGEN_VECTORIZE_AVX2
299 return _mm256_xor_si256(a, b);
301 return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
306 EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(
const Packet8i& a,
const Packet8i& b)
308 #ifdef EIGEN_VECTORIZE_AVX2
309 return _mm256_cmpgt_epi32(b, a);
311 Packet4i a1, a2, b1, b2;
312 split_two(a, a1, a2);
313 split_two(b, b1, b2);
314 return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
319 EIGEN_STRONG_INLINE Packet8i pcmplt64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
321 #ifdef EIGEN_VECTORIZE_AVX2
322 return _mm256_cmpgt_epi64(b, a);
324 Packet4i a1, a2, b1, b2;
325 split_two(a, a1, a2);
326 split_two(b, b1, b2);
327 return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2));
332 EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(
const Packet8f& a,
const Packet8f& b)
334 return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
338 EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(
const Packet8f& a,
const Packet8f& b)
340 return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
344 EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(
const Packet4d& a,
const Packet4d& b)
346 return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
350 EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(
const Packet4d& a,
const Packet4d& b)
352 return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
356 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8f& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
358 return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
362 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8i& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
364 return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
368 EIGEN_STRONG_INLINE Packet8i pblendv(
const Packet8i& ifPacket,
const Packet8i& thenPacket,
const Packet8i& elsePacket)
370 return _mm256_castps_si256(_mm256_blendv_ps(
371 _mm256_castsi256_ps(elsePacket),
372 _mm256_castsi256_ps(thenPacket),
373 _mm256_castsi256_ps(ifPacket)
378 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet4d& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
380 return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
384 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet8i& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
386 return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
390 EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(
const int* addr,
const Packet8i& index)
392 #ifdef EIGEN_VECTORIZE_AVX2
393 return _mm256_i32gather_epi32(addr, index, 4);
396 _mm256_storeu_si256((Packet8i*)u, index);
397 return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
398 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
403 EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(
const float* addr,
const Packet8i& index)
405 #ifdef EIGEN_VECTORIZE_AVX2
406 return _mm256_i32gather_ps(addr, index, 4);
409 _mm256_storeu_si256((Packet8i*)u, index);
410 return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
411 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
416 EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(
const double* addr,
const Packet8i& index,
bool upperhalf)
418 #ifdef EIGEN_VECTORIZE_AVX2
419 return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
422 _mm256_storeu_si256((Packet8i*)u, index);
425 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
429 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
435 EIGEN_STRONG_INLINE
int pmovemask<Packet8f>(
const Packet8f& a)
437 return _mm256_movemask_ps(a);
441 EIGEN_STRONG_INLINE
int pmovemask<Packet4d>(
const Packet4d& a)
443 return _mm256_movemask_pd(a);
447 EIGEN_STRONG_INLINE
int pmovemask<Packet8i>(
const Packet8i& a)
449 return pmovemask(_mm256_castsi256_ps(a));
453 EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(
const Packet8f& a)
455 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
459 EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(
const Packet4d& a)
461 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
465 EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
467 #ifdef EIGEN_VECTORIZE_AVX2
468 return _mm256_cmpeq_epi64(a, b);
470 Packet4i a1, a2, b1, b2;
471 split_two(a, a1, a2);
472 split_two(b, b1, b2);
473 return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
478 EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(
const Packet8i& a, uint64_t b, uint64_t c)
481 _mm256_storeu_si256((__m256i*)u, a);
486 return _mm256_loadu_si256((__m256i*)u);
489 EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) {
490 auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
491 return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000));
494 EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) {
495 x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
496 return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
499 EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) {
500 x = _mm256_add_pd(_mm256_floor_pd(x), _mm256_set1_pd(0x0018000000000000));
502 _mm256_castpd_si256(x),
503 _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
508 EIGEN_STRONG_INLINE Packet8i pcast64<Packet4d, Packet8i>(
const Packet4d& a)
510 return double_to_int64(a);
514 EIGEN_STRONG_INLINE Packet4d pcast64<Packet8i, Packet4d>(
const Packet8i& a)
516 return int64_to_double(a);
519 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
520 Packet4d psin<Packet4d>(
const Packet4d& x)
525 #ifdef EIGENRAND_EIGEN_33_MODE
527 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
528 plog<Packet4d>(
const Packet4d& _x) {
530 _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
531 _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
533 auto inv_mant_mask = _mm256_castsi256_pd(pseti64<Packet8i>(~0x7ff0000000000000));
534 auto min_norm_pos = _mm256_castsi256_pd(pseti64<Packet8i>(0x10000000000000));
535 auto minus_inf = _mm256_castsi256_pd(pseti64<Packet8i>(0xfff0000000000000));
538 _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524);
539 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2);
540 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1);
541 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1);
542 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1);
543 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1);
544 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1);
545 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1);
546 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1);
547 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1);
548 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4);
549 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375);
551 Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ);
552 Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
555 x = pmax(x, min_norm_pos);
557 Packet4d emm0 = uint64_to_double(psrl64<52>(_mm256_castpd_si256(x)));
558 Packet4d e = psub(emm0, pset1<Packet4d>(1022));
561 x = _mm256_and_pd(x, inv_mant_mask);
562 x = _mm256_or_pd(x, p4d_half);
571 Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ);
572 Packet4d tmp = _mm256_and_pd(x, mask);
574 e = psub(e, _mm256_and_pd(p4d_1, mask));
577 Packet4d x2 = pmul(x, x);
578 Packet4d x3 = pmul(x2, x);
583 y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1);
584 y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4);
585 y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7);
586 y = pmadd(y, x, p4d_cephes_log_p2);
587 y1 = pmadd(y1, x, p4d_cephes_log_p5);
588 y2 = pmadd(y2, x, p4d_cephes_log_p8);
589 y = pmadd(y, x3, y1);
590 y = pmadd(y, x3, y2);
594 y1 = pmul(e, p4d_cephes_log_q1);
595 tmp = pmul(x2, p4d_half);
598 y2 = pmul(e, p4d_cephes_log_q2);
603 return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask));
607 #if !(EIGEN_VERSION_AT_LEAST(3,3,5))
608 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(
const Packet4i& a) {
609 return _mm_cvtepi32_ps(a);
612 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(
const Packet4f& a) {
613 return _mm_cvttps_epi32(a);