12#ifndef EIGENRAND_MORE_PACKET_MATH_AVX_H
13#define EIGENRAND_MORE_PACKET_MATH_AVX_H
22 struct IsIntPacket<Packet8i> : std::true_type {};
25 struct HalfPacket<Packet8i>
27 using type = Packet4i;
31 struct HalfPacket<Packet8f>
33 using type = Packet4f;
37 struct IsFloatPacket<Packet8f> : std::true_type {};
40 struct IsDoublePacket<Packet4d> : std::true_type {};
43 struct reinterpreter<Packet8i>
45 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8i& x)
47 return _mm256_castsi256_ps(x);
50 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8i& x)
52 return _mm256_castsi256_pd(x);
55 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8i& x)
62 struct reinterpreter<Packet8f>
64 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8f& x)
69 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8f& x)
71 return _mm256_castps_pd(x);
74 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8f& x)
76 return _mm256_castps_si256(x);
81 struct reinterpreter<Packet4d>
83 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet4d& x)
85 return _mm256_castpd_ps(x);
88 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet4d& x)
93 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet4d& x)
95 return _mm256_castpd_si256(x);
100 EIGEN_STRONG_INLINE
void split_two<Packet8i>(
const Packet8i& x, Packet4i& a, Packet4i& b)
102 a = _mm256_extractf128_si256(x, 0);
103 b = _mm256_extractf128_si256(x, 1);
106 EIGEN_STRONG_INLINE Packet8i combine_two(
const Packet4i& a,
const Packet4i& b)
108 return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
112 EIGEN_STRONG_INLINE
void split_two<Packet8f>(
const Packet8f& x, Packet4f& a, Packet4f& b)
114 a = _mm256_extractf128_ps(x, 0);
115 b = _mm256_extractf128_ps(x, 1);
118 EIGEN_STRONG_INLINE Packet8f combine_two(
const Packet4f& a,
const Packet4f& b)
120 return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
124 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet8i& a)
126#ifdef EIGEN_VECTORIZE_AVX2
127 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
129 auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
130 return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
135 EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
137 return _mm256_set1_epi64x(a);
141 EIGEN_STRONG_INLINE Packet8i padd64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
143#ifdef EIGEN_VECTORIZE_AVX2
144 return _mm256_add_epi64(a, b);
146 Packet4i a1, a2, b1, b2;
147 split_two(a, a1, a2);
148 split_two(b, b1, b2);
149 return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2));
154 EIGEN_STRONG_INLINE Packet8i psub64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
156#ifdef EIGEN_VECTORIZE_AVX2
157 return _mm256_sub_epi64(a, b);
159 Packet4i a1, a2, b1, b2;
160 split_two(a, a1, a2);
161 split_two(b, b1, b2);
162 return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2));
167 EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(
const Packet8i& a,
const Packet8i& b)
169#ifdef EIGEN_VECTORIZE_AVX2
170 return _mm256_cmpeq_epi32(a, b);
172 Packet4i a1, a2, b1, b2;
173 split_two(a, a1, a2);
174 split_two(b, b1, b2);
175 return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
180 EIGEN_STRONG_INLINE Packet8f pcmpeq<Packet8f>(
const Packet8f& a,
const Packet8f& b)
182 return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
186 EIGEN_STRONG_INLINE Packet8i pnegate<Packet8i>(
const Packet8i& a)
188#ifdef EIGEN_VECTORIZE_AVX2
189 return _mm256_sub_epi32(pset1<Packet8i>(0), a);
192 split_two(a, a1, a2);
193 return combine_two(_mm_sub_epi32(pset1<Packet4i>(0), a1), _mm_sub_epi32(pset1<Packet4i>(0), a2));
198 struct BitShifter<Packet8i>
201 EIGEN_STRONG_INLINE Packet8i sll(
const Packet8i& a)
203#ifdef EIGEN_VECTORIZE_AVX2
204 return _mm256_slli_epi32(a, b);
207 split_two(a, a1, a2);
208 return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
213 EIGEN_STRONG_INLINE Packet8i srl(
const Packet8i& a,
int _b = b)
215#ifdef EIGEN_VECTORIZE_AVX2
218 return _mm256_srli_epi32(a, b);
222 return _mm256_srli_epi32(a, _b);
226 split_two(a, a1, a2);
229 return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
233 return combine_two((Packet4i)_mm_srli_epi32(a1, _b), (Packet4i)_mm_srli_epi32(a2, _b));
239 EIGEN_STRONG_INLINE Packet8i sll64(
const Packet8i& a)
241#ifdef EIGEN_VECTORIZE_AVX2
242 return _mm256_slli_epi64(a, b);
245 split_two(a, a1, a2);
246 return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
251 EIGEN_STRONG_INLINE Packet8i srl64(
const Packet8i& a)
253#ifdef EIGEN_VECTORIZE_AVX2
254 return _mm256_srli_epi64(a, b);
257 split_two(a, a1, a2);
258 return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
262#ifdef EIGENRAND_EIGEN_33_MODE
263 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(
const Packet8i& a,
const Packet8i& b)
265 #ifdef EIGEN_VECTORIZE_AVX2
266 return _mm256_add_epi32(a, b);
268 Packet4i a1, a2, b1, b2;
269 split_two(a, a1, a2);
270 split_two(b, b1, b2);
271 return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
275 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(
const Packet8i& a,
const Packet8i& b)
277 #ifdef EIGEN_VECTORIZE_AVX2
278 return _mm256_sub_epi32(a, b);
280 Packet4i a1, a2, b1, b2;
281 split_two(a, a1, a2);
282 split_two(b, b1, b2);
283 return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
287 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(
const Packet8i& a,
const Packet8i& b)
289 #ifdef EIGEN_VECTORIZE_AVX2
290 return _mm256_and_si256(a, b);
292 return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
296 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(
const Packet8i& a,
const Packet8i& b)
298 #ifdef EIGEN_VECTORIZE_AVX2
299 return _mm256_andnot_si256(a, b);
301 return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
305 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(
const Packet8i& a,
const Packet8i& b)
307 #ifdef EIGEN_VECTORIZE_AVX2
308 return _mm256_or_si256(a, b);
310 return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
314 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(
const Packet8i& a,
const Packet8i& b)
316 #ifdef EIGEN_VECTORIZE_AVX2
317 return _mm256_xor_si256(a, b);
319 return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
323 template<> EIGEN_STRONG_INLINE
bool predux_any(
const Packet8f& x)
325 return !!_mm256_movemask_ps(x);
328 template<> EIGEN_STRONG_INLINE
bool predux_any(
const Packet8i& x)
330 return predux_any(_mm256_castsi256_ps(x));
334 template<> EIGEN_STRONG_INLINE
bool predux_all(
const Packet8f& x)
336 return _mm256_movemask_ps(x) == 0xFF;
339 template<> EIGEN_STRONG_INLINE
bool predux_all(
const Packet8i& x)
341 return predux_all(_mm256_castsi256_ps(x));
345 EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(
const Packet8i& a,
const Packet8i& b)
347#ifdef EIGEN_VECTORIZE_AVX2
348 return _mm256_cmpgt_epi32(b, a);
350 Packet4i a1, a2, b1, b2;
351 split_two(a, a1, a2);
352 split_two(b, b1, b2);
353 return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
358 EIGEN_STRONG_INLINE Packet8i pcmplt64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
360#ifdef EIGEN_VECTORIZE_AVX2
361 return _mm256_cmpgt_epi64(b, a);
363 Packet4i a1, a2, b1, b2;
364 split_two(a, a1, a2);
365 split_two(b, b1, b2);
366 return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2));
371 EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(
const Packet8f& a,
const Packet8f& b)
373 return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
377 EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(
const Packet8f& a,
const Packet8f& b)
379 return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
383 EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(
const Packet4d& a,
const Packet4d& b)
385 return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
389 EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(
const Packet4d& a,
const Packet4d& b)
391 return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
395 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8f& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
397 return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
401 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8i& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
403 return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
407 EIGEN_STRONG_INLINE Packet8i pblendv(
const Packet8i& ifPacket,
const Packet8i& thenPacket,
const Packet8i& elsePacket)
409 return _mm256_castps_si256(_mm256_blendv_ps(
410 _mm256_castsi256_ps(elsePacket),
411 _mm256_castsi256_ps(thenPacket),
412 _mm256_castsi256_ps(ifPacket)
417 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet4d& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
419 return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
423 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet8i& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
425 return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
429 EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(
const int* addr,
const Packet8i& index)
431#ifdef EIGEN_VECTORIZE_AVX2
432 return _mm256_i32gather_epi32(addr, index, 4);
435 _mm256_storeu_si256((Packet8i*)u, index);
436 return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
437 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
442 EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(
const float* addr,
const Packet8i& index)
444#ifdef EIGEN_VECTORIZE_AVX2
445 return _mm256_i32gather_ps(addr, index, 4);
448 _mm256_storeu_si256((Packet8i*)u, index);
449 return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
450 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
455 EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(
const double* addr,
const Packet8i& index,
bool upperhalf)
457#ifdef EIGEN_VECTORIZE_AVX2
458 return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
461 _mm256_storeu_si256((Packet8i*)u, index);
464 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
468 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
474 EIGEN_STRONG_INLINE
int pmovemask<Packet8f>(
const Packet8f& a)
476 return _mm256_movemask_ps(a);
480 EIGEN_STRONG_INLINE
int pmovemask<Packet4d>(
const Packet4d& a)
482 return _mm256_movemask_pd(a);
486 EIGEN_STRONG_INLINE
int pmovemask<Packet8i>(
const Packet8i& a)
488 return pmovemask(_mm256_castsi256_ps(a));
492 EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(
const Packet8f& a)
494 return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
498 EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(
const Packet4d& a)
500 return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
504 EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(
const Packet8i& a,
const Packet8i& b)
506#ifdef EIGEN_VECTORIZE_AVX2
507 return _mm256_cmpeq_epi64(a, b);
509 Packet4i a1, a2, b1, b2;
510 split_two(a, a1, a2);
511 split_two(b, b1, b2);
512 return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
517 EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(
const Packet8i& a, uint64_t b, uint64_t c)
520 _mm256_storeu_si256((__m256i*)u, a);
525 return _mm256_loadu_si256((__m256i*)u);
528 EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) {
529 auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
530 return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000));
533 EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) {
534 x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
535 return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
538 EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) {
539 x = _mm256_add_pd(_mm256_floor_pd(x), _mm256_set1_pd(0x0018000000000000));
541 _mm256_castpd_si256(x),
542 _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
547 EIGEN_STRONG_INLINE Packet8i pcast64<Packet4d, Packet8i>(
const Packet4d& a)
549 return double_to_int64(a);
553 EIGEN_STRONG_INLINE Packet4d pcast64<Packet8i, Packet4d>(
const Packet8i& a)
555 return int64_to_double(a);
558 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
559 Packet4d psin<Packet4d>(
const Packet4d& x)
564 #ifdef EIGENRAND_EIGEN_33_MODE
566 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
567 plog<Packet4d>(
const Packet4d& _x) {
569 _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
570 _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
572 auto inv_mant_mask = _mm256_castsi256_pd(pseti64<Packet8i>(~0x7ff0000000000000));
573 auto min_norm_pos = _mm256_castsi256_pd(pseti64<Packet8i>(0x10000000000000));
574 auto minus_inf = _mm256_castsi256_pd(pseti64<Packet8i>(0xfff0000000000000));
577 _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524);
578 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2);
579 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1);
580 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1);
581 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1);
582 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1);
583 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1);
584 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1);
585 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1);
586 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1);
587 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4);
588 _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375);
590 Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ);
591 Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
594 x = pmax(x, min_norm_pos);
596 Packet4d emm0 = uint64_to_double(psrl64<52>(_mm256_castpd_si256(x)));
597 Packet4d e = psub(emm0, pset1<Packet4d>(1022));
600 x = _mm256_and_pd(x, inv_mant_mask);
601 x = _mm256_or_pd(x, p4d_half);
610 Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ);
611 Packet4d tmp = _mm256_and_pd(x, mask);
613 e = psub(e, _mm256_and_pd(p4d_1, mask));
616 Packet4d x2 = pmul(x, x);
617 Packet4d x3 = pmul(x2, x);
622 y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1);
623 y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4);
624 y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7);
625 y = pmadd(y, x, p4d_cephes_log_p2);
626 y1 = pmadd(y1, x, p4d_cephes_log_p5);
627 y2 = pmadd(y2, x, p4d_cephes_log_p8);
628 y = pmadd(y, x3, y1);
629 y = pmadd(y, x3, y2);
633 y1 = pmul(e, p4d_cephes_log_q1);
634 tmp = pmul(x2, p4d_half);
637 y2 = pmul(e, p4d_cephes_log_q2);
642 return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask));
646 #if !(EIGEN_VERSION_AT_LEAST(3,3,5))
647 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(
const Packet4i& a) {
648 return _mm_cvtepi32_ps(a);
651 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(
const Packet4f& a) {
652 return _mm_cvttps_epi32(a);