12 #ifndef EIGENRAND_MORE_PACKET_MATH_H
13 #define EIGENRAND_MORE_PACKET_MATH_H
15 #include <Eigen/Dense>
21 template<
typename Packet>
26 template<
typename Packet>
27 inline auto reinterpret_to_float(
const Packet& x)
28 -> decltype(reinterpreter<Packet>{}.to_float(x))
30 return reinterpreter<Packet>{}.to_float(x);
33 template<
typename Packet>
34 inline auto reinterpret_to_double(
const Packet& x)
35 -> decltype(reinterpreter<Packet>{}.to_double(x))
37 return reinterpreter<Packet>{}.to_double(x);
40 template<
typename Packet>
41 inline auto reinterpret_to_int(
const Packet& x)
42 -> decltype(reinterpreter<Packet>{}.to_int(x))
44 return reinterpreter<Packet>{}.to_int(x);
47 template<
typename Packet>
48 EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
50 template<
typename Packet>
51 EIGEN_STRONG_INLINE Packet pcmpeq(
const Packet& a,
const Packet& b);
53 template<
typename Packet>
54 EIGEN_STRONG_INLINE Packet psll(
const Packet& a,
int b);
56 template<
typename Packet>
57 EIGEN_STRONG_INLINE Packet psrl(
const Packet& a,
int b);
59 template<
typename Packet>
60 EIGEN_STRONG_INLINE Packet psll64(
const Packet& a,
int b);
62 template<
typename Packet>
63 EIGEN_STRONG_INLINE Packet psrl64(
const Packet& a,
int b);
65 template<
typename Packet>
66 EIGEN_STRONG_INLINE
void psincos(Packet x, Packet &s, Packet &c)
68 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
69 using IntPacket = decltype(reinterpret_to_int(x));
70 IntPacket emm0, emm2, emm4;
76 sign_bit_sin = reinterpret_to_float(
77 pand(reinterpret_to_int(sign_bit_sin), pset1<IntPacket>(0x80000000))
81 y = pmul(x, pset1<Packet>(1.27323954473516));
84 emm2 = pcast<Packet, IntPacket>(y);
87 emm2 = padd(emm2, pset1<IntPacket>(1));
88 emm2 = pand(emm2, pset1<IntPacket>(~1));
89 y = pcast<IntPacket, Packet>(emm2);
94 emm0 = pand(emm2, pset1<IntPacket>(4));
95 emm0 = psll(emm0, 29);
96 Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
99 emm2 = pand(emm2, pset1<IntPacket>(2));
101 emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
102 Packet poly_mask = reinterpret_to_float(emm2);
106 xmm1 = pset1<Packet>(-0.78515625);
107 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
108 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
109 xmm1 = pmul(y, xmm1);
110 xmm2 = pmul(y, xmm2);
111 xmm3 = pmul(y, xmm3);
116 emm4 = psub(emm4, pset1<IntPacket>(2));
117 emm4 = pandnot(emm4, pset1<IntPacket>(4));
118 emm4 = psll(emm4, 29);
119 Packet sign_bit_cos = reinterpret_to_float(emm4);
120 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
124 Packet z = pmul(x, x);
125 y = pset1<Packet>(2.443315711809948E-005);
128 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
130 y = padd(y, pset1<Packet>(4.166664568298827E-002));
133 Packet tmp = pmul(z, pset1<Packet>(0.5));
135 y = padd(y, pset1<Packet>(1));
139 Packet y2 = pset1<Packet>(-1.9515295891E-4);
141 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
143 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
150 Packet ysin2 = pand(xmm3, y2);
151 Packet ysin1 = pandnot(xmm3, y);
152 y2 = psub(y2, ysin2);
155 xmm1 = padd(ysin1, ysin2);
159 s = pxor(xmm1, sign_bit_sin);
160 c = pxor(xmm2, sign_bit_cos);
163 template<
typename Packet>
164 EIGEN_STRONG_INLINE Packet pcmplt(
const Packet& a,
const Packet& b);
166 template<
typename Packet>
167 EIGEN_STRONG_INLINE Packet pcmple(
const Packet& a,
const Packet& b);
169 template<
typename Packet>
170 EIGEN_STRONG_INLINE Packet pblendv(
const Packet& ifPacket,
const Packet& thenPacket,
const Packet& elsePacket);
172 template<
typename Packet>
173 EIGEN_STRONG_INLINE Packet pgather(
const int* addr,
const Packet& index);
175 template<
typename Packet>
176 EIGEN_STRONG_INLINE
auto pgather(
const float* addr,
const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
178 template<
typename Packet>
179 EIGEN_STRONG_INLINE
auto pgather(
const double* addr,
const Packet& index,
bool upperhalf =
false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
183 #ifdef EIGEN_VECTORIZE_AVX
184 #include <immintrin.h>
191 struct reinterpreter<Packet8i>
193 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8i& x)
195 return _mm256_castsi256_ps(x);
198 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8i& x)
200 return _mm256_castsi256_pd(x);
203 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8i& x)
210 struct reinterpreter<Packet8f>
212 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8f& x)
217 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8f& x)
219 return _mm256_castps_pd(x);
222 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8f& x)
224 return _mm256_castps_si256(x);
229 struct reinterpreter<Packet4d>
231 EIGEN_STRONG_INLINE Packet8f to_float(
const Packet4d& x)
233 return _mm256_castpd_ps(x);
236 EIGEN_STRONG_INLINE Packet4d to_double(
const Packet4d& x)
241 EIGEN_STRONG_INLINE Packet8i to_int(
const Packet4d& x)
243 return _mm256_castpd_si256(x);
247 EIGEN_STRONG_INLINE
void split_two(
const Packet8i& x, Packet4i& a, Packet4i& b)
249 a = _mm256_extractf128_si256(x, 0);
250 b = _mm256_extractf128_si256(x, 1);
253 EIGEN_STRONG_INLINE Packet8i combine_two(
const Packet4i& a,
const Packet4i& b)
255 return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
258 EIGEN_STRONG_INLINE
void split_two(
const Packet8f& x, Packet4f& a, Packet4f& b)
260 a = _mm256_extractf128_ps(x, 0);
261 b = _mm256_extractf128_ps(x, 1);
264 EIGEN_STRONG_INLINE Packet8f combine_two(
const Packet4f& a,
const Packet4f& b)
266 return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
270 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet8i& a)
272 #ifdef EIGEN_VECTORIZE_AVX2
273 return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
275 auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
276 return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
281 EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
283 return _mm256_set1_epi64x(a);
287 EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(
const Packet8i& a,
const Packet8i& b)
289 #ifdef EIGEN_VECTORIZE_AVX2
290 return _mm256_cmpeq_epi32(a, b);
292 Packet4i a1, a2, b1, b2;
293 split_two(a, a1, a2);
294 split_two(b, b1, b2);
295 return combine_two(_mm_cmpeq_epi32(a1, b1), _mm_cmpeq_epi32(a2, b2));
300 EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(
const Packet8i& a,
int b)
302 #ifdef EIGEN_VECTORIZE_AVX2
303 return _mm256_slli_epi32(a, b);
306 split_two(a, a1, a2);
307 return combine_two(_mm_slli_epi32(a1, b), _mm_slli_epi32(a2, b));
312 EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(
const Packet8i& a,
int b)
314 #ifdef EIGEN_VECTORIZE_AVX2
315 return _mm256_srli_epi32(a, b);
318 split_two(a, a1, a2);
319 return combine_two(_mm_srli_epi32(a1, b), _mm_srli_epi32(a2, b));
324 EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(
const Packet8i& a,
int b)
326 #ifdef EIGEN_VECTORIZE_AVX2
327 return _mm256_slli_epi64(a, b);
330 split_two(a, a1, a2);
331 return combine_two(_mm_slli_epi64(a1, b), _mm_slli_epi64(a2, b));
336 EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(
const Packet8i& a,
int b)
338 #ifdef EIGEN_VECTORIZE_AVX2
339 return _mm256_srli_epi64(a, b);
342 split_two(a, a1, a2);
343 return combine_two(_mm_srli_epi64(a1, b), _mm_srli_epi64(a2, b));
347 template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(
const Packet8i& a,
const Packet8i& b)
349 #ifdef EIGEN_VECTORIZE_AVX2
350 return _mm256_add_epi32(a, b);
352 Packet4i a1, a2, b1, b2;
353 split_two(a, a1, a2);
354 split_two(b, b1, b2);
355 return combine_two(_mm_add_epi32(a1, b1), _mm_add_epi32(a2, b2));
359 template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(
const Packet8i& a,
const Packet8i& b)
361 #ifdef EIGEN_VECTORIZE_AVX2
362 return _mm256_sub_epi32(a, b);
364 Packet4i a1, a2, b1, b2;
365 split_two(a, a1, a2);
366 split_two(b, b1, b2);
367 return combine_two(_mm_sub_epi32(a1, b1), _mm_sub_epi32(a2, b2));
371 template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(
const Packet8i& a,
const Packet8i& b)
373 #ifdef EIGEN_VECTORIZE_AVX2
374 return _mm256_and_si256(a, b);
376 return reinterpret_to_int(_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
380 template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(
const Packet8i& a,
const Packet8i& b)
382 #ifdef EIGEN_VECTORIZE_AVX2
383 return _mm256_andnot_si256(a, b);
385 return reinterpret_to_int(_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
389 template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(
const Packet8i& a,
const Packet8i& b)
391 #ifdef EIGEN_VECTORIZE_AVX2
392 return _mm256_or_si256(a, b);
394 return reinterpret_to_int(_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
398 template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(
const Packet8i& a,
const Packet8i& b)
400 #ifdef EIGEN_VECTORIZE_AVX2
401 return _mm256_xor_si256(a, b);
403 return reinterpret_to_int(_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
408 EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(
const Packet8i& a,
const Packet8i& b)
410 return _mm256_cmpgt_epi32(b, a);
414 EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(
const Packet8f& a,
const Packet8f& b)
416 return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
420 EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(
const Packet8f& a,
const Packet8f& b)
422 return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
426 EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(
const Packet4d& a,
const Packet4d& b)
428 return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
432 EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(
const Packet4d& a,
const Packet4d& b)
434 return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
438 EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8f& ifPacket,
const Packet8f& thenPacket,
const Packet8f& elsePacket)
440 return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
444 EIGEN_STRONG_INLINE Packet8i pblendv(
const Packet8i& ifPacket,
const Packet8i& thenPacket,
const Packet8i& elsePacket)
446 return _mm256_castps_si256(_mm256_blendv_ps(
447 _mm256_castsi256_ps(elsePacket),
448 _mm256_castsi256_ps(thenPacket),
449 _mm256_castsi256_ps(ifPacket)
454 EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet4d& ifPacket,
const Packet4d& thenPacket,
const Packet4d& elsePacket)
456 return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
460 EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(
const int* addr,
const Packet8i& index)
462 #ifdef EIGEN_VECTORIZE_AVX2
463 return _mm256_i32gather_epi32(addr, index, 4);
466 _mm256_storeu_si256((Packet8i*)u, index);
467 return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
468 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
473 EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(
const float *addr,
const Packet8i& index)
475 #ifdef EIGEN_VECTORIZE_AVX2
476 return _mm256_i32gather_ps(addr, index, 4);
479 _mm256_storeu_si256((Packet8i*)u, index);
480 return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
481 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
486 EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(
const double *addr,
const Packet8i& index,
bool upperhalf)
488 #ifdef EIGEN_VECTORIZE_AVX2
489 return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
492 _mm256_storeu_si256((Packet8i*)u, index);
495 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
499 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
507 #ifdef EIGEN_VECTORIZE_SSE2
508 #include <xmmintrin.h>
515 struct reinterpreter<Packet4i>
517 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
519 return _mm_castsi128_ps(x);
522 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
524 return _mm_castsi128_pd(x);
527 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
534 struct reinterpreter<Packet4f>
536 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
541 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
543 return _mm_castps_pd(x);
546 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
548 return _mm_castps_si128(x);
553 struct reinterpreter<Packet2d>
555 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
557 return _mm_castpd_ps(x);
560 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
565 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
567 return _mm_castpd_si128(x);
571 EIGEN_STRONG_INLINE
void split_two(
const Packet4i& x, uint64_t& a, uint64_t& b)
573 #ifdef EIGEN_VECTORIZE_SSE4_1
574 a = _mm_extract_epi64(x, 0);
575 b = _mm_extract_epi64(x, 1);
578 _mm_storeu_si128((__m128i*)u, x);
584 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet4i& a,
const Packet4i& b)
586 auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
587 auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
588 sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
589 sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
590 return _mm_or_si128(sa, sb);
594 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
596 return _mm_set1_epi64x(a);
600 Packet4i pcmpeq<Packet4i>(
const Packet4i& a,
const Packet4i& b)
602 return _mm_cmpeq_epi32(a, b);
606 Packet4i psll<Packet4i>(
const Packet4i& a,
int b)
608 return _mm_slli_epi32(a, b);
612 Packet4i psrl<Packet4i>(
const Packet4i& a,
int b)
614 return _mm_srli_epi32(a, b);
619 Packet4i psll64<Packet4i>(
const Packet4i& a,
int b)
621 return _mm_slli_epi64(a, b);
625 Packet4i psrl64<Packet4i>(
const Packet4i& a,
int b)
627 return _mm_srli_epi64(a, b);
631 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a,
const Packet4i& b)
633 return _mm_cmplt_epi32(a, b);
637 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a,
const Packet4f& b)
639 return _mm_cmplt_ps(a, b);
643 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a,
const Packet4f& b)
645 return _mm_cmple_ps(a, b);
649 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a,
const Packet2d& b)
651 return _mm_cmplt_pd(a, b);
655 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a,
const Packet2d& b)
657 return _mm_cmple_pd(a, b);
661 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
663 #ifdef XXX_EIGEN_VECTORIZE_SSE4_1
664 return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
666 return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
671 EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket,
const Packet4i& thenPacket,
const Packet4i& elsePacket)
673 #ifdef XXX_EIGEN_VECTORIZE_SSE4_1
674 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
676 return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
681 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
683 #ifdef XXX_EIGEN_VECTORIZE_SSE4_1
684 return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
686 return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
691 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr,
const Packet4i& index)
693 #ifdef EIGEN_VECTORIZE_AVX2
694 return _mm_i32gather_epi32(addr, index, 4);
697 _mm_storeu_si128((Packet4i*)u, index);
698 return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
703 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr,
const Packet4i& index)
705 #ifdef EIGEN_VECTORIZE_AVX2
706 return _mm_i32gather_ps(addr, index, 4);
709 _mm_storeu_si128((Packet4i*)u, index);
710 return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
715 EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(
const double* addr,
const Packet4i& index,
bool upperhalf)
717 #ifdef EIGEN_VECTORIZE_AVX2
718 return _mm_i32gather_pd(addr, index, 8);
721 _mm_storeu_si128((Packet4i*)u, index);
724 return _mm_setr_pd(addr[u[2]], addr[u[3]]);
728 return _mm_setr_pd(addr[u[0]], addr[u[1]]);