12#ifndef EIGENRAND_MORE_PACKET_MATH_SSE_H
13#define EIGENRAND_MORE_PACKET_MATH_SSE_H
22 struct IsIntPacket<Packet4i> : std::true_type {};
25 struct IsFloatPacket<Packet4f> : std::true_type {};
28 struct IsDoublePacket<Packet2d> : std::true_type {};
31 struct HalfPacket<Packet4i>
33 using type = uint64_t;
36#ifdef EIGEN_VECTORIZE_AVX
39 struct HalfPacket<Packet4f>
45 struct reinterpreter<Packet4i>
47 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
49 return _mm_castsi128_ps(x);
52 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
54 return _mm_castsi128_pd(x);
57 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
64 struct reinterpreter<Packet4f>
66 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
71 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
73 return _mm_castps_pd(x);
76 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
78 return _mm_castps_si128(x);
83 struct reinterpreter<Packet2d>
85 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
87 return _mm_castpd_ps(x);
90 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
95 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
97 return _mm_castpd_si128(x);
102 EIGEN_STRONG_INLINE
void split_two<Packet4i>(
const Packet4i& x, uint64_t& a, uint64_t& b)
104#ifdef EIGEN_VECTORIZE_SSE4_1
105 a = _mm_extract_epi64(x, 0);
106 b = _mm_extract_epi64(x, 1);
109 _mm_storeu_si128((__m128i*)u, x);
115 EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet4i& a,
const Packet4i& b)
117 auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
118 auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
119 sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
120 sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
121 return _mm_or_si128(sa, sb);
125 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
127 return _mm_set1_epi64x(a);
131 EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
133 return _mm_add_epi64(a, b);
137 EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
139 return _mm_sub_epi64(a, b);
143 EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(
const Packet4i& a,
const Packet4i& b)
145 return _mm_cmpeq_epi32(a, b);
149 EIGEN_STRONG_INLINE Packet4f pcmpeq<Packet4f>(
const Packet4f& a,
const Packet4f& b)
151 return _mm_cmpeq_ps(a, b);
155 struct BitShifter<Packet4i>
158 EIGEN_STRONG_INLINE Packet4i sll(
const Packet4i& a)
160 return _mm_slli_epi32(a, b);
164 EIGEN_STRONG_INLINE Packet4i srl(
const Packet4i& a,
int _b = b)
168 return _mm_srli_epi32(a, b);
172 return _mm_srli_epi32(a, _b);
177 EIGEN_STRONG_INLINE Packet4i sll64(
const Packet4i& a)
179 return _mm_slli_epi64(a, b);
183 EIGEN_STRONG_INLINE Packet4i srl64(
const Packet4i& a)
185 return _mm_srli_epi64(a, b);
190 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a,
const Packet4i& b)
192 return _mm_cmplt_epi32(a, b);
196 EIGEN_STRONG_INLINE Packet4i pcmplt64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
198#ifdef EIGEN_VECTORIZE_SSE4_2
199 return _mm_cmpgt_epi64(b, a);
202 _mm_storeu_si128((__m128i*)u, a);
203 _mm_storeu_si128((__m128i*)v, b);
204 return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0);
209 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a,
const Packet4f& b)
211 return _mm_cmplt_ps(a, b);
215 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a,
const Packet4f& b)
217 return _mm_cmple_ps(a, b);
221 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a,
const Packet2d& b)
223 return _mm_cmplt_pd(a, b);
227 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a,
const Packet2d& b)
229 return _mm_cmple_pd(a, b);
233 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
235#ifdef EIGEN_VECTORIZE_SSE4_1
236 return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
238 return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
243 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4i& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
245 return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
249 EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket,
const Packet4i& thenPacket,
const Packet4i& elsePacket)
251#ifdef EIGEN_VECTORIZE_SSE4_1
252 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
254 return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
259 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
261#ifdef EIGEN_VECTORIZE_SSE4_1
262 return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
264 return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
270 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet4i& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
272 return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
276 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr,
const Packet4i& index)
278#ifdef EIGEN_VECTORIZE_AVX2
279 return _mm_i32gather_epi32(addr, index, 4);
282 _mm_storeu_si128((__m128i*)u, index);
283 return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
288 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr,
const Packet4i& index)
290#ifdef EIGEN_VECTORIZE_AVX2
291 return _mm_i32gather_ps(addr, index, 4);
294 _mm_storeu_si128((__m128i*)u, index);
295 return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
300 EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(
const double* addr,
const Packet4i& index,
bool upperhalf)
302#ifdef EIGEN_VECTORIZE_AVX2
303 return _mm_i32gather_pd(addr, index, 8);
306 _mm_storeu_si128((__m128i*)u, index);
309 return _mm_setr_pd(addr[u[2]], addr[u[3]]);
313 return _mm_setr_pd(addr[u[0]], addr[u[1]]);
319 EIGEN_STRONG_INLINE
int pmovemask<Packet4f>(
const Packet4f& a)
321 return _mm_movemask_ps(a);
325 EIGEN_STRONG_INLINE
int pmovemask<Packet2d>(
const Packet2d& a)
327 return _mm_movemask_pd(a);
331 EIGEN_STRONG_INLINE
int pmovemask<Packet4i>(
const Packet4i& a)
333 return pmovemask((Packet4f)_mm_castsi128_ps(a));
337 EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(
const Packet4f& a)
339#ifdef EIGEN_VECTORIZE_SSE4_1
340 return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
342 auto round = _MM_GET_ROUNDING_MODE();
343 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
344 auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
345 _MM_SET_ROUNDING_MODE(round);
351 EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(
const Packet2d& a)
353#ifdef EIGEN_VECTORIZE_SSE4_1
354 return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
356 auto round = _MM_GET_ROUNDING_MODE();
357 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
358 auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
359 _MM_SET_ROUNDING_MODE(round);
365 EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
367#ifdef EIGEN_VECTORIZE_SSE4_1
368 return _mm_cmpeq_epi64(a, b);
370 Packet4i c = _mm_cmpeq_epi32(a, b);
371 return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
376 EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(
const Packet4i& a, uint64_t b, uint64_t c)
379 _mm_storeu_si128((__m128i*)u, a);
382 return _mm_loadu_si128((__m128i*)u);
385 EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) {
386 x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
387 return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
390 EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) {
391 x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
392 return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
395 EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) {
396 int _mm_rounding = _MM_GET_ROUNDING_MODE();
397 _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN);
398 x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
399 _MM_SET_ROUNDING_MODE(_mm_rounding);
400 return _mm_sub_epi64(
402 _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
407 EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(
const Packet2d& a)
409 return double_to_int64(a);
413 EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(
const Packet4i& a)
415 return int64_to_double(a);
418 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
419 Packet2d psin<Packet2d>(
const Packet2d& x)
424 template<> EIGEN_STRONG_INLINE
bool predux_all(
const Packet4f& x)
426 return _mm_movemask_ps(x) == 0x0F;
429 template<> EIGEN_STRONG_INLINE
bool predux_all(
const Packet4i& x)
431 return predux_all(_mm_castsi128_ps(x));
434#ifdef EIGENRAND_EIGEN_33_MODE
436 template<> EIGEN_STRONG_INLINE
bool predux_any(
const Packet4f& x)
438 return !!_mm_movemask_ps(x);
441 template<> EIGEN_STRONG_INLINE
bool predux_any(
const Packet4i& x)
443 return predux_any(_mm_castsi128_ps(x));
446 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
447 Packet2d plog<Packet2d>(
const Packet2d& _x)
450 _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f);
451 _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f);
452 _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
454 auto inv_mant_mask = _mm_castsi128_pd(pseti64<Packet4i>(~0x7ff0000000000000));
455 auto min_norm_pos = _mm_castsi128_pd(pseti64<Packet4i>(0x10000000000000));
456 auto minus_inf = _mm_castsi128_pd(pseti64<Packet4i>(0xfff0000000000000));
461 _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524);
462 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2);
463 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1);
464 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1);
465 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1);
466 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1);
467 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1);
468 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1);
469 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1);
470 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1);
471 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4);
472 _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375);
477 Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd());
478 Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
480 x = pmax(x, min_norm_pos);
481 emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52);
484 x = _mm_and_pd(x, inv_mant_mask);
485 x = _mm_or_pd(x, p2d_half);
487 Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1<Packet2d>(1022));
495 Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF);
496 Packet2d tmp = pand(x, mask);
498 e = psub(e, pand(p2d_1, mask));
501 Packet2d x2 = pmul(x, x);
502 Packet2d x3 = pmul(x2, x);
505 y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1);
506 y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4);
507 y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7);
508 y = pmadd(y, x, p2d_cephes_log_p2);
509 y1 = pmadd(y1, x, p2d_cephes_log_p5);
510 y2 = pmadd(y2, x, p2d_cephes_log_p8);
511 y = pmadd(y, x3, y1);
512 y = pmadd(y, x3, y2);
515 y1 = pmul(e, p2d_cephes_log_q1);
516 tmp = pmul(x2, p2d_half);
519 y2 = pmul(e, p2d_cephes_log_q2);
523 return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask));