12#ifndef EIGENRAND_MORE_PACKET_MATH_NEON_H
13#define EIGENRAND_MORE_PACKET_MATH_NEON_H
18#ifdef EIGENRAND_EIGEN_33_MODE
24 EIGEN_DEVICE_FUNC
inline Packet4f pcast<Packet4i, Packet4f>(
const Packet4i& a)
26 return vcvtq_f32_s32(a);
30 EIGEN_DEVICE_FUNC
inline Packet4i pcast<Packet4f, Packet4i>(
const Packet4f& a)
32 return vcvtq_s32_f32(a);
44 struct IsIntPacket<Packet4i> : std::true_type {};
47 struct IsFloatPacket<Packet4f> : std::true_type {};
50 struct IsDoublePacket<Packet2d> : std::true_type {};
53 struct HalfPacket<Packet4i>
55 using type = uint64_t;
59 struct reinterpreter<Packet4i>
61 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
63 return (Packet4f)vreinterpretq_f32_s32(x);
66 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
71 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
73 return (Packet2d)vreinterpretq_f64_s32(x);
78 struct reinterpreter<Packet4f>
80 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
85 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
87 return (Packet4i)vreinterpretq_s32_f32(x);
90 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
92 return (Packet2d)vreinterpretq_f64_f32(x);
97 struct reinterpreter<Packet2d>
99 EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
101 return vreinterpretq_f32_f64(x);
104 EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
109 EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
111 return vreinterpretq_s32_f64(x);
116 EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(
const Packet4i& a,
const Packet4i& b)
118 return vreinterpretq_s32_u32(vceqq_s32(a, b));
122 EIGEN_STRONG_INLINE Packet4f pcmpeq<Packet4f>(
const Packet4f& a,
const Packet4f& b)
124 return vreinterpretq_f32_u32(vceqq_f32(a, b));
128 EIGEN_STRONG_INLINE Packet4i pbitnot<Packet4i>(
const Packet4i& a)
134 EIGEN_STRONG_INLINE Packet4f pbitnot<Packet4f>(
const Packet4f& a)
136 return (Packet4f)vreinterpretq_f32_s32(pbitnot((Packet4i)vreinterpretq_s32_f32(a)));
140 struct BitShifter<Packet4i>
143 EIGEN_STRONG_INLINE Packet4i sll(
const Packet4i& a)
145 return vreinterpretq_s32_u32(vshlq_n_u32(vreinterpretq_u32_s32(a), b));
149 EIGEN_STRONG_INLINE Packet4i srl(
const Packet4i& a,
int _b = b)
153 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), b > 0 ? b : 1));
160 case 1:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 1));
161 case 2:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 2));
162 case 3:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 3));
163 case 4:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 4));
164 case 5:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 5));
165 case 6:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 6));
166 case 7:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 7));
167 case 8:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 8));
168 case 9:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 9));
169 case 10:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 10));
170 case 11:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 11));
171 case 12:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 12));
172 case 13:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 13));
173 case 14:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 14));
174 case 15:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 15));
175 case 16:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 16));
176 case 17:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 17));
177 case 18:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 18));
178 case 19:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 19));
179 case 20:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 20));
180 case 21:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 21));
181 case 22:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 22));
182 case 23:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 23));
183 case 24:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 24));
184 case 25:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 25));
185 case 26:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 26));
186 case 27:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 27));
187 case 28:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 28));
188 case 29:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 29));
189 case 30:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 30));
190 case 31:
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
192 return vdupq_n_s32(0);
197 EIGEN_STRONG_INLINE Packet4i sll64(
const Packet4i& a)
199 return vreinterpretq_s32_u64(vshlq_n_u64(vreinterpretq_u64_s32(a), b));
203 EIGEN_STRONG_INLINE Packet4i srl64(
const Packet4i& a)
205 return vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(a), b));
210 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a,
const Packet4i& b)
212 return vreinterpretq_s32_u32(vcltq_s32(a, b));
216 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a,
const Packet4f& b)
218 return vreinterpretq_f32_u32(vcltq_f32(a, b));
222 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a,
const Packet4f& b)
224 return vreinterpretq_f32_u32(vcleq_f32(a, b));
228 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a,
const Packet2d& b)
230 return vreinterpretq_f64_u64(vcltq_f64(a,b));
234 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a,
const Packet2d& b)
236 return vreinterpretq_f64_u64(vcleq_f64(a,b));
240 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
242 return vbslq_f32(vreinterpretq_u32_f32(ifPacket), thenPacket, elsePacket);
246 EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4i& ifPacket,
const Packet4f& thenPacket,
const Packet4f& elsePacket)
248 return vbslq_f32(vreinterpretq_u32_s32(ifPacket), thenPacket, elsePacket);
252 EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket,
const Packet4i& thenPacket,
const Packet4i& elsePacket)
254 return vbslq_s32(vreinterpretq_u32_s32(ifPacket), thenPacket, elsePacket);
258 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
260 return vbslq_f64(vreinterpretq_u64_f64(ifPacket), thenPacket, elsePacket);
264 EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet4i& ifPacket,
const Packet2d& thenPacket,
const Packet2d& elsePacket)
266 return vbslq_f64(vreinterpretq_u64_s32(ifPacket), thenPacket, elsePacket);
270 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr,
const Packet4i& index)
283 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr,
const Packet4i& index)
296 EIGEN_STRONG_INLINE
int pmovemask<Packet4f>(
const Packet4f& a)
298 int32_t bits[4] = { 1, 2, 4, 8 };
299 auto r = vbslq_s32(vreinterpretq_u32_f32(a), vld1q_s32(bits), vdupq_n_s32(0));
300 auto s = vadd_s32(vget_low_s32(r), vget_high_s32(r));
301 return vget_lane_s32(vpadd_s32(s, s), 0);
305 EIGEN_STRONG_INLINE
int pmovemask<Packet4i>(
const Packet4i& a)
307 return pmovemask((Packet4f)vreinterpretq_f32_s32(a));
311 EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(
const Packet4f& a)
317 EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(
const Packet2d& a)
319 return (Packet4i)vcvtq_s64_f64(a);
323 EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(
const Packet4i& a)
325 return vcvtq_f64_s64((int64x2_t)a);
330 EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
332 return (Packet4i)vaddq_s64((int64x2_t)a, (int64x2_t)b);
336 EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
338 return (Packet4i)vsubq_s64((int64x2_t)a, (int64x2_t)b);
341 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
342 Packet2d psin<Packet2d>(
const Packet2d& x)
348 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
350 return vreinterpretq_s32_u64(vdupq_n_u64(a));
354 EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(
const Packet4i& a,
const Packet4i& b)
356 return vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(a), vreinterpretq_s64_s32(b)));
360 EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(
const Packet4i& a, uint64_t b, uint64_t c)
363 vst1q_u64(u, vreinterpretq_u64_s32(a));
366 return vreinterpretq_s32_u64(vld1q_u64(u));
370 EIGEN_STRONG_INLINE
bool predux_all(
const Packet4f& x)
372 uint32x2_t tmp = vand_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
373 vget_high_u32(vreinterpretq_u32_f32(x)));
374 return vget_lane_u32(vpmin_u32(tmp, tmp), 0);
378 EIGEN_STRONG_INLINE
bool predux_all(
const Packet4i& x)
380 return predux_all((Packet4f)vreinterpretq_f32_s32(x));
383 #ifdef EIGENRAND_EIGEN_33_MODE
385 EIGEN_STRONG_INLINE
bool predux_any(
const Packet4f& x)
387 uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
388 vget_high_u32(vreinterpretq_u32_f32(x)));
389 return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
393 EIGEN_STRONG_INLINE
bool predux_any(
const Packet4i& x)
395 return predux_any((Packet4f)vreinterpretq_f32_s32(x));
399 EIGEN_STRONG_INLINE Packet4f plog<Packet4f>(
const Packet4f& _x)
402 _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
403 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
404 _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
406 const Packet4f p4f_inv_mant_mask = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(~0x7f800000));
409 const Packet4f p4f_min_norm_pos = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0x00800000));
410 const Packet4f p4f_minus_inf = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0xff800000));
415 _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
416 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
417 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310E-1f);
418 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
419 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846E-1f);
420 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787E-1f);
421 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665E-1f);
422 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765E-1f);
423 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993E-1f);
424 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174E-1f);
425 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
426 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
431 Packet4f invalid_mask = pbitnot(pcmple(pset1<Packet4f>(0), x));
432 Packet4f iszero_mask = pcmpeq(x, pset1<Packet4f>(0));
434 x = pmax(x, p4f_min_norm_pos);
435 emm0 = BitShifter<Packet4i>{}.template srl<23>((Packet4i)vreinterpretq_s32_f32(x));
438 x = pand(x, p4f_inv_mant_mask);
439 x = por(x, p4f_half);
441 emm0 = psub(emm0, p4i_0x7f);
442 Packet4f e = padd(Packet4f(vcvtq_f32_s32(emm0)), p4f_1);
450 Packet4f mask = pcmplt(x, p4f_cephes_SQRTHF);
451 Packet4f tmp = pand(x, mask);
453 e = psub(e, pand(p4f_1, mask));
456 Packet4f x2 = pmul(x, x);
457 Packet4f x3 = pmul(x2, x);
460 y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
461 y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
462 y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
463 y = pmadd(y, x, p4f_cephes_log_p2);
464 y1 = pmadd(y1, x, p4f_cephes_log_p5);
465 y2 = pmadd(y2, x, p4f_cephes_log_p8);
466 y = pmadd(y, x3, y1);
467 y = pmadd(y, x3, y2);
470 y1 = pmul(e, p4f_cephes_log_q1);
471 tmp = pmul(x2, p4f_half);
474 y2 = pmul(e, p4f_cephes_log_q2);
478 return pblendv(iszero_mask, p4f_minus_inf, por(x, invalid_mask));
482 EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(
const Packet4f& x)
484 return vsqrtq_f32(x);
488 EIGEN_STRONG_INLINE Packet4f psin<Packet4f>(
const Packet4f& _x)
491 _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
492 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
494 _EIGEN_DECLARE_CONST_Packet4i(1, 1);
495 _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
496 _EIGEN_DECLARE_CONST_Packet4i(2, 2);
497 _EIGEN_DECLARE_CONST_Packet4i(4, 4);
499 const Packet4f p4f_sign_mask = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0x80000000));
501 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
502 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
503 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
504 _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
505 _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
506 _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
507 _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
508 _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
509 _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
510 _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);
512 Packet4f xmm1, xmm2, xmm3, sign_bit, y;
522 sign_bit = pand(sign_bit, p4f_sign_mask);
525 y = pmul(x, p4f_cephes_FOPI);
528 emm2 = vcvtq_s32_f32(y);
530 emm2 = padd(emm2, p4i_1);
531 emm2 = pand(emm2, p4i_not1);
532 y = vcvtq_f32_s32(emm2);
534 emm0 = pand(emm2, p4i_4);
535 emm0 = BitShifter<Packet4i>{}.template sll<29>(emm0);
542 emm2 = pand(emm2, p4i_2);
543 emm2 = pcmpeq(emm2, pset1<Packet4i>(0));
545 Packet4f swap_sign_bit = (Packet4f)vreinterpretq_f32_s32(emm0);
546 Packet4f poly_mask = (Packet4f)vreinterpretq_f32_s32(emm2);
547 sign_bit = pxor(sign_bit, swap_sign_bit);
551 xmm1 = pmul(y, p4f_minus_cephes_DP1);
552 xmm2 = pmul(y, p4f_minus_cephes_DP2);
553 xmm3 = pmul(y, p4f_minus_cephes_DP3);
560 Packet4f z = pmul(x, x);
562 y = pmadd(y, z, p4f_coscof_p1);
563 y = pmadd(y, z, p4f_coscof_p2);
566 Packet4f tmp = pmul(z, p4f_half);
572 Packet4f y2 = p4f_sincof_p0;
573 y2 = pmadd(y2, z, p4f_sincof_p1);
574 y2 = pmadd(y2, z, p4f_sincof_p2);
580 y = pblendv(poly_mask, y2, y);
582 return pxor(y, sign_bit);