12#ifndef EIGENRAND_MORE_PACKET_MATH_NEON_H 
   13#define EIGENRAND_MORE_PACKET_MATH_NEON_H 
   18#ifdef EIGENRAND_EIGEN_33_MODE 
   24        EIGEN_DEVICE_FUNC 
inline Packet4f pcast<Packet4i, Packet4f>(
const Packet4i& a)
 
   26            return vcvtq_f32_s32(a);
 
   30        EIGEN_DEVICE_FUNC 
inline Packet4i pcast<Packet4f, Packet4i>(
const Packet4f& a)
 
   32            return vcvtq_s32_f32(a);
 
   44        struct IsIntPacket<Packet4i> : std::true_type {};
 
   47        struct IsFloatPacket<Packet4f> : std::true_type {};
 
   50        struct IsDoublePacket<Packet2d> : std::true_type {};
 
   53        struct HalfPacket<Packet4i>
 
   55            using type = uint64_t;
 
   59        struct reinterpreter<Packet4i>
 
   61            EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
 
   63                return (Packet4f)vreinterpretq_f32_s32(x);
 
   66            EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
 
   71            EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
 
   73                return (Packet2d)vreinterpretq_f64_s32(x);
 
   78        struct reinterpreter<Packet4f>
 
   80            EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
 
   85            EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
 
   87                return (Packet4i)vreinterpretq_s32_f32(x);
 
   90            EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
 
   92                return (Packet2d)vreinterpretq_f64_f32(x);
 
   97        struct reinterpreter<Packet2d>
 
   99            EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
 
  101                return vreinterpretq_f32_f64(x);
 
  104            EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
 
  109            EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
 
  111                return vreinterpretq_s32_f64(x);
 
  116        EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
  118            return vreinterpretq_s32_u32(vceqq_s32(a, b));
 
  122        EIGEN_STRONG_INLINE Packet4f pcmpeq<Packet4f>(
const Packet4f& a, 
const Packet4f& b)
 
  124            return vreinterpretq_f32_u32(vceqq_f32(a, b));
 
  128        EIGEN_STRONG_INLINE Packet4i pbitnot<Packet4i>(
const Packet4i& a)
 
  134        EIGEN_STRONG_INLINE Packet4f pbitnot<Packet4f>(
const Packet4f& a)
 
  136            return (Packet4f)vreinterpretq_f32_s32(pbitnot((Packet4i)vreinterpretq_s32_f32(a)));
 
  140        struct BitShifter<Packet4i>
 
  143            EIGEN_STRONG_INLINE Packet4i sll(
const Packet4i& a)
 
  145                return vreinterpretq_s32_u32(vshlq_n_u32(vreinterpretq_u32_s32(a), b));
 
  149            EIGEN_STRONG_INLINE Packet4i srl(
const Packet4i& a, 
int _b = b)
 
  153                    return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), b > 0 ? b : 1));
 
  160                    case 1: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 1));
 
  161                    case 2: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 2));
 
  162                    case 3: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 3));
 
  163                    case 4: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 4));
 
  164                    case 5: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 5));
 
  165                    case 6: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 6));
 
  166                    case 7: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 7));
 
  167                    case 8: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 8));
 
  168                    case 9: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 9));
 
  169                    case 10: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 10));
 
  170                    case 11: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 11));
 
  171                    case 12: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 12));
 
  172                    case 13: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 13));
 
  173                    case 14: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 14));
 
  174                    case 15: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 15));
 
  175                    case 16: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 16));
 
  176                    case 17: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 17));
 
  177                    case 18: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 18));
 
  178                    case 19: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 19));
 
  179                    case 20: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 20));
 
  180                    case 21: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 21));
 
  181                    case 22: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 22));
 
  182                    case 23: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 23));
 
  183                    case 24: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 24));
 
  184                    case 25: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 25));
 
  185                    case 26: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 26));
 
  186                    case 27: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 27));
 
  187                    case 28: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 28));
 
  188                    case 29: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 29));
 
  189                    case 30: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 30));
 
  190                    case 31: 
return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
 
  192                    return vdupq_n_s32(0);
 
  197            EIGEN_STRONG_INLINE Packet4i sll64(
const Packet4i& a)
 
  199                return vreinterpretq_s32_u64(vshlq_n_u64(vreinterpretq_u64_s32(a), b));
 
  203            EIGEN_STRONG_INLINE Packet4i srl64(
const Packet4i& a)
 
  205                return vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(a), b));
 
  210        EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
  212            return vreinterpretq_s32_u32(vcltq_s32(a, b));
 
  216        EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a, 
const Packet4f& b)
 
  218            return vreinterpretq_f32_u32(vcltq_f32(a, b));
 
  222        EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a, 
const Packet4f& b)
 
  224            return vreinterpretq_f32_u32(vcleq_f32(a, b));
 
  228        EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a, 
const Packet2d& b)
 
  230            return vreinterpretq_f64_u64(vcltq_f64(a,b));
 
  234        EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a, 
const Packet2d& b)
 
  236            return vreinterpretq_f64_u64(vcleq_f64(a,b));
 
  240        EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket, 
const Packet4f& thenPacket, 
const Packet4f& elsePacket)
 
  242            return vbslq_f32(vreinterpretq_u32_f32(ifPacket), thenPacket, elsePacket);
 
  246        EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4i& ifPacket, 
const Packet4f& thenPacket, 
const Packet4f& elsePacket)
 
  248            return vbslq_f32(vreinterpretq_u32_s32(ifPacket), thenPacket, elsePacket);
 
  252        EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket, 
const Packet4i& thenPacket, 
const Packet4i& elsePacket)
 
  254            return vbslq_s32(vreinterpretq_u32_s32(ifPacket), thenPacket, elsePacket);
 
  258        EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket, 
const Packet2d& thenPacket, 
const Packet2d& elsePacket)
 
  260            return vbslq_f64(vreinterpretq_u64_f64(ifPacket), thenPacket, elsePacket);
 
  264        EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet4i& ifPacket, 
const Packet2d& thenPacket, 
const Packet2d& elsePacket)
 
  266            return vbslq_f64(vreinterpretq_u64_s32(ifPacket), thenPacket, elsePacket);
 
  270        EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr, 
const Packet4i& index)
 
  283        EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr, 
const Packet4i& index)
 
  296        EIGEN_STRONG_INLINE 
int pmovemask<Packet4f>(
const Packet4f& a)
 
  298            int32_t bits[4] = { 1, 2, 4, 8 };
 
  299            auto r = vbslq_s32(vreinterpretq_u32_f32(a), vld1q_s32(bits), vdupq_n_s32(0));
 
  300            auto s = vadd_s32(vget_low_s32(r), vget_high_s32(r));
 
  301            return vget_lane_s32(vpadd_s32(s, s), 0);
 
  305        EIGEN_STRONG_INLINE 
int pmovemask<Packet4i>(
const Packet4i& a)
 
  307            return pmovemask((Packet4f)vreinterpretq_f32_s32(a));
 
  311        EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(
const Packet4f& a)
 
  317        EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(
const Packet2d& a)
 
  319            return (Packet4i)vcvtq_s64_f64(a);
 
  323        EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(
const Packet4i& a)
 
  325            return vcvtq_f64_s64((int64x2_t)a);
 
  330        EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
  332            return (Packet4i)vaddq_s64((int64x2_t)a, (int64x2_t)b);
 
  336        EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
  338            return (Packet4i)vsubq_s64((int64x2_t)a, (int64x2_t)b);
 
  341        template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 
  342            Packet2d psin<Packet2d>(
const Packet2d& x)
 
  348        EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
 
  350            return vreinterpretq_s32_u64(vdupq_n_u64(a));
 
  354        EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
  356            return vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(a), vreinterpretq_s64_s32(b)));
 
  360        EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(
const Packet4i& a, uint64_t b, uint64_t c)
 
  363            vst1q_u64(u, vreinterpretq_u64_s32(a));
 
  366            return vreinterpretq_s32_u64(vld1q_u64(u));
 
  370        EIGEN_STRONG_INLINE 
bool predux_all(
const Packet4f& x)
 
  372            uint32x2_t tmp = vand_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
 
  373                                        vget_high_u32(vreinterpretq_u32_f32(x)));
 
  374            return vget_lane_u32(vpmin_u32(tmp, tmp), 0);
 
  378        EIGEN_STRONG_INLINE 
bool predux_all(
const Packet4i& x)
 
  380            return predux_all((Packet4f)vreinterpretq_f32_s32(x));
 
  383    #ifdef EIGENRAND_EIGEN_33_MODE 
  385        EIGEN_STRONG_INLINE 
bool predux_any(
const Packet4f& x)
 
  387            uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
 
  388                                        vget_high_u32(vreinterpretq_u32_f32(x)));
 
  389            return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
 
  393        EIGEN_STRONG_INLINE 
bool predux_any(
const Packet4i& x)
 
  395            return predux_any((Packet4f)vreinterpretq_f32_s32(x));
 
  399        EIGEN_STRONG_INLINE Packet4f plog<Packet4f>(
const Packet4f& _x)
 
  402            _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
 
  403            _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
 
  404            _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
 
  406            const Packet4f p4f_inv_mant_mask = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(~0x7f800000));
 
  409            const Packet4f p4f_min_norm_pos = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0x00800000));
 
  410            const Packet4f p4f_minus_inf = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0xff800000));
 
  415            _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
 
  416            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
 
  417            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310E-1f);
 
  418            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
 
  419            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846E-1f);
 
  420            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787E-1f);
 
  421            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665E-1f);
 
  422            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765E-1f);
 
  423            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993E-1f);
 
  424            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174E-1f);
 
  425            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
 
  426            _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
 
  431            Packet4f invalid_mask = pbitnot(pcmple(pset1<Packet4f>(0), x)); 
 
  432            Packet4f iszero_mask = pcmpeq(x, pset1<Packet4f>(0));
 
  434            x = pmax(x, p4f_min_norm_pos);  
 
  435            emm0 = BitShifter<Packet4i>{}.template srl<23>((Packet4i)vreinterpretq_s32_f32(x));
 
  438            x = pand(x, p4f_inv_mant_mask);
 
  439            x = por(x, p4f_half);
 
  441            emm0 = psub(emm0, p4i_0x7f);
 
  442            Packet4f e = padd(Packet4f(vcvtq_f32_s32(emm0)), p4f_1);
 
  450            Packet4f mask = pcmplt(x, p4f_cephes_SQRTHF);
 
  451            Packet4f tmp = pand(x, mask);
 
  453            e = psub(e, pand(p4f_1, mask));
 
  456            Packet4f x2 = pmul(x, x);
 
  457            Packet4f x3 = pmul(x2, x);
 
  460            y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
 
  461            y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
 
  462            y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
 
  463            y = pmadd(y, x, p4f_cephes_log_p2);
 
  464            y1 = pmadd(y1, x, p4f_cephes_log_p5);
 
  465            y2 = pmadd(y2, x, p4f_cephes_log_p8);
 
  466            y = pmadd(y, x3, y1);
 
  467            y = pmadd(y, x3, y2);
 
  470            y1 = pmul(e, p4f_cephes_log_q1);
 
  471            tmp = pmul(x2, p4f_half);
 
  474            y2 = pmul(e, p4f_cephes_log_q2);
 
  478            return pblendv(iszero_mask, p4f_minus_inf, por(x, invalid_mask));
 
  482        EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(
const Packet4f& x)
 
  484            return vsqrtq_f32(x);
 
  488        EIGEN_STRONG_INLINE Packet4f psin<Packet4f>(
const Packet4f& _x)
 
  491            _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
 
  492            _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
 
  494            _EIGEN_DECLARE_CONST_Packet4i(1, 1);
 
  495            _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
 
  496            _EIGEN_DECLARE_CONST_Packet4i(2, 2);
 
  497            _EIGEN_DECLARE_CONST_Packet4i(4, 4);
 
  499            const Packet4f p4f_sign_mask = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0x80000000));
 
  501            _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
 
  502            _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
 
  503            _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
 
  504            _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
 
  505            _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
 
  506            _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
 
  507            _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
 
  508            _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
 
  509            _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
 
  510            _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); 
 
  512            Packet4f xmm1, xmm2, xmm3, sign_bit, y;
 
  522            sign_bit = pand(sign_bit, p4f_sign_mask);
 
  525            y = pmul(x, p4f_cephes_FOPI);
 
  528            emm2 = vcvtq_s32_f32(y);
 
  530            emm2 = padd(emm2, p4i_1);
 
  531            emm2 = pand(emm2, p4i_not1);
 
  532            y = vcvtq_f32_s32(emm2);
 
  534            emm0 = pand(emm2, p4i_4);
 
  535            emm0 = BitShifter<Packet4i>{}.template sll<29>(emm0);
 
  542            emm2 = pand(emm2, p4i_2);
 
  543            emm2 = pcmpeq(emm2, pset1<Packet4i>(0));
 
  545            Packet4f swap_sign_bit = (Packet4f)vreinterpretq_f32_s32(emm0);
 
  546            Packet4f poly_mask = (Packet4f)vreinterpretq_f32_s32(emm2);
 
  547            sign_bit = pxor(sign_bit, swap_sign_bit);
 
  551            xmm1 = pmul(y, p4f_minus_cephes_DP1);
 
  552            xmm2 = pmul(y, p4f_minus_cephes_DP2);
 
  553            xmm3 = pmul(y, p4f_minus_cephes_DP3);
 
  560            Packet4f z = pmul(x, x);
 
  562            y = pmadd(y, z, p4f_coscof_p1);
 
  563            y = pmadd(y, z, p4f_coscof_p2);
 
  566            Packet4f tmp = pmul(z, p4f_half);
 
  572            Packet4f y2 = p4f_sincof_p0;
 
  573            y2 = pmadd(y2, z, p4f_sincof_p1);
 
  574            y2 = pmadd(y2, z, p4f_sincof_p2);
 
  580            y = pblendv(poly_mask, y2, y);
 
  582            return pxor(y, sign_bit);