12 #ifndef EIGENRAND_MORE_PACKET_MATH_H 
   13 #define EIGENRAND_MORE_PACKET_MATH_H 
   15 #include <Eigen/Dense> 
   22         struct IsIntPacket : std::false_type {};
 
   25         struct IsFloatPacket : std::false_type {};
 
   28         struct IsDoublePacket : std::false_type {};
 
   33 #ifdef EIGEN_VECTORIZE_AVX2 
   35         struct IsIntPacket<Packet8i> : std::true_type {};
 
   38         struct HalfPacket<Packet8i>
 
   40             using type = Packet4i;
 
   43 #ifdef EIGEN_VECTORIZE_AVX 
   45         struct IsFloatPacket<Packet8f> : std::true_type {};
 
   48         struct IsDoublePacket<Packet4d> : std::true_type {};
 
   50 #ifdef EIGEN_VECTORIZE_SSE2 
   52         struct IsIntPacket<Packet4i> : std::true_type {};
 
   55         struct IsFloatPacket<Packet4f> : std::true_type {};
 
   58         struct IsDoublePacket<Packet2d> : std::true_type {};
 
   61         struct HalfPacket<Packet4i>
 
   63             using type = uint64_t;
 
   66         template<
typename Packet>
 
   71         template<
typename Packet>
 
   72         inline auto reinterpret_to_float(
const Packet& x)
 
   73             -> decltype(reinterpreter<Packet>{}.to_float(x))
 
   75             return reinterpreter<Packet>{}.to_float(x);
 
   78         template<
typename Packet>
 
   79         inline auto reinterpret_to_double(
const Packet& x)
 
   80             -> decltype(reinterpreter<Packet>{}.to_double(x))
 
   82             return reinterpreter<Packet>{}.to_double(x);
 
   85         template<
typename Packet>
 
   86         inline auto reinterpret_to_int(
const Packet& x)
 
   87             -> decltype(reinterpreter<Packet>{}.to_int(x))
 
   89             return reinterpreter<Packet>{}.to_int(x);
 
   92         template<
typename Packet>
 
   93         EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
 
   95         template<
typename Packet>
 
   96         EIGEN_STRONG_INLINE Packet padd64(
const Packet& a, 
const Packet& b);
 
   98         template<
typename Packet>
 
   99         EIGEN_STRONG_INLINE Packet psub64(
const Packet& a, 
const Packet& b);
 
  101         template <
typename SrcPacket, 
typename TgtPacket>
 
  102         EIGEN_STRONG_INLINE TgtPacket pcast64(
const SrcPacket& a);
 
  104         template<
typename Packet>
 
  105         EIGEN_STRONG_INLINE Packet pcmpeq(
const Packet& a, 
const Packet& b);
 
  107         template<
typename Packet>
 
  108         EIGEN_STRONG_INLINE Packet psll(
const Packet& a, 
int b);
 
  110         template<
typename Packet>
 
  111         EIGEN_STRONG_INLINE Packet psrl(
const Packet& a, 
int b);
 
  113         template<
typename Packet>
 
  114         EIGEN_STRONG_INLINE Packet psll64(
const Packet& a, 
int b);
 
  116         template<
typename Packet>
 
  117         EIGEN_STRONG_INLINE Packet psrl64(
const Packet& a, 
int b);
 
  119         template<
typename Packet>
 
  120         EIGEN_STRONG_INLINE 
int pmovemask(
const Packet& a);
 
  122         template<
typename Packet>
 
  123         EIGEN_STRONG_INLINE 
typename std::enable_if<
 
  124             IsFloatPacket<Packet>::value, Packet
 
  125         >::type pext_sign(
const Packet& a)
 
  127             using IntPacket = decltype(reinterpret_to_int(a));
 
  128             return reinterpret_to_float(
 
  129                 pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
 
  133         template<
typename Packet>
 
  134         EIGEN_STRONG_INLINE 
typename std::enable_if<
 
  135             IsDoublePacket<Packet>::value, Packet
 
  136         >::type pext_sign(
const Packet& a)
 
  138             using IntPacket = decltype(reinterpret_to_int(a));
 
  139             return reinterpret_to_double(
 
  140                 pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
 
  145         EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(
const uint64_t& a, 
int b)
 
  151         EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(
const uint64_t& a, 
int b)
 
  157         template<
typename Packet>
 
  158         EIGEN_STRONG_INLINE Packet plgamma_approx(
const Packet& x)
 
  160             auto x_3 = padd(x, pset1<Packet>(3));
 
  161             auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
 
  162             ret = psub(ret, x_3);
 
  163             ret = padd(ret, pset1<Packet>(0.9189385332046727));
 
  164             ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
 
  165             ret = psub(ret, plog(pmul(
 
  166                 pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
 
  170         template<
typename Packet>
 
  171         EIGEN_STRONG_INLINE Packet pcmplt(
const Packet& a, 
const Packet& b);
 
  173         template<
typename Packet>
 
  174         EIGEN_STRONG_INLINE Packet pcmple(
const Packet& a, 
const Packet& b);
 
  176         template<
typename PacketIf, 
typename Packet>
 
  177         EIGEN_STRONG_INLINE Packet pblendv(
const PacketIf& ifPacket, 
const Packet& thenPacket, 
const Packet& elsePacket);
 
  179         template<
typename Packet>
 
  180         EIGEN_STRONG_INLINE Packet pgather(
const int* addr, 
const Packet& index);
 
  182         template<
typename Packet>
 
  183         EIGEN_STRONG_INLINE 
auto pgather(
const float* addr, 
const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
 
  185         template<
typename Packet>
 
  186         EIGEN_STRONG_INLINE 
auto pgather(
const double* addr, 
const Packet& index, 
bool upperhalf = 
false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
 
  188         template<
typename Packet>
 
  189         EIGEN_STRONG_INLINE Packet ptruncate(
const Packet& a);
 
  191         template<
typename Packet>
 
  192         EIGEN_STRONG_INLINE Packet pcmpeq64(
const Packet& a, 
const Packet& b);
 
  194         template<
typename Packet>
 
  195         EIGEN_STRONG_INLINE Packet pcmplt64(
const Packet& a, 
const Packet& b);
 
  197         template<
typename Packet>
 
  198         EIGEN_STRONG_INLINE Packet pmuluadd64(
const Packet& a, uint64_t b, uint64_t c);
 
  200         template<
typename IntPacket>
 
  201         EIGEN_STRONG_INLINE 
auto bit_to_ur_float(
const IntPacket& x) -> decltype(reinterpret_to_float(x))
 
  203             using FloatPacket = decltype(reinterpret_to_float(x));
 
  205             const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
 
  206                 upper = pset1<IntPacket>(127 << 23);
 
  207             const FloatPacket one = pset1<FloatPacket>(1);
 
  209             return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
 
  212         template<
typename IntPacket>
 
  213         EIGEN_STRONG_INLINE 
auto bit_to_ur_double(
const IntPacket& x) -> decltype(reinterpret_to_double(x))
 
  215             using DoublePacket = decltype(reinterpret_to_double(x));
 
  217             const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
 
  218                 upper = pseti64<IntPacket>(1023ull << 52);
 
  219             const DoublePacket one = pset1<DoublePacket>(1);
 
  221             return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
 
  224         template<
typename _Scalar>
 
  228         struct bit_scalar<float>
 
  230             float to_ur(uint32_t x)
 
  237                 u = (x & 0x7FFFFF) | (127 << 23);
 
  241             float to_nzur(uint32_t x)
 
  243                 return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
 
  248         struct bit_scalar<double>
 
  250             double to_ur(uint64_t x)
 
  257                 u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
 
  261             double to_nzur(uint64_t x)
 
  263                 return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
 
  273         EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
 
  275             bit_scalar<float> bs;
 
  277             ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
 
  278             ret.f[1] = bs.to_ur(x >> 32);
 
  282         template<
typename Packet>
 
  283         EIGEN_STRONG_INLINE 
typename std::enable_if<
 
  284             IsFloatPacket<Packet>::value
 
  285         >::type psincos(Packet x, Packet& s, Packet& c)
 
  287             Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
 
  288             using IntPacket = decltype(reinterpret_to_int(x));
 
  289             IntPacket emm0, emm2, emm4;
 
  295             sign_bit_sin = pext_sign(sign_bit_sin);
 
  298             y = pmul(x, pset1<Packet>(1.27323954473516));
 
  301             emm2 = pcast<Packet, IntPacket>(y);
 
  304             emm2 = padd(emm2, pset1<IntPacket>(1));
 
  305             emm2 = pand(emm2, pset1<IntPacket>(~1));
 
  306             y = pcast<IntPacket, Packet>(emm2);
 
  311             emm0 = pand(emm2, pset1<IntPacket>(4));
 
  312             emm0 = psll(emm0, 29);
 
  313             Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
 
  316             emm2 = pand(emm2, pset1<IntPacket>(2));
 
  318             emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
 
  319             Packet poly_mask = reinterpret_to_float(emm2);
 
  323             xmm1 = pset1<Packet>(-0.78515625);
 
  324             xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
 
  325             xmm3 = pset1<Packet>(-3.77489497744594108e-8);
 
  326             xmm1 = pmul(y, xmm1);
 
  327             xmm2 = pmul(y, xmm2);
 
  328             xmm3 = pmul(y, xmm3);
 
  333             emm4 = psub(emm4, pset1<IntPacket>(2));
 
  334             emm4 = pandnot(emm4, pset1<IntPacket>(4));
 
  335             emm4 = psll(emm4, 29);
 
  336             Packet sign_bit_cos = reinterpret_to_float(emm4);
 
  337             sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
 
  341             Packet z = pmul(x, x);
 
  342             y = pset1<Packet>(2.443315711809948E-005);
 
  345             y = padd(y, pset1<Packet>(-1.388731625493765E-003));
 
  347             y = padd(y, pset1<Packet>(4.166664568298827E-002));
 
  350             Packet tmp = pmul(z, pset1<Packet>(0.5));
 
  352             y = padd(y, pset1<Packet>(1));
 
  356             Packet y2 = pset1<Packet>(-1.9515295891E-4);
 
  358             y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
 
  360             y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
 
  367             Packet ysin2 = pand(xmm3, y2);
 
  368             Packet ysin1 = pandnot(xmm3, y);
 
  369             y2 = psub(y2, ysin2);
 
  372             xmm1 = padd(ysin1, ysin2);
 
  376             s = pxor(xmm1, sign_bit_sin);
 
  377             c = pxor(xmm2, sign_bit_cos);
 
  380         template<
typename Packet>
 
  381         EIGEN_STRONG_INLINE 
typename std::enable_if<
 
  382             IsDoublePacket<Packet>::value
 
  383         >::type psincos(Packet x, Packet& s, Packet& c)
 
  385             Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
 
  386             using IntPacket = decltype(reinterpret_to_int(x));
 
  387             IntPacket emm0, emm2, emm4;
 
  393             sign_bit_sin = pext_sign(sign_bit_sin);
 
  396             y = pmul(x, pset1<Packet>(1.27323954473516));
 
  399             emm2 = pcast64<Packet, IntPacket>(y);
 
  402             emm2 = padd64(emm2, pseti64<IntPacket>(1));
 
  403             emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
 
  404             y = pcast64<IntPacket, Packet>(emm2);
 
  409             emm0 = pand(emm2, pseti64<IntPacket>(4));
 
  410             emm0 = psll64(emm0, 61);
 
  411             Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
 
  414             emm2 = pand(emm2, pseti64<IntPacket>(2));
 
  416             emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
 
  417             Packet poly_mask = reinterpret_to_double(emm2);
 
  421             xmm1 = pset1<Packet>(-0.78515625);
 
  422             xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
 
  423             xmm3 = pset1<Packet>(-3.77489497744594108e-8);
 
  424             xmm1 = pmul(y, xmm1);
 
  425             xmm2 = pmul(y, xmm2);
 
  426             xmm3 = pmul(y, xmm3);
 
  431             emm4 = psub64(emm4, pseti64<IntPacket>(2));
 
  432             emm4 = pandnot(emm4, pseti64<IntPacket>(4));
 
  433             emm4 = psll64(emm4, 61);
 
  434             Packet sign_bit_cos = reinterpret_to_double(emm4);
 
  435             sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
 
  439             Packet z = pmul(x, x);
 
  440             y = pset1<Packet>(2.443315711809948E-005);
 
  443             y = padd(y, pset1<Packet>(-1.388731625493765E-003));
 
  445             y = padd(y, pset1<Packet>(4.166664568298827E-002));
 
  448             Packet tmp = pmul(z, pset1<Packet>(0.5));
 
  450             y = padd(y, pset1<Packet>(1));
 
  454             Packet y2 = pset1<Packet>(-1.9515295891E-4);
 
  456             y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
 
  458             y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
 
  465             Packet ysin2 = pand(xmm3, y2);
 
  466             Packet ysin1 = pandnot(xmm3, y);
 
  467             y2 = psub(y2, ysin2);
 
  470             xmm1 = padd(ysin1, ysin2);
 
  474             s = pxor(xmm1, sign_bit_sin);
 
  475             c = pxor(xmm2, sign_bit_cos);
 
  478         template<
typename Packet>
 
  479         EIGEN_STRONG_INLINE 
typename std::enable_if<
 
  480             IsDoublePacket<Packet>::value, Packet
 
  481         >::type _psin(Packet x)
 
  483             Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
 
  484             using IntPacket = decltype(reinterpret_to_int(x));
 
  485             IntPacket emm0, emm2;
 
  491             sign_bit_sin = pext_sign(sign_bit_sin);
 
  494             y = pmul(x, pset1<Packet>(1.27323954473516));
 
  497             emm2 = pcast64<Packet, IntPacket>(y);
 
  500             emm2 = padd64(emm2, pseti64<IntPacket>(1));
 
  501             emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
 
  502             y = pcast64<IntPacket, Packet>(emm2);
 
  505             emm0 = pand(emm2, pseti64<IntPacket>(4));
 
  506             emm0 = psll64(emm0, 61);
 
  507             Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
 
  510             emm2 = pand(emm2, pseti64<IntPacket>(2));
 
  512             emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
 
  513             Packet poly_mask = reinterpret_to_double(emm2);
 
  517             xmm1 = pset1<Packet>(-0.78515625);
 
  518             xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
 
  519             xmm3 = pset1<Packet>(-3.77489497744594108e-8);
 
  520             xmm1 = pmul(y, xmm1);
 
  521             xmm2 = pmul(y, xmm2);
 
  522             xmm3 = pmul(y, xmm3);
 
  527             sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
 
  531             Packet z = pmul(x, x);
 
  532             y = pset1<Packet>(2.443315711809948E-005);
 
  535             y = padd(y, pset1<Packet>(-1.388731625493765E-003));
 
  537             y = padd(y, pset1<Packet>(4.166664568298827E-002));
 
  540             Packet tmp = pmul(z, pset1<Packet>(0.5));
 
  542             y = padd(y, pset1<Packet>(1));
 
  546             Packet y2 = pset1<Packet>(-1.9515295891E-4);
 
  548             y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
 
  550             y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
 
  557             Packet ysin2 = pand(xmm3, y2);
 
  558             Packet ysin1 = pandnot(xmm3, y);
 
  560             xmm1 = padd(ysin1, ysin2);
 
  563             return pxor(xmm1, sign_bit_sin);
 
  568 #ifdef EIGEN_VECTORIZE_AVX 
  569 #include <immintrin.h> 
  576         struct reinterpreter<Packet8i>
 
  578             EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8i& x)
 
  580                 return _mm256_castsi256_ps(x);
 
  583             EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8i& x)
 
  585                 return _mm256_castsi256_pd(x);
 
  588             EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8i& x)
 
  595         struct reinterpreter<Packet8f>
 
  597             EIGEN_STRONG_INLINE Packet8f to_float(
const Packet8f& x)
 
  602             EIGEN_STRONG_INLINE Packet4d to_double(
const Packet8f& x)
 
  604                 return _mm256_castps_pd(x);
 
  607             EIGEN_STRONG_INLINE Packet8i to_int(
const Packet8f& x)
 
  609                 return _mm256_castps_si256(x);
 
  614         struct reinterpreter<Packet4d>
 
  616             EIGEN_STRONG_INLINE Packet8f to_float(
const Packet4d& x)
 
  618                 return _mm256_castpd_ps(x);
 
  621             EIGEN_STRONG_INLINE Packet4d to_double(
const Packet4d& x)
 
  626             EIGEN_STRONG_INLINE Packet8i to_int(
const Packet4d& x)
 
  628                 return _mm256_castpd_si256(x);
 
  632         EIGEN_STRONG_INLINE 
void split_two(
const Packet8i& x, Packet4i& a, Packet4i& b)
 
  634             a = _mm256_extractf128_si256(x, 0);
 
  635             b = _mm256_extractf128_si256(x, 1);
 
  638         EIGEN_STRONG_INLINE Packet8i combine_two(
const Packet4i& a, 
const Packet4i& b)
 
  640             return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
 
  643         EIGEN_STRONG_INLINE 
void split_two(
const Packet8f& x, Packet4f& a, Packet4f& b)
 
  645             a = _mm256_extractf128_ps(x, 0);
 
  646             b = _mm256_extractf128_ps(x, 1);
 
  649         EIGEN_STRONG_INLINE Packet8f combine_two(
const Packet4f& a, 
const Packet4f& b)
 
  651             return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
 
  655         EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet8i& a)
 
  657 #ifdef EIGEN_VECTORIZE_AVX2 
  658             return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
 
  660             auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
 
  661             return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
 
  666         EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
 
  668             return _mm256_set1_epi64x(a);
 
  672         EIGEN_STRONG_INLINE Packet8i padd64<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  674 #ifdef EIGEN_VECTORIZE_AVX2 
  675             return _mm256_add_epi64(a, b);
 
  677             Packet4i a1, a2, b1, b2;
 
  678             split_two(a, a1, a2);
 
  679             split_two(b, b1, b2);
 
  680             return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2));
 
  685         EIGEN_STRONG_INLINE Packet8i psub64<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  687 #ifdef EIGEN_VECTORIZE_AVX2 
  688             return _mm256_sub_epi64(a, b);
 
  690             Packet4i a1, a2, b1, b2;
 
  691             split_two(a, a1, a2);
 
  692             split_two(b, b1, b2);
 
  693             return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2));
 
  698         EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  700 #ifdef EIGEN_VECTORIZE_AVX2 
  701             return _mm256_cmpeq_epi32(a, b);
 
  703             Packet4i a1, a2, b1, b2;
 
  704             split_two(a, a1, a2);
 
  705             split_two(b, b1, b2);
 
  706             return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
 
  711         EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(
const Packet8i& a, 
int b)
 
  713 #ifdef EIGEN_VECTORIZE_AVX2 
  714             return _mm256_slli_epi32(a, b);
 
  717             split_two(a, a1, a2);
 
  718             return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
 
  723         EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(
const Packet8i& a, 
int b)
 
  725 #ifdef EIGEN_VECTORIZE_AVX2 
  726             return _mm256_srli_epi32(a, b);
 
  729             split_two(a, a1, a2);
 
  730             return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
 
  735         EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(
const Packet8i& a, 
int b)
 
  737 #ifdef EIGEN_VECTORIZE_AVX2 
  738             return _mm256_slli_epi64(a, b);
 
  741             split_two(a, a1, a2);
 
  742             return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
 
  747         EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(
const Packet8i& a, 
int b)
 
  749 #ifdef EIGEN_VECTORIZE_AVX2 
  750             return _mm256_srli_epi64(a, b);
 
  753             split_two(a, a1, a2);
 
  754             return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
 
  758         template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  760 #ifdef EIGEN_VECTORIZE_AVX2 
  761             return _mm256_add_epi32(a, b);
 
  763             Packet4i a1, a2, b1, b2;
 
  764             split_two(a, a1, a2);
 
  765             split_two(b, b1, b2);
 
  766             return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
 
  770         template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  772 #ifdef EIGEN_VECTORIZE_AVX2 
  773             return _mm256_sub_epi32(a, b);
 
  775             Packet4i a1, a2, b1, b2;
 
  776             split_two(a, a1, a2);
 
  777             split_two(b, b1, b2);
 
  778             return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
 
  782         template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  784 #ifdef EIGEN_VECTORIZE_AVX2 
  785             return _mm256_and_si256(a, b);
 
  787             return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
 
  791         template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  793 #ifdef EIGEN_VECTORIZE_AVX2 
  794             return _mm256_andnot_si256(a, b);
 
  796             return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
 
  800         template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  802 #ifdef EIGEN_VECTORIZE_AVX2 
  803             return _mm256_or_si256(a, b);
 
  805             return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
 
  809         template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  811 #ifdef EIGEN_VECTORIZE_AVX2 
  812             return _mm256_xor_si256(a, b);
 
  814             return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
 
  819         EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  821 #ifdef EIGEN_VECTORIZE_AVX2 
  822             return _mm256_cmpgt_epi32(b, a);
 
  824             Packet4i a1, a2, b1, b2;
 
  825             split_two(a, a1, a2);
 
  826             split_two(b, b1, b2);
 
  827             return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
 
  832         EIGEN_STRONG_INLINE Packet8i pcmplt64<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  834 #ifdef EIGEN_VECTORIZE_AVX2 
  835             return _mm256_cmpgt_epi64(b, a);
 
  837             Packet4i a1, a2, b1, b2;
 
  838             split_two(a, a1, a2);
 
  839             split_two(b, b1, b2);
 
  840             return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2));
 
  845         EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(
const Packet8f& a, 
const Packet8f& b)
 
  847             return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
 
  851         EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(
const Packet8f& a, 
const Packet8f& b)
 
  853             return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
 
  857         EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(
const Packet4d& a, 
const Packet4d& b)
 
  859             return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
 
  863         EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(
const Packet4d& a, 
const Packet4d& b)
 
  865             return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
 
  869         EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8f& ifPacket, 
const Packet8f& thenPacket, 
const Packet8f& elsePacket) 
 
  871             return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
 
  875         EIGEN_STRONG_INLINE Packet8f pblendv(
const Packet8i& ifPacket, 
const Packet8f& thenPacket, 
const Packet8f& elsePacket)
 
  877             return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
 
  881         EIGEN_STRONG_INLINE Packet8i pblendv(
const Packet8i& ifPacket, 
const Packet8i& thenPacket, 
const Packet8i& elsePacket)
 
  883             return _mm256_castps_si256(_mm256_blendv_ps(
 
  884                 _mm256_castsi256_ps(elsePacket), 
 
  885                 _mm256_castsi256_ps(thenPacket), 
 
  886                 _mm256_castsi256_ps(ifPacket)
 
  891         EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet4d& ifPacket, 
const Packet4d& thenPacket, 
const Packet4d& elsePacket)
 
  893             return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
 
  897         EIGEN_STRONG_INLINE Packet4d pblendv(
const Packet8i& ifPacket, 
const Packet4d& thenPacket, 
const Packet4d& elsePacket)
 
  899             return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
 
  903         EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(
const int* addr, 
const Packet8i& index)
 
  905 #ifdef EIGEN_VECTORIZE_AVX2 
  906             return _mm256_i32gather_epi32(addr, index, 4);
 
  909             _mm256_storeu_si256((Packet8i*)u, index);
 
  910             return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
 
  911                 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
 
  916         EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(
const float *addr, 
const Packet8i& index)
 
  918 #ifdef EIGEN_VECTORIZE_AVX2 
  919             return _mm256_i32gather_ps(addr, index, 4);
 
  922             _mm256_storeu_si256((Packet8i*)u, index);
 
  923             return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
 
  924                 addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
 
  929         EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(
const double *addr, 
const Packet8i& index, 
bool upperhalf)
 
  931 #ifdef EIGEN_VECTORIZE_AVX2 
  932             return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
 
  935             _mm256_storeu_si256((Packet8i*)u, index);
 
  938                 return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
 
  942                 return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
 
  948         EIGEN_STRONG_INLINE 
int pmovemask<Packet8f>(
const Packet8f& a)
 
  950             return _mm256_movemask_ps(a);
 
  954         EIGEN_STRONG_INLINE 
int pmovemask<Packet4d>(
const Packet4d& a)
 
  956             return _mm256_movemask_pd(a);
 
  960         EIGEN_STRONG_INLINE 
int pmovemask<Packet8i>(
const Packet8i& a)
 
  962             return pmovemask(_mm256_castsi256_ps(a));
 
  966         EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(
const Packet8f& a)
 
  968             return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 
  972         EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(
const Packet4d& a)
 
  974             return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 
  978         EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(
const Packet8i& a, 
const Packet8i& b)
 
  980 #ifdef EIGEN_VECTORIZE_AVX2 
  981             return _mm256_cmpeq_epi64(a, b);
 
  983             Packet4i a1, a2, b1, b2;
 
  984             split_two(a, a1, a2);
 
  985             split_two(b, b1, b2);
 
  986             return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
 
  991         EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(
const Packet8i& a, uint64_t b, uint64_t c)
 
  994             _mm256_storeu_si256((__m256i*)u, a);
 
  999             return _mm256_loadu_si256((__m256i*)u);
 
 1002         EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) {
 
 1003             auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
 
 1004             return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000));
 
 1007         EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) {
 
 1008             x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
 
 1009             return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
 
 1012         EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) {
 
 1013             x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000));
 
 1015                 _mm256_castpd_si256(x),
 
 1016                 _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
 
 1021         EIGEN_STRONG_INLINE Packet8i pcast64<Packet4d, Packet8i>(
const Packet4d& a)
 
 1023             return double_to_int64(a);
 
 1027         EIGEN_STRONG_INLINE Packet4d pcast64<Packet8i, Packet4d>(
const Packet8i& a)
 
 1029             return int64_to_double(a);
 
 1032         template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 
 1033             Packet4d psin<Packet4d>(
const Packet4d& x)
 
 1039         EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
 
 1040             plog<Packet4d>(
const Packet4d& _x) {
 
 1042             _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
 
 1043             _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
 
 1045             auto inv_mant_mask = _mm256_castsi256_pd(pseti64<Packet8i>(~0x7ff0000000000000));
 
 1046             auto min_norm_pos = _mm256_castsi256_pd(pseti64<Packet8i>(0x10000000000000));
 
 1047             auto minus_inf = _mm256_castsi256_pd(pseti64<Packet8i>(0xfff0000000000000));
 
 1050             _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524);
 
 1051             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2);
 
 1052             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1);
 
 1053             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1);
 
 1054             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1);
 
 1055             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1);
 
 1056             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1);
 
 1057             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1);
 
 1058             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1);
 
 1059             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1);
 
 1060             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4);
 
 1061             _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375);
 
 1063             Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ); 
 
 1064             Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
 
 1067             x = pmax(x, min_norm_pos);
 
 1069             Packet4d emm0 = uint64_to_double(psrl64(_mm256_castpd_si256(x), 52));
 
 1070             Packet4d e = psub(emm0, pset1<Packet4d>(1022));
 
 1073             x = _mm256_and_pd(x, inv_mant_mask);
 
 1074             x = _mm256_or_pd(x, p4d_half);
 
 1083             Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ);
 
 1084             Packet4d tmp = _mm256_and_pd(x, mask);
 
 1086             e = psub(e, _mm256_and_pd(p4d_1, mask));
 
 1089             Packet4d x2 = pmul(x, x);
 
 1090             Packet4d x3 = pmul(x2, x);
 
 1095             y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1);
 
 1096             y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4);
 
 1097             y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7);
 
 1098             y = pmadd(y, x, p4d_cephes_log_p2);
 
 1099             y1 = pmadd(y1, x, p4d_cephes_log_p5);
 
 1100             y2 = pmadd(y2, x, p4d_cephes_log_p8);
 
 1101             y = pmadd(y, x3, y1);
 
 1102             y = pmadd(y, x3, y2);
 
 1106             y1 = pmul(e, p4d_cephes_log_q1);
 
 1107             tmp = pmul(x2, p4d_half);
 
 1110             y2 = pmul(e, p4d_cephes_log_q2);
 
 1115             return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask));
 
 1121 #ifdef EIGEN_VECTORIZE_SSE2 
 1122 #include <xmmintrin.h> 
 1129         struct reinterpreter<Packet4i>
 
 1131             EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4i& x)
 
 1133                 return _mm_castsi128_ps(x);
 
 1136             EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4i& x)
 
 1138                 return _mm_castsi128_pd(x);
 
 1141             EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4i& x)
 
 1148         struct reinterpreter<Packet4f>
 
 1150             EIGEN_STRONG_INLINE Packet4f to_float(
const Packet4f& x)
 
 1155             EIGEN_STRONG_INLINE Packet2d to_double(
const Packet4f& x)
 
 1157                 return _mm_castps_pd(x);
 
 1160             EIGEN_STRONG_INLINE Packet4i to_int(
const Packet4f& x)
 
 1162                 return _mm_castps_si128(x);
 
 1167         struct reinterpreter<Packet2d>
 
 1169             EIGEN_STRONG_INLINE Packet4f to_float(
const Packet2d& x)
 
 1171                 return _mm_castpd_ps(x);
 
 1174             EIGEN_STRONG_INLINE Packet2d to_double(
const Packet2d& x)
 
 1179             EIGEN_STRONG_INLINE Packet4i to_int(
const Packet2d& x)
 
 1181                 return _mm_castpd_si128(x);
 
 1185         EIGEN_STRONG_INLINE 
void split_two(
const Packet4i& x, uint64_t& a, uint64_t& b)
 
 1187 #ifdef EIGEN_VECTORIZE_SSE4_1 
 1188             a = _mm_extract_epi64(x, 0);
 
 1189             b = _mm_extract_epi64(x, 1);
 
 1192             _mm_storeu_si128((__m128i*)u, x);
 
 1198         EIGEN_STRONG_INLINE Packet4i combine_low32(
const Packet4i& a, 
const Packet4i& b)
 
 1200             auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
 
 1201             auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
 
 1202             sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
 
 1203             sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
 
 1204             return _mm_or_si128(sa, sb);
 
 1208         EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
 
 1210             return _mm_set1_epi64x(a);
 
 1214         EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
 1216             return _mm_add_epi64(a, b);
 
 1220         EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
 1222             return _mm_sub_epi64(a, b);
 
 1226         EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
 1228             return _mm_cmpeq_epi32(a, b);
 
 1232         EIGEN_STRONG_INLINE Packet4i psll<Packet4i>(
const Packet4i& a, 
int b)
 
 1234             return _mm_slli_epi32(a, b);
 
 1238         EIGEN_STRONG_INLINE Packet4i psrl<Packet4i>(
const Packet4i& a, 
int b)
 
 1240             return _mm_srli_epi32(a, b);
 
 1245         EIGEN_STRONG_INLINE Packet4i psll64<Packet4i>(
const Packet4i& a, 
int b)
 
 1247             return _mm_slli_epi64(a, b);
 
 1251         EIGEN_STRONG_INLINE Packet4i psrl64<Packet4i>(
const Packet4i& a, 
int b)
 
 1253             return _mm_srli_epi64(a, b);
 
 1257         EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
 1259             return _mm_cmplt_epi32(a, b);
 
 1263         EIGEN_STRONG_INLINE Packet4i pcmplt64<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
 1265 #ifdef EIGEN_VECTORIZE_SSE4_2 
 1266             return _mm_cmpgt_epi64(b, a);
 
 1269             _mm_storeu_si128((__m128i*)u, a);
 
 1270             _mm_storeu_si128((__m128i*)v, b);
 
 1271             return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0);
 
 1276         EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(
const Packet4f& a, 
const Packet4f& b)
 
 1278             return _mm_cmplt_ps(a, b);
 
 1282         EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(
const Packet4f& a, 
const Packet4f& b)
 
 1284             return _mm_cmple_ps(a, b);
 
 1288         EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(
const Packet2d& a, 
const Packet2d& b)
 
 1290             return _mm_cmplt_pd(a, b);
 
 1294         EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(
const Packet2d& a, 
const Packet2d& b)
 
 1296             return _mm_cmple_pd(a, b);
 
 1300         EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4f& ifPacket, 
const Packet4f& thenPacket, 
const Packet4f& elsePacket) 
 
 1302 #ifdef EIGEN_VECTORIZE_SSE4_1 
 1303             return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
 
 1305             return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
 
 1310         EIGEN_STRONG_INLINE Packet4f pblendv(
const Packet4i& ifPacket, 
const Packet4f& thenPacket, 
const Packet4f& elsePacket)
 
 1312             return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
 
 1316         EIGEN_STRONG_INLINE Packet4i pblendv(
const Packet4i& ifPacket, 
const Packet4i& thenPacket, 
const Packet4i& elsePacket)
 
 1318 #ifdef EIGEN_VECTORIZE_SSE4_1 
 1319             return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
 
 1321             return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
 
 1326         EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet2d& ifPacket, 
const Packet2d& thenPacket, 
const Packet2d& elsePacket) 
 
 1328 #ifdef EIGEN_VECTORIZE_SSE4_1 
 1329             return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
 
 1331             return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
 
 1337         EIGEN_STRONG_INLINE Packet2d pblendv(
const Packet4i& ifPacket, 
const Packet2d& thenPacket, 
const Packet2d& elsePacket)
 
 1339             return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
 
 1343         EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(
const int* addr, 
const Packet4i& index)
 
 1345 #ifdef EIGEN_VECTORIZE_AVX2 
 1346             return _mm_i32gather_epi32(addr, index, 4);
 
 1349             _mm_storeu_si128((__m128i*)u, index);
 
 1350             return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
 
 1355         EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(
const float* addr, 
const Packet4i& index)
 
 1357 #ifdef EIGEN_VECTORIZE_AVX2 
 1358             return _mm_i32gather_ps(addr, index, 4);
 
 1361             _mm_storeu_si128((__m128i*)u, index);
 
 1362             return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
 
 1367         EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(
const double* addr, 
const Packet4i& index, 
bool upperhalf)
 
 1369 #ifdef EIGEN_VECTORIZE_AVX2 
 1370             return _mm_i32gather_pd(addr, index, 8);
 
 1373             _mm_storeu_si128((__m128i*)u, index);
 
 1376                 return _mm_setr_pd(addr[u[2]], addr[u[3]]);
 
 1380                 return _mm_setr_pd(addr[u[0]], addr[u[1]]);
 
 1386         EIGEN_STRONG_INLINE 
int pmovemask<Packet4f>(
const Packet4f& a)
 
 1388             return _mm_movemask_ps(a);
 
 1392         EIGEN_STRONG_INLINE 
int pmovemask<Packet2d>(
const Packet2d& a)
 
 1394             return _mm_movemask_pd(a);
 
 1398         EIGEN_STRONG_INLINE 
int pmovemask<Packet4i>(
const Packet4i& a)
 
 1400             return pmovemask((Packet4f)_mm_castsi128_ps(a));
 
 1404         EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(
const Packet4f& a)
 
 1406 #ifdef EIGEN_VECTORIZE_SSE4_1 
 1407             return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 
 1409             auto round = _MM_GET_ROUNDING_MODE();
 
 1410             _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
 
 1411             auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
 
 1412             _MM_SET_ROUNDING_MODE(round);
 
 1418         EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(
const Packet2d& a)
 
 1420 #ifdef EIGEN_VECTORIZE_SSE4_1 
 1421             return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 
 1423             auto round = _MM_GET_ROUNDING_MODE();
 
 1424             _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
 
 1425             auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
 
 1426             _MM_SET_ROUNDING_MODE(round);
 
 1432         EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(
const Packet4i& a, 
const Packet4i& b)
 
 1434 #ifdef EIGEN_VECTORIZE_SSE4_1 
 1435             return _mm_cmpeq_epi64(a, b);
 
 1437             Packet4i c = _mm_cmpeq_epi32(a, b);
 
 1438             return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
 
 1443         EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(
const Packet4i& a, uint64_t b, uint64_t c)
 
 1446             _mm_storeu_si128((__m128i*)u, a);
 
 1447             u[0] = u[0] * b + c;
 
 1448             u[1] = u[1] * b + c;
 
 1449             return _mm_loadu_si128((__m128i*)u);
 
 1452         EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) {
 
 1453             x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
 
 1454             return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
 
 1457         EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) {
 
 1458             x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
 
 1459             return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
 
 1462         EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) {
 
 1463             x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
 
 1464             return _mm_sub_epi64(
 
 1465                 _mm_castpd_si128(x),
 
 1466                 _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
 
 1471         EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(
const Packet2d& a)
 
 1473             return double_to_int64(a);
 
 1477         EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(
const Packet4i& a)
 
 1479             return int64_to_double(a);
 
 1482         template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 
 1483             Packet2d psin<Packet2d>(
const Packet2d& x)
 
 1488         template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 
 1489             Packet2d plog<Packet2d>(
const Packet2d& _x)
 
 1492             _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f);
 
 1493             _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f);
 
 1494             _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
 
 1496             auto inv_mant_mask = _mm_castsi128_pd(pseti64<Packet4i>(~0x7ff0000000000000));
 
 1497             auto min_norm_pos = _mm_castsi128_pd(pseti64<Packet4i>(0x10000000000000));
 
 1498             auto minus_inf = _mm_castsi128_pd(pseti64<Packet4i>(0xfff0000000000000));
 
 1503             _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524);
 
 1504             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2);
 
 1505             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1);
 
 1506             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1);
 
 1507             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1);
 
 1508             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1);
 
 1509             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1);
 
 1510             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1);
 
 1511             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1);
 
 1512             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1);
 
 1513             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4);
 
 1514             _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375);
 
 1519             Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd()); 
 
 1520             Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
 
 1522             x = pmax(x, min_norm_pos);  
 
 1523             emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52);
 
 1526             x = _mm_and_pd(x, inv_mant_mask);
 
 1527             x = _mm_or_pd(x, p2d_half);
 
 1529             Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1<Packet2d>(1022));
 
 1537             Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF);
 
 1538             Packet2d tmp = pand(x, mask);
 
 1540             e = psub(e, pand(p2d_1, mask));
 
 1543             Packet2d x2 = pmul(x, x);
 
 1544             Packet2d x3 = pmul(x2, x);
 
 1547             y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1);
 
 1548             y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4);
 
 1549             y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7);
 
 1550             y = pmadd(y, x, p2d_cephes_log_p2);
 
 1551             y1 = pmadd(y1, x, p2d_cephes_log_p5);
 
 1552             y2 = pmadd(y2, x, p2d_cephes_log_p8);
 
 1553             y = pmadd(y, x3, y1);
 
 1554             y = pmadd(y, x3, y2);
 
 1557             y1 = pmul(e, p2d_cephes_log_q1);
 
 1558             tmp = pmul(x2, p2d_half);
 
 1561             y2 = pmul(e, p2d_cephes_log_q2);
 
 1565             return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask));