EigenRand  0.3.0
MorePacketMath.h
Go to the documentation of this file.
1 
12 #ifndef EIGENRAND_MORE_PACKET_MATH_H
13 #define EIGENRAND_MORE_PACKET_MATH_H
14 
15 #include <Eigen/Dense>
16 
17 namespace Eigen
18 {
19  namespace internal
20  {
21  template<typename Ty>
22  struct IsIntPacket : std::false_type {};
23 
24  template<typename Ty>
25  struct IsFloatPacket : std::false_type {};
26 
27  template<typename Ty>
28  struct IsDoublePacket : std::false_type {};
29 
30  template<typename Ty>
31  struct HalfPacket;
32 
33 #ifdef EIGEN_VECTORIZE_AVX2
34  template<>
35  struct IsIntPacket<Packet8i> : std::true_type {};
36 
37  template<>
38  struct HalfPacket<Packet8i>
39  {
40  using type = Packet4i;
41  };
42 #endif
43 #ifdef EIGEN_VECTORIZE_AVX
44  template<>
45  struct IsFloatPacket<Packet8f> : std::true_type {};
46 
47  template<>
48  struct IsDoublePacket<Packet4d> : std::true_type {};
49 #endif
50 #ifdef EIGEN_VECTORIZE_SSE2
51  template<>
52  struct IsIntPacket<Packet4i> : std::true_type {};
53 
54  template<>
55  struct IsFloatPacket<Packet4f> : std::true_type {};
56 
57  template<>
58  struct IsDoublePacket<Packet2d> : std::true_type {};
59 
60  template<>
61  struct HalfPacket<Packet4i>
62  {
63  using type = uint64_t;
64  };
65 #endif
66  template<typename Packet>
67  struct reinterpreter
68  {
69  };
70 
71  template<typename Packet>
72  inline auto reinterpret_to_float(const Packet& x)
73  -> decltype(reinterpreter<Packet>{}.to_float(x))
74  {
75  return reinterpreter<Packet>{}.to_float(x);
76  }
77 
78  template<typename Packet>
79  inline auto reinterpret_to_double(const Packet& x)
80  -> decltype(reinterpreter<Packet>{}.to_double(x))
81  {
82  return reinterpreter<Packet>{}.to_double(x);
83  }
84 
85  template<typename Packet>
86  inline auto reinterpret_to_int(const Packet& x)
87  -> decltype(reinterpreter<Packet>{}.to_int(x))
88  {
89  return reinterpreter<Packet>{}.to_int(x);
90  }
91 
92  template<typename Packet>
93  EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
94 
95  template<typename Packet>
96  EIGEN_STRONG_INLINE Packet padd64(const Packet& a, const Packet& b);
97 
98  template<typename Packet>
99  EIGEN_STRONG_INLINE Packet psub64(const Packet& a, const Packet& b);
100 
101  template <typename SrcPacket, typename TgtPacket>
102  EIGEN_STRONG_INLINE TgtPacket pcast64(const SrcPacket& a);
103 
104  template<typename Packet>
105  EIGEN_STRONG_INLINE Packet pcmpeq(const Packet& a, const Packet& b);
106 
107  template<typename Packet>
108  EIGEN_STRONG_INLINE Packet psll(const Packet& a, int b);
109 
110  template<typename Packet>
111  EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b);
112 
113  template<typename Packet>
114  EIGEN_STRONG_INLINE Packet psll64(const Packet& a, int b);
115 
116  template<typename Packet>
117  EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b);
118 
119  template<typename Packet>
120  EIGEN_STRONG_INLINE int pmovemask(const Packet& a);
121 
122  template<typename Packet>
123  EIGEN_STRONG_INLINE typename std::enable_if<
124  IsFloatPacket<Packet>::value, Packet
125  >::type pext_sign(const Packet& a)
126  {
127  using IntPacket = decltype(reinterpret_to_int(a));
128  return reinterpret_to_float(
129  pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
130  );
131  }
132 
133  template<typename Packet>
134  EIGEN_STRONG_INLINE typename std::enable_if<
135  IsDoublePacket<Packet>::value, Packet
136  >::type pext_sign(const Packet& a)
137  {
138  using IntPacket = decltype(reinterpret_to_int(a));
139  return reinterpret_to_double(
140  pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
141  );
142  }
143 
144  template<>
145  EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(const uint64_t& a, int b)
146  {
147  return a << b;
148  }
149 
150  template<>
151  EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(const uint64_t& a, int b)
152  {
153  return a >> b;
154  }
155 
156  // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2))
157  template<typename Packet>
158  EIGEN_STRONG_INLINE Packet plgamma_approx(const Packet& x)
159  {
160  auto x_3 = padd(x, pset1<Packet>(3));
161  auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
162  ret = psub(ret, x_3);
163  ret = padd(ret, pset1<Packet>(0.9189385332046727));
164  ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
165  ret = psub(ret, plog(pmul(
166  pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
167  return ret;
168  }
169 
170  template<typename Packet>
171  EIGEN_STRONG_INLINE Packet pcmplt(const Packet& a, const Packet& b);
172 
173  template<typename Packet>
174  EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b);
175 
176  template<typename PacketIf, typename Packet>
177  EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket);
178 
179  template<typename Packet>
180  EIGEN_STRONG_INLINE Packet pgather(const int* addr, const Packet& index);
181 
182  template<typename Packet>
183  EIGEN_STRONG_INLINE auto pgather(const float* addr, const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
184 
185  template<typename Packet>
186  EIGEN_STRONG_INLINE auto pgather(const double* addr, const Packet& index, bool upperhalf = false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
187 
188  template<typename Packet>
189  EIGEN_STRONG_INLINE Packet ptruncate(const Packet& a);
190 
191  template<typename Packet>
192  EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b);
193 
194  template<typename Packet>
195  EIGEN_STRONG_INLINE Packet pcmplt64(const Packet& a, const Packet& b);
196 
197  template<typename Packet>
198  EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c);
199 
200  template<typename IntPacket>
201  EIGEN_STRONG_INLINE auto bit_to_ur_float(const IntPacket& x) -> decltype(reinterpret_to_float(x))
202  {
203  using FloatPacket = decltype(reinterpret_to_float(x));
204 
205  const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
206  upper = pset1<IntPacket>(127 << 23);
207  const FloatPacket one = pset1<FloatPacket>(1);
208 
209  return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
210  }
211 
212  template<typename IntPacket>
213  EIGEN_STRONG_INLINE auto bit_to_ur_double(const IntPacket& x) -> decltype(reinterpret_to_double(x))
214  {
215  using DoublePacket = decltype(reinterpret_to_double(x));
216 
217  const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
218  upper = pseti64<IntPacket>(1023ull << 52);
219  const DoublePacket one = pset1<DoublePacket>(1);
220 
221  return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
222  }
223 
224  template<typename _Scalar>
225  struct bit_scalar;
226 
227  template<>
228  struct bit_scalar<float>
229  {
230  float to_ur(uint32_t x)
231  {
232  union
233  {
234  uint32_t u;
235  float f;
236  };
237  u = (x & 0x7FFFFF) | (127 << 23);
238  return f - 1.f;
239  }
240 
241  float to_nzur(uint32_t x)
242  {
243  return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
244  }
245  };
246 
247  template<>
248  struct bit_scalar<double>
249  {
250  double to_ur(uint64_t x)
251  {
252  union
253  {
254  uint64_t u;
255  double f;
256  };
257  u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
258  return f - 1.;
259  }
260 
261  double to_nzur(uint64_t x)
262  {
263  return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
264  }
265  };
266 
267 
268  struct float2
269  {
270  float f[2];
271  };
272 
273  EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
274  {
275  bit_scalar<float> bs;
276  float2 ret;
277  ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
278  ret.f[1] = bs.to_ur(x >> 32);
279  return ret;
280  }
281 
282  template<typename Packet>
283  EIGEN_STRONG_INLINE typename std::enable_if<
284  IsFloatPacket<Packet>::value
285  >::type psincos(Packet x, Packet& s, Packet& c)
286  {
287  Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
288  using IntPacket = decltype(reinterpret_to_int(x));
289  IntPacket emm0, emm2, emm4;
290 
291  sign_bit_sin = x;
292  /* take the absolute value */
293  x = pabs(x);
294  /* extract the sign bit (upper one) */
295  sign_bit_sin = pext_sign(sign_bit_sin);
296 
297  /* scale by 4/Pi */
298  y = pmul(x, pset1<Packet>(1.27323954473516));
299 
300  /* store the integer part of y in emm2 */
301  emm2 = pcast<Packet, IntPacket>(y);
302 
303  /* j=(j+1) & (~1) (see the cephes sources) */
304  emm2 = padd(emm2, pset1<IntPacket>(1));
305  emm2 = pand(emm2, pset1<IntPacket>(~1));
306  y = pcast<IntPacket, Packet>(emm2);
307 
308  emm4 = emm2;
309 
310  /* get the swap sign flag for the sine */
311  emm0 = pand(emm2, pset1<IntPacket>(4));
312  emm0 = psll(emm0, 29);
313  Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
314 
315  /* get the polynom selection mask for the sine*/
316  emm2 = pand(emm2, pset1<IntPacket>(2));
317 
318  emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
319  Packet poly_mask = reinterpret_to_float(emm2);
320 
321  /* The magic pass: "Extended precision modular arithmetic"
322  x = ((x - y * DP1) - y * DP2) - y * DP3; */
323  xmm1 = pset1<Packet>(-0.78515625);
324  xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
325  xmm3 = pset1<Packet>(-3.77489497744594108e-8);
326  xmm1 = pmul(y, xmm1);
327  xmm2 = pmul(y, xmm2);
328  xmm3 = pmul(y, xmm3);
329  x = padd(x, xmm1);
330  x = padd(x, xmm2);
331  x = padd(x, xmm3);
332 
333  emm4 = psub(emm4, pset1<IntPacket>(2));
334  emm4 = pandnot(emm4, pset1<IntPacket>(4));
335  emm4 = psll(emm4, 29);
336  Packet sign_bit_cos = reinterpret_to_float(emm4);
337  sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
338 
339 
340  /* Evaluate the first polynom (0 <= x <= Pi/4) */
341  Packet z = pmul(x, x);
342  y = pset1<Packet>(2.443315711809948E-005);
343 
344  y = pmul(y, z);
345  y = padd(y, pset1<Packet>(-1.388731625493765E-003));
346  y = pmul(y, z);
347  y = padd(y, pset1<Packet>(4.166664568298827E-002));
348  y = pmul(y, z);
349  y = pmul(y, z);
350  Packet tmp = pmul(z, pset1<Packet>(0.5));
351  y = psub(y, tmp);
352  y = padd(y, pset1<Packet>(1));
353 
354  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
355 
356  Packet y2 = pset1<Packet>(-1.9515295891E-4);
357  y2 = pmul(y2, z);
358  y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
359  y2 = pmul(y2, z);
360  y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
361  y2 = pmul(y2, z);
362  y2 = pmul(y2, x);
363  y2 = padd(y2, x);
364 
365  /* select the correct result from the two polynoms */
366  xmm3 = poly_mask;
367  Packet ysin2 = pand(xmm3, y2);
368  Packet ysin1 = pandnot(xmm3, y);
369  y2 = psub(y2, ysin2);
370  y = psub(y, ysin1);
371 
372  xmm1 = padd(ysin1, ysin2);
373  xmm2 = padd(y, y2);
374 
375  /* update the sign */
376  s = pxor(xmm1, sign_bit_sin);
377  c = pxor(xmm2, sign_bit_cos);
378  }
379 
380  template<typename Packet>
381  EIGEN_STRONG_INLINE typename std::enable_if<
382  IsDoublePacket<Packet>::value
383  >::type psincos(Packet x, Packet& s, Packet& c)
384  {
385  Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
386  using IntPacket = decltype(reinterpret_to_int(x));
387  IntPacket emm0, emm2, emm4;
388 
389  sign_bit_sin = x;
390  /* take the absolute value */
391  x = pabs(x);
392  /* extract the sign bit (upper one) */
393  sign_bit_sin = pext_sign(sign_bit_sin);
394 
395  /* scale by 4/Pi */
396  y = pmul(x, pset1<Packet>(1.27323954473516));
397 
398  /* store the integer part of y in emm2 */
399  emm2 = pcast64<Packet, IntPacket>(y);
400 
401  /* j=(j+1) & (~1) (see the cephes sources) */
402  emm2 = padd64(emm2, pseti64<IntPacket>(1));
403  emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
404  y = pcast64<IntPacket, Packet>(emm2);
405 
406  emm4 = emm2;
407 
408  /* get the swap sign flag for the sine */
409  emm0 = pand(emm2, pseti64<IntPacket>(4));
410  emm0 = psll64(emm0, 61);
411  Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
412 
413  /* get the polynom selection mask for the sine*/
414  emm2 = pand(emm2, pseti64<IntPacket>(2));
415 
416  emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
417  Packet poly_mask = reinterpret_to_double(emm2);
418 
419  /* The magic pass: "Extended precision modular arithmetic"
420  x = ((x - y * DP1) - y * DP2) - y * DP3; */
421  xmm1 = pset1<Packet>(-0.78515625);
422  xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
423  xmm3 = pset1<Packet>(-3.77489497744594108e-8);
424  xmm1 = pmul(y, xmm1);
425  xmm2 = pmul(y, xmm2);
426  xmm3 = pmul(y, xmm3);
427  x = padd(x, xmm1);
428  x = padd(x, xmm2);
429  x = padd(x, xmm3);
430 
431  emm4 = psub64(emm4, pseti64<IntPacket>(2));
432  emm4 = pandnot(emm4, pseti64<IntPacket>(4));
433  emm4 = psll64(emm4, 61);
434  Packet sign_bit_cos = reinterpret_to_double(emm4);
435  sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
436 
437 
438  /* Evaluate the first polynom (0 <= x <= Pi/4) */
439  Packet z = pmul(x, x);
440  y = pset1<Packet>(2.443315711809948E-005);
441 
442  y = pmul(y, z);
443  y = padd(y, pset1<Packet>(-1.388731625493765E-003));
444  y = pmul(y, z);
445  y = padd(y, pset1<Packet>(4.166664568298827E-002));
446  y = pmul(y, z);
447  y = pmul(y, z);
448  Packet tmp = pmul(z, pset1<Packet>(0.5));
449  y = psub(y, tmp);
450  y = padd(y, pset1<Packet>(1));
451 
452  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
453 
454  Packet y2 = pset1<Packet>(-1.9515295891E-4);
455  y2 = pmul(y2, z);
456  y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
457  y2 = pmul(y2, z);
458  y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
459  y2 = pmul(y2, z);
460  y2 = pmul(y2, x);
461  y2 = padd(y2, x);
462 
463  /* select the correct result from the two polynoms */
464  xmm3 = poly_mask;
465  Packet ysin2 = pand(xmm3, y2);
466  Packet ysin1 = pandnot(xmm3, y);
467  y2 = psub(y2, ysin2);
468  y = psub(y, ysin1);
469 
470  xmm1 = padd(ysin1, ysin2);
471  xmm2 = padd(y, y2);
472 
473  /* update the sign */
474  s = pxor(xmm1, sign_bit_sin);
475  c = pxor(xmm2, sign_bit_cos);
476  }
477 
478  template<typename Packet>
479  EIGEN_STRONG_INLINE typename std::enable_if<
480  IsDoublePacket<Packet>::value, Packet
481  >::type _psin(Packet x)
482  {
483  Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
484  using IntPacket = decltype(reinterpret_to_int(x));
485  IntPacket emm0, emm2;
486 
487  sign_bit_sin = x;
488  /* take the absolute value */
489  x = pabs(x);
490  /* extract the sign bit (upper one) */
491  sign_bit_sin = pext_sign(sign_bit_sin);
492 
493  /* scale by 4/Pi */
494  y = pmul(x, pset1<Packet>(1.27323954473516));
495 
496  /* store the integer part of y in emm2 */
497  emm2 = pcast64<Packet, IntPacket>(y);
498 
499  /* j=(j+1) & (~1) (see the cephes sources) */
500  emm2 = padd64(emm2, pseti64<IntPacket>(1));
501  emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
502  y = pcast64<IntPacket, Packet>(emm2);
503 
504  /* get the swap sign flag for the sine */
505  emm0 = pand(emm2, pseti64<IntPacket>(4));
506  emm0 = psll64(emm0, 61);
507  Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
508 
509  /* get the polynom selection mask for the sine*/
510  emm2 = pand(emm2, pseti64<IntPacket>(2));
511 
512  emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
513  Packet poly_mask = reinterpret_to_double(emm2);
514 
515  /* The magic pass: "Extended precision modular arithmetic"
516  x = ((x - y * DP1) - y * DP2) - y * DP3; */
517  xmm1 = pset1<Packet>(-0.78515625);
518  xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
519  xmm3 = pset1<Packet>(-3.77489497744594108e-8);
520  xmm1 = pmul(y, xmm1);
521  xmm2 = pmul(y, xmm2);
522  xmm3 = pmul(y, xmm3);
523  x = padd(x, xmm1);
524  x = padd(x, xmm2);
525  x = padd(x, xmm3);
526 
527  sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
528 
529 
530  /* Evaluate the first polynom (0 <= x <= Pi/4) */
531  Packet z = pmul(x, x);
532  y = pset1<Packet>(2.443315711809948E-005);
533 
534  y = pmul(y, z);
535  y = padd(y, pset1<Packet>(-1.388731625493765E-003));
536  y = pmul(y, z);
537  y = padd(y, pset1<Packet>(4.166664568298827E-002));
538  y = pmul(y, z);
539  y = pmul(y, z);
540  Packet tmp = pmul(z, pset1<Packet>(0.5));
541  y = psub(y, tmp);
542  y = padd(y, pset1<Packet>(1));
543 
544  /* Evaluate the second polynom (Pi/4 <= x <= 0) */
545 
546  Packet y2 = pset1<Packet>(-1.9515295891E-4);
547  y2 = pmul(y2, z);
548  y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
549  y2 = pmul(y2, z);
550  y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
551  y2 = pmul(y2, z);
552  y2 = pmul(y2, x);
553  y2 = padd(y2, x);
554 
555  /* select the correct result from the two polynoms */
556  xmm3 = poly_mask;
557  Packet ysin2 = pand(xmm3, y2);
558  Packet ysin1 = pandnot(xmm3, y);
559 
560  xmm1 = padd(ysin1, ysin2);
561 
562  /* update the sign */
563  return pxor(xmm1, sign_bit_sin);
564  }
565  }
566 }
567 
568 #ifdef EIGEN_VECTORIZE_AVX
569 #include <immintrin.h>
570 
571 namespace Eigen
572 {
573  namespace internal
574  {
575  template<>
576  struct reinterpreter<Packet8i>
577  {
578  EIGEN_STRONG_INLINE Packet8f to_float(const Packet8i& x)
579  {
580  return _mm256_castsi256_ps(x);
581  }
582 
583  EIGEN_STRONG_INLINE Packet4d to_double(const Packet8i& x)
584  {
585  return _mm256_castsi256_pd(x);
586  }
587 
588  EIGEN_STRONG_INLINE Packet8i to_int(const Packet8i& x)
589  {
590  return x;
591  }
592  };
593 
594  template<>
595  struct reinterpreter<Packet8f>
596  {
597  EIGEN_STRONG_INLINE Packet8f to_float(const Packet8f& x)
598  {
599  return x;
600  }
601 
602  EIGEN_STRONG_INLINE Packet4d to_double(const Packet8f& x)
603  {
604  return _mm256_castps_pd(x);
605  }
606 
607  EIGEN_STRONG_INLINE Packet8i to_int(const Packet8f& x)
608  {
609  return _mm256_castps_si256(x);
610  }
611  };
612 
613  template<>
614  struct reinterpreter<Packet4d>
615  {
616  EIGEN_STRONG_INLINE Packet8f to_float(const Packet4d& x)
617  {
618  return _mm256_castpd_ps(x);
619  }
620 
621  EIGEN_STRONG_INLINE Packet4d to_double(const Packet4d& x)
622  {
623  return x;
624  }
625 
626  EIGEN_STRONG_INLINE Packet8i to_int(const Packet4d& x)
627  {
628  return _mm256_castpd_si256(x);
629  }
630  };
631 
632  EIGEN_STRONG_INLINE void split_two(const Packet8i& x, Packet4i& a, Packet4i& b)
633  {
634  a = _mm256_extractf128_si256(x, 0);
635  b = _mm256_extractf128_si256(x, 1);
636  }
637 
638  EIGEN_STRONG_INLINE Packet8i combine_two(const Packet4i& a, const Packet4i& b)
639  {
640  return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
641  }
642 
643  EIGEN_STRONG_INLINE void split_two(const Packet8f& x, Packet4f& a, Packet4f& b)
644  {
645  a = _mm256_extractf128_ps(x, 0);
646  b = _mm256_extractf128_ps(x, 1);
647  }
648 
649  EIGEN_STRONG_INLINE Packet8f combine_two(const Packet4f& a, const Packet4f& b)
650  {
651  return _mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1);
652  }
653 
654 
655  EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet8i& a)
656  {
657 #ifdef EIGEN_VECTORIZE_AVX2
658  return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
659 #else
660  auto sc = _mm256_permutevar_ps(_mm256_castsi256_ps(a), _mm256_setr_epi32(0, 2, 1, 3, 1, 3, 0, 2));
661  return _mm_castps_si128(_mm_blend_ps(_mm256_extractf128_ps(sc, 0), _mm256_extractf128_ps(sc, 1), 0b1100));
662 #endif
663  }
664 
665  template<>
666  EIGEN_STRONG_INLINE Packet8i pseti64<Packet8i>(uint64_t a)
667  {
668  return _mm256_set1_epi64x(a);
669  }
670 
671  template<>
672  EIGEN_STRONG_INLINE Packet8i padd64<Packet8i>(const Packet8i& a, const Packet8i& b)
673  {
674 #ifdef EIGEN_VECTORIZE_AVX2
675  return _mm256_add_epi64(a, b);
676 #else
677  Packet4i a1, a2, b1, b2;
678  split_two(a, a1, a2);
679  split_two(b, b1, b2);
680  return combine_two((Packet4i)_mm_add_epi64(a1, b1), (Packet4i)_mm_add_epi64(a2, b2));
681 #endif
682  }
683 
684  template<>
685  EIGEN_STRONG_INLINE Packet8i psub64<Packet8i>(const Packet8i& a, const Packet8i& b)
686  {
687 #ifdef EIGEN_VECTORIZE_AVX2
688  return _mm256_sub_epi64(a, b);
689 #else
690  Packet4i a1, a2, b1, b2;
691  split_two(a, a1, a2);
692  split_two(b, b1, b2);
693  return combine_two((Packet4i)_mm_sub_epi64(a1, b1), (Packet4i)_mm_sub_epi64(a2, b2));
694 #endif
695  }
696 
697  template<>
698  EIGEN_STRONG_INLINE Packet8i pcmpeq<Packet8i>(const Packet8i& a, const Packet8i& b)
699  {
700 #ifdef EIGEN_VECTORIZE_AVX2
701  return _mm256_cmpeq_epi32(a, b);
702 #else
703  Packet4i a1, a2, b1, b2;
704  split_two(a, a1, a2);
705  split_two(b, b1, b2);
706  return combine_two((Packet4i)_mm_cmpeq_epi32(a1, b1), (Packet4i)_mm_cmpeq_epi32(a2, b2));
707 #endif
708  }
709 
710  template<>
711  EIGEN_STRONG_INLINE Packet8i psll<Packet8i>(const Packet8i& a, int b)
712  {
713 #ifdef EIGEN_VECTORIZE_AVX2
714  return _mm256_slli_epi32(a, b);
715 #else
716  Packet4i a1, a2;
717  split_two(a, a1, a2);
718  return combine_two((Packet4i)_mm_slli_epi32(a1, b), (Packet4i)_mm_slli_epi32(a2, b));
719 #endif
720  }
721 
722  template<>
723  EIGEN_STRONG_INLINE Packet8i psrl<Packet8i>(const Packet8i& a, int b)
724  {
725 #ifdef EIGEN_VECTORIZE_AVX2
726  return _mm256_srli_epi32(a, b);
727 #else
728  Packet4i a1, a2;
729  split_two(a, a1, a2);
730  return combine_two((Packet4i)_mm_srli_epi32(a1, b), (Packet4i)_mm_srli_epi32(a2, b));
731 #endif
732  }
733 
734  template<>
735  EIGEN_STRONG_INLINE Packet8i psll64<Packet8i>(const Packet8i& a, int b)
736  {
737 #ifdef EIGEN_VECTORIZE_AVX2
738  return _mm256_slli_epi64(a, b);
739 #else
740  Packet4i a1, a2;
741  split_two(a, a1, a2);
742  return combine_two((Packet4i)_mm_slli_epi64(a1, b), (Packet4i)_mm_slli_epi64(a2, b));
743 #endif
744  }
745 
746  template<>
747  EIGEN_STRONG_INLINE Packet8i psrl64<Packet8i>(const Packet8i& a, int b)
748  {
749 #ifdef EIGEN_VECTORIZE_AVX2
750  return _mm256_srli_epi64(a, b);
751 #else
752  Packet4i a1, a2;
753  split_two(a, a1, a2);
754  return combine_two((Packet4i)_mm_srli_epi64(a1, b), (Packet4i)_mm_srli_epi64(a2, b));
755 #endif
756  }
757 
758  template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b)
759  {
760 #ifdef EIGEN_VECTORIZE_AVX2
761  return _mm256_add_epi32(a, b);
762 #else
763  Packet4i a1, a2, b1, b2;
764  split_two(a, a1, a2);
765  split_two(b, b1, b2);
766  return combine_two((Packet4i)_mm_add_epi32(a1, b1), (Packet4i)_mm_add_epi32(a2, b2));
767 #endif
768  }
769 
770  template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b)
771  {
772 #ifdef EIGEN_VECTORIZE_AVX2
773  return _mm256_sub_epi32(a, b);
774 #else
775  Packet4i a1, a2, b1, b2;
776  split_two(a, a1, a2);
777  split_two(b, b1, b2);
778  return combine_two((Packet4i)_mm_sub_epi32(a1, b1), (Packet4i)_mm_sub_epi32(a2, b2));
779 #endif
780  }
781 
782  template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b)
783  {
784 #ifdef EIGEN_VECTORIZE_AVX2
785  return _mm256_and_si256(a, b);
786 #else
787  return reinterpret_to_int((Packet8f)_mm256_and_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
788 #endif
789  }
790 
791  template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b)
792  {
793 #ifdef EIGEN_VECTORIZE_AVX2
794  return _mm256_andnot_si256(a, b);
795 #else
796  return reinterpret_to_int((Packet8f)_mm256_andnot_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
797 #endif
798  }
799 
800  template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b)
801  {
802 #ifdef EIGEN_VECTORIZE_AVX2
803  return _mm256_or_si256(a, b);
804 #else
805  return reinterpret_to_int((Packet8f)_mm256_or_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
806 #endif
807  }
808 
809  template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b)
810  {
811 #ifdef EIGEN_VECTORIZE_AVX2
812  return _mm256_xor_si256(a, b);
813 #else
814  return reinterpret_to_int((Packet8f)_mm256_xor_ps(reinterpret_to_float(a), reinterpret_to_float(b)));
815 #endif
816  }
817 
818  template<>
819  EIGEN_STRONG_INLINE Packet8i pcmplt<Packet8i>(const Packet8i& a, const Packet8i& b)
820  {
821 #ifdef EIGEN_VECTORIZE_AVX2
822  return _mm256_cmpgt_epi32(b, a);
823 #else
824  Packet4i a1, a2, b1, b2;
825  split_two(a, a1, a2);
826  split_two(b, b1, b2);
827  return combine_two((Packet4i)_mm_cmpgt_epi32(b1, a1), (Packet4i)_mm_cmpgt_epi32(b2, a2));
828 #endif
829  }
830 
831  template<>
832  EIGEN_STRONG_INLINE Packet8i pcmplt64<Packet8i>(const Packet8i& a, const Packet8i& b)
833  {
834 #ifdef EIGEN_VECTORIZE_AVX2
835  return _mm256_cmpgt_epi64(b, a);
836 #else
837  Packet4i a1, a2, b1, b2;
838  split_two(a, a1, a2);
839  split_two(b, b1, b2);
840  return combine_two((Packet4i)_mm_cmpgt_epi64(b1, a1), (Packet4i)_mm_cmpgt_epi64(b2, a2));
841 #endif
842  }
843 
844  template<>
845  EIGEN_STRONG_INLINE Packet8f pcmplt<Packet8f>(const Packet8f& a, const Packet8f& b)
846  {
847  return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
848  }
849 
850  template<>
851  EIGEN_STRONG_INLINE Packet8f pcmple<Packet8f>(const Packet8f& a, const Packet8f& b)
852  {
853  return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
854  }
855 
856  template<>
857  EIGEN_STRONG_INLINE Packet4d pcmplt<Packet4d>(const Packet4d& a, const Packet4d& b)
858  {
859  return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
860  }
861 
862  template<>
863  EIGEN_STRONG_INLINE Packet4d pcmple<Packet4d>(const Packet4d& a, const Packet4d& b)
864  {
865  return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
866  }
867 
868  template<>
869  EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8f& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
870  {
871  return _mm256_blendv_ps(elsePacket, thenPacket, ifPacket);
872  }
873 
874  template<>
875  EIGEN_STRONG_INLINE Packet8f pblendv(const Packet8i& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket)
876  {
877  return pblendv(_mm256_castsi256_ps(ifPacket), thenPacket, elsePacket);
878  }
879 
880  template<>
881  EIGEN_STRONG_INLINE Packet8i pblendv(const Packet8i& ifPacket, const Packet8i& thenPacket, const Packet8i& elsePacket)
882  {
883  return _mm256_castps_si256(_mm256_blendv_ps(
884  _mm256_castsi256_ps(elsePacket),
885  _mm256_castsi256_ps(thenPacket),
886  _mm256_castsi256_ps(ifPacket)
887  ));
888  }
889 
890  template<>
891  EIGEN_STRONG_INLINE Packet4d pblendv(const Packet4d& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
892  {
893  return _mm256_blendv_pd(elsePacket, thenPacket, ifPacket);
894  }
895 
896  template<>
897  EIGEN_STRONG_INLINE Packet4d pblendv(const Packet8i& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket)
898  {
899  return pblendv(_mm256_castsi256_pd(ifPacket), thenPacket, elsePacket);
900  }
901 
902  template<>
903  EIGEN_STRONG_INLINE Packet8i pgather<Packet8i>(const int* addr, const Packet8i& index)
904  {
905 #ifdef EIGEN_VECTORIZE_AVX2
906  return _mm256_i32gather_epi32(addr, index, 4);
907 #else
908  uint32_t u[8];
909  _mm256_storeu_si256((Packet8i*)u, index);
910  return _mm256_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
911  addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
912 #endif
913  }
914 
915  template<>
916  EIGEN_STRONG_INLINE Packet8f pgather<Packet8i>(const float *addr, const Packet8i& index)
917  {
918 #ifdef EIGEN_VECTORIZE_AVX2
919  return _mm256_i32gather_ps(addr, index, 4);
920 #else
921  uint32_t u[8];
922  _mm256_storeu_si256((Packet8i*)u, index);
923  return _mm256_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]],
924  addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
925 #endif
926  }
927 
928  template<>
929  EIGEN_STRONG_INLINE Packet4d pgather<Packet8i>(const double *addr, const Packet8i& index, bool upperhalf)
930  {
931 #ifdef EIGEN_VECTORIZE_AVX2
932  return _mm256_i32gather_pd(addr, _mm256_castsi256_si128(index), 8);
933 #else
934  uint32_t u[8];
935  _mm256_storeu_si256((Packet8i*)u, index);
936  if (upperhalf)
937  {
938  return _mm256_setr_pd(addr[u[4]], addr[u[5]], addr[u[6]], addr[u[7]]);
939  }
940  else
941  {
942  return _mm256_setr_pd(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
943  }
944 #endif
945  }
946 
947  template<>
948  EIGEN_STRONG_INLINE int pmovemask<Packet8f>(const Packet8f& a)
949  {
950  return _mm256_movemask_ps(a);
951  }
952 
953  template<>
954  EIGEN_STRONG_INLINE int pmovemask<Packet4d>(const Packet4d& a)
955  {
956  return _mm256_movemask_pd(a);
957  }
958 
959  template<>
960  EIGEN_STRONG_INLINE int pmovemask<Packet8i>(const Packet8i& a)
961  {
962  return pmovemask(_mm256_castsi256_ps(a));
963  }
964 
965  template<>
966  EIGEN_STRONG_INLINE Packet8f ptruncate<Packet8f>(const Packet8f& a)
967  {
968  return _mm256_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
969  }
970 
971  template<>
972  EIGEN_STRONG_INLINE Packet4d ptruncate<Packet4d>(const Packet4d& a)
973  {
974  return _mm256_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
975  }
976 
977  template<>
978  EIGEN_STRONG_INLINE Packet8i pcmpeq64<Packet8i>(const Packet8i& a, const Packet8i& b)
979  {
980 #ifdef EIGEN_VECTORIZE_AVX2
981  return _mm256_cmpeq_epi64(a, b);
982 #else
983  Packet4i a1, a2, b1, b2;
984  split_two(a, a1, a2);
985  split_two(b, b1, b2);
986  return combine_two((Packet4i)_mm_cmpeq_epi64(a1, b1), (Packet4i)_mm_cmpeq_epi64(a2, b2));
987 #endif
988  }
989 
990  template<>
991  EIGEN_STRONG_INLINE Packet8i pmuluadd64<Packet8i>(const Packet8i& a, uint64_t b, uint64_t c)
992  {
993  uint64_t u[4];
994  _mm256_storeu_si256((__m256i*)u, a);
995  u[0] = u[0] * b + c;
996  u[1] = u[1] * b + c;
997  u[2] = u[2] * b + c;
998  u[3] = u[3] * b + c;
999  return _mm256_loadu_si256((__m256i*)u);
1000  }
1001 
1002  EIGEN_STRONG_INLINE __m256d uint64_to_double(__m256i x) {
1003  auto y = _mm256_or_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0010000000000000));
1004  return _mm256_sub_pd(y, _mm256_set1_pd(0x0010000000000000));
1005  }
1006 
1007  EIGEN_STRONG_INLINE __m256d int64_to_double(__m256i x) {
1008  x = padd64(x, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000)));
1009  return _mm256_sub_pd(_mm256_castsi256_pd(x), _mm256_set1_pd(0x0018000000000000));
1010  }
1011 
1012  EIGEN_STRONG_INLINE __m256i double_to_int64(__m256d x) {
1013  x = _mm256_add_pd(x, _mm256_set1_pd(0x0018000000000000));
1014  return psub64(
1015  _mm256_castpd_si256(x),
1016  _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))
1017  );
1018  }
1019 
1020  template<>
1021  EIGEN_STRONG_INLINE Packet8i pcast64<Packet4d, Packet8i>(const Packet4d& a)
1022  {
1023  return double_to_int64(a);
1024  }
1025 
1026  template<>
1027  EIGEN_STRONG_INLINE Packet4d pcast64<Packet8i, Packet4d>(const Packet8i& a)
1028  {
1029  return int64_to_double(a);
1030  }
1031 
1032  template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
1033  Packet4d psin<Packet4d>(const Packet4d& x)
1034  {
1035  return _psin(x);
1036  }
1037 
1038  template <>
1039  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
1040  plog<Packet4d>(const Packet4d& _x) {
1041  Packet4d x = _x;
1042  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
1043  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
1044 
1045  auto inv_mant_mask = _mm256_castsi256_pd(pseti64<Packet8i>(~0x7ff0000000000000));
1046  auto min_norm_pos = _mm256_castsi256_pd(pseti64<Packet8i>(0x10000000000000));
1047  auto minus_inf = _mm256_castsi256_pd(pseti64<Packet8i>(0xfff0000000000000));
1048 
1049  // Polynomial coefficients.
1050  _EIGEN_DECLARE_CONST_Packet4d(cephes_SQRTHF, 0.707106781186547524);
1051  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p0, 7.0376836292E-2);
1052  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p1, -1.1514610310E-1);
1053  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p2, 1.1676998740E-1);
1054  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p3, -1.2420140846E-1);
1055  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p4, +1.4249322787E-1);
1056  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p5, -1.6668057665E-1);
1057  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p6, +2.0000714765E-1);
1058  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p7, -2.4999993993E-1);
1059  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_p8, +3.3333331174E-1);
1060  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q1, -2.12194440e-4);
1061  _EIGEN_DECLARE_CONST_Packet4d(cephes_log_q2, 0.693359375);
1062 
1063  Packet4d invalid_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
1064  Packet4d iszero_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_EQ_OQ);
1065 
1066  // Truncate input values to the minimum positive normal.
1067  x = pmax(x, min_norm_pos);
1068 
1069  Packet4d emm0 = uint64_to_double(psrl64(_mm256_castpd_si256(x), 52));
1070  Packet4d e = psub(emm0, pset1<Packet4d>(1022));
1071 
1072  // Set the exponents to -1, i.e. x are in the range [0.5,1).
1073  x = _mm256_and_pd(x, inv_mant_mask);
1074  x = _mm256_or_pd(x, p4d_half);
1075 
1076  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
1077  // and shift by -1. The values are then centered around 0, which improves
1078  // the stability of the polynomial evaluation.
1079  // if( x < SQRTHF ) {
1080  // e -= 1;
1081  // x = x + x - 1.0;
1082  // } else { x = x - 1.0; }
1083  Packet4d mask = _mm256_cmp_pd(x, p4d_cephes_SQRTHF, _CMP_LT_OQ);
1084  Packet4d tmp = _mm256_and_pd(x, mask);
1085  x = psub(x, p4d_1);
1086  e = psub(e, _mm256_and_pd(p4d_1, mask));
1087  x = padd(x, tmp);
1088 
1089  Packet4d x2 = pmul(x, x);
1090  Packet4d x3 = pmul(x2, x);
1091 
1092  // Evaluate the polynomial approximant of degree 8 in three parts, probably
1093  // to improve instruction-level parallelism.
1094  Packet4d y, y1, y2;
1095  y = pmadd(p4d_cephes_log_p0, x, p4d_cephes_log_p1);
1096  y1 = pmadd(p4d_cephes_log_p3, x, p4d_cephes_log_p4);
1097  y2 = pmadd(p4d_cephes_log_p6, x, p4d_cephes_log_p7);
1098  y = pmadd(y, x, p4d_cephes_log_p2);
1099  y1 = pmadd(y1, x, p4d_cephes_log_p5);
1100  y2 = pmadd(y2, x, p4d_cephes_log_p8);
1101  y = pmadd(y, x3, y1);
1102  y = pmadd(y, x3, y2);
1103  y = pmul(y, x3);
1104 
1105  // Add the logarithm of the exponent back to the result of the interpolation.
1106  y1 = pmul(e, p4d_cephes_log_q1);
1107  tmp = pmul(x2, p4d_half);
1108  y = padd(y, y1);
1109  x = psub(x, tmp);
1110  y2 = pmul(e, p4d_cephes_log_q2);
1111  x = padd(x, y);
1112  x = padd(x, y2);
1113 
1114  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
1115  return pblendv(iszero_mask, minus_inf, _mm256_or_pd(x, invalid_mask));
1116  }
1117  }
1118 }
1119 #endif
1120 
1121 #ifdef EIGEN_VECTORIZE_SSE2
1122 #include <xmmintrin.h>
1123 
1124 namespace Eigen
1125 {
1126  namespace internal
1127  {
1128  template<>
1129  struct reinterpreter<Packet4i>
1130  {
1131  EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x)
1132  {
1133  return _mm_castsi128_ps(x);
1134  }
1135 
1136  EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x)
1137  {
1138  return _mm_castsi128_pd(x);
1139  }
1140 
1141  EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x)
1142  {
1143  return x;
1144  }
1145  };
1146 
1147  template<>
1148  struct reinterpreter<Packet4f>
1149  {
1150  EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x)
1151  {
1152  return x;
1153  }
1154 
1155  EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x)
1156  {
1157  return _mm_castps_pd(x);
1158  }
1159 
1160  EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x)
1161  {
1162  return _mm_castps_si128(x);
1163  }
1164  };
1165 
1166  template<>
1167  struct reinterpreter<Packet2d>
1168  {
1169  EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x)
1170  {
1171  return _mm_castpd_ps(x);
1172  }
1173 
1174  EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x)
1175  {
1176  return x;
1177  }
1178 
1179  EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x)
1180  {
1181  return _mm_castpd_si128(x);
1182  }
1183  };
1184 
1185  EIGEN_STRONG_INLINE void split_two(const Packet4i& x, uint64_t& a, uint64_t& b)
1186  {
1187 #ifdef EIGEN_VECTORIZE_SSE4_1
1188  a = _mm_extract_epi64(x, 0);
1189  b = _mm_extract_epi64(x, 1);
1190 #else
1191  uint64_t u[2];
1192  _mm_storeu_si128((__m128i*)u, x);
1193  a = u[0];
1194  b = u[1];
1195 #endif
1196  }
1197 
1198  EIGEN_STRONG_INLINE Packet4i combine_low32(const Packet4i& a, const Packet4i& b)
1199  {
1200  auto sa = _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 2, 0));
1201  auto sb = _mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 3, 1));
1202  sa = _mm_and_si128(sa, _mm_setr_epi32(-1, -1, 0, 0));
1203  sb = _mm_and_si128(sb, _mm_setr_epi32(0, 0, -1, -1));
1204  return _mm_or_si128(sa, sb);
1205  }
1206 
1207  template<>
1208  EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
1209  {
1210  return _mm_set1_epi64x(a);
1211  }
1212 
1213  template<>
1214  EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(const Packet4i& a, const Packet4i& b)
1215  {
1216  return _mm_add_epi64(a, b);
1217  }
1218 
1219  template<>
1220  EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(const Packet4i& a, const Packet4i& b)
1221  {
1222  return _mm_sub_epi64(a, b);
1223  }
1224 
1225  template<>
1226  EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(const Packet4i& a, const Packet4i& b)
1227  {
1228  return _mm_cmpeq_epi32(a, b);
1229  }
1230 
1231  template<>
1232  EIGEN_STRONG_INLINE Packet4i psll<Packet4i>(const Packet4i& a, int b)
1233  {
1234  return _mm_slli_epi32(a, b);
1235  }
1236 
1237  template<>
1238  EIGEN_STRONG_INLINE Packet4i psrl<Packet4i>(const Packet4i& a, int b)
1239  {
1240  return _mm_srli_epi32(a, b);
1241  }
1242 
1243 
1244  template<>
1245  EIGEN_STRONG_INLINE Packet4i psll64<Packet4i>(const Packet4i& a, int b)
1246  {
1247  return _mm_slli_epi64(a, b);
1248  }
1249 
1250  template<>
1251  EIGEN_STRONG_INLINE Packet4i psrl64<Packet4i>(const Packet4i& a, int b)
1252  {
1253  return _mm_srli_epi64(a, b);
1254  }
1255 
1256  template<>
1257  EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(const Packet4i& a, const Packet4i& b)
1258  {
1259  return _mm_cmplt_epi32(a, b);
1260  }
1261 
1262  template<>
1263  EIGEN_STRONG_INLINE Packet4i pcmplt64<Packet4i>(const Packet4i& a, const Packet4i& b)
1264  {
1265 #ifdef EIGEN_VECTORIZE_SSE4_2
1266  return _mm_cmpgt_epi64(b, a);
1267 #else
1268  int64_t u[2], v[2];
1269  _mm_storeu_si128((__m128i*)u, a);
1270  _mm_storeu_si128((__m128i*)v, b);
1271  return _mm_set_epi64x(u[1] < v[1] ? -1 : 0, u[0] < v[0] ? -1 : 0);
1272 #endif
1273  }
1274 
1275  template<>
1276  EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(const Packet4f& a, const Packet4f& b)
1277  {
1278  return _mm_cmplt_ps(a, b);
1279  }
1280 
1281  template<>
1282  EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(const Packet4f& a, const Packet4f& b)
1283  {
1284  return _mm_cmple_ps(a, b);
1285  }
1286 
1287  template<>
1288  EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(const Packet2d& a, const Packet2d& b)
1289  {
1290  return _mm_cmplt_pd(a, b);
1291  }
1292 
1293  template<>
1294  EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(const Packet2d& a, const Packet2d& b)
1295  {
1296  return _mm_cmple_pd(a, b);
1297  }
1298 
1299  template<>
1300  EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
1301  {
1302 #ifdef EIGEN_VECTORIZE_SSE4_1
1303  return _mm_blendv_ps(elsePacket, thenPacket, ifPacket);
1304 #else
1305  return _mm_or_ps(_mm_and_ps(ifPacket, thenPacket), _mm_andnot_ps(ifPacket, elsePacket));
1306 #endif
1307  }
1308 
1309  template<>
1310  EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
1311  {
1312  return pblendv(_mm_castsi128_ps(ifPacket), thenPacket, elsePacket);
1313  }
1314 
1315  template<>
1316  EIGEN_STRONG_INLINE Packet4i pblendv(const Packet4i& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket)
1317  {
1318 #ifdef EIGEN_VECTORIZE_SSE4_1
1319  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(elsePacket), _mm_castsi128_ps(thenPacket), _mm_castsi128_ps(ifPacket)));
1320 #else
1321  return _mm_or_si128(_mm_and_si128(ifPacket, thenPacket), _mm_andnot_si128(ifPacket, elsePacket));
1322 #endif
1323  }
1324 
1325  template<>
1326  EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
1327  {
1328 #ifdef EIGEN_VECTORIZE_SSE4_1
1329  return _mm_blendv_pd(elsePacket, thenPacket, ifPacket);
1330 #else
1331  return _mm_or_pd(_mm_and_pd(ifPacket, thenPacket), _mm_andnot_pd(ifPacket, elsePacket));
1332 #endif
1333  }
1334 
1335 
1336  template<>
1337  EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
1338  {
1339  return pblendv(_mm_castsi128_pd(ifPacket), thenPacket, elsePacket);
1340  }
1341 
1342  template<>
1343  EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(const int* addr, const Packet4i& index)
1344  {
1345 #ifdef EIGEN_VECTORIZE_AVX2
1346  return _mm_i32gather_epi32(addr, index, 4);
1347 #else
1348  uint32_t u[4];
1349  _mm_storeu_si128((__m128i*)u, index);
1350  return _mm_setr_epi32(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
1351 #endif
1352  }
1353 
1354  template<>
1355  EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(const float* addr, const Packet4i& index)
1356  {
1357 #ifdef EIGEN_VECTORIZE_AVX2
1358  return _mm_i32gather_ps(addr, index, 4);
1359 #else
1360  uint32_t u[4];
1361  _mm_storeu_si128((__m128i*)u, index);
1362  return _mm_setr_ps(addr[u[0]], addr[u[1]], addr[u[2]], addr[u[3]]);
1363 #endif
1364  }
1365 
1366  template<>
1367  EIGEN_STRONG_INLINE Packet2d pgather<Packet4i>(const double* addr, const Packet4i& index, bool upperhalf)
1368  {
1369 #ifdef EIGEN_VECTORIZE_AVX2
1370  return _mm_i32gather_pd(addr, index, 8);
1371 #else
1372  uint32_t u[4];
1373  _mm_storeu_si128((__m128i*)u, index);
1374  if (upperhalf)
1375  {
1376  return _mm_setr_pd(addr[u[2]], addr[u[3]]);
1377  }
1378  else
1379  {
1380  return _mm_setr_pd(addr[u[0]], addr[u[1]]);
1381  }
1382 #endif
1383  }
1384 
1385  template<>
1386  EIGEN_STRONG_INLINE int pmovemask<Packet4f>(const Packet4f& a)
1387  {
1388  return _mm_movemask_ps(a);
1389  }
1390 
1391  template<>
1392  EIGEN_STRONG_INLINE int pmovemask<Packet2d>(const Packet2d& a)
1393  {
1394  return _mm_movemask_pd(a);
1395  }
1396 
1397  template<>
1398  EIGEN_STRONG_INLINE int pmovemask<Packet4i>(const Packet4i& a)
1399  {
1400  return pmovemask((Packet4f)_mm_castsi128_ps(a));
1401  }
1402 
1403  template<>
1404  EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(const Packet4f& a)
1405  {
1406 #ifdef EIGEN_VECTORIZE_SSE4_1
1407  return _mm_round_ps(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1408 #else
1409  auto round = _MM_GET_ROUNDING_MODE();
1410  _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1411  auto ret = _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
1412  _MM_SET_ROUNDING_MODE(round);
1413  return ret;
1414 #endif
1415  }
1416 
1417  template<>
1418  EIGEN_STRONG_INLINE Packet2d ptruncate<Packet2d>(const Packet2d& a)
1419  {
1420 #ifdef EIGEN_VECTORIZE_SSE4_1
1421  return _mm_round_pd(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
1422 #else
1423  auto round = _MM_GET_ROUNDING_MODE();
1424  _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1425  auto ret = _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
1426  _MM_SET_ROUNDING_MODE(round);
1427  return ret;
1428 #endif
1429  }
1430 
1431  template<>
1432  EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(const Packet4i& a, const Packet4i& b)
1433  {
1434 #ifdef EIGEN_VECTORIZE_SSE4_1
1435  return _mm_cmpeq_epi64(a, b);
1436 #else
1437  Packet4i c = _mm_cmpeq_epi32(a, b);
1438  return pand(c, (Packet4i)_mm_shuffle_epi32(c, _MM_SHUFFLE(2, 3, 0, 1)));
1439 #endif
1440  }
1441 
1442  template<>
1443  EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(const Packet4i& a, uint64_t b, uint64_t c)
1444  {
1445  uint64_t u[2];
1446  _mm_storeu_si128((__m128i*)u, a);
1447  u[0] = u[0] * b + c;
1448  u[1] = u[1] * b + c;
1449  return _mm_loadu_si128((__m128i*)u);
1450  }
1451 
1452  EIGEN_STRONG_INLINE __m128d uint64_to_double(__m128i x) {
1453  x = _mm_or_si128(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)));
1454  return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0010000000000000));
1455  }
1456 
1457  EIGEN_STRONG_INLINE __m128d int64_to_double(__m128i x) {
1458  x = _mm_add_epi64(x, _mm_castpd_si128(_mm_set1_pd(0x0018000000000000)));
1459  return _mm_sub_pd(_mm_castsi128_pd(x), _mm_set1_pd(0x0018000000000000));
1460  }
1461 
1462  EIGEN_STRONG_INLINE __m128i double_to_int64(__m128d x) {
1463  x = _mm_add_pd(x, _mm_set1_pd(0x0018000000000000));
1464  return _mm_sub_epi64(
1465  _mm_castpd_si128(x),
1466  _mm_castpd_si128(_mm_set1_pd(0x0018000000000000))
1467  );
1468  }
1469 
1470  template<>
1471  EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(const Packet2d& a)
1472  {
1473  return double_to_int64(a);
1474  }
1475 
1476  template<>
1477  EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(const Packet4i& a)
1478  {
1479  return int64_to_double(a);
1480  }
1481 
1482  template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
1483  Packet2d psin<Packet2d>(const Packet2d& x)
1484  {
1485  return _psin(x);
1486  }
1487 
1488  template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
1489  Packet2d plog<Packet2d>(const Packet2d& _x)
1490  {
1491  Packet2d x = _x;
1492  _EIGEN_DECLARE_CONST_Packet2d(1, 1.0f);
1493  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5f);
1494  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
1495 
1496  auto inv_mant_mask = _mm_castsi128_pd(pseti64<Packet4i>(~0x7ff0000000000000));
1497  auto min_norm_pos = _mm_castsi128_pd(pseti64<Packet4i>(0x10000000000000));
1498  auto minus_inf = _mm_castsi128_pd(pseti64<Packet4i>(0xfff0000000000000));
1499 
1500  /* natural logarithm computed for 4 simultaneous float
1501  return NaN for x <= 0
1502  */
1503  _EIGEN_DECLARE_CONST_Packet2d(cephes_SQRTHF, 0.707106781186547524);
1504  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p0, 7.0376836292E-2);
1505  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p1, -1.1514610310E-1);
1506  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p2, 1.1676998740E-1);
1507  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p3, -1.2420140846E-1);
1508  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p4, +1.4249322787E-1);
1509  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p5, -1.6668057665E-1);
1510  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p6, +2.0000714765E-1);
1511  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p7, -2.4999993993E-1);
1512  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_p8, +3.3333331174E-1);
1513  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q1, -2.12194440e-4);
1514  _EIGEN_DECLARE_CONST_Packet2d(cephes_log_q2, 0.693359375);
1515 
1516 
1517  Packet4i emm0;
1518 
1519  Packet2d invalid_mask = _mm_cmpnge_pd(x, _mm_setzero_pd()); // not greater equal is true if x is NaN
1520  Packet2d iszero_mask = _mm_cmpeq_pd(x, _mm_setzero_pd());
1521 
1522  x = pmax(x, min_norm_pos); /* cut off denormalized stuff */
1523  emm0 = _mm_srli_epi64(_mm_castpd_si128(x), 52);
1524 
1525  /* keep only the fractional part */
1526  x = _mm_and_pd(x, inv_mant_mask);
1527  x = _mm_or_pd(x, p2d_half);
1528 
1529  Packet2d e = _mm_sub_pd(uint64_to_double(emm0), pset1<Packet2d>(1022));
1530 
1531  /* part2:
1532  if( x < SQRTHF ) {
1533  e -= 1;
1534  x = x + x - 1.0;
1535  } else { x = x - 1.0; }
1536  */
1537  Packet2d mask = _mm_cmplt_pd(x, p2d_cephes_SQRTHF);
1538  Packet2d tmp = pand(x, mask);
1539  x = psub(x, p2d_1);
1540  e = psub(e, pand(p2d_1, mask));
1541  x = padd(x, tmp);
1542 
1543  Packet2d x2 = pmul(x, x);
1544  Packet2d x3 = pmul(x2, x);
1545 
1546  Packet2d y, y1, y2;
1547  y = pmadd(p2d_cephes_log_p0, x, p2d_cephes_log_p1);
1548  y1 = pmadd(p2d_cephes_log_p3, x, p2d_cephes_log_p4);
1549  y2 = pmadd(p2d_cephes_log_p6, x, p2d_cephes_log_p7);
1550  y = pmadd(y, x, p2d_cephes_log_p2);
1551  y1 = pmadd(y1, x, p2d_cephes_log_p5);
1552  y2 = pmadd(y2, x, p2d_cephes_log_p8);
1553  y = pmadd(y, x3, y1);
1554  y = pmadd(y, x3, y2);
1555  y = pmul(y, x3);
1556 
1557  y1 = pmul(e, p2d_cephes_log_q1);
1558  tmp = pmul(x2, p2d_half);
1559  y = padd(y, y1);
1560  x = psub(x, tmp);
1561  y2 = pmul(e, p2d_cephes_log_q2);
1562  x = padd(x, y);
1563  x = padd(x, y2);
1564  // negative arg will be NAN, 0 will be -INF
1565  return pblendv(iszero_mask, minus_inf, _mm_or_pd(x, invalid_mask));
1566  }
1567  }
1568 }
1569 #endif
1570 
1571 #endif