EigenRand  0.5.0
 
Loading...
Searching...
No Matches
MorePacketMath.h
Go to the documentation of this file.
1
12#ifndef EIGENRAND_MORE_PACKET_MATH_H
13#define EIGENRAND_MORE_PACKET_MATH_H
14
15#include <Eigen/Dense>
16#include <cstdint>
17
18#define EIGENRAND_PRINT_PACKET(p) do { using _MTy = typename std::remove_const<typename std::remove_reference<decltype(p)>::type>::type; typename std::conditional<Eigen::internal::IsFloatPacket<_MTy>::value, float, typename std::conditional<Eigen::internal::IsDoublePacket<_MTy>::value, double, int>::type>::type f[4]; Eigen::internal::pstore(f, p); std::cout << #p " " << f[0] << " " << f[1] << " " << f[2] << " " << f[3] << std::endl; } while(0)
19
20namespace Eigen
21{
22 namespace internal
23 {
24 template<typename Ty>
25 struct IsIntPacket : std::false_type {};
26
27 template<typename Ty>
28 struct IsFloatPacket : std::false_type {};
29
30 template<typename Ty>
31 struct IsDoublePacket : std::false_type {};
32
33 template<typename Ty>
34 struct HalfPacket;
35
36 template<typename Packet>
37 struct reinterpreter{};
38
39 template<typename Packet>
40 inline auto reinterpret_to_float(const Packet& x)
41 -> decltype(reinterpreter<Packet>{}.to_float(x))
42 {
43 return reinterpreter<Packet>{}.to_float(x);
44 }
45
46 template<typename Packet>
47 inline auto reinterpret_to_double(const Packet& x)
48 -> decltype(reinterpreter<Packet>{}.to_double(x))
49 {
50 return reinterpreter<Packet>{}.to_double(x);
51 }
52
53 template<typename Packet>
54 inline auto reinterpret_to_int(const Packet& x)
55 -> decltype(reinterpreter<Packet>{}.to_int(x))
56 {
57 return reinterpreter<Packet>{}.to_int(x);
58 }
59
60 template<typename Packet>
61 EIGEN_STRONG_INLINE void split_two(const Packet& p, typename HalfPacket<Packet>::type& a, typename HalfPacket<Packet>::type& b);
62
63 template<typename Packet>
64 EIGEN_STRONG_INLINE Packet pseti64(uint64_t a);
65
66 template<typename Packet>
67 EIGEN_STRONG_INLINE Packet padd64(const Packet& a, const Packet& b);
68
69 template<typename Packet>
70 EIGEN_STRONG_INLINE Packet psub64(const Packet& a, const Packet& b);
71
72 template <typename SrcPacket, typename TgtPacket>
73 EIGEN_STRONG_INLINE TgtPacket pcast64(const SrcPacket& a);
74
75 template<typename Packet>
76 EIGEN_STRONG_INLINE Packet pcmpeq(const Packet& a, const Packet& b);
77
78 template<typename Packet>
79 struct BitShifter {};
80
81 template<int b, typename Packet>
82 EIGEN_STRONG_INLINE Packet psll(const Packet& a);
83
84 template<int _b, typename Packet>
85 EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b = _b);
86
87 template<int b, typename Packet>
88 EIGEN_STRONG_INLINE Packet psll64(const Packet& a);
89
90 template<int b, typename Packet>
91 EIGEN_STRONG_INLINE Packet psrl64(const Packet& a);
92
93 template<typename Packet>
94 EIGEN_STRONG_INLINE bool predux_all(const Packet& a);
95
96 template<typename Packet>
97 EIGEN_STRONG_INLINE bool predux_any(const Packet& a);
98
99 /*template<typename Packet>
100 EIGEN_STRONG_INLINE Packet psll(const Packet& a, int b);
101
102 template<typename Packet>
103 EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b);
104
105 template<typename Packet>
106 EIGEN_STRONG_INLINE Packet psll64(const Packet& a, int b);
107
108 template<typename Packet>
109 EIGEN_STRONG_INLINE Packet psrl64(const Packet& a, int b);*/
110
111 template<typename Packet>
112 EIGEN_STRONG_INLINE int pmovemask(const Packet& a);
113
114 template<typename Packet>
115 EIGEN_STRONG_INLINE typename std::enable_if<
116 IsFloatPacket<Packet>::value, Packet
117 >::type pext_sign(const Packet& a)
118 {
119 using IntPacket = decltype(reinterpret_to_int(a));
120 return reinterpret_to_float(
121 pand(reinterpret_to_int(a), pset1<IntPacket>(0x80000000))
122 );
123 }
124
125 template<typename Packet>
126 EIGEN_STRONG_INLINE typename std::enable_if<
127 IsDoublePacket<Packet>::value, Packet
128 >::type pext_sign(const Packet& a)
129 {
130 using IntPacket = decltype(reinterpret_to_int(a));
131 return reinterpret_to_double(
132 pand(reinterpret_to_int(a), pseti64<IntPacket>(0x8000000000000000))
133 );
134 }
135
136 /*template<>
137 EIGEN_STRONG_INLINE uint64_t psll64<uint64_t>(const uint64_t& a, int b)
138 {
139 return a << b;
140 }
141
142 template<>
143 EIGEN_STRONG_INLINE uint64_t psrl64<uint64_t>(const uint64_t& a, int b)
144 {
145 return a >> b;
146 }*/
147
148 // approximation : lgamma(z) ~= (z+2.5)ln(z+3) - z - 3 + 0.5 ln (2pi) + 1/12/(z + 3) - ln (z(z+1)(z+2))
149 template<typename Packet>
150 EIGEN_STRONG_INLINE Packet plgamma_approx(const Packet& x)
151 {
152 auto x_3 = padd(x, pset1<Packet>(3));
153 auto ret = pmul(padd(x_3, pset1<Packet>(-0.5)), plog(x_3));
154 ret = psub(ret, x_3);
155 ret = padd(ret, pset1<Packet>(0.9189385332046727));
156 ret = padd(ret, pdiv(pset1<Packet>(1 / 12.), x_3));
157 ret = psub(ret, plog(pmul(
158 pmul(psub(x_3, pset1<Packet>(1)), psub(x_3, pset1<Packet>(2))), x)));
159 return ret;
160 }
161
162 template<typename Packet>
163 EIGEN_STRONG_INLINE Packet pcmplt(const Packet& a, const Packet& b);
164
165 template<typename Packet>
166 EIGEN_STRONG_INLINE Packet pcmple(const Packet& a, const Packet& b);
167
168 template<typename Packet>
169 EIGEN_STRONG_INLINE Packet pbitnot(const Packet& a);
170
171 template<typename PacketIf, typename Packet>
172 EIGEN_STRONG_INLINE Packet pblendv(const PacketIf& ifPacket, const Packet& thenPacket, const Packet& elsePacket);
173
174 template<typename Packet>
175 EIGEN_STRONG_INLINE Packet pgather(const int* addr, const Packet& index);
176
177 template<typename Packet>
178 EIGEN_STRONG_INLINE auto pgather(const float* addr, const Packet& index) -> decltype(reinterpret_to_float(std::declval<Packet>()));
179
180 template<typename Packet>
181 EIGEN_STRONG_INLINE auto pgather(const double* addr, const Packet& index, bool upperhalf = false) -> decltype(reinterpret_to_double(std::declval<Packet>()));
182
183 template<typename Packet>
184 EIGEN_STRONG_INLINE Packet ptruncate(const Packet& a);
185
186 template<typename Packet>
187 EIGEN_STRONG_INLINE Packet pcmpeq64(const Packet& a, const Packet& b);
188
189 template<typename Packet>
190 EIGEN_STRONG_INLINE Packet pcmplt64(const Packet& a, const Packet& b);
191
192 template<typename Packet>
193 EIGEN_STRONG_INLINE Packet pmuluadd64(const Packet& a, uint64_t b, uint64_t c);
194
195 template<typename IntPacket>
196 EIGEN_STRONG_INLINE auto bit_to_ur_float(const IntPacket& x) -> decltype(reinterpret_to_float(x))
197 {
198 using FloatPacket = decltype(reinterpret_to_float(x));
199
200 const IntPacket lower = pset1<IntPacket>(0x7FFFFF),
201 upper = pset1<IntPacket>(127 << 23);
202 const FloatPacket one = pset1<FloatPacket>(1);
203
204 return psub(reinterpret_to_float(por(pand(x, lower), upper)), one);
205 }
206
207 template<typename IntPacket>
208 EIGEN_STRONG_INLINE auto bit_to_ur_double(const IntPacket& x) -> decltype(reinterpret_to_double(x))
209 {
210 using DoublePacket = decltype(reinterpret_to_double(x));
211
212 const IntPacket lower = pseti64<IntPacket>(0xFFFFFFFFFFFFFull),
213 upper = pseti64<IntPacket>(1023ull << 52);
214 const DoublePacket one = pset1<DoublePacket>(1);
215
216 return psub(reinterpret_to_double(por(pand(x, lower), upper)), one);
217 }
218
219 template<typename Packet>
220 EIGEN_STRONG_INLINE Packet pnew_andnot(const Packet& a, const Packet& b)
221 {
222#if defined(EIGEN_VECTORIZE_NEON) || defined(EIGENRAND_EIGEN_34_MODE)
223 return pandnot(a, b);
224#else
225 return pandnot(b, a);
226#endif
227 }
228
229 template<typename _Scalar>
230 struct BitScalar;
231
232 template<>
233 struct BitScalar<float>
234 {
235 float to_ur(uint32_t x)
236 {
237 union
238 {
239 uint32_t u;
240 float f;
241 };
242 u = (x & 0x7FFFFF) | (127 << 23);
243 return f - 1.f;
244 }
245
246 float to_nzur(uint32_t x)
247 {
248 return to_ur(x) + std::numeric_limits<float>::epsilon() / 8;
249 }
250 };
251
252 template<>
253 struct BitScalar<double>
254 {
255 double to_ur(uint64_t x)
256 {
257 union
258 {
259 uint64_t u;
260 double f;
261 };
262 u = (x & 0xFFFFFFFFFFFFFull) | (1023ull << 52);
263 return f - 1.;
264 }
265
266 double to_nzur(uint64_t x)
267 {
268 return to_ur(x) + std::numeric_limits<double>::epsilon() / 8;
269 }
270 };
271
272
273 struct float2
274 {
275 float f[2];
276 };
277
278 EIGEN_STRONG_INLINE float2 bit_to_ur_float(uint64_t x)
279 {
280 BitScalar<float> bs;
281 float2 ret;
282 ret.f[0] = bs.to_ur(x & 0xFFFFFFFF);
283 ret.f[1] = bs.to_ur(x >> 32);
284 return ret;
285 }
286
287 template<typename Packet>
288 EIGEN_STRONG_INLINE typename std::enable_if<
289 IsFloatPacket<Packet>::value
290 >::type psincos(Packet x, Packet& s, Packet& c)
291 {
292 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
293 using IntPacket = decltype(reinterpret_to_int(x));
294 IntPacket emm0, emm2, emm4;
295
296 sign_bit_sin = x;
297 /* take the absolute value */
298 x = pabs(x);
299 /* extract the sign bit (upper one) */
300 sign_bit_sin = pext_sign(sign_bit_sin);
301
302 /* scale by 4/Pi */
303 y = pmul(x, pset1<Packet>(1.27323954473516));
304
305 /* store the integer part of y in emm2 */
306 emm2 = pcast<Packet, IntPacket>(y);
307
308 /* j=(j+1) & (~1) (see the cephes sources) */
309 emm2 = padd(emm2, pset1<IntPacket>(1));
310 emm2 = pand(emm2, pset1<IntPacket>(~1));
311 y = pcast<IntPacket, Packet>(emm2);
312
313 emm4 = emm2;
314
315 /* get the swap sign flag for the sine */
316 emm0 = pand(emm2, pset1<IntPacket>(4));
317 emm0 = psll<29>(emm0);
318 Packet swap_sign_bit_sin = reinterpret_to_float(emm0);
319
320 /* get the polynom selection mask for the sine*/
321 emm2 = pand(emm2, pset1<IntPacket>(2));
322
323 emm2 = pcmpeq(emm2, pset1<IntPacket>(0));
324 Packet poly_mask = reinterpret_to_float(emm2);
325
326 /* The magic pass: "Extended precision modular arithmetic"
327 x = ((x - y * DP1) - y * DP2) - y * DP3; */
328 xmm1 = pset1<Packet>(-0.78515625);
329 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
330 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
331 xmm1 = pmul(y, xmm1);
332 xmm2 = pmul(y, xmm2);
333 xmm3 = pmul(y, xmm3);
334 x = padd(x, xmm1);
335 x = padd(x, xmm2);
336 x = padd(x, xmm3);
337
338 emm4 = psub(emm4, pset1<IntPacket>(2));
339 emm4 = pnew_andnot(pset1<IntPacket>(4), emm4);
340 emm4 = psll<29>(emm4);
341 Packet sign_bit_cos = reinterpret_to_float(emm4);
342 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
343
344
345 /* Evaluate the first polynom (0 <= x <= Pi/4) */
346 Packet z = pmul(x, x);
347 y = pset1<Packet>(2.443315711809948E-005);
348
349 y = pmul(y, z);
350 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
351 y = pmul(y, z);
352 y = padd(y, pset1<Packet>(4.166664568298827E-002));
353 y = pmul(y, z);
354 y = pmul(y, z);
355 Packet tmp = pmul(z, pset1<Packet>(0.5));
356 y = psub(y, tmp);
357 y = padd(y, pset1<Packet>(1));
358
359 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
360
361 Packet y2 = pset1<Packet>(-1.9515295891E-4);
362 y2 = pmul(y2, z);
363 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
364 y2 = pmul(y2, z);
365 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
366 y2 = pmul(y2, z);
367 y2 = pmul(y2, x);
368 y2 = padd(y2, x);
369
370 /* select the correct result from the two polynoms */
371 xmm3 = poly_mask;
372 Packet ysin2 = pand(xmm3, y2);
373 Packet ysin1 = pnew_andnot(y, xmm3);
374 y2 = psub(y2, ysin2);
375 y = psub(y, ysin1);
376
377 xmm1 = padd(ysin1, ysin2);
378 xmm2 = padd(y, y2);
379
380 /* update the sign */
381 s = pxor(xmm1, sign_bit_sin);
382 c = pxor(xmm2, sign_bit_cos);
383 }
384
385 template<typename Packet>
386 EIGEN_STRONG_INLINE typename std::enable_if<
387 IsDoublePacket<Packet>::value
388 >::type psincos(Packet x, Packet& s, Packet& c)
389 {
390 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
391 using IntPacket = decltype(reinterpret_to_int(x));
392 IntPacket emm0, emm2, emm4;
393
394 sign_bit_sin = x;
395 /* take the absolute value */
396 x = pabs(x);
397 /* extract the sign bit (upper one) */
398 sign_bit_sin = pext_sign(sign_bit_sin);
399
400 /* scale by 4/Pi */
401 y = pmul(x, pset1<Packet>(1.27323954473516));
402
403 /* store the integer part of y in emm2 */
404 emm2 = pcast64<Packet, IntPacket>(y);
405
406 /* j=(j+1) & (~1) (see the cephes sources) */
407 emm2 = padd64(emm2, pseti64<IntPacket>(1));
408 emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
409 y = pcast64<IntPacket, Packet>(emm2);
410
411 emm4 = emm2;
412
413 /* get the swap sign flag for the sine */
414 emm0 = pand(emm2, pseti64<IntPacket>(4));
415 emm0 = psll64<61>(emm0);
416 Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
417
418 /* get the polynom selection mask for the sine*/
419 emm2 = pand(emm2, pseti64<IntPacket>(2));
420
421 emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
422 Packet poly_mask = reinterpret_to_double(emm2);
423
424 /* The magic pass: "Extended precision modular arithmetic"
425 x = ((x - y * DP1) - y * DP2) - y * DP3; */
426 xmm1 = pset1<Packet>(-0.78515625);
427 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
428 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
429 xmm1 = pmul(y, xmm1);
430 xmm2 = pmul(y, xmm2);
431 xmm3 = pmul(y, xmm3);
432 x = padd(x, xmm1);
433 x = padd(x, xmm2);
434 x = padd(x, xmm3);
435
436 emm4 = psub64(emm4, pseti64<IntPacket>(2));
437 emm4 = pnew_andnot(pseti64<IntPacket>(4), emm4);
438 emm4 = psll64<61>(emm4);
439 Packet sign_bit_cos = reinterpret_to_double(emm4);
440 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
441
442
443 /* Evaluate the first polynom (0 <= x <= Pi/4) */
444 Packet z = pmul(x, x);
445 y = pset1<Packet>(2.443315711809948E-005);
446
447 y = pmul(y, z);
448 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
449 y = pmul(y, z);
450 y = padd(y, pset1<Packet>(4.166664568298827E-002));
451 y = pmul(y, z);
452 y = pmul(y, z);
453 Packet tmp = pmul(z, pset1<Packet>(0.5));
454 y = psub(y, tmp);
455 y = padd(y, pset1<Packet>(1));
456
457 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
458
459 Packet y2 = pset1<Packet>(-1.9515295891E-4);
460 y2 = pmul(y2, z);
461 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
462 y2 = pmul(y2, z);
463 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
464 y2 = pmul(y2, z);
465 y2 = pmul(y2, x);
466 y2 = padd(y2, x);
467
468 /* select the correct result from the two polynoms */
469 xmm3 = poly_mask;
470 Packet ysin2 = pand(xmm3, y2);
471 Packet ysin1 = pnew_andnot(y, xmm3);
472 y2 = psub(y2, ysin2);
473 y = psub(y, ysin1);
474
475 xmm1 = padd(ysin1, ysin2);
476 xmm2 = padd(y, y2);
477
478 /* update the sign */
479 s = pxor(xmm1, sign_bit_sin);
480 c = pxor(xmm2, sign_bit_cos);
481 }
482
483 template<typename Packet>
484 EIGEN_STRONG_INLINE typename std::enable_if<
485 IsDoublePacket<Packet>::value, Packet
486 >::type _psin(Packet x)
487 {
488 Packet xmm1, xmm2, xmm3 = pset1<Packet>(0), sign_bit_sin, y;
489 using IntPacket = decltype(reinterpret_to_int(x));
490 IntPacket emm0, emm2;
491
492 sign_bit_sin = x;
493 /* take the absolute value */
494 x = pabs(x);
495 /* extract the sign bit (upper one) */
496 sign_bit_sin = pext_sign(sign_bit_sin);
497
498 /* scale by 4/Pi */
499 y = pmul(x, pset1<Packet>(1.27323954473516));
500
501 /* store the integer part of y in emm2 */
502 emm2 = pcast64<Packet, IntPacket>(y);
503
504 /* j=(j+1) & (~1) (see the cephes sources) */
505 emm2 = padd64(emm2, pseti64<IntPacket>(1));
506 emm2 = pand(emm2, pseti64<IntPacket>(~1ll));
507 y = pcast64<IntPacket, Packet>(emm2);
508
509 /* get the swap sign flag for the sine */
510 emm0 = pand(emm2, pseti64<IntPacket>(4));
511 emm0 = psll64<61>(emm0);
512 Packet swap_sign_bit_sin = reinterpret_to_double(emm0);
513
514 /* get the polynom selection mask for the sine*/
515 emm2 = pand(emm2, pseti64<IntPacket>(2));
516
517 emm2 = pcmpeq64(emm2, pseti64<IntPacket>(0));
518 Packet poly_mask = reinterpret_to_double(emm2);
519
520 /* The magic pass: "Extended precision modular arithmetic"
521 x = ((x - y * DP1) - y * DP2) - y * DP3; */
522 xmm1 = pset1<Packet>(-0.78515625);
523 xmm2 = pset1<Packet>(-2.4187564849853515625e-4);
524 xmm3 = pset1<Packet>(-3.77489497744594108e-8);
525 xmm1 = pmul(y, xmm1);
526 xmm2 = pmul(y, xmm2);
527 xmm3 = pmul(y, xmm3);
528 x = padd(x, xmm1);
529 x = padd(x, xmm2);
530 x = padd(x, xmm3);
531
532 sign_bit_sin = pxor(sign_bit_sin, swap_sign_bit_sin);
533
534
535 /* Evaluate the first polynom (0 <= x <= Pi/4) */
536 Packet z = pmul(x, x);
537 y = pset1<Packet>(2.443315711809948E-005);
538
539 y = pmul(y, z);
540 y = padd(y, pset1<Packet>(-1.388731625493765E-003));
541 y = pmul(y, z);
542 y = padd(y, pset1<Packet>(4.166664568298827E-002));
543 y = pmul(y, z);
544 y = pmul(y, z);
545 Packet tmp = pmul(z, pset1<Packet>(0.5));
546 y = psub(y, tmp);
547 y = padd(y, pset1<Packet>(1));
548
549 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
550
551 Packet y2 = pset1<Packet>(-1.9515295891E-4);
552 y2 = pmul(y2, z);
553 y2 = padd(y2, pset1<Packet>(8.3321608736E-3));
554 y2 = pmul(y2, z);
555 y2 = padd(y2, pset1<Packet>(-1.6666654611E-1));
556 y2 = pmul(y2, z);
557 y2 = pmul(y2, x);
558 y2 = padd(y2, x);
559
560 /* select the correct result from the two polynoms */
561 xmm3 = poly_mask;
562 Packet ysin2 = pand(xmm3, y2);
563 Packet ysin1 = pnew_andnot(y, xmm3);
564
565 xmm1 = padd(ysin1, ysin2);
566
567 /* update the sign */
568 return pxor(xmm1, sign_bit_sin);
569 }
570 }
571}
572
573#ifdef EIGEN_VECTORIZE_AVX512
575#endif
576
577#ifdef EIGEN_VECTORIZE_AVX
579#endif
580
581#ifdef EIGEN_VECTORIZE_SSE2
583#endif
584
585#ifdef EIGEN_VECTORIZE_NEON
587#endif
588
589namespace Eigen
590{
591 namespace internal
592 {
593 template<int b, typename Packet>
594 EIGEN_STRONG_INLINE Packet psll(const Packet& a)
595 {
596 return BitShifter<Packet>{}.template sll<b>(a);
597 }
598
599 template<int _b, typename Packet>
600 EIGEN_STRONG_INLINE Packet psrl(const Packet& a, int b)
601 {
602 return BitShifter<Packet>{}.template srl<_b>(a, b);
603 }
604
605 template<int b, typename Packet>
606 EIGEN_STRONG_INLINE Packet psll64(const Packet& a)
607 {
608 return BitShifter<Packet>{}.template sll64<b>(a);
609 }
610
611 template<int b, typename Packet>
612 EIGEN_STRONG_INLINE Packet psrl64(const Packet& a)
613 {
614 return BitShifter<Packet>{}.template srl64<b>(a);
615 }
616 }
617}
618
619#endif