EigenRand  0.5.0
 
Loading...
Searching...
No Matches
arch/NEON/MorePacketMath.h
Go to the documentation of this file.
1
12#ifndef EIGENRAND_MORE_PACKET_MATH_NEON_H
13#define EIGENRAND_MORE_PACKET_MATH_NEON_H
14
15#include <arm_neon.h>
16
17// device func of casting for Eigen ~3.3.9
18#ifdef EIGENRAND_EIGEN_33_MODE
19namespace Eigen
20{
21 namespace internal
22 {
23 template<>
24 EIGEN_DEVICE_FUNC inline Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a)
25 {
26 return vcvtq_f32_s32(a);
27 }
28
29 template<>
30 EIGEN_DEVICE_FUNC inline Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a)
31 {
32 return vcvtq_s32_f32(a);
33 }
34
35 }
36}
37#endif
38
39namespace Eigen
40{
41 namespace internal
42 {
43 template<>
44 struct IsIntPacket<Packet4i> : std::true_type {};
45
46 template<>
47 struct IsFloatPacket<Packet4f> : std::true_type {};
48
49 template<>
50 struct IsDoublePacket<Packet2d> : std::true_type {};
51
52 template<>
53 struct HalfPacket<Packet4i>
54 {
55 using type = uint64_t;
56 };
57
58 template<>
59 struct reinterpreter<Packet4i>
60 {
61 EIGEN_STRONG_INLINE Packet4f to_float(const Packet4i& x)
62 {
63 return (Packet4f)vreinterpretq_f32_s32(x);
64 }
65
66 EIGEN_STRONG_INLINE Packet4i to_int(const Packet4i& x)
67 {
68 return x;
69 }
70
71 EIGEN_STRONG_INLINE Packet2d to_double(const Packet4i& x)
72 {
73 return (Packet2d)vreinterpretq_f64_s32(x);
74 }
75 };
76
77 template<>
78 struct reinterpreter<Packet4f>
79 {
80 EIGEN_STRONG_INLINE Packet4f to_float(const Packet4f& x)
81 {
82 return x;
83 }
84
85 EIGEN_STRONG_INLINE Packet4i to_int(const Packet4f& x)
86 {
87 return (Packet4i)vreinterpretq_s32_f32(x);
88 }
89
90 EIGEN_STRONG_INLINE Packet2d to_double(const Packet4f& x)
91 {
92 return (Packet2d)vreinterpretq_f64_f32(x);
93 }
94 };
95
96 template<>
97 struct reinterpreter<Packet2d>
98 {
99 EIGEN_STRONG_INLINE Packet4f to_float(const Packet2d& x)
100 {
101 return vreinterpretq_f32_f64(x);
102 }
103
104 EIGEN_STRONG_INLINE Packet2d to_double(const Packet2d& x)
105 {
106 return x;
107 }
108
109 EIGEN_STRONG_INLINE Packet4i to_int(const Packet2d& x)
110 {
111 return vreinterpretq_s32_f64(x);
112 }
113 };
114
115 template<>
116 EIGEN_STRONG_INLINE Packet4i pcmpeq<Packet4i>(const Packet4i& a, const Packet4i& b)
117 {
118 return vreinterpretq_s32_u32(vceqq_s32(a, b));
119 }
120
121 template<>
122 EIGEN_STRONG_INLINE Packet4f pcmpeq<Packet4f>(const Packet4f& a, const Packet4f& b)
123 {
124 return vreinterpretq_f32_u32(vceqq_f32(a, b));
125 }
126
127 template<>
128 EIGEN_STRONG_INLINE Packet4i pbitnot<Packet4i>(const Packet4i& a)
129 {
130 return vmvnq_s32(a);
131 }
132
133 template<>
134 EIGEN_STRONG_INLINE Packet4f pbitnot<Packet4f>(const Packet4f& a)
135 {
136 return (Packet4f)vreinterpretq_f32_s32(pbitnot((Packet4i)vreinterpretq_s32_f32(a)));
137 }
138
139 template<>
140 struct BitShifter<Packet4i>
141 {
142 template<int b>
143 EIGEN_STRONG_INLINE Packet4i sll(const Packet4i& a)
144 {
145 return vreinterpretq_s32_u32(vshlq_n_u32(vreinterpretq_u32_s32(a), b));
146 }
147
148 template<int b>
149 EIGEN_STRONG_INLINE Packet4i srl(const Packet4i& a, int _b = b)
150 {
151 if (b > 0)
152 {
153 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), b > 0 ? b : 1));
154 }
155 else
156 {
157 switch (_b)
158 {
159 case 0: return a;
160 case 1: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 1));
161 case 2: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 2));
162 case 3: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 3));
163 case 4: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 4));
164 case 5: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 5));
165 case 6: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 6));
166 case 7: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 7));
167 case 8: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 8));
168 case 9: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 9));
169 case 10: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 10));
170 case 11: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 11));
171 case 12: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 12));
172 case 13: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 13));
173 case 14: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 14));
174 case 15: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 15));
175 case 16: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 16));
176 case 17: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 17));
177 case 18: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 18));
178 case 19: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 19));
179 case 20: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 20));
180 case 21: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 21));
181 case 22: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 22));
182 case 23: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 23));
183 case 24: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 24));
184 case 25: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 25));
185 case 26: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 26));
186 case 27: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 27));
187 case 28: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 28));
188 case 29: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 29));
189 case 30: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 30));
190 case 31: return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
191 }
192 return vdupq_n_s32(0);
193 }
194 }
195
196 template<int b>
197 EIGEN_STRONG_INLINE Packet4i sll64(const Packet4i& a)
198 {
199 return vreinterpretq_s32_u64(vshlq_n_u64(vreinterpretq_u64_s32(a), b));
200 }
201
202 template<int b>
203 EIGEN_STRONG_INLINE Packet4i srl64(const Packet4i& a)
204 {
205 return vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(a), b));
206 }
207 };
208
209 template<>
210 EIGEN_STRONG_INLINE Packet4i pcmplt<Packet4i>(const Packet4i& a, const Packet4i& b)
211 {
212 return vreinterpretq_s32_u32(vcltq_s32(a, b));
213 }
214
215 template<>
216 EIGEN_STRONG_INLINE Packet4f pcmplt<Packet4f>(const Packet4f& a, const Packet4f& b)
217 {
218 return vreinterpretq_f32_u32(vcltq_f32(a, b));
219 }
220
221 template<>
222 EIGEN_STRONG_INLINE Packet4f pcmple<Packet4f>(const Packet4f& a, const Packet4f& b)
223 {
224 return vreinterpretq_f32_u32(vcleq_f32(a, b));
225 }
226
227 template<>
228 EIGEN_STRONG_INLINE Packet2d pcmplt<Packet2d>(const Packet2d& a, const Packet2d& b)
229 {
230 return vreinterpretq_f64_u64(vcltq_f64(a,b));
231 }
232
233 template<>
234 EIGEN_STRONG_INLINE Packet2d pcmple<Packet2d>(const Packet2d& a, const Packet2d& b)
235 {
236 return vreinterpretq_f64_u64(vcleq_f64(a,b));
237 }
238
239 template<>
240 EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4f& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
241 {
242 return vbslq_f32(vreinterpretq_u32_f32(ifPacket), thenPacket, elsePacket);
243 }
244
245 template<>
246 EIGEN_STRONG_INLINE Packet4f pblendv(const Packet4i& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket)
247 {
248 return vbslq_f32(vreinterpretq_u32_s32(ifPacket), thenPacket, elsePacket);
249 }
250
251 template<>
252 EIGEN_STRONG_INLINE Packet4i pblendv(const Packet4i& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket)
253 {
254 return vbslq_s32(vreinterpretq_u32_s32(ifPacket), thenPacket, elsePacket);
255 }
256
257 template<>
258 EIGEN_STRONG_INLINE Packet2d pblendv(const Packet2d& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
259 {
260 return vbslq_f64(vreinterpretq_u64_f64(ifPacket), thenPacket, elsePacket);
261 }
262
263 template<>
264 EIGEN_STRONG_INLINE Packet2d pblendv(const Packet4i& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket)
265 {
266 return vbslq_f64(vreinterpretq_u64_s32(ifPacket), thenPacket, elsePacket);
267 }
268
269 template<>
270 EIGEN_STRONG_INLINE Packet4i pgather<Packet4i>(const int* addr, const Packet4i& index)
271 {
272 int32_t u[4];
273 vst1q_s32(u, index);
274 int32_t t[4];
275 t[0] = addr[u[0]];
276 t[1] = addr[u[1]];
277 t[2] = addr[u[2]];
278 t[3] = addr[u[3]];
279 return vld1q_s32(t);
280 }
281
282 template<>
283 EIGEN_STRONG_INLINE Packet4f pgather<Packet4i>(const float* addr, const Packet4i& index)
284 {
285 int32_t u[4];
286 vst1q_s32(u, index);
287 float t[4];
288 t[0] = addr[u[0]];
289 t[1] = addr[u[1]];
290 t[2] = addr[u[2]];
291 t[3] = addr[u[3]];
292 return vld1q_f32(t);
293 }
294
295 template<>
296 EIGEN_STRONG_INLINE int pmovemask<Packet4f>(const Packet4f& a)
297 {
298 int32_t bits[4] = { 1, 2, 4, 8 };
299 auto r = vbslq_s32(vreinterpretq_u32_f32(a), vld1q_s32(bits), vdupq_n_s32(0));
300 auto s = vadd_s32(vget_low_s32(r), vget_high_s32(r));
301 return vget_lane_s32(vpadd_s32(s, s), 0);
302 }
303
304 template<>
305 EIGEN_STRONG_INLINE int pmovemask<Packet4i>(const Packet4i& a)
306 {
307 return pmovemask((Packet4f)vreinterpretq_f32_s32(a));
308 }
309
310 template<>
311 EIGEN_STRONG_INLINE Packet4f ptruncate<Packet4f>(const Packet4f& a)
312 {
313 return vrndq_f32(a);
314 }
315
316 template<>
317 EIGEN_STRONG_INLINE Packet4i pcast64<Packet2d, Packet4i>(const Packet2d& a)
318 {
319 return (Packet4i)vcvtq_s64_f64(a);
320 }
321
322 template<>
323 EIGEN_STRONG_INLINE Packet2d pcast64<Packet4i, Packet2d>(const Packet4i& a)
324 {
325 return vcvtq_f64_s64((int64x2_t)a);
326 }
327
328
329 template<>
330 EIGEN_STRONG_INLINE Packet4i padd64<Packet4i>(const Packet4i& a, const Packet4i& b)
331 {
332 return (Packet4i)vaddq_s64((int64x2_t)a, (int64x2_t)b);
333 }
334
335 template<>
336 EIGEN_STRONG_INLINE Packet4i psub64<Packet4i>(const Packet4i& a, const Packet4i& b)
337 {
338 return (Packet4i)vsubq_s64((int64x2_t)a, (int64x2_t)b);
339 }
340
341 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
342 Packet2d psin<Packet2d>(const Packet2d& x)
343 {
344 return _psin(x);
345 }
346
347 template<>
348 EIGEN_STRONG_INLINE Packet4i pseti64<Packet4i>(uint64_t a)
349 {
350 return vreinterpretq_s32_u64(vdupq_n_u64(a));
351 }
352
353 template<>
354 EIGEN_STRONG_INLINE Packet4i pcmpeq64<Packet4i>(const Packet4i& a, const Packet4i& b)
355 {
356 return vreinterpretq_s32_u64(vceqq_s64(vreinterpretq_s64_s32(a), vreinterpretq_s64_s32(b)));
357 }
358
359 template<>
360 EIGEN_STRONG_INLINE Packet4i pmuluadd64<Packet4i>(const Packet4i& a, uint64_t b, uint64_t c)
361 {
362 uint64_t u[2];
363 vst1q_u64(u, vreinterpretq_u64_s32(a));
364 u[0] = u[0] * b + c;
365 u[1] = u[1] * b + c;
366 return vreinterpretq_s32_u64(vld1q_u64(u));
367 }
368
369 template<>
370 EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
371 {
372 uint32x2_t tmp = vand_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
373 vget_high_u32(vreinterpretq_u32_f32(x)));
374 return vget_lane_u32(vpmin_u32(tmp, tmp), 0);
375 }
376
377 template<>
378 EIGEN_STRONG_INLINE bool predux_all(const Packet4i& x)
379 {
380 return predux_all((Packet4f)vreinterpretq_f32_s32(x));
381 }
382
383 #ifdef EIGENRAND_EIGEN_33_MODE
384 template<>
385 EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
386 {
387 uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
388 vget_high_u32(vreinterpretq_u32_f32(x)));
389 return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
390 }
391
392 template<>
393 EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x)
394 {
395 return predux_any((Packet4f)vreinterpretq_f32_s32(x));
396 }
397
398 template<>
399 EIGEN_STRONG_INLINE Packet4f plog<Packet4f>(const Packet4f& _x)
400 {
401 Packet4f x = _x;
402 _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
403 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
404 _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
405
406 const Packet4f p4f_inv_mant_mask = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(~0x7f800000));
407
408 /* the smallest non denormalized float number */
409 const Packet4f p4f_min_norm_pos = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0x00800000));
410 const Packet4f p4f_minus_inf = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0xff800000));
411
412 /* natural logarithm computed for 4 simultaneous float
413 return NaN for x <= 0
414 */
415 _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
416 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
417 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310E-1f);
418 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
419 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846E-1f);
420 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787E-1f);
421 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665E-1f);
422 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765E-1f);
423 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993E-1f);
424 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174E-1f);
425 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
426 _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
427
428
429 Packet4i emm0;
430
431 Packet4f invalid_mask = pbitnot(pcmple(pset1<Packet4f>(0), x)); // not greater equal is true if x is NaN
432 Packet4f iszero_mask = pcmpeq(x, pset1<Packet4f>(0));
433
434 x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */
435 emm0 = BitShifter<Packet4i>{}.template srl<23>((Packet4i)vreinterpretq_s32_f32(x));
436
437 /* keep only the fractional part */
438 x = pand(x, p4f_inv_mant_mask);
439 x = por(x, p4f_half);
440
441 emm0 = psub(emm0, p4i_0x7f);
442 Packet4f e = padd(Packet4f(vcvtq_f32_s32(emm0)), p4f_1);
443
444 /* part2:
445 if( x < SQRTHF ) {
446 e -= 1;
447 x = x + x - 1.0;
448 } else { x = x - 1.0; }
449 */
450 Packet4f mask = pcmplt(x, p4f_cephes_SQRTHF);
451 Packet4f tmp = pand(x, mask);
452 x = psub(x, p4f_1);
453 e = psub(e, pand(p4f_1, mask));
454 x = padd(x, tmp);
455
456 Packet4f x2 = pmul(x, x);
457 Packet4f x3 = pmul(x2, x);
458
459 Packet4f y, y1, y2;
460 y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
461 y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
462 y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
463 y = pmadd(y, x, p4f_cephes_log_p2);
464 y1 = pmadd(y1, x, p4f_cephes_log_p5);
465 y2 = pmadd(y2, x, p4f_cephes_log_p8);
466 y = pmadd(y, x3, y1);
467 y = pmadd(y, x3, y2);
468 y = pmul(y, x3);
469
470 y1 = pmul(e, p4f_cephes_log_q1);
471 tmp = pmul(x2, p4f_half);
472 y = padd(y, y1);
473 x = psub(x, tmp);
474 y2 = pmul(e, p4f_cephes_log_q2);
475 x = padd(x, y);
476 x = padd(x, y2);
477 // negative arg will be NAN, 0 will be -INF
478 return pblendv(iszero_mask, p4f_minus_inf, por(x, invalid_mask));
479 }
480
481 template<>
482 EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x)
483 {
484 return vsqrtq_f32(x);
485 }
486
487 template<>
488 EIGEN_STRONG_INLINE Packet4f psin<Packet4f>(const Packet4f& _x)
489 {
490 Packet4f x = _x;
491 _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
492 _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
493
494 _EIGEN_DECLARE_CONST_Packet4i(1, 1);
495 _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
496 _EIGEN_DECLARE_CONST_Packet4i(2, 2);
497 _EIGEN_DECLARE_CONST_Packet4i(4, 4);
498
499 const Packet4f p4f_sign_mask = (Packet4f)vreinterpretq_f32_s32(pset1<Packet4i>(0x80000000));
500
501 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
502 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
503 _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
504 _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
505 _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f);
506 _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
507 _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f);
508 _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
509 _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f);
510 _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
511
512 Packet4f xmm1, xmm2, xmm3, sign_bit, y;
513
514 Packet4i emm0, emm2;
515 sign_bit = x;
516 /* take the absolute value */
517 x = pabs(x);
518
519 /* take the modulo */
520
521 /* extract the sign bit (upper one) */
522 sign_bit = pand(sign_bit, p4f_sign_mask);
523
524 /* scale by 4/Pi */
525 y = pmul(x, p4f_cephes_FOPI);
526
527 /* store the integer part of y in mm0 */
528 emm2 = vcvtq_s32_f32(y);
529 /* j=(j+1) & (~1) (see the cephes sources) */
530 emm2 = padd(emm2, p4i_1);
531 emm2 = pand(emm2, p4i_not1);
532 y = vcvtq_f32_s32(emm2);
533 /* get the swap sign flag */
534 emm0 = pand(emm2, p4i_4);
535 emm0 = BitShifter<Packet4i>{}.template sll<29>(emm0);
536 /* get the polynom selection mask
537 there is one polynom for 0 <= x <= Pi/4
538 and another one for Pi/4<x<=Pi/2
539
540 Both branches will be computed.
541 */
542 emm2 = pand(emm2, p4i_2);
543 emm2 = pcmpeq(emm2, pset1<Packet4i>(0));
544
545 Packet4f swap_sign_bit = (Packet4f)vreinterpretq_f32_s32(emm0);
546 Packet4f poly_mask = (Packet4f)vreinterpretq_f32_s32(emm2);
547 sign_bit = pxor(sign_bit, swap_sign_bit);
548
549 /* The magic pass: "Extended precision modular arithmetic"
550 x = ((x - y * DP1) - y * DP2) - y * DP3; */
551 xmm1 = pmul(y, p4f_minus_cephes_DP1);
552 xmm2 = pmul(y, p4f_minus_cephes_DP2);
553 xmm3 = pmul(y, p4f_minus_cephes_DP3);
554 x = padd(x, xmm1);
555 x = padd(x, xmm2);
556 x = padd(x, xmm3);
557
558 /* Evaluate the first polynom (0 <= x <= Pi/4) */
559 y = p4f_coscof_p0;
560 Packet4f z = pmul(x, x);
561
562 y = pmadd(y, z, p4f_coscof_p1);
563 y = pmadd(y, z, p4f_coscof_p2);
564 y = pmul(y, z);
565 y = pmul(y, z);
566 Packet4f tmp = pmul(z, p4f_half);
567 y = psub(y, tmp);
568 y = padd(y, p4f_1);
569
570 /* Evaluate the second polynom (Pi/4 <= x <= 0) */
571
572 Packet4f y2 = p4f_sincof_p0;
573 y2 = pmadd(y2, z, p4f_sincof_p1);
574 y2 = pmadd(y2, z, p4f_sincof_p2);
575 y2 = pmul(y2, z);
576 y2 = pmul(y2, x);
577 y2 = padd(y2, x);
578
579 /* select the correct result from the two polynoms */
580 y = pblendv(poly_mask, y2, y);
581 /* update the sign */
582 return pxor(y, sign_bit);
583 }
584 #endif
585 }
586}
587
588#endif