EigenRand  0.5.0
 
Loading...
Searching...
No Matches
arch/NEON/PacketFilter.h
Go to the documentation of this file.
1
12#ifndef EIGENRAND_PACKET_FILTER_NEON_H
13#define EIGENRAND_PACKET_FILTER_NEON_H
14
15#include <arm_neon.h>
16
17namespace Eigen
18{
19 namespace Rand
20 {
21 namespace detail
22 {
23 template<>
24 class CompressMask<16>
25 {
26 std::array<std::array<uint8_t, 16>, 7> idx;
27 std::array<internal::Packet4f, 4> selector;
28 std::array<uint8_t, 16> cnt;
29
30 static uint8_t make_compress(int mask, int offset = 0)
31 {
32 uint8_t ret = 0;
33 int n = offset;
34 for (int i = 0; i < 4; ++i)
35 {
36 int l = mask & 1;
37 mask >>= 1;
38 if (l)
39 {
40 if (n >= 0) ret |= (i & 3) << (2 * n);
41 if (++n >= 4) break;
42 }
43 }
44 return ret;
45 }
46
47 static uint8_t count(int mask)
48 {
49 uint8_t ret = 0;
50 for (int i = 0; i < 4; ++i)
51 {
52 ret += mask & 1;
53 mask >>= 1;
54 }
55 return ret;
56 }
57
58 CompressMask()
59 {
60 for (int i = 0; i < 16; ++i)
61 {
62 for (int o = 0; o < 7; ++o)
63 {
64 idx[o][i] = make_compress(i, o < 4 ? o : o - 7);
65 }
66
67 cnt[i] = count(i);
68 }
69
70 uint32_t v[4] = { 0, };
71
72 selector[0] = (internal::Packet4f)vreinterpretq_f32_u32(vld1q_u32(v));
73 v[0] = -1;
74 selector[1] = (internal::Packet4f)vreinterpretq_f32_u32(vld1q_u32(v));
75 v[1] = -1;
76 selector[2] = (internal::Packet4f)vreinterpretq_f32_u32(vld1q_u32(v));
77 v[2] = -1;
78 selector[3] = (internal::Packet4f)vreinterpretq_f32_u32(vld1q_u32(v));
79 }
80
81 static EIGEN_STRONG_INLINE internal::Packet4f permute(const internal::Packet4f& p, uint8_t i)
82 {
83 float u[4];
84 vst1q_f32(u, p);
85 float t[4];
86 t[0] = u[i & 3];
87 t[1] = u[(i >> 2) & 3];
88 t[2] = u[(i >> 4) & 3];
89 t[3] = u[(i >> 6) & 3];
90 return vld1q_f32(t);
91 }
92
93 public:
94
95 enum { full_size = 4 };
96
97 static const CompressMask& get_inst()
98 {
99 static CompressMask cm;
100 return cm;
101 }
102
103 template<typename Packet>
104 EIGEN_STRONG_INLINE int compress_append(Packet& _value, const Packet& _mask,
105 Packet& _rest, int rest_cnt, bool& full) const
106 {
107 auto& value = reinterpret_cast<internal::Packet4f&>(_value);
108 auto& mask = reinterpret_cast<const internal::Packet4f&>(_mask);
109 auto& rest = reinterpret_cast<internal::Packet4f&>(_rest);
110
111 int m = internal::pmovemask(mask);
112 if (cnt[m] == full_size)
113 {
114 full = true;
115 return rest_cnt;
116 }
117 auto p1 = permute(value, idx[rest_cnt][m]);
118 p1 = internal::pblendv(selector[rest_cnt], rest, p1);
119
120 auto new_cnt = rest_cnt + cnt[m];
121 if (new_cnt >= full_size)
122 {
123 if (new_cnt > full_size)
124 {
125 rest = permute(value, idx[new_cnt - cnt[m] + full_size - 1][m]);
126 }
127 value = p1;
128 full = true;
129 return new_cnt - full_size;
130 }
131 else
132 {
133 rest = p1;
134 full = false;
135 return new_cnt;
136 }
137 }
138 };
139 }
140 }
141}
142#endif