EigenRand  0.5.0
 
Loading...
Searching...
No Matches
arch/AVX/PacketFilter.h
Go to the documentation of this file.
1
12#ifndef EIGENRAND_PACKET_FILTER_AVX_H
13#define EIGENRAND_PACKET_FILTER_AVX_H
14
15#include <immintrin.h>
16
17namespace Eigen
18{
19 namespace Rand
20 {
21 namespace detail
22 {
23 template<>
24 class CompressMask<32>
25 {
26 std::array<std::array<internal::Packet8i, 256>, 15> idx;
27 std::array<internal::Packet8f, 8> selector;
28 std::array<uint8_t, 256> cnt;
29
30 static internal::Packet8i make_compress(int mask, int offset = 0)
31 {
32 int32_t ret[8] = { 0, };
33 int n = offset;
34 for (int i = 0; i < 8; ++i)
35 {
36 int l = mask & 1;
37 mask >>= 1;
38 if (l)
39 {
40 if (n >= 0) ret[n] = i;
41 if (++n >= 8) break;
42 }
43 }
44 return _mm256_loadu_si256((internal::Packet8i*)ret);
45 }
46
47 static uint8_t count(int mask)
48 {
49 uint8_t ret = 0;
50 for (int i = 0; i < 8; ++i)
51 {
52 ret += mask & 1;
53 mask >>= 1;
54 }
55 return ret;
56 }
57
58 CompressMask()
59 {
60 for (int i = 0; i < 256; ++i)
61 {
62 for (int o = 0; o < 15; ++o)
63 {
64 idx[o][i] = make_compress(i, o < 8 ? o : o - 15);
65 }
66
67 cnt[i] = count(i);
68 }
69
70 selector[0] = _mm256_castsi256_ps(_mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0));
71 selector[1] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0));
72 selector[2] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0));
73 selector[3] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0));
74 selector[4] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0));
75 selector[5] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0));
76 selector[6] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0));
77 selector[7] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0));
78 }
79
80 static EIGEN_STRONG_INLINE internal::Packet8f permute(const internal::Packet8f& p, const internal::Packet8i& i)
81 {
82#ifdef EIGEN_VECTORIZE_AVX2
83 return _mm256_permutevar8x32_ps(p, i);
84#else
85 auto l = _mm256_permutevar_ps(p, i);
86 auto h = _mm256_permutevar_ps(_mm256_permute2f128_ps(p, p, 0x01), i);
87 internal::Packet4i i1, i2;
88 internal::split_two(i, i1, i2);
89 i1 = _mm_slli_epi32(i1, 29);
90 i2 = _mm_slli_epi32(i2, 29);
91 auto c = _mm256_castsi256_ps(
92 internal::combine_two(
93 _mm_cmplt_epi32(i1, internal::pset1<internal::Packet4i>(0)),
94 _mm_cmplt_epi32(internal::pset1<internal::Packet4i>(-1), i2)
95 )
96 );
97 return internal::pblendv(c, h, l);
98#endif
99 }
100
101 public:
102 enum { full_size = 8 };
103 static const CompressMask& get_inst()
104 {
105 static CompressMask cm;
106 return cm;
107 }
108
109 template<typename Packet>
110 EIGEN_STRONG_INLINE int compress_append(Packet& _value, const Packet& _mask,
111 Packet& _rest, int rest_cnt, bool& full) const
112 {
113 auto& value = reinterpret_cast<internal::Packet8f&>(_value);
114 auto& mask = reinterpret_cast<const internal::Packet8f&>(_mask);
115 auto& rest = reinterpret_cast<internal::Packet8f&>(_rest);
116
117 int m = _mm256_movemask_ps(mask);
118 if (cnt[m] == full_size)
119 {
120 full = true;
121 return rest_cnt;
122 }
123
124 auto p1 = permute(value, idx[rest_cnt][m]);
125 p1 = internal::pblendv(selector[rest_cnt], rest, p1);
126
127 auto new_cnt = rest_cnt + cnt[m];
128 if (new_cnt >= full_size)
129 {
130 if (new_cnt > full_size)
131 {
132 rest = permute(value, idx[new_cnt - cnt[m] + full_size - 1][m]);
133 }
134 value = p1;
135 full = true;
136 return new_cnt - full_size;
137 }
138 else
139 {
140 rest = p1;
141 full = false;
142 return new_cnt;
143 }
144 }
145 };
146 }
147 }
148}
149#endif