12 #ifndef EIGENRAND_PACKET_FILTER_H
13 #define EIGENRAND_PACKET_FILTER_H
24 template<
size_t PacketSize>
29 #ifdef EIGEN_VECTORIZE_AVX
30 #include <immintrin.h>
38 class CompressMask<32>
40 std::array<std::array<internal::Packet8i, 256>, 15> idx;
41 std::array<internal::Packet8f, 8> selector;
42 std::array<uint8_t, 256> cnt;
44 static internal::Packet8i make_compress(
int mask,
int offset = 0)
46 int32_t ret[8] = { 0, };
48 for (
int i = 0; i < 8; ++i)
54 if (n >= 0) ret[n] = i;
58 return _mm256_loadu_si256((internal::Packet8i*)ret);
61 static uint8_t count(
int mask)
64 for (
int i = 0; i < 8; ++i)
74 for (
int i = 0; i < 256; ++i)
76 for (
int o = 0; o < 15; ++o)
78 idx[o][i] = make_compress(i, o < 8 ? o : o - 15);
84 selector[0] = _mm256_castsi256_ps(_mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0));
85 selector[1] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0));
86 selector[2] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0));
87 selector[3] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0));
88 selector[4] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0));
89 selector[5] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0));
90 selector[6] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0));
91 selector[7] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0));
94 static EIGEN_STRONG_INLINE internal::Packet8f permute(
const internal::Packet8f& p,
const internal::Packet8i& i)
96 #ifdef EIGEN_VECTORIZE_AVX2
97 return _mm256_permutevar8x32_ps(p, i);
99 auto l = _mm256_permutevar_ps(p, i);
100 auto h = _mm256_permutevar_ps(_mm256_permute2f128_ps(p, p, 0x01), i);
101 internal::Packet4i i1, i2;
102 internal::split_two(i, i1, i2);
103 i1 = _mm_slli_epi32(i1, 29);
104 i2 = _mm_slli_epi32(i2, 29);
105 auto c = _mm256_castsi256_ps(
106 internal::combine_two(
107 _mm_cmplt_epi32(i1, internal::pset1<internal::Packet4i>(0)),
108 _mm_cmplt_epi32(internal::pset1<internal::Packet4i>(-1), i2)
111 return internal::pblendv(c, h, l);
116 enum { full_size = 8 };
117 static const CompressMask& get_inst()
119 static CompressMask cm;
123 template<
typename Packet>
124 EIGEN_STRONG_INLINE
int compress_append(Packet& _value,
const Packet& _mask,
125 Packet& _rest,
int rest_cnt,
bool& full)
const
127 auto& value =
reinterpret_cast<internal::Packet8f&
>(_value);
128 auto& mask =
reinterpret_cast<const internal::Packet8f&
>(_mask);
129 auto& rest =
reinterpret_cast<internal::Packet8f&
>(_rest);
131 int m = _mm256_movemask_ps(mask);
132 if (cnt[m] == full_size)
138 auto p1 = permute(value, idx[rest_cnt][m]);
139 p1 = internal::pblendv(selector[rest_cnt], rest, p1);
141 auto new_cnt = rest_cnt + cnt[m];
142 if (new_cnt >= full_size)
144 if (new_cnt > full_size)
146 rest = permute(value, idx[new_cnt - cnt[m] + full_size - 1][m]);
150 return new_cnt - full_size;
165 #ifdef EIGEN_VECTORIZE_SSE2
166 #include <xmmintrin.h>
175 class CompressMask<16>
177 std::array<std::array<uint8_t, 16>, 7> idx;
178 std::array<internal::Packet4f, 4> selector;
179 std::array<uint8_t, 64> cnt;
181 static uint8_t make_compress(
int mask,
int offset = 0)
185 for (
int i = 0; i < 4; ++i)
191 if (n >= 0) ret |= (i & 3) << (2 * n);
198 static uint8_t count(
int mask)
201 for (
int i = 0; i < 4; ++i)
211 for (
int i = 0; i < 16; ++i)
213 for (
int o = 0; o < 7; ++o)
215 idx[o][i] = make_compress(i, o < 4 ? o : o - 7);
221 selector[0] = _mm_castsi128_ps(_mm_setr_epi32(0, 0, 0, 0));
222 selector[1] = _mm_castsi128_ps(_mm_setr_epi32(-1, 0, 0, 0));
223 selector[2] = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, 0, 0));
224 selector[3] = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0));
227 static EIGEN_STRONG_INLINE internal::Packet4f permute(
const internal::Packet4f& p, uint8_t i)
231 return _mm_setr_ps(u[i & 3], u[(i >> 2) & 3], u[(i >> 4) & 3], u[(i >> 6) & 3]);
236 enum { full_size = 4 };
238 static const CompressMask& get_inst()
240 static CompressMask cm;
244 template<
typename Packet>
245 EIGEN_STRONG_INLINE
int compress_append(Packet& _value,
const Packet& _mask,
246 Packet& _rest,
int rest_cnt,
bool& full)
const
248 auto& value =
reinterpret_cast<internal::Packet4f&
>(_value);
249 auto& mask =
reinterpret_cast<const internal::Packet4f&
>(_mask);
250 auto& rest =
reinterpret_cast<internal::Packet4f&
>(_rest);
252 int m = _mm_movemask_ps(mask);
253 if (cnt[m] == full_size)
259 auto p1 = permute(value, idx[rest_cnt][m]);
260 p1 = internal::pblendv(selector[rest_cnt], rest, p1);
262 auto new_cnt = rest_cnt + cnt[m];
263 if (new_cnt >= full_size)
265 if (new_cnt > full_size)
267 rest = permute(value, idx[new_cnt - cnt[m] + full_size - 1][m]);
271 return new_cnt - full_size;