12 #ifndef EIGENRAND_PACKET_FILTER_H 
   13 #define EIGENRAND_PACKET_FILTER_H 
   24             template<
size_t PacketSize>
 
   29 #ifdef EIGEN_VECTORIZE_AVX 
   30 #include <immintrin.h> 
   38             class CompressMask<32>
 
   40                 std::array<std::array<internal::Packet8i, 256>, 15> idx;
 
   41                 std::array<internal::Packet8f, 8> selector;
 
   42                 std::array<uint8_t, 256> cnt;
 
   44                 static internal::Packet8i make_compress(
int mask, 
int offset = 0)
 
   46                     int32_t ret[8] = { 0, };
 
   48                     for (
int i = 0; i < 8; ++i)
 
   54                             if (n >= 0) ret[n] = i;
 
   58                     return _mm256_loadu_si256((internal::Packet8i*)ret);
 
   61                 static uint8_t count(
int mask)
 
   64                     for (
int i = 0; i < 8; ++i)
 
   74                     for (
int i = 0; i < 256; ++i)
 
   76                         for (
int o = 0; o < 15; ++o)
 
   78                             idx[o][i] = make_compress(i, o < 8 ? o : o - 15);
 
   84                     selector[0] = _mm256_castsi256_ps(_mm256_setr_epi32(0, 0, 0, 0, 0, 0, 0, 0));
 
   85                     selector[1] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, 0, 0, 0, 0, 0, 0, 0));
 
   86                     selector[2] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, 0, 0, 0, 0, 0, 0));
 
   87                     selector[3] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0));
 
   88                     selector[4] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, 0, 0, 0, 0));
 
   89                     selector[5] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, 0, 0, 0));
 
   90                     selector[6] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0));
 
   91                     selector[7] = _mm256_castsi256_ps(_mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, 0));
 
   94                 static EIGEN_STRONG_INLINE internal::Packet8f permute(
const internal::Packet8f& p, 
const internal::Packet8i& i)
 
   96 #ifdef EIGEN_VECTORIZE_AVX2 
   97                     return _mm256_permutevar8x32_ps(p, i);
 
   99                     auto l = _mm256_permutevar_ps(p, i);
 
  100                     auto h = _mm256_permutevar_ps(_mm256_permute2f128_ps(p, p, 0x01), i);
 
  101                     internal::Packet4i i1, i2;
 
  102                     internal::split_two(i, i1, i2);
 
  103                     i1 = _mm_slli_epi32(i1, 29);
 
  104                     i2 = _mm_slli_epi32(i2, 29);
 
  105                     auto c = _mm256_castsi256_ps(
 
  106                         internal::combine_two(
 
  107                             _mm_cmplt_epi32(i1, internal::pset1<internal::Packet4i>(0)),
 
  108                             _mm_cmplt_epi32(internal::pset1<internal::Packet4i>(-1), i2)
 
  111                     return internal::pblendv(c, h, l);
 
  116                 enum { full_size = 8 };
 
  117                 static const CompressMask& get_inst()
 
  119                     static CompressMask cm;
 
  123                 template<
typename Packet>
 
  124                 EIGEN_STRONG_INLINE 
int compress_append(Packet& _value, 
const Packet& _mask,
 
  125                     Packet& _rest, 
int rest_cnt, 
bool& full)
 const 
  127                     auto& value = 
reinterpret_cast<internal::Packet8f&
>(_value);
 
  128                     auto& mask = 
reinterpret_cast<const internal::Packet8f&
>(_mask);
 
  129                     auto& rest = 
reinterpret_cast<internal::Packet8f&
>(_rest);
 
  131                     int m = _mm256_movemask_ps(mask);
 
  132                     if (cnt[m] == full_size)
 
  138                     auto p1 = permute(value, idx[rest_cnt][m]);
 
  139                     p1 = internal::pblendv(selector[rest_cnt], rest, p1);
 
  141                     auto new_cnt = rest_cnt + cnt[m];
 
  142                     if (new_cnt >= full_size)
 
  144                         if (new_cnt > full_size)
 
  146                             rest = permute(value, idx[new_cnt - cnt[m] + full_size - 1][m]);
 
  150                         return new_cnt - full_size;
 
  165 #ifdef EIGEN_VECTORIZE_SSE2 
  166 #include <xmmintrin.h> 
  175             class CompressMask<16>
 
  177                 std::array<std::array<uint8_t, 16>, 7> idx;
 
  178                 std::array<internal::Packet4f, 4> selector;
 
  179                 std::array<uint8_t, 64> cnt;
 
  181                 static uint8_t make_compress(
int mask, 
int offset = 0)
 
  185                     for (
int i = 0; i < 4; ++i)
 
  191                             if (n >= 0) ret |= (i & 3) << (2 * n);
 
  198                 static uint8_t count(
int mask)
 
  201                     for (
int i = 0; i < 4; ++i)
 
  211                     for (
int i = 0; i < 16; ++i)
 
  213                         for (
int o = 0; o < 7; ++o)
 
  215                             idx[o][i] = make_compress(i, o < 4 ? o : o - 7);
 
  221                     selector[0] = _mm_castsi128_ps(_mm_setr_epi32(0, 0, 0, 0));
 
  222                     selector[1] = _mm_castsi128_ps(_mm_setr_epi32(-1, 0, 0, 0));
 
  223                     selector[2] = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, 0, 0));
 
  224                     selector[3] = _mm_castsi128_ps(_mm_setr_epi32(-1, -1, -1, 0));
 
  227                 static EIGEN_STRONG_INLINE internal::Packet4f permute(
const internal::Packet4f& p, uint8_t i)
 
  231                     return _mm_setr_ps(u[i & 3], u[(i >> 2) & 3], u[(i >> 4) & 3], u[(i >> 6) & 3]);
 
  236                 enum { full_size = 4 };
 
  238                 static const CompressMask& get_inst()
 
  240                     static CompressMask cm;
 
  244                 template<
typename Packet>
 
  245                 EIGEN_STRONG_INLINE 
int compress_append(Packet& _value, 
const Packet& _mask,
 
  246                     Packet& _rest, 
int rest_cnt, 
bool& full)
 const 
  248                     auto& value = 
reinterpret_cast<internal::Packet4f&
>(_value);
 
  249                     auto& mask = 
reinterpret_cast<const internal::Packet4f&
>(_mask);
 
  250                     auto& rest = 
reinterpret_cast<internal::Packet4f&
>(_rest);
 
  252                     int m = _mm_movemask_ps(mask);
 
  253                     if (cnt[m] == full_size)
 
  259                     auto p1 = permute(value, idx[rest_cnt][m]);
 
  260                     p1 = internal::pblendv(selector[rest_cnt], rest, p1);
 
  262                     auto new_cnt = rest_cnt + cnt[m];
 
  263                     if (new_cnt >= full_size)
 
  265                         if (new_cnt > full_size)
 
  267                             rest = permute(value, idx[new_cnt - cnt[m] + full_size - 1][m]);
 
  271                         return new_cnt - full_size;