12 #ifndef EIGENRAND_PACKET_RANDOM_ENGINE_H 
   13 #define EIGENRAND_PACKET_RANDOM_ENGINE_H 
   17 #include <type_traits> 
   26         struct IsIntPacket : std::false_type {};
 
   31 #ifdef EIGEN_VECTORIZE_AVX2 
   33         struct IsIntPacket<Packet8i> : std::true_type {};
 
   36         struct HalfPacket<Packet8i>
 
   38             using type = Packet4i;
 
   41 #ifdef EIGEN_VECTORIZE_SSE2 
   43         struct IsIntPacket<Packet4i> : std::true_type {};
 
   46         struct HalfPacket<Packet4i>
 
   48             using type = uint64_t;
 
   58             auto test_integral_result_type(
int)->std::integral_constant<bool, std::is_integral<typename T::result_type>::value>;
 
   61             auto test_integral_result_type(...)->std::false_type;
 
   64             auto test_intpacket_result_type(
int)->std::integral_constant<bool, internal::IsIntPacket<typename T::result_type>::value>;
 
   67             auto test_intpacket_result_type(...)->std::false_type;
 
   71         struct IsScalarRandomEngine : decltype(detail::test_integral_result_type<Ty>(0))
 
   76         struct IsPacketRandomEngine : decltype(detail::test_intpacket_result_type<Ty>(0))
 
   80         enum class RandomEngineType
 
   86         struct GetRandomEngineType : std::integral_constant <
 
   88             IsPacketRandomEngine<Ty>::value ? RandomEngineType::packet :
 
   89             (IsScalarRandomEngine<Ty>::value ? RandomEngineType::scalar : RandomEngineType::none)
 
   94         template<
typename Ty, 
size_t length, 
size_t alignment = 64>
 
  101                 for (
size_t i = 0; i < length; ++i)
 
  103                     new (&aligned[i]) Ty();
 
  107             AlignedArray(
const AlignedArray& o)
 
  110                 for (
size_t i = 0; i < length; ++i)
 
  116             AlignedArray(AlignedArray&& o)
 
  118                 std::swap(memory, o.memory);
 
  119                 std::swap(aligned, o.aligned);
 
  122             AlignedArray& operator=(
const AlignedArray& o)
 
  124                 for (
size_t i = 0; i < length; ++i)
 
  131             AlignedArray& operator=(AlignedArray&& o)
 
  133                 std::swap(memory, o.memory);
 
  134                 std::swap(aligned, o.aligned);
 
  143             Ty& operator[](
size_t i)
 
  148             const Ty& operator[](
size_t i)
 const 
  163             const Ty* data()
 const 
  171                 memory = std::malloc(
sizeof(Ty) * length + alignment);
 
  172                 aligned = (Ty*)(((
size_t)memory + alignment) & ~(alignment - 1));
 
  179                     for (
size_t i = 0; i < length; ++i)
 
  189             void* memory = 
nullptr;
 
  190             Ty* aligned = 
nullptr;
 
  193 #ifndef EIGEN_DONT_VECTORIZE 
  214         template<
typename Packet,
 
  216             int _Rx, uint64_t _Px,
 
  217             int _Ux, uint64_t _Dx,
 
  218             int _Sx, uint64_t _Bx,
 
  219             int _Tx, uint64_t _Cx,
 
  220             int _Lx, uint64_t _Fx>
 
  224             using result_type = Packet;
 
  226             static constexpr 
int word_size = 64;
 
  227             static constexpr 
int state_size = _Nx;
 
  228             static constexpr 
int shift_size = _Mx;
 
  229             static constexpr 
int mask_bits = _Rx;
 
  230             static constexpr uint64_t parameter_a = _Px;
 
  231             static constexpr 
int output_u = _Ux;
 
  232             static constexpr 
int output_s = _Sx;
 
  233             static constexpr uint64_t output_b = _Bx;
 
  234             static constexpr 
int output_t = _Tx;
 
  235             static constexpr uint64_t output_c = _Cx;
 
  236             static constexpr 
int output_l = _Lx;
 
  238             static constexpr uint64_t default_seed = 5489U;
 
  250                 using namespace Eigen::internal;
 
  251                 std::array<uint64_t, unpacket_traits<Packet>::size / 2> seeds;
 
  252                 for (uint64_t i = 0; i < seeds.size(); ++i)
 
  256                 seed(ploadu<Packet>((
int*)seeds.data()));
 
  276                 using namespace Eigen::internal;
 
  277                 Packet prev = state[0] = x0;
 
  278                 for (
int i = 1; i < _Nx; ++i)
 
  280                     prev = state[i] = pmuluadd64(pxor(prev, psrl64(prev, word_size - 2)), _Fx, i);
 
  317                 else if (2 * _Nx <= stateIdx)
 
  320                 using namespace Eigen::internal;
 
  322                 Packet res = state[stateIdx++];
 
  323                 res = pxor(res, pand(psrl64(res, _Ux), pseti64<Packet>(_Dx)));
 
  324                 res = pxor(res, pand(psll64(res, _Sx), pseti64<Packet>(_Bx)));
 
  325                 res = pxor(res, pand(psll64(res, _Tx), pseti64<Packet>(_Cx)));
 
  326                 res = pxor(res, psrl64(res, _Lx));
 
  337                 for (; 0 < num; --num)
 
  343             typename internal::HalfPacket<Packet>::type half()
 
  350                 typename internal::HalfPacket<Packet>::type a;
 
  351                 internal::split_two(
operator()(), a, cache);
 
  360                 using namespace Eigen::internal;
 
  362                 auto hmask = pseti64<Packet>(_hMask),
 
  363                     lmask = pseti64<Packet>(_lMask),
 
  364                     px = pseti64<Packet>(_Px),
 
  365                     one = pseti64<Packet>(1);
 
  368                 for (i = 0; i < _Nx - _Mx; ++i)
 
  370                     Packet tmp = por(pand(state[i + _Nx], hmask),
 
  371                         pand(state[i + _Nx + 1], lmask));
 
  373                     state[i] = pxor(pxor(
 
  375                         pand(pcmpeq64(pand(tmp, one), one), px)),
 
  380                 for (; i < _Nx - 1; ++i)
 
  382                     Packet tmp = por(pand(state[i + _Nx], hmask),
 
  383                         pand(state[i + _Nx + 1], lmask));
 
  385                     state[i] = pxor(pxor(
 
  387                         pand(pcmpeq64(pand(tmp, one), one), px)),
 
  392                 Packet tmp = por(pand(state[i + _Nx], hmask),
 
  393                     pand(state[0], lmask));
 
  394                 state[i] = pxor(pxor(
 
  396                     pand(pcmpeq64(pand(tmp, one), one), px)),
 
  404                 using namespace Eigen::internal;
 
  406                 auto hmask = pseti64<Packet>(_hMask),
 
  407                     lmask = pseti64<Packet>(_lMask),
 
  408                     px = pseti64<Packet>(_Px),
 
  409                     one = pseti64<Packet>(1);
 
  411                 for (
int i = _Nx; i < 2 * _Nx; ++i)
 
  413                     Packet tmp = por(pand(state[i - _Nx], hmask),
 
  414                         pand(state[i - _Nx + 1], lmask));
 
  416                     state[i] = pxor(pxor(
 
  418                         pand(pcmpeq64(pand(tmp, one), one), px)),
 
  424             AlignedArray<Packet, _Nx * 2> state;
 
  426             typename internal::HalfPacket<Packet>::type cache;
 
  429             static constexpr uint64_t _wMask = (uint64_t)-1;
 
  430             static constexpr uint64_t _hMask = (_wMask << _Rx) & _wMask;
 
  431             static constexpr uint64_t _lMask = ~_hMask & _wMask;
 
  439         template<
typename Packet>
 
  441             0xb5026f5aa96619e9, 29,
 
  442             0x5555555555555555, 17,
 
  443             0x71d67fffeda60000, 37,
 
  444             0xfff7eee000000000, 43, 6364136223846793005>;
 
  447         template<
typename UIntType, 
typename BaseRng, 
int numU64>
 
  448         class ParallelRandomEngineAdaptor
 
  450             static_assert(GetRandomEngineType<BaseRng>::value != RandomEngineType::none, 
"BaseRng must be a kind of Random Engine.");
 
  452             using result_type = UIntType;
 
  454             ParallelRandomEngineAdaptor(
size_t seed = BaseRng::default_seed)
 
  456                 for (
int i = 0; i < num_parallel; ++i)
 
  459                     new (&rngs[i]) BaseRng{ 
seed + i * u64_stride };
 
  463             ParallelRandomEngineAdaptor(
const BaseRng& o)
 
  465                 for (
int i = 0; i < num_parallel; ++i)
 
  468                     new (&rngs[i]) BaseRng{ o };
 
  472             ParallelRandomEngineAdaptor(
const ParallelRandomEngineAdaptor&) = 
default;
 
  473             ParallelRandomEngineAdaptor(ParallelRandomEngineAdaptor&&) = 
default;
 
  475             static constexpr result_type 
min()
 
  477                 return std::numeric_limits<result_type>::min();
 
  480             static constexpr result_type 
max()
 
  482                 return std::numeric_limits<result_type>::max();
 
  496                 if (fcnt >= fbuf_size)
 
  507                 for (
size_t i = 0; i < num_parallel; ++i)
 
  509                     reinterpret_cast<typename BaseRng::result_type&
>(buf[i * result_type_stride]) = rngs[i]();
 
  513             void refill_fbuffer()
 
  516                 for (
size_t i = 0; i < num_parallel; ++i)
 
  518                     auto urf = internal::bit_to_ur_float(rngs[i]());
 
  519                     reinterpret_cast<decltype(urf)&
>(fbuf[i * u64_stride * 2]) = urf;
 
  523             static constexpr 
int u64_stride = 
sizeof(
typename BaseRng::result_type) / 
sizeof(uint64_t);
 
  524             static constexpr 
int result_type_stride = 
sizeof(
typename BaseRng::result_type) / 
sizeof(result_type);
 
  525             static constexpr 
int num_parallel = numU64 / u64_stride;
 
  526             static constexpr 
int byte_size = 
sizeof(uint64_t) * numU64;
 
  527             static constexpr 
size_t buf_size = byte_size / 
sizeof(result_type);
 
  528             static constexpr 
size_t fbuf_size = byte_size / 
sizeof(float);
 
  530             std::array<BaseRng, num_parallel> rngs;
 
  531             AlignedArray<result_type, buf_size> buf;
 
  532             AlignedArray<float, fbuf_size> fbuf;
 
  533             size_t cnt = buf_size, fcnt = fbuf_size;
 
  542         template<
typename UIntType, 
typename BaseRng>
 
  544             sizeof(
typename BaseRng::result_type) / 
sizeof(uint64_t)>;
 
  546         template<
typename BaseRng>
 
  547         class RandomEngineWrapper : 
public BaseRng
 
  550             using BaseRng::BaseRng;
 
  552             RandomEngineWrapper(
const BaseRng& o) : BaseRng{ o }
 
  556             RandomEngineWrapper(BaseRng&& o) : BaseRng{ o }
 
  560             RandomEngineWrapper(
size_t seed) : BaseRng{ 
seed }
 
  564             RandomEngineWrapper() = 
default;
 
  565             RandomEngineWrapper(
const RandomEngineWrapper&) = 
default;
 
  566             RandomEngineWrapper(RandomEngineWrapper&&) = 
default;
 
  570                 internal::bit_scalar<float> bs;
 
  571                 return bs.to_ur(this->
operator()());
 
  575         template<
typename UIntType, 
typename Rng>
 
  576         using UniversalRandomEngine = 
typename std::conditional<
 
  577             IsPacketRandomEngine<typename std::remove_reference<Rng>::type>::value,
 
  578             PacketRandomEngineAdaptor<UIntType, typename std::remove_reference<Rng>::type>,
 
  579             typename std::conditional<
 
  580             IsScalarRandomEngine<typename std::remove_reference<Rng>::type>::value,
 
  581             RandomEngineWrapper<typename std::remove_reference<Rng>::type>,
 
  594         template<
typename UIntType, 
typename Rng>
 
  597             return { std::forward<Rng>(rng) };
 
  600 #ifdef EIGEN_VECTORIZE_AVX2 
  601         using Vmt19937_64 = Pmt19937_64<internal::Packet8i>;
 
  602 #elif defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_SSE2) 
  603         using Vmt19937_64 = Pmt19937_64<internal::Packet4i>;
 
  619         template<
typename UIntType = u
int64_t>
 
  620         using P8_mt19937_64 = ParallelRandomEngineAdaptor<UIntType, Vmt19937_64, 8>;