diff --git a/src/include/clRNG/private/Random123/MicroURNG.hpp b/src/include/clRNG/private/Random123/MicroURNG.hpp new file mode 100644 index 0000000..beb0341 --- /dev/null +++ b/src/include/clRNG/private/Random123/MicroURNG.hpp @@ -0,0 +1,146 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __MicroURNG_dot_hpp__ +#define __MicroURNG_dot_hpp__ + +#include +#include + +namespace r123{ +/** + Given a CBRNG whose ctr_type has an unsigned integral value_type, + MicroURNG(c, k) is a type that satisfies the + requirements of a C++0x Uniform Random Number Generator. + + The intended purpose is for a MicroURNG to be passed + as an argument to a C++0x Distribution, e.g., + std::normal_distribution. See examples/MicroURNG.cpp. + + The MicroURNG functor has a period of "only" + + ctr_type.size()*2^32, + + after which it will silently repeat. + + The high 32 bits of the highest word in the counter c, passed to + the constructor must be zero. MicroURNG uses these bits to + "count". + + Older versions of the library permitted a second template + parameter by which the caller could control the number of + bits devoted to the URNG's internal counter. This flexibility + has been disabled because URNGs created with different + numbers of counter bits could, conceivably "collide". + +\code + typedef ?someCBRNG? RNG; + RNG::ctr_type c = ...; // under application control + RNG::key_type k = ...; // + std::normal_distribution nd; + MicroURNG urng(c, k); + for(???){ + ... + nd(urng); // may be called several hundred times with BITS=10 + ... + } +\endcode +*/ + +template +class MicroURNG{ + // According to C++0x, a URNG requires only a result_type, + // operator()(), min() and max() methods. Everything else + // (ctr_type, key_type, reset() method, etc.) is "value added" + // for the benefit of users that "know" that they're dealing with + // a MicroURNG. +public: + typedef CBRNG cbrng_type; + static const int BITS = 32; + typedef typename cbrng_type::ctr_type ctr_type; + typedef typename cbrng_type::key_type key_type; + typedef typename cbrng_type::ukey_type ukey_type; + typedef typename ctr_type::value_type result_type; + + R123_STATIC_ASSERT( std::numeric_limits::digits >= BITS, "The result_type must have at least 32 bits" ); + + result_type operator()(){ + if(last_elem == 0){ + // jam n into the high bits of c + const size_t W = std::numeric_limits::digits; + ctr_type c = c0; + c[c0.size()-1] |= n<<(W-BITS); + rdata = b(c,k); + n++; + last_elem = rdata.size(); + } + return rdata[--last_elem]; + } + MicroURNG(cbrng_type _b, ctr_type _c0, ukey_type _uk) : b(_b), c0(_c0), k(_uk), n(0), last_elem(0) { + chkhighbits(); + } + MicroURNG(ctr_type _c0, ukey_type _uk) : b(), c0(_c0), k(_uk), n(0), last_elem(0) { + chkhighbits(); + } + + // _Min and _Max work around a bug in the library shipped with MacOS Xcode 4.5.2. + // See the commment in conventional/Engine.hpp. + const static result_type _Min = 0; + const static result_type _Max = ~((result_type)0); + + static R123_CONSTEXPR result_type min R123_NO_MACRO_SUBST () { return _Min; } + static R123_CONSTEXPR result_type max R123_NO_MACRO_SUBST () { return _Max; } + // extra methods: + const ctr_type& counter() const{ return c0; } + void reset(ctr_type _c0, ukey_type _uk){ + c0 = _c0; + chkhighbits(); + k = _uk; + n = 0; + last_elem = 0; + } + +private: + cbrng_type b; + ctr_type c0; + key_type k; + R123_ULONG_LONG n; + size_t last_elem; + ctr_type rdata; + void chkhighbits(){ + result_type r = c0[c0.size()-1]; + result_type mask = ((uint64_t)std::numeric_limits::max R123_NO_MACRO_SUBST ())>>BITS; + if((r&mask) != r) + throw std::runtime_error("MicroURNG: c0, does not have high bits clear"); + } +}; +} // namespace r123 +#endif diff --git a/src/include/clRNG/private/Random123/ReinterpretCtr.hpp b/src/include/clRNG/private/Random123/ReinterpretCtr.hpp new file mode 100644 index 0000000..164a38b --- /dev/null +++ b/src/include/clRNG/private/Random123/ReinterpretCtr.hpp @@ -0,0 +1,88 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __ReinterpretCtr_dot_hpp__ +#define __ReinterpretCtr_dot_hpp__ + +#include "features/compilerfeatures.h" +#include + +namespace r123{ +/*! + ReinterpretCtr uses memcpy to map back and forth + between a CBRNG's ctr_type and the specified ToType. For example, + after: + + typedef ReinterpretCtr G; + + G is a bona fide CBRNG with ctr_type r123array4x32. + + WARNING: ReinterpretCtr is endian dependent. The + values returned by G, declared as above, + will depend on the endianness of the machine on which it runs. + */ + +template +struct ReinterpretCtr{ + typedef ToType ctr_type; + typedef typename CBRNG::key_type key_type; + typedef typename CBRNG::ctr_type bctype; + typedef typename CBRNG::ukey_type ukey_type; + R123_STATIC_ASSERT(sizeof(ToType) == sizeof(bctype) && sizeof(typename bctype::value_type) != 16, + "ReinterpretCtr: sizeof(ToType) is not the same as sizeof(CBRNG::ctr_type) or CBRNG::ctr_type::value_type looks like it might be __m128i"); + // It's amazingly difficult to safely do conversions with __m128i. + // If we use the operator() implementation below with a CBRNG + // whose ctr_type is r123array1xm128i, gcc4.6 optimizes away the + // memcpys, inlines the operator()(c,k), and produces assembly + // language that ends with an aesenclast instruction with a + // destination operand pointing to an unaligned memory address ... + // Segfault! See: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50444 + // MSVC also produces code that crashes. We suspect a + // similar mechanism but haven't done the debugging necessary to + // be sure. We were able to 'fix' gcc4.6 by making bc a mutable + // data member rather than declaring it in the scope of + // operator(). That didn't fix the MSVC problems, though. + // + // Conclusion - don't touch __m128i, at least for now. The + // easiest (but highly imprecise) way to do that is the static + // assertion above that rejects bctype::value_types of size 16. - + // Sep 2011. + ctr_type operator()(ctr_type c, key_type k){ + bctype bc; + std::memcpy(&bc, &c, sizeof(c)); + CBRNG b; + bc = b(bc, k); + std::memcpy(&c, &bc, sizeof(bc)); + return c; + } +}; +} // namespace r123 +#endif diff --git a/src/include/clRNG/private/Random123/aes.h b/src/include/clRNG/private/Random123/aes.h new file mode 100644 index 0000000..96e3c9c --- /dev/null +++ b/src/include/clRNG/private/Random123/aes.h @@ -0,0 +1,344 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __Random123_aes_dot_hpp__ +#define __Random123_aes_dot_hpp__ + +#include "features/compilerfeatures.h" +#include "array.h" + +/* Implement a bona fide AES block cipher. It's minimally +// checked against the test vector in FIPS-197 in ut_aes.cpp. */ +#if R123_USE_AES_NI + +/** @ingroup AESNI */ +typedef struct r123array1xm128i aesni1xm128i_ctr_t; +/** @ingroup AESNI */ +typedef struct r123array1xm128i aesni1xm128i_ukey_t; +/** @ingroup AESNI */ +typedef struct r123array4x32 aesni4x32_ukey_t; +/** @ingroup AESNI */ +enum r123_enum_aesni1xm128i { aesni1xm128i_rounds = 10 }; + +/** \cond HIDDEN_FROM_DOXYGEN */ +R123_STATIC_INLINE __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) { + __m128i temp3; + temp2 = _mm_shuffle_epi32 (temp2 ,0xff); + temp3 = _mm_slli_si128 (temp1, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp3 = _mm_slli_si128 (temp3, 0x4); + temp1 = _mm_xor_si128 (temp1, temp3); + temp1 = _mm_xor_si128 (temp1, temp2); + return temp1; +} + +R123_STATIC_INLINE void aesni1xm128iexpand(aesni1xm128i_ukey_t uk, __m128i ret[11]) +{ + __m128i rkey = uk.v[0].m; + __m128i tmp2; + + ret[0] = rkey; + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x1); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[1] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x2); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[2] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x4); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[3] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x8); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[4] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x10); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[5] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x20); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[6] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x40); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[7] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x80); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[8] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x1b); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[9] = rkey; + + tmp2 = _mm_aeskeygenassist_si128(rkey, 0x36); + rkey = AES_128_ASSIST(rkey, tmp2); + ret[10] = rkey; +} +/** \endcond */ + +#ifdef __cplusplus +/** @ingroup AESNI */ +struct aesni1xm128i_key_t{ + __m128i k[11]; + aesni1xm128i_key_t(){ + aesni1xm128i_ukey_t uk; + uk.v[0].m = _mm_setzero_si128(); + aesni1xm128iexpand(uk, k); + } + aesni1xm128i_key_t(const aesni1xm128i_ukey_t& uk){ + aesni1xm128iexpand(uk, k); + } + aesni1xm128i_key_t(const aesni4x32_ukey_t& uk){ + aesni1xm128i_ukey_t uk128; + uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]); + aesni1xm128iexpand(uk128, k); + } + aesni1xm128i_key_t& operator=(const aesni1xm128i_ukey_t& uk){ + aesni1xm128iexpand(uk, k); + return *this; + } + aesni1xm128i_key_t& operator=(const aesni4x32_ukey_t& uk){ + aesni1xm128i_ukey_t uk128; + uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]); + aesni1xm128iexpand(uk128, k); + return *this; + } +}; +#else +typedef struct { + __m128i k[11]; +}aesni1xm128i_key_t; + +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni1xm128i_key_t aesni1xm128ikeyinit(aesni1xm128i_ukey_t uk){ + aesni1xm128i_key_t ret; + aesni1xm128iexpand(uk, ret.k); + return ret; +} +#endif + +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni1xm128i_ctr_t aesni1xm128i(aesni1xm128i_ctr_t in, aesni1xm128i_key_t k) { + __m128i x = _mm_xor_si128(k.k[0], in.v[0].m); + x = _mm_aesenc_si128(x, k.k[1]); + x = _mm_aesenc_si128(x, k.k[2]); + x = _mm_aesenc_si128(x, k.k[3]); + x = _mm_aesenc_si128(x, k.k[4]); + x = _mm_aesenc_si128(x, k.k[5]); + x = _mm_aesenc_si128(x, k.k[6]); + x = _mm_aesenc_si128(x, k.k[7]); + x = _mm_aesenc_si128(x, k.k[8]); + x = _mm_aesenc_si128(x, k.k[9]); + x = _mm_aesenclast_si128(x, k.k[10]); + { + aesni1xm128i_ctr_t ret; + ret.v[0].m = x; + return ret; + } +} + +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni1xm128i_ctr_t aesni1xm128i_R(unsigned R, aesni1xm128i_ctr_t in, aesni1xm128i_key_t k){ + R123_ASSERT(R==10); + return aesni1xm128i(in, k); +} + + +/** @ingroup AESNI */ +typedef struct r123array4x32 aesni4x32_ctr_t; +/** @ingroup AESNI */ +typedef aesni1xm128i_key_t aesni4x32_key_t; +/** @ingroup AESNI */ +enum r123_enum_aesni4x32 { aesni4x32_rounds = 10 }; +/** @ingroup AESNI */ +R123_STATIC_INLINE aesni4x32_key_t aesni4x32keyinit(aesni4x32_ukey_t uk){ + aesni1xm128i_ukey_t uk128; + aesni4x32_key_t ret; + uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]); + aesni1xm128iexpand(uk128, ret.k); + return ret; +} + +/** @ingroup AESNI */ +/** The aesni4x32_R function provides a C API to the @ref AESNI "AESNI" CBRNG, allowing the number of rounds to be specified explicitly **/ +R123_STATIC_INLINE aesni4x32_ctr_t aesni4x32_R(unsigned int Nrounds, aesni4x32_ctr_t c, aesni4x32_key_t k){ + aesni1xm128i_ctr_t c128; + c128.v[0].m = _mm_set_epi32(c.v[3], c.v[2], c.v[1], c.v[0]); + c128 = aesni1xm128i_R(Nrounds, c128, k); + _mm_storeu_si128((__m128i*)&c.v[0], c128.v[0].m); + return c; +} + +#define aesni4x32_rounds aesni1xm128i_rounds + +/** The aesni4x32 macro provides a C API to the @ref AESNI "AESNI" CBRNG, uses the default number of rounds i.e. \c aesni4x32_rounds **/ +/** @ingroup AESNI */ +#define aesni4x32(c,k) aesni4x32_R(aesni4x32_rounds, c, k) + +#ifdef __cplusplus +namespace r123{ +/** +@defgroup AESNI ARS and AESNI Classes and Typedefs + +The ARS4x32, ARS1xm128i, AESNI4x32 and AESNI1xm128i classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +ARS1xm128i and AESNI1xm128i are based on the AES block cipher and rely on the AES-NI hardware instructions +available on some some new (2011) CPUs. + +The ARS1xm128i CBRNG and the use of AES for random number generation are described in +Parallel Random Numbers: As Easy as 1, 2, 3 . +Although it uses some cryptographic primitives, ARS1xm128i uses a cryptographically weak key schedule and is \b not suitable for cryptographic use. + +@class AESNI1xm128i +@ingroup AESNI +AESNI exports the member functions, typedefs and operator overloads required by a @ref CBRNG class. + +AESNI1xm128i uses the crypotgraphic AES round function, including the cryptographic key schedule. + +In contrast to the other CBRNGs in the Random123 library, the AESNI1xm128i_R::key_type is opaque +and is \b not identical to the AESNI1xm128i_R::ukey_type. Creating a key_type, using either the constructor +or assignment operator, is significantly more time-consuming than running the bijection (hundreds +of clock cycles vs. tens of clock cycles). + +AESNI1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which +should occur only when the compiler is configured to generate AES-NI instructions (or +when defaults are overridden by compile-time, compiler-command-line options). + +As of September 2011, the authors know of no statistical flaws with AESNI1xm128i. It +would be an event of major cryptographic note if any such flaws were ever found. +*/ +struct AESNI1xm128i{ + typedef aesni1xm128i_ctr_t ctr_type; + typedef aesni1xm128i_ukey_t ukey_type; + typedef aesni1xm128i_key_t key_type; + static const unsigned int rounds=10; + ctr_type operator()(ctr_type ctr, key_type key) const{ + return aesni1xm128i(ctr, key); + } +}; + +/* @class AESNI4x32 */ +struct AESNI4x32{ + typedef aesni4x32_ctr_t ctr_type; + typedef aesni4x32_ukey_t ukey_type; + typedef aesni4x32_key_t key_type; + static const unsigned int rounds=10; + ctr_type operator()(ctr_type ctr, key_type key) const{ + return aesni4x32(ctr, key); + } +}; + +/** @ingroup AESNI + @class AESNI1xm128i_R + +AESNI1xm128i_R is provided for completeness, but is only instantiable with ROUNDS=10, in +which case it is identical to AESNI1xm128i */ +template +struct AESNI1xm128i_R : public AESNI1xm128i{ + R123_STATIC_ASSERT(ROUNDS==10, "AESNI1xm128i_R is only valid with R=10"); +}; + +/** @class AESNI4x32_R **/ +template +struct AESNI4x32_R : public AESNI4x32{ + R123_STATIC_ASSERT(ROUNDS==10, "AESNI4x32_R is only valid with R=10"); +}; +} // namespace r123 +#endif /* __cplusplus */ + +#endif /* R123_USE_AES_NI */ + +#if R123_USE_AES_OPENSSL +#include +typedef struct r123array16x8 aesopenssl16x8_ctr_t; +typedef struct r123array16x8 aesopenssl16x8_ukey_t; +#ifdef __cplusplus +struct aesopenssl16x8_key_t{ + AES_KEY k; + aesopenssl16x8_key_t(){ + aesopenssl16x8_ukey_t ukey={{}}; + AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k); + } + aesopenssl16x8_key_t(const aesopenssl16x8_ukey_t& ukey){ + AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k); + } + aesopenssl16x8_key_t& operator=(const aesopenssl16x8_ukey_t& ukey){ + AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k); + return *this; + } +}; +#else +typedef struct aesopenssl16x8_key_t{ + AES_KEY k; +}aesopenssl16x8_key_t; +R123_STATIC_INLINE struct aesopenssl16x8_key_t aesopenssl16x8keyinit(aesopenssl16x8_ukey_t uk){ + aesopenssl16x8_key_t ret; + AES_set_encrypt_key((const unsigned char *)&uk.v[0], 128, &ret.k); + return ret; +} +#endif + +R123_STATIC_INLINE R123_FORCE_INLINE(aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key)); +R123_STATIC_INLINE +aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key){ + aesopenssl16x8_ctr_t ret; + AES_encrypt((const unsigned char*)&ctr.v[0], (unsigned char *)&ret.v[0], &key.k); + return ret; +} + +#define aesopenssl16x8_rounds aesni4x32_rounds +#define aesopenssl16x8(c,k) aesopenssl16x8_R(aesopenssl16x8_rounds) + +#ifdef __cplusplus +namespace r123{ +struct AESOpenSSL16x8{ + typedef aesopenssl16x8_ctr_t ctr_type; + typedef aesopenssl16x8_key_t key_type; + typedef aesopenssl16x8_ukey_t ukey_type; + static const unsigned int rounds=10; + ctr_type operator()(const ctr_type& in, const key_type& k){ + ctr_type out; + AES_encrypt((const unsigned char *)&in[0], (unsigned char *)&out[0], &k.k); + return out; + } +}; +} // namespace r123 +#endif /* __cplusplus */ +#endif /* R123_USE_AES_OPENSSL */ + +#endif diff --git a/src/include/clRNG/private/Random123/ars.h b/src/include/clRNG/private/Random123/ars.h new file mode 100644 index 0000000..a027b6f --- /dev/null +++ b/src/include/clRNG/private/Random123/ars.h @@ -0,0 +1,204 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __Random123_ars_dot_hpp__ +#define __Random123_ars_dot_hpp__ + +#include "features/compilerfeatures.h" +#include "array.h" + +#if R123_USE_AES_NI + +#ifndef ARS1xm128i_DEFAULT_ROUNDS +#define ARS1xm128i_DEFAULT_ROUNDS 7 +#endif + +/** @ingroup AESNI */ +enum r123_enum_ars1xm128i {ars1xm128i_rounds = ARS1xm128i_DEFAULT_ROUNDS}; + +/* ARS1xm128i with Weyl keys. Fast, and Crush-resistant, but NOT CRYPTO. */ +/** @ingroup AESNI */ +typedef struct r123array1xm128i ars1xm128i_ctr_t; +/** @ingroup AESNI */ +typedef struct r123array1xm128i ars1xm128i_key_t; +/** @ingroup AESNI */ +typedef struct r123array1xm128i ars1xm128i_ukey_t; +/** @ingroup AESNI */ +R123_STATIC_INLINE ars1xm128i_key_t ars1xm128ikeyinit(ars1xm128i_ukey_t uk) { return uk; } +/** @ingroup AESNI */ +R123_STATIC_INLINE ars1xm128i_ctr_t ars1xm128i_R(unsigned int Nrounds, ars1xm128i_ctr_t in, ars1xm128i_key_t k){ + __m128i kweyl = _mm_set_epi64x(R123_64BIT(0xBB67AE8584CAA73B), /* sqrt(3) - 1.0 */ + R123_64BIT(0x9E3779B97F4A7C15)); /* golden ratio */ + /* N.B. the aesenc instructions do the xor *after* + // so if we want to follow the AES pattern, we + // have to do the initial xor explicitly */ + __m128i kk = k.v[0].m; + __m128i v = _mm_xor_si128(in.v[0].m, kk); + ars1xm128i_ctr_t ret; + R123_ASSERT(Nrounds<=10); + if( Nrounds>1 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>2 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>3 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>4 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>5 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>6 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>7 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>8 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + if( Nrounds>9 ){ + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenc_si128(v, kk); + } + kk = _mm_add_epi64(kk, kweyl); + v = _mm_aesenclast_si128(v, kk); + ret.v[0].m = v; + return ret; +} + +/** @def ars1xm128i +@ingroup AESNI +The ars1mx128i macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default number of rounds i.e. \c ars1xm128i_rounds **/ +#define ars1xm128i(c,k) ars1xm128i_R(ars1xm128i_rounds, c, k) + +/** @ingroup AESNI */ +typedef struct r123array4x32 ars4x32_ctr_t; +/** @ingroup AESNI */ +typedef struct r123array4x32 ars4x32_key_t; +/** @ingroup AESNI */ +typedef struct r123array4x32 ars4x32_ukey_t; +/** @ingroup AESNI */ +enum r123_enum_ars4x32 {ars4x32_rounds = ARS1xm128i_DEFAULT_ROUNDS}; +/** @ingroup AESNI */ +R123_STATIC_INLINE ars4x32_key_t ars4x32keyinit(ars4x32_ukey_t uk) { return uk; } +/** @ingroup AESNI */ +R123_STATIC_INLINE ars4x32_ctr_t ars4x32_R(unsigned int Nrounds, ars4x32_ctr_t c, ars4x32_key_t k){ + ars1xm128i_ctr_t c128; + ars1xm128i_key_t k128; + c128.v[0].m = _mm_set_epi32(c.v[3], c.v[2], c.v[1], c.v[0]); + k128.v[0].m = _mm_set_epi32(k.v[3], k.v[2], k.v[1], k.v[0]); + c128 = ars1xm128i_R(Nrounds, c128, k128); + _mm_storeu_si128((__m128i*)&c.v[0], c128.v[0].m); + return c; +} + +/** @def ars4x32 +@ingroup AESNI +The ars4x32 macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default number of rounds i.e. \c ars4x32_rounds **/ +#define ars4x32(c,k) ars4x32_R(ars4x32_rounds, c, k) + +#ifdef __cplusplus +namespace r123{ +/** +@ingroup AESNI + +ARS1xm128i_R exports the member functions, typedefs and operator overloads required by a @ref CBRNG class. + +ARS1xm128i uses the crypotgraphic AES round function, but a @b non-cryptographc key schedule +to save time and space. + +ARS1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which +should occur only when the compiler is configured to generate AES-NI instructions (or +when defaults are overridden by compile-time, compiler-command-line options). + +The template argument, ROUNDS, is the number of times the ARS round +functions will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=5 or more. + +@class ARS1xm128i_R + +*/ +template +struct ARS1xm128i_R{ + typedef ars1xm128i_ctr_t ctr_type; + typedef ars1xm128i_key_t key_type; + typedef ars1xm128i_key_t ukey_type; + static const unsigned int rounds=ROUNDS; + R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ + return ars1xm128i_R(ROUNDS, ctr, key); + } +}; + +/** @class ARS4x32_R + @ingroup AESNI +*/ + +template +struct ARS4x32_R{ + typedef ars4x32_ctr_t ctr_type; + typedef ars4x32_key_t key_type; + typedef ars4x32_key_t ukey_type; + static const unsigned int rounds=ROUNDS; + R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ + return ars4x32_R(ROUNDS, ctr, key); + } +}; +/** +@ingroup AESNI + +@class ARS1xm128i_R + ARS1xm128i is equivalent to ARS1xm128i_R<7>. With 7 rounds, + the ARS1xm128i CBRNG has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. */ +typedef ARS1xm128i_R ARS1xm128i; +typedef ARS4x32_R ARS4x32; +} // namespace r123 + +#endif /* __cplusplus */ + +#endif /* R123_USE_AES_NI */ + +#endif diff --git a/src/include/clRNG/private/Random123/conventional/Engine.hpp b/src/include/clRNG/private/Random123/conventional/Engine.hpp new file mode 100644 index 0000000..2b8ee7a --- /dev/null +++ b/src/include/clRNG/private/Random123/conventional/Engine.hpp @@ -0,0 +1,250 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __Engine_dot_hpp_ +#define __Engine_dot_hpp_ + +#include "../features/compilerfeatures.h" +#include "../array.h" +#include +#include +#include +#include +#include +#if R123_USE_CXX11_TYPE_TRAITS +#include +#endif + +namespace r123{ +/** + If G satisfies the requirements of a CBRNG, and has a ctr_type whose + value_type is an unsigned integral type, then Engine satisfies + the requirements of a C++0x "Uniform Random Number Engine" and can + be used in any context where such an object is expected. + + Note that wrapping a counter based RNG with a traditional API in + this way obscures much of the power of counter based PRNGs. + Nevertheless, it may be of value in applications that are already + coded to work with the C++0x random number engines. + + The MicroURNG template in MicroURNG.hpp + provides the more limited functionality of a C++0x "Uniform + Random Number Generator", but leaves the application in control + of counters and keys and hence may be preferable to the Engine template. + For example, a MicroURNG allows one to use C++0x "Random Number + Distributions" without giving up control over the counters + and keys. +*/ + +template +struct Engine { + typedef CBRNG cbrng_type; + typedef typename CBRNG::ctr_type ctr_type; + typedef typename CBRNG::key_type key_type; + typedef typename CBRNG::ukey_type ukey_type; + typedef typename ctr_type::value_type result_type; + typedef size_t elem_type; + +protected: + cbrng_type b; + key_type key; + ukey_type ukey; + ctr_type c; + elem_type elem; + ctr_type v; + + void fix_invariant(){ + if( elem != 0 ) { + v = b(c, key); + } + } +public: + explicit Engine() : b(), c(), elem() { + ukey_type x = {{}}; + ukey = x; + key = ukey; + } + explicit Engine(result_type r) : b(), c(), elem() { + ukey_type x = {{typename ukey_type::value_type(r)}}; + ukey = x; + key = ukey; + } + // 26.5.3 says that the SeedSeq templates shouldn't particpate in + // overload resolution unless the type qualifies as a SeedSeq. + // How that is determined is unspecified, except that "as a + // minimum a type shall not qualify as a SeedSeq if it is + // implicitly convertible to a result_type." + // + // First, we make sure that even the non-const copy constructor + // works as expected. In addition, if we've got C++0x + // type_traits, we use enable_if and is_convertible to implement + // the convertible-to-result_type restriction. Otherwise, the + // template is unconditional and will match in some surpirsing + // and undesirable situations. + Engine(Engine& e) : b(e.b), ukey(e.ukey), c(e.c), elem(e.elem){ + key = ukey; + fix_invariant(); + } + Engine(const Engine& e) : b(e.b), ukey(e.ukey), c(e.c), elem(e.elem){ + key = ukey; + fix_invariant(); + } + + template + explicit Engine(SeedSeq &s +#if R123_USE_CXX11_TYPE_TRAITS + , typename std::enable_if::value>::type* =0 +#endif + ) + : b(), c(), elem() { + ukey = ukey_type::seed(s); + key = ukey; + } + void seed(result_type r){ + *this = Engine(r); + } + template + void seed(SeedSeq &s +#if R123_USE_CXX11_TYPE_TRAITS + , typename std::enable_if::value>::type* =0 +#endif + ){ + *this = Engine(s); + } + void seed(){ + *this = Engine(); + } + friend bool operator==(const Engine& lhs, const Engine& rhs){ + return lhs.c==rhs.c && lhs.elem == rhs.elem && lhs.ukey == rhs.ukey; + } + friend bool operator!=(const Engine& lhs, const Engine& rhs){ + return lhs.c!=rhs.c || lhs.elem != rhs.elem || lhs.ukey!=rhs.ukey; + } + + friend std::ostream& operator<<(std::ostream& os, const Engine& be){ + return os << be.c << " " << be.ukey << " " << be.elem; + } + + friend std::istream& operator>>(std::istream& is, Engine& be){ + is >> be.c >> be.ukey >> be.elem; + be.key = be.ukey; + be.fix_invariant(); + return is; + } + + // The shipped with MacOS Xcode 4.5.2 imposes a + // non-standard requirement that URNGs also have static data + // members: _Min and _Max. Later versions of libc++ impose the + // requirement only when constexpr isn't supported. Although the + // Xcode 4.5.2 requirement is clearly non-standard, it is unlikely + // to be fixed and it is very easy work around. We certainly + // don't want to go to great lengths to accommodate every buggy + // library we come across, but in this particular case, the effort + // is low and the benefit is high, so it's worth doing. Thanks to + // Yan Zhou for pointing this out to us. See similar code in + // ../MicroURNG.hpp + const static result_type _Min = 0; + const static result_type _Max = ~((result_type)0); + + static R123_CONSTEXPR result_type min R123_NO_MACRO_SUBST () { return _Min; } + static R123_CONSTEXPR result_type max R123_NO_MACRO_SUBST () { return _Max; } + + result_type operator()(){ + if( c.size() == 1 ) // short-circuit the scalar case. Compilers aren't mind-readers. + return b(c.incr(), key)[0]; + if( elem == 0 ){ + v = b(c.incr(), key); + elem = c.size(); + } + return v[--elem]; + } + + void discard(R123_ULONG_LONG skip){ + // don't forget: elem counts down + size_t nelem = c.size(); + size_t sub = skip % nelem; + skip /= nelem; + if (elem < sub) { + elem += nelem; + skip++; + } + elem -= sub; + c.incr(skip); + fix_invariant(); + } + + //-------------------------- + // Some bonus methods, not required for a Random Number + // Engine + + // Constructors and seed() method for ukey_type seem useful + // We need const and non-const to supersede the SeedSeq template. + explicit Engine(const ukey_type &uk) : key(uk), ukey(uk), c(), elem(){} + explicit Engine(ukey_type &uk) : key(uk), ukey(uk), c(), elem(){} + void seed(const ukey_type& uk){ + *this = Engine(uk); + } + void seed(ukey_type& uk){ + *this = Engine(uk); + } + + // Forward the e(counter) to the CBRNG we are templated + // on, using the current value of the key. + ctr_type operator()(const ctr_type& c) const{ + return b(c, key); + } + + // Since you can seed *this with a ukey_type, it seems reasonable + // to allow the caller to know what seed/ukey *this is using. + ukey_type getseed() const{ + return ukey; + } + + // Maybe the caller want's to know the details of + // the internal state, e.g., so it can call a different + // bijection with the same counter. + std::pair getcounter() const { + return make_pair(c, elem); + } + + // And the inverse. + void setcounter(const ctr_type& _c, elem_type _elem){ + static const size_t nelem = c.size(); + if( elem > nelem ) + throw std::range_error("Engine::setcounter called with elem out of range"); + c = _c; + elem = _elem; + fix_invariant(); + } +}; +} // namespace r123 + +#endif diff --git a/src/include/clRNG/private/Random123/conventional/gsl_cbrng.h b/src/include/clRNG/private/Random123/conventional/gsl_cbrng.h new file mode 100644 index 0000000..44457d0 --- /dev/null +++ b/src/include/clRNG/private/Random123/conventional/gsl_cbrng.h @@ -0,0 +1,128 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_compat_gslrng_dot_h__ +#define __r123_compat_gslrng_dot_h__ + +#include +#include + +/** + The macro: GSL_CBRNG(NAME, CBRNGNAME) + declares the necessary structs and constants that define a + gsl_rng_NAME type based on the counter-based RNG CBRNGNAME. For example: + + Usage: + + @code + #include + #include // this file + GSL_CBRNG(cbrng, threefry4x32); // creates gsl_rng_cbrng + + int main(int argc, char **argv){ + gsl_rng *r = gsl_rng_alloc(gsl_rng_cbrng); + ... use r as you would use any other gsl_rng ... + } + @endcode + + It requires that NAME be the name of a CBRNG that follows the + naming and stylistic conventions of the Random123 library. + + Note that wrapping a \ref CBRNG "counter-based PRNG" with a traditional API in + this way obscures much of the power of the CBRNG API. + Nevertheless, it may be of value to applications that are already + coded to work with GSL random number generators, and that wish + to use the RNGs in the Random123 library. + + */ + +#define GSL_CBRNG(NAME, CBRNGNAME) \ +const gsl_rng_type *gsl_rng_##NAME; \ + \ +typedef struct{ \ + CBRNGNAME##_ctr_t ctr; \ + CBRNGNAME##_ctr_t r; \ + CBRNGNAME##_key_t key; \ + int elem; \ +} NAME##_state; \ + \ +static unsigned long int NAME##_get(void *vstate){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + const int N=sizeof(st->ctr.v)/sizeof(st->ctr.v[0]); \ + if( st->elem == 0 ){ \ + ++st->ctr.v[0]; \ + if( N>1 && st->ctr.v[0] == 0 ) ++st->ctr.v[1]; \ + if( N>2 && st->ctr.v[1] == 0 ) ++st->ctr.v[2]; \ + if( N>3 && st->ctr.v[2] == 0 ) ++st->ctr.v[3]; \ + st->r = CBRNGNAME(st->ctr, st->key); \ + st->elem = N; \ + } \ + return 0xffffffffUL & st->r.v[--st->elem]; \ +} \ + \ +static double \ +NAME##_get_double (void * vstate) \ +{ \ + return NAME##_get (vstate)/4294967296.0; \ +} \ + \ +static void NAME##_set(void *vstate, unsigned long int s){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + st->elem = 0; \ + /* Assume that key and ctr have an array member, v, \ + as if they are r123arrayNxW. If not, this will fail \ + to compile. In particular, this macro fails to compile \ + when the underlying CBRNG requires use of keyinit */ \ + memset(&st->ctr.v[0], 0, sizeof(st->ctr.v)); \ + memset(&st->key.v[0], 0, sizeof(st->key.v)); \ + /* GSL 1.15 documentation says this about gsl_rng_set: \ + Note that the most generators only accept 32-bit seeds, with higher \ + values being reduced modulo 2^32. For generators with smaller \ + ranges the maximum seed value will typically be lower. \ + so we won't jump through any hoops here to deal with \ + high bits if sizeof(unsigned long) > sizeof(uint32_t). */ \ + st->key.v[0] = s; \ +} \ + \ +static const gsl_rng_type NAME##_type = { \ + #NAME, \ + 0xffffffffUL, \ + 0, \ + sizeof(NAME##_state), \ + &NAME##_set, \ + &NAME##_get, \ + &NAME##_get_double \ +}; \ + \ +const gsl_rng_type *gsl_rng_##NAME = &NAME##_type + +#endif + diff --git a/src/include/clRNG/private/Random123/features/clangfeatures.h b/src/include/clRNG/private/Random123/features/clangfeatures.h index 908aee8..7138eb0 100644 --- a/src/include/clRNG/private/Random123/features/clangfeatures.h +++ b/src/include/clRNG/private/Random123/features/clangfeatures.h @@ -1,5 +1,5 @@ /* -Copyright 2010-2011, D. E. Shaw Research. +Copyright 2010-2016, D. E. Shaw Research. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -44,6 +44,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define R123_USE_CXX11_STATIC_ASSERT __has_feature(cxx_static_assert) #endif +// With clang-3.6, -Wall warns about unused-local-typedefs. +// The "obvious" thing to do is to ignore -Wunused-local-typedefs, +// but that doesn't work because earlier versions of clang blow +// up on an 'unknown warning group'. So we briefly ignore -Wall... +// It's tempting to just give up on static assertions in pre-c++11 code. +#if !R123_USE_CXX11_STATIC_ASSERT && !defined(R123_STATIC_ASSERT) +#define R123_STATIC_ASSERT(expr, msg) \ +_Pragma("clang diagnostic push") \ +_Pragma("clang diagnostic ignored \"-Wall\"") \ +typedef char static_assertion[(!!(expr))*2-1] \ +_Pragma("clang diagnostic pop") +#endif + #ifndef R123_USE_CXX11_CONSTEXPR #define R123_USE_CXX11_CONSTEXPR __has_feature(cxx_constexpr) #endif diff --git a/src/include/clRNG/private/Random123/features/compilerfeatures.h b/src/include/clRNG/private/Random123/features/compilerfeatures.h index a2a56bf..4039790 100644 --- a/src/include/clRNG/private/Random123/features/compilerfeatures.h +++ b/src/include/clRNG/private/Random123/features/compilerfeatures.h @@ -195,12 +195,22 @@ added to each of the *features.h files, AND to examples/ut_features.cpp. /* N.B. most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */ #if defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0 #include "openclfeatures.h" +#elif defined(__CUDACC__) +#include "nvccfeatures.h" +#elif defined(__ICC) +#include "iccfeatures.h" +#elif defined(__xlC__) +#include "xlcfeatures.h" +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) +#include "sunprofeatures.h" #elif defined(__OPEN64__) #include "open64features.h" #elif defined(__clang__) #include "clangfeatures.h" #elif defined(__GNUC__) #include "gccfeatures.h" +#elif defined(__PGI) +#include "pgccfeatures.h" #elif defined(_MSC_FULL_VER) #include "msvcfeatures.h" #else diff --git a/src/include/clRNG/private/Random123/features/iccfeatures.h b/src/include/clRNG/private/Random123/features/iccfeatures.h new file mode 100644 index 0000000..b64e5c2 --- /dev/null +++ b/src/include/clRNG/private/Random123/features/iccfeatures.h @@ -0,0 +1,208 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __icpcfeatures_dot_hpp +#define __icpcfeatures_dot_hpp + +// icc relies on gcc libraries and other toolchain components. +#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +{ // maybe an unbalanced brace will terminate the compilation +// You are invited to try Easy123 on other architectures, by changing +// the conditions that reach this error, but you should consider it a +// porting exercise and expect to encounter bugs and deficiencies. +// Please let the authors know of any successes (or failures). +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((always_inline)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_SSE4_2 +#ifdef __SSE4_2__ +#define R123_USE_SSE4_2 1 +#else +#define R123_USE_SSE4_2 0 +#endif +#endif + +#ifndef R123_USE_SSE4_1 +#ifdef __SSE4_1__ +#define R123_USE_SSE4_1 1 +#else +#define R123_USE_SSE4_1 0 +#endif +#endif + +#ifndef R123_USE_SSE +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_NI +// Unlike gcc, icc (version 12) does not pre-define an __AES__ +// pp-symbol when -maes or -xHost is on the command line. This feels +// like a defect in icc (it defines __SSE4_2__ in analogous +// circumstances), but until Intel fixes it, we're better off erring +// on the side of caution and not generating instructions that are +// going to raise SIGILL when executed. To get the AES-NI +// instructions with icc, the caller must puts something like +// -DR123_USE_AES_NI=1 or -D__AES__ on the command line. FWIW, the +// AES-NI Whitepaper by Gueron says that icc has supported AES-NI from +// 11.1 onwards. +// +#define R123_USE_AES_NI ((__ICC>=1101) && defined(__AES__)) +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 1 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 1 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 1 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/src/include/clRNG/private/Random123/features/nvccfeatures.h b/src/include/clRNG/private/Random123/features/nvccfeatures.h new file mode 100644 index 0000000..d1ff8bf --- /dev/null +++ b/src/include/clRNG/private/Random123/features/nvccfeatures.h @@ -0,0 +1,125 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_nvcc_features_dot_h__ +#define __r123_nvcc_features_dot_h__ + +#if !defined(CUDART_VERSION) +#error "why are we in nvccfeatures.h if CUDART_VERSION is not defined" +#endif + +#if CUDART_VERSION < 4010 +#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces. Random123 isunsupported. See comments in nvccfeatures.h" +// This test was added in Random123-1.08 (August, 2013) because we +// discovered that Ftype(maxTvalue()) with Ftype=double and +// T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and +// earlier. We can't be sure this bug doesn't also affect invocations +// of other templated functions, e.g., essentially all of Random123. +// Thus, we no longer trust CUDA versions earlier than 4.1 even though +// we had previously tested and timed Random123 with CUDA 3.x and 4.0. +// If you feel lucky or desperate, you can change #error to #warning, but +// please take extra care to be sure that you are getting correct +// results. +#endif + +// nvcc falls through to gcc or msvc. So first define +// a couple of things and then include either gccfeatures.h +// or msvcfeatures.h + +//#ifdef __CUDA_ARCH__ allows Philox32 and Philox64 to be compiled +//for both device and host functions in CUDA by setting compiler flags +//for the device function +#ifdef __CUDA_ARCH__ +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE __device__ +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 1 +#endif + +#ifndef R123_THROW +// No exceptions in CUDA, at least upto 4.0 +#define R123_THROW(x) R123_ASSERT(0) +#endif + +#ifndef R123_ASSERT +#define R123_ASSERT(x) if((x)) ; else asm("trap;") +#endif + +#else // ! __CUDA_ARCH__ +// If we're using nvcc not compiling for the CUDA architecture, +// then we must be compiling for the host. In that case, +// tell the philox code to use the mulhilo64 asm because +// nvcc doesn't grok uint128_t. +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#endif // __CUDA_ARCH__ + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_ULONG_LONG +// uint64_t, which is what we'd get without this, is +// not the same as unsigned long long +#define R123_ULONG_LONG unsigned long long +#endif + +#if defined(__GNUC__) +#include "gccfeatures.h" +#elif defined(_MSC_FULL_VER) +#include "msvcfeatures.h" +#endif + +#endif diff --git a/src/include/clRNG/private/Random123/features/pgccfeatures.h b/src/include/clRNG/private/Random123/features/pgccfeatures.h new file mode 100644 index 0000000..18ace13 --- /dev/null +++ b/src/include/clRNG/private/Random123/features/pgccfeatures.h @@ -0,0 +1,194 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __pgccfeatures_dot_hpp +#define __pgccfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) +# error "This code has only been tested on x86 platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +/* Found this example in PGI's emmintrin.h. */ +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) (expr) +#endif + +/* PGI through 13.2 doesn't appear to support AES-NI. */ +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +/* PGI through 13.2 appears to support MMX, SSE, SSE3, SSE3, SSSE3, SSE4a, and + ABM, but not SSE4.1 or SSE4.2. */ +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +/* There's no point in trying to compile SSE code in Random123 + unless SSE2 is available. */ +#ifdef __SSE2__ +#define R123_USE_SSE 1 +#else +#define R123_USE_SSE 0 +#endif +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +/* emmintrin.h from PGI #includes xmmintrin.h but then complains at link time + about undefined references to _mm_castsi128_ps(__m128i). Why? */ +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 1 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 1 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 1 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/src/include/clRNG/private/Random123/features/sunprofeatures.h b/src/include/clRNG/private/Random123/features/sunprofeatures.h new file mode 100644 index 0000000..c9cdc00 --- /dev/null +++ b/src/include/clRNG/private/Random123/features/sunprofeatures.h @@ -0,0 +1,172 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __sunprofeatures_dot_hpp +#define __sunprofeatures_dot_hpp + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) expr +#endif + +// The basic idiom is: +// #ifndef R123_SOMETHING +// #if some condition +// #define R123_SOMETHING 1 +// #else +// #define R123_SOMETHING 0 +// #endif +// #endif +// This idiom allows an external user to override any decision +// in this file with a command-line -DR123_SOMETHING=1 or -DR123_SOMETHINE=0 + +// An alternative idiom is: +// #ifndef R123_SOMETHING +// #define R123_SOMETHING (some boolean expression) +// #endif +// where the boolean expression might contain previously-defined R123_SOMETHING_ELSE +// pp-symbols. + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 0 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#define R123_USE_INTRIN_H 0 +#endif + +#ifndef R123_USE_MULHILO16_ASM +#define R123_USE_MULHILO16_ASM 0 +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef R123_USE_PHILOX_64BIT +#define R123_USE_PHILOX_64BIT 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +// If you add something, it must go in all the other XXfeatures.hpp +// and in ../ut_features.cpp +#endif diff --git a/src/include/clRNG/private/Random123/features/xlcfeatures.h b/src/include/clRNG/private/Random123/features/xlcfeatures.h new file mode 100644 index 0000000..a5c8412 --- /dev/null +++ b/src/include/clRNG/private/Random123/features/xlcfeatures.h @@ -0,0 +1,202 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Copyright (c) 2013, Los Alamos National Security, LLC +All rights reserved. + +Copyright 2013. Los Alamos National Security, LLC. This software was produced +under U.S. Government contract DE-AC52-06NA25396 for Los Alamos National +Laboratory (LANL), which is operated by Los Alamos National Security, LLC for +the U.S. Department of Energy. The U.S. Government has rights to use, +reproduce, and distribute this software. NEITHER THE GOVERNMENT NOR LOS +ALAMOS NATIONAL SECURITY, LLC MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR +ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. If software is modified +to produce derivative works, such modified software should be clearly marked, +so as not to confuse it with the version available from LANL. +*/ +#ifndef __xlcfeatures_dot_hpp +#define __xlcfeatures_dot_hpp + +#if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__) +# error "This code has only been tested on x86 and PowerPC platforms." +#include +{ /* maybe an unbalanced brace will terminate the compilation */ + /* Feel free to try the Random123 library on other architectures by changing + the conditions that reach this error, but you should consider it a + porting exercise and expect to encounter bugs and deficiencies. + Please let the authors know of any successes (or failures). */ +#endif + +#ifdef __cplusplus +/* builtins are automatically available to xlc. To use them with xlc++, + one must include builtins.h. c.f + http://publib.boulder.ibm.com/infocenter/cellcomp/v101v121/index.jsp?topic=/com.ibm.xlcpp101.cell.doc/compiler_ref/compiler_builtins.html +*/ +#include +#endif + +#ifndef R123_STATIC_INLINE +#define R123_STATIC_INLINE static inline +#endif + +#ifndef R123_FORCE_INLINE +#define R123_FORCE_INLINE(decl) decl __attribute__((__always_inline__)) +#endif + +#ifndef R123_CUDA_DEVICE +#define R123_CUDA_DEVICE +#endif + +#ifndef R123_ASSERT +#include +#define R123_ASSERT(x) assert(x) +#endif + +#ifndef R123_BUILTIN_EXPECT +#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely) +#endif + +#ifndef R123_USE_AES_NI +#define R123_USE_AES_NI 0 +#endif + +#ifndef R123_USE_SSE4_2 +#define R123_USE_SSE4_2 0 +#endif + +#ifndef R123_USE_SSE4_1 +#define R123_USE_SSE4_1 0 +#endif + +#ifndef R123_USE_SSE +#define R123_USE_SSE 0 +#endif + +#ifndef R123_USE_AES_OPENSSL +/* There isn't really a good way to tell at compile time whether + openssl is available. Without a pre-compilation configure-like + tool, it's less error-prone to guess that it isn't available. Add + -DR123_USE_AES_OPENSSL=1 and any necessary LDFLAGS or LDLIBS to + play with openssl */ +#define R123_USE_AES_OPENSSL 0 +#endif + +#ifndef R123_USE_GNU_UINT128 +#define R123_USE_GNU_UINT128 0 +#endif + +#ifndef R123_USE_ASM_GNU +#define R123_USE_ASM_GNU 1 +#endif + +#ifndef R123_USE_CPUID_MSVC +#define R123_USE_CPUID_MSVC 0 +#endif + +#ifndef R123_USE_X86INTRIN_H +#define R123_USE_X86INTRIN_H 0 +#endif + +#ifndef R123_USE_IA32INTRIN_H +#define R123_USE_IA32INTRIN_H 0 +#endif + +#ifndef R123_USE_XMMINTRIN_H +#define R123_USE_XMMINTRIN_H 0 +#endif + +#ifndef R123_USE_EMMINTRIN_H +#define R123_USE_EMMINTRIN_H 0 +#endif + +#ifndef R123_USE_SMMINTRIN_H +#define R123_USE_SMMINTRIN_H 0 +#endif + +#ifndef R123_USE_WMMINTRIN_H +#define R123_USE_WMMINTRIN_H 0 +#endif + +#ifndef R123_USE_INTRIN_H +#ifdef __ABM__ +#define R123_USE_INTRIN_H 1 +#else +#define R123_USE_INTRIN_H 0 +#endif +#endif + +#ifndef R123_USE_MULHILO32_ASM +#define R123_USE_MULHILO32_ASM 0 +#endif + +#ifndef R123_USE_MULHILO64_MULHI_INTRIN +#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__)) +#endif + +#ifndef R123_MULHILO64_MULHI_INTRIN +#define R123_MULHILO64_MULHI_INTRIN __mulhdu +#endif + +#ifndef R123_USE_MULHILO32_MULHI_INTRIN +#define R123_USE_MULHILO32_MULHI_INTRIN 0 +#endif + +#ifndef R123_MULHILO32_MULHI_INTRIN +#define R123_MULHILO32_MULHI_INTRIN __mulhwu +#endif + +#ifndef R123_USE_MULHILO64_ASM +#define R123_USE_MULHILO64_ASM (defined(__powerpc64__) && !(R123_USE_MULHILO64_MULHI_INTRIN)) +#endif + +#ifndef R123_USE_MULHILO64_MSVC_INTRIN +#define R123_USE_MULHILO64_MSVC_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_CUDA_INTRIN +#define R123_USE_MULHILO64_CUDA_INTRIN 0 +#endif + +#ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif +#include +#ifndef UINT64_C +#error UINT64_C not defined. You must define __STDC_CONSTANT_MACROS before you #include +#endif + +/* If you add something, it must go in all the other XXfeatures.hpp + and in ../ut_features.cpp */ +#endif diff --git a/src/include/clRNG/private/Random123/gsl_microrng.h b/src/include/clRNG/private/Random123/gsl_microrng.h new file mode 100644 index 0000000..4f09412 --- /dev/null +++ b/src/include/clRNG/private/Random123/gsl_microrng.h @@ -0,0 +1,136 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef __r123_gslmicrorng_dot_h__ +#define __r123_gslmicrorng_dot_h__ + + +#include +#include + +/** The macro: GSL_MICRORNG(NAME, CBRNGNAME) is the GSL + analog analog of the C++ r123::MicroURNG template. It declares a gsl_rng + type named gsl_rng_NAME which uses the underlying CBRNGNAME + and can be invoked a limited number of times between calls to NAME_reset. + + When the underlying CBRNG's \c ctr_t is an \ref arrayNxW "r123arrayNxW", + and the gsl_rng_NAME may called up to \c N*2^32 times + between calls to \c NAME_reset. + + \c NAME_reset takes a gsl_rng_NAME type, a counter and a key as arguments. + It restarts the micro-rng with a new base counter and key. + + Note that you must call NAME_reset before the first use + of a gsl_rng. NAME_reset is not called automatically by + gsl_rng_alloc(). + + @code + #include + #include // this file + GSL_MICRORNG(microcbrng, threefry4x64, 20) // creates gsl_rng_microcbrng + + int main(int argc, char** argv) { + gsl_rng *r = gsl_rng_alloc(gsl_rng_microcbrng); + threefry4x64_ctr_t c = {{}}; + threefry4x64_key_t k = {{}}; + + for (...) { + c.v[0] = ??; // some application variable + microcbrng_reset(r, c, k); + for (...) { + // gaussian calls r several times. It is safe for + // r to be used upto 2^20 times in this loop + something[i] = gsl_ran_gaussian(r, 1.5); + } + } + } + @endcode + +*/ + +#define GSL_MICRORNG(NAME, CBRNGNAME) \ +const gsl_rng_type *gsl_rng_##NAME; \ + \ +typedef struct{ \ + CBRNGNAME##_ctr_t ctr; \ + CBRNGNAME##_ctr_t r; \ + CBRNGNAME##_key_t key; \ + R123_ULONG_LONG n; \ + int elem; \ +} NAME##_state; \ + \ +static unsigned long int NAME##_get(void *vstate){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + const int N=sizeof(st->ctr.v)/sizeof(st->ctr.v[0]); \ + if( st->elem == 0 ){ \ + CBRNGNAME##_ctr_t c = st->ctr; \ + c.v[N-1] |= st->n<<(R123_W(CBRNGNAME##_ctr_t)-32); \ + st->n++; \ + st->r = CBRNGNAME(c, st->key); \ + st->elem = N; \ + } \ + return 0xffffffff & st->r.v[--st->elem]; \ +} \ + \ +static double \ +NAME##_get_double (void * vstate) \ +{ \ + return NAME##_get (vstate)/4294967296.; \ +} \ + \ +static void NAME##_set(void *vstate, unsigned long int s){ \ + NAME##_state *st = (NAME##_state *)vstate; \ + (void)s; /* ignored */ \ + st->elem = 0; \ + st->n = ~0; /* will abort if _reset is not called */ \ +} \ + \ +static const gsl_rng_type NAME##_type = { \ + #NAME, \ + 0xffffffffUL, \ + 0, \ + sizeof(NAME##_state), \ + &NAME##_set, \ + &NAME##_get, \ + &NAME##_get_double \ +}; \ + \ +R123_STATIC_INLINE void NAME##_reset(const gsl_rng* gr, CBRNGNAME##_ctr_t c, CBRNGNAME##_key_t k) { \ + NAME##_state* state = (NAME##_state *)gr->state; \ + state->ctr = c; \ + state->key = k; \ + state->n = 0; \ + state->elem = 0; \ +} \ + \ +const gsl_rng_type *gsl_rng_##NAME = &NAME##_type + +#endif diff --git a/src/include/clRNG/private/Random123/threefry.h b/src/include/clRNG/private/Random123/threefry.h new file mode 100644 index 0000000..da2de97 --- /dev/null +++ b/src/include/clRNG/private/Random123/threefry.h @@ -0,0 +1,864 @@ +/* +Copyright 2010-2011, D. E. Shaw Research. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions, and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +* Neither the name of D. E. Shaw Research nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +#ifndef _threefry_dot_h_ +#define _threefry_dot_h_ +#include "features/compilerfeatures.h" +#include "array.h" + +/** \cond HIDDEN_FROM_DOXYGEN */ +/* Significant parts of this file were copied from + from: + Skein_FinalRnd/ReferenceImplementation/skein.h + Skein_FinalRnd/ReferenceImplementation/skein_block.c + + in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip + + This file has been modified so that it may no longer perform its originally + intended function. If you're looking for a Skein or Threefish source code, + please consult the original file. + + The original file had the following header: +************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** + +*/ + +/* See comment at the top of philox.h for the macro pre-process + strategy. */ + +/* Rotation constants: */ +enum r123_enum_threefry64x4 { + /* These are the R_256 constants from the Threefish reference sources + with names changed to R_64x4... */ + R_64x4_0_0=14, R_64x4_0_1=16, + R_64x4_1_0=52, R_64x4_1_1=57, + R_64x4_2_0=23, R_64x4_2_1=40, + R_64x4_3_0= 5, R_64x4_3_1=37, + R_64x4_4_0=25, R_64x4_4_1=33, + R_64x4_5_0=46, R_64x4_5_1=12, + R_64x4_6_0=58, R_64x4_6_1=22, + R_64x4_7_0=32, R_64x4_7_1=32 +}; + +enum r123_enum_threefry64x2 { + /* + // Output from skein_rot_search: (srs64_B64-X1000) + // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 + // Start: Tue Mar 1 10:07:48 2011 + // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format + */ + R_64x2_0_0=16, + R_64x2_1_0=42, + R_64x2_2_0=12, + R_64x2_3_0=31, + R_64x2_4_0=16, + R_64x2_5_0=32, + R_64x2_6_0=24, + R_64x2_7_0=21 + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 8 [ 8 8 8 8 ] + // 6 rounds: minHW = 16 [ 16 16 16 16 ] + // 7 rounds: minHW = 32 [ 32 32 32 32 ] + // 8 rounds: minHW = 64 [ 64 64 64 64 ] + // 9 rounds: minHW = 64 [ 64 64 64 64 ] + //10 rounds: minHW = 64 [ 64 64 64 64 ] + //11 rounds: minHW = 64 [ 64 64 64 64 ] */ +}; + +enum r123_enum_threefry32x4 { + /* Output from skein_rot_search: (srs-B128-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Mon Aug 24 22:41:36 2009 + // ... + // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */ + R_32x4_0_0=10, R_32x4_0_1=26, + R_32x4_1_0=11, R_32x4_1_1=21, + R_32x4_2_0=13, R_32x4_2_1=27, + R_32x4_3_0=23, R_32x4_3_1= 5, + R_32x4_4_0= 6, R_32x4_4_1=20, + R_32x4_5_0=17, R_32x4_5_1=11, + R_32x4_6_0=25, R_32x4_6_1=10, + R_32x4_7_0=18, R_32x4_7_1=20 + + /* 4 rounds: minHW = 3 [ 3 3 3 3 ] + // 5 rounds: minHW = 7 [ 7 7 7 7 ] + // 6 rounds: minHW = 12 [ 13 12 13 12 ] + // 7 rounds: minHW = 22 [ 22 23 22 23 ] + // 8 rounds: minHW = 31 [ 31 31 31 31 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + +}; + +enum r123_enum_threefry32x2 { + /* Output from skein_rot_search (srs32x2-X5000.out) + // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 + // Start: Tue Jul 12 11:11:33 2011 + // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */ + R_32x2_0_0=13, + R_32x2_1_0=15, + R_32x2_2_0=26, + R_32x2_3_0= 6, + R_32x2_4_0=17, + R_32x2_5_0=29, + R_32x2_6_0=16, + R_32x2_7_0=24 + + /* 4 rounds: minHW = 4 [ 4 4 4 4 ] + // 5 rounds: minHW = 6 [ 6 8 6 8 ] + // 6 rounds: minHW = 9 [ 9 12 9 12 ] + // 7 rounds: minHW = 16 [ 16 24 16 24 ] + // 8 rounds: minHW = 32 [ 32 32 32 32 ] + // 9 rounds: minHW = 32 [ 32 32 32 32 ] + //10 rounds: minHW = 32 [ 32 32 32 32 ] + //11 rounds: minHW = 32 [ 32 32 32 32 ] */ + }; + +enum r123_enum_threefry_wcnt { + WCNT2=2, + WCNT4=4 +}; +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N) +{ + return (x << (N & 63)) | (x >> ((64-N) & 63)); +} + +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N)); +R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N) +{ + return (x << (N & 31)) | (x >> ((32-N) & 31)); +} + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) +#define SKEIN_KS_PARITY32 0x1BD11BDA + +#ifndef THREEFRY2x32_DEFAULT_ROUNDS +#define THREEFRY2x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY2x64_DEFAULT_ROUNDS +#define THREEFRY2x64_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x32_DEFAULT_ROUNDS +#define THREEFRY4x32_DEFAULT_ROUNDS 20 +#endif + +#ifndef THREEFRY4x64_DEFAULT_ROUNDS +#define THREEFRY4x64_DEFAULT_ROUNDS 20 +#endif + +#define _threefry2x_tpl(W) \ +typedef struct r123array2x##W threefry2x##W##_ctr_t; \ +typedef struct r123array2x##W threefry2x##W##_key_t; \ +typedef struct r123array2x##W threefry2x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + threefry2x##W##_ctr_t X; \ + uint##W##_t ks[2+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=32); \ + ks[2] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 2; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[2] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + \ + if(Nrounds>0){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>1){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>2){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 1; /* X.v[2-1] += r */ \ + } \ + if(Nrounds>4){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>5){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>6){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 2; \ + } \ + if(Nrounds>8){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>9){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>10){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 3; \ + } \ + if(Nrounds>12){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>13){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>14){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>15){ \ + /* InjectKey(r=4) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 4; \ + } \ + if(Nrounds>16){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>17){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>18){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>19){ \ + /* InjectKey(r=5) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 5; \ + } \ + if(Nrounds>20){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>21){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>22){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>23){ \ + /* InjectKey(r=6) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; \ + X.v[1] += 6; \ + } \ + if(Nrounds>24){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>25){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>26){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>27){ \ + /* InjectKey(r=7) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; \ + X.v[1] += 7; \ + } \ + if(Nrounds>28){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>29){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>30){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \ + if(Nrounds>31){ \ + /* InjectKey(r=8) */ \ + X.v[0] += ks[2]; X.v[1] += ks[0]; \ + X.v[1] += 8; \ + } \ + return X; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \ + return threefry2x##W##_R(threefry2x##W##_rounds, in, k); \ +} + + +#define _threefry4x_tpl(W) \ +typedef struct r123array4x##W threefry4x##W##_ctr_t; \ +typedef struct r123array4x##W threefry4x##W##_key_t; \ +typedef struct r123array4x##W threefry4x##W##_ukey_t; \ +R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + threefry4x##W##_ctr_t X; \ + uint##W##_t ks[4+1]; \ + int i; /* avoid size_t to avoid need for stddef.h */ \ + R123_ASSERT(Nrounds<=72); \ + ks[4] = SKEIN_KS_PARITY##W; \ + for (i=0;i < 4; i++) \ + { \ + ks[i] = k.v[i]; \ + X.v[i] = in.v[i]; \ + ks[4] ^= k.v[i]; \ + } \ + \ + /* Insert initial key before round 0 */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + \ + if(Nrounds>0){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>1){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>2){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>3){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 1; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>4){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>5){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>6){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>7){ \ + /* InjectKey(r=2) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 2; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>8){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>9){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>10){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>11){ \ + /* InjectKey(r=3) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 3; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>12){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>13){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>14){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>15){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 4; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>16){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>17){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>18){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>19){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 5; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>20){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>21){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>22){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>23){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 6; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>24){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>25){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>26){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>27){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 7; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>28){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>29){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>30){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>31){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 8; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>32){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>33){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>34){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>35){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 9; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>36){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>37){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>38){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>39){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 10; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>40){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>41){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>42){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>43){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 11; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>44){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>45){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>46){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>47){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 12; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>48){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>49){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>50){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>51){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 13; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>52){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>53){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>54){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>55){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \ + X.v[4-1] += 14; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>56){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>57){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>58){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>59){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \ + X.v[4-1] += 15; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>60){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>61){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>62){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>63){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \ + X.v[4-1] += 16; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>64){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>65){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>66){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>67){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \ + X.v[4-1] += 17; /* X.v[WCNT4-1] += r */ \ + } \ + \ + if(Nrounds>68){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>69){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>70){ \ + X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \ + X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \ + X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \ + } \ + if(Nrounds>71){ \ + /* InjectKey(r=1) */ \ + X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \ + X.v[4-1] += 18; /* X.v[WCNT4-1] += r */ \ + } \ + \ + return X; \ +} \ + /** @ingroup ThreefryNxW */ \ +enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS }; \ +R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \ +R123_CUDA_DEVICE R123_STATIC_INLINE \ +threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \ + return threefry4x##W##_R(threefry4x##W##_rounds, in, k); \ +} +/** \endcond */ + +_threefry2x_tpl(64) +_threefry2x_tpl(32) +_threefry4x_tpl(64) +_threefry4x_tpl(32) + +/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better + than a static inline function. Why? */ +#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k) +#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k) +#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k) +#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k) + +#ifdef __cplusplus +/** \cond HIDDEN_FROM_DOXYGEN */ +#define _threefryNxWclass_tpl(NxW) \ +namespace r123{ \ +template \ + struct Threefry##NxW##_R{ \ + typedef threefry##NxW##_ctr_t ctr_type; \ + typedef threefry##NxW##_key_t key_type; \ + typedef threefry##NxW##_key_t ukey_type; \ + static const unsigned int rounds=R; \ + inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \ + R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \ + return threefry##NxW##_R(R, ctr, key); \ + } \ +}; \ + typedef Threefry##NxW##_R Threefry##NxW; \ +} // namespace r123 + +/** \endcond */ + +_threefryNxWclass_tpl(2x32) +_threefryNxWclass_tpl(4x32) +_threefryNxWclass_tpl(2x64) +_threefryNxWclass_tpl(4x64) + +/* The _tpl macros don't quite work to do string-pasting inside comments. + so we just write out the boilerplate documentation four times... */ + +/** +@defgroup ThreefryNxW Threefry Classes and Typedefs + +The ThreefryNxW classes export the member functions, typedefs and +operator overloads required by a @ref CBRNG "CBRNG" class. + +As described in +Parallel Random Numbers: As Easy as 1, 2, 3 , +the Threefry family is closely related to the Threefish block cipher from + Skein Hash Function. +Threefry is \b not suitable for cryptographic use. + +Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output. + +@class r123::Threefry2x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=13 or more for Threefry2x32. + +@typedef r123::Threefry2x32 +@ingroup ThreefryNxW + Threefry2x32 is equivalent to Threefry2x32_R<20>. With 20 rounds, + Threefry2x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + +@class r123::Threefry2x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +In November 2011, the authors discovered that 13 rounds of +Threefry2x64 sequenced by strided, interleaved key and counter +increments failed a very long (longer than the default BigCrush +length) WeightDistrub test. At the same time, it was confirmed that +14 rounds passes much longer tests (up to 5x10^12 samples) of a +similar nature. The authors know of no statistical flaws with +ROUNDS=14 or more for Threefry2x64. + +@typedef r123::Threefry2x64 +@ingroup ThreefryNxW + Threefry2x64 is equivalent to Threefry2x64_R<20>. With 20 rounds, + Threefry2x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x32_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x32. + +@typedef r123::Threefry4x32 +@ingroup ThreefryNxW + Threefry4x32 is equivalent to Threefry4x32_R<20>. With 20 rounds, + Threefry4x32 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. + + + +@class r123::Threefry4x64_R +@ingroup ThreefryNxW + +exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class. + +The template argument, ROUNDS, is the number of times the Threefry round +function will be applied. + +As of September 2011, the authors know of no statistical flaws with +ROUNDS=12 or more for Threefry4x64. + +@typedef r123::Threefry4x64 +@ingroup ThreefryNxW + Threefry4x64 is equivalent to Threefry4x64_R<20>. With 20 rounds, + Threefry4x64 has a considerable safety margin over the minimum number + of rounds with no known statistical flaws, but still has excellent + performance. +*/ + +#endif + +#endif