00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 #ifndef _Random123_sse_dot_h__
00033 #define _Random123_sse_dot_h__
00034 
00035 #if R123_USE_SSE
00036 
00037 #if R123_USE_X86INTRIN_H
00038 #include <x86intrin.h>
00039 #endif
00040 #if R123_USE_IA32INTRIN_H
00041 #include <ia32intrin.h>
00042 #endif
00043 #if R123_USE_EMMINTRIN_H
00044 #include <emmintrin.h>
00045 #endif
00046 #if R123_USE_SMMINTRIN_H
00047 #include <smmintrin.h>
00048 #endif
00049 #if R123_USE_WMMINTRIN_H
00050 #include <wmmintrin.h>
00051 #endif
00052 #if R123_USE_INTRIN_H
00053 #include <intrin.h>
00054 #endif
00055 #ifdef __cplusplus
00056 #include <iostream>
00057 #include <limits>
00058 #include <stdexcept>
00059 #endif
00060 
00061 #if R123_USE_ASM_GNU
00062 
00063 
00064 R123_STATIC_INLINE int haveAESNI(){
00065     unsigned int eax, ebx, ecx, edx;
00066     __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
00067                       "a" (1));
00068     return (ecx>>25) & 1;
00069 }
00070 #elif R123_USE_CPUID_MSVC
00071 R123_STATIC_INLINE int haveAESNI(){
00072     int CPUInfo[4];
00073     __cpuid(CPUInfo, 1);
00074     return (CPUInfo[2]>>25)&1;
00075 }
00076 #else 
00077 #warning "No R123_USE_CPUID_XXX method chosen.  haveAESNI will always return false"
00078 R123_STATIC_INLINE int haveAESNI(){
00079     return 0;
00080 }
00081 #endif 
00082 
00083 
00084 
00085 
00086 
00087 
00088 
00089 
00090 #if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
00091 
00092 
00093 
00094 
00095 R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
00096     union{
00097         uint64_t u64;
00098         uint32_t u32[2];
00099     } u1, u0;
00100     u1.u64 = v1;
00101     u0.u64 = v0;
00102     return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
00103 }
00104 #endif
00105 
00106 
00107 
00108 
00109 
00110 
00111 
00112 
00113 
00114 
00115 #if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
00116 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00117     union{
00118         uint64_t u64[2];
00119         __m128i m;
00120     }u;
00121     _mm_store_si128(&u.m, si);
00122     return u.u64[0];
00123 }
00124 #elif defined(__llvm__) || defined(__ICC)
00125 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00126     return (uint64_t)_mm_cvtsi128_si64(si);
00127 }
00128 #else 
00129 
00130 
00131 
00132 R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
00133     return (uint64_t)_mm_cvtsi128_si64x(si);
00134 }
00135 #endif
00136 #if defined(__GNUC__) && __GNUC__ < 4
00137 
00138 R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
00139     return (__m128)si;
00140 }
00141 #endif
00142 
00143 #ifdef __cplusplus
00144 
00145 struct r123m128i{
00146     __m128i m;
00147 #if R123_USE_CXX0X
00148     
00149     
00150     
00151     
00152     
00153     
00154     
00155     
00156     
00157     r123m128i() = default;
00158     r123m128i(__m128i _m): m(_m){}
00159 #endif
00160     r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
00161     r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
00162 #if R123_USE_CXX0X
00163     
00164     
00165     
00166     explicit operator bool() const {return _bool();}
00167 #else
00168     
00169     
00170     operator const void*() const{return _bool()?this:0;}
00171 #endif
00172     operator __m128i() const {return m;}
00173 
00174 private:
00175 #if R123_USE_SSE4_1
00176     bool _bool() const{ return !_mm_testz_si128(m,m); }
00177 #else
00178     bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
00179 #endif
00180 };
00181 
00182 R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
00183     __m128i& c = v.m;
00184     __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
00185     c = _mm_add_epi64(c, zeroone);
00186     
00187 #if R123_USE_SSE4_1
00188     __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
00189     if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
00190         __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
00191         c = _mm_add_epi64(c, onezero);
00192     }
00193 #else
00194     unsigned mask  = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
00195     
00196     
00197     if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
00198         __m128i onezero = _mm_set_epi64x(1,0);
00199         c = _mm_add_epi64(c, onezero);
00200     }
00201 #endif
00202     return v;
00203 }
00204 
00205 R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ 
00206     __m128i c = lhs.m;
00207     __m128i incr128 = _mm_set_epi64x(0, n);
00208     c = _mm_add_epi64(c, incr128);
00209     
00210 
00211     int64_t lo64 = _mm_extract_lo64(c);
00212     if((uint64_t)lo64 < n)
00213         c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
00214     lhs.m = c;
00215     return lhs; 
00216 }
00217 
00218 
00219 R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG lhs, const r123m128i &rhs){
00220     throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
00221 
00222 
00223 
00224 
00225 R123_STATIC_INLINE bool operator<(const r123m128i& lhs, const r123m128i& rhs){
00226     throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
00227 R123_STATIC_INLINE bool operator<=(const r123m128i& lhs, const r123m128i& rhs){
00228     throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
00229 R123_STATIC_INLINE bool operator>(const r123m128i& lhs, const r123m128i& rhs){
00230     throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
00231 R123_STATIC_INLINE bool operator>=(const r123m128i& lhs, const r123m128i& rhs){
00232     throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
00233 
00234 R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ 
00235     return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
00236 R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ 
00237     return !(lhs==rhs);}
00238 R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
00239     r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
00240 R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
00241     return !(lhs==rhs);}
00242 R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
00243     union{
00244         uint64_t u64[2];
00245         __m128i m;
00246     }u;
00247     _mm_storeu_si128(&u.m, m.m);
00248     return os << u.u64[0] << " " << u.u64[1];
00249 }
00250 
00251 R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
00252     uint64_t u64[2];
00253     is >> u64[0] >> u64[1];
00254     m.m = _mm_set_epi64x(u64[1], u64[0]);
00255     return is;
00256 }
00257 
00258 template<typename T> inline T assemble_from_u32(uint32_t *p32); 
00259 
00260 template <>
00261 inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
00262     r123m128i ret;
00263     ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
00264     return ret;
00265 }
00266 
00267 #else
00268 
00269 typedef struct {
00270     __m128i m;
00271 } r123m128i;
00272 
00273 #endif 
00274 
00275 #else 
00276 R123_STATIC_INLINE int haveAESNI(){
00277     return 0;
00278 }
00279 #endif 
00280 
00281 #endif