00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 #ifndef _philox_dot_h_
00033 #define _philox_dot_h_
00034 
00037 #include "features/compilerfeatures.h"
00038 #include "array.h"
00039 
00040 
00041 
00042 
00043 
00044 
00045 
00046 
00047 
00048 
00049 
00050 
00051 
00052 
00053 
00054 
00055 
00056 
00057 
00058 
00059 
00060 
00061 
00062 
00063 
00064 
00065 
00066 
00067 #define _mulhilo_dword_tpl(W, Word, Dword)                              \
00068 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00069     Dword product = ((Dword)a)*((Dword)b);                              \
00070     *hip = product>>W;                                                  \
00071     return (Word)product;                                               \
00072 }
00073 
00074 
00075 
00076 
00077 
00078 
00079 
00080 #define _mulhilo_asm_tpl(W, Word, INSN)                         \
00081 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){      \
00082     Word dx;                                                    \
00083     __asm__("\n\t"                                              \
00084         INSN " %2\n\t"                                          \
00085         : "=a"(ax), "=d"(dx)                                    \
00086         : "r"(b), "0"(ax)                                       \
00087         );                                                      \
00088     *hip = dx;                                                  \
00089     return ax;                                                  \
00090 }
00091 
00092 
00093 
00094 
00095 
00096 
00097 #define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN)               \
00098 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){       \
00099     return INTRIN(a, b, hip);                                   \
00100 }
00101 
00102 #define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN)                       \
00103 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
00104     *hip = INTRIN(a, b);                                                \
00105     return a*b;                                                         \
00106 }
00107 
00108 
00109 
00110 
00111 
00112 
00113 
00114 
00115 
00116 
00117 
00118 
00119 
00120 
00121 #define _mulhilo_c99_tpl(W, Word) \
00122 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
00123     const unsigned WHALF = W/2;                                    \
00124     const Word LOMASK = ((((Word)1)<<WHALF)-1);                    \
00125     Word lo = a*b;                          \
00126     Word ahi = a>>WHALF;                                           \
00127     Word alo = a& LOMASK;                                          \
00128     Word bhi = b>>WHALF;                                           \
00129     Word blo = b& LOMASK;                                          \
00130                                                                    \
00131     Word ahbl = ahi*blo;                                           \
00132     Word albh = alo*bhi;                                           \
00133                                                                    \
00134     Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK));                   \
00135     Word hi = ahi*bhi + (ahbl>>WHALF) +  (albh>>WHALF);                 \
00136     hi += ahbl_albh >> WHALF;  \
00137                                    \
00138     hi += ((lo >> WHALF) < (ahbl_albh&LOMASK));                         \
00139     *hip = hi;                                                          \
00140     return lo;                                                          \
00141 }
00142 
00143 
00144 
00145 
00146 
00147 
00148 #define _mulhilo_fail_tpl(W, Word)                                      \
00149 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){               \
00150     R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
00151 }
00152 
00153 
00154 
00155 
00156 
00157 
00158 #if R123_USE_MULHILO32_ASM
00159 _mulhilo_asm_tpl(32, uint32_t, "mull")
00160 #else
00161 _mulhilo_dword_tpl(32, uint32_t, uint64_t)
00162 #endif
00163 
00164 #if R123_USE_PHILOX_64BIT
00165 #if R123_USE_MULHILO64_ASM
00166 _mulhilo_asm_tpl(64, uint64_t, "mulq")
00167 #elif R123_USE_MULHILO64_MSVC_INTRIN
00168 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
00169 #elif R123_USE_MULHILO64_CUDA_INTRIN
00170 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
00171 #elif R123_USE_MULHILO64_OPENCL_INTRIN
00172 _mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
00173 #elif R123_USE_GNU_UINT128
00174 _mulhilo_dword_tpl(64, uint64_t, __uint128_t)
00175 #elif R123_USE_MULHILO64_C99
00176 _mulhilo_c99_tpl(64, uint64_t)
00177 #else
00178 _mulhilo_fail_tpl(64, uint64_t)
00179 #endif
00180 #endif
00181 
00182 
00183 
00184 
00185 
00186 
00187 
00188 
00189 
00190 #ifndef PHILOX_M2x64_0
00191 #define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
00192 #endif
00193 
00194 #ifndef PHILOX_M4x64_0
00195 #define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
00196 #endif
00197 
00198 #ifndef PHILOX_M4x64_1
00199 #define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
00200 #endif
00201 
00202 #ifndef PHILOX_M2x32_0
00203 #define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
00204 #endif
00205 
00206 #ifndef PHILOX_M4x32_0
00207 #define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
00208 #endif
00209 #ifndef PHILOX_M4x32_1
00210 #define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
00211 #endif
00212 
00213 #ifndef PHILOX_W64_0
00214 #define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)  
00215 #endif
00216 #ifndef PHILOX_W64_1
00217 #define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)  
00218 #endif
00219 
00220 #ifndef PHILOX_W32_0
00221 #define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
00222 #endif
00223 #ifndef PHILOX_W32_1
00224 #define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
00225 #endif
00226 
00227 #ifndef PHILOX2x32_DEFAULT_ROUNDS
00228 #define PHILOX2x32_DEFAULT_ROUNDS 10
00229 #endif
00230 
00231 #ifndef PHILOX2x64_DEFAULT_ROUNDS
00232 #define PHILOX2x64_DEFAULT_ROUNDS 10
00233 #endif
00234 
00235 #ifndef PHILOX4x32_DEFAULT_ROUNDS
00236 #define PHILOX4x32_DEFAULT_ROUNDS 10
00237 #endif
00238 
00239 #ifndef PHILOX4x64_DEFAULT_ROUNDS
00240 #define PHILOX4x64_DEFAULT_ROUNDS 10
00241 #endif
00242 
00243 
00244 
00245 #define _philox2xWround_tpl(W, T)                                       \
00246 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
00247 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
00248     T hi;                                                               \
00249     T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi);                \
00250     struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}};               \
00251     return out;                                                         \
00252 }
00253 #define _philox2xWbumpkey_tpl(W)                                        \
00254 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
00255     key.v[0] += PHILOX_W##W##_0;                                        \
00256     return key;                                                         \
00257 }
00258 
00259 #define _philox4xWround_tpl(W, T)                                       \
00260 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
00261 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
00262     T hi0;                                                              \
00263     T hi1;                                                              \
00264     T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0);              \
00265     T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1);              \
00266     struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1,               \
00267                               hi0^ctr.v[3]^key.v[1], lo0}};             \
00268     return out;                                                         \
00269 }
00270 
00271 #define _philox4xWbumpkey_tpl(W)                                        \
00272 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
00273     key.v[0] += PHILOX_W##W##_0;                                        \
00274     key.v[1] += PHILOX_W##W##_1;                                        \
00275     return key;                                                         \
00276 }
00277 
00278 #define _philoxNxW_tpl(N, Nhalf, W, T)                         \
00279                                        \
00280 enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
00281 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t;                  \
00282 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t;              \
00283 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t;              \
00284 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
00285 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
00286 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
00287     R123_ASSERT(R<=16);                                                 \
00288     if(R>0){                                       ctr = _philox##N##x##W##round(ctr, key); } \
00289     if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00290     if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00291     if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00292     if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00293     if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00294     if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00295     if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00296     if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00297     if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00298     if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00299     if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00300     if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00301     if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00302     if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00303     if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
00304     return ctr;                                                         \
00305 }
00306          
00307 _philox2xWbumpkey_tpl(32)
00308 _philox4xWbumpkey_tpl(32)
00309 _philox2xWround_tpl(32, uint32_t) 
00310 _philox4xWround_tpl(32, uint32_t)            
00312 _philoxNxW_tpl(2, 1, 32, uint32_t)    
00313 _philoxNxW_tpl(4, 2, 32, uint32_t)    
00314 #if R123_USE_PHILOX_64BIT
00315 
00316 _philox2xWbumpkey_tpl(64)
00317 _philox4xWbumpkey_tpl(64)
00318 _philox2xWround_tpl(64, uint64_t) 
00319 _philox4xWround_tpl(64, uint64_t) 
00321 _philoxNxW_tpl(2, 1, 64, uint64_t)    
00322 _philoxNxW_tpl(4, 2, 64, uint64_t)    
00323 #endif 
00324 
00325 #define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
00326 #define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
00327 #if R123_USE_PHILOX_64BIT
00328 #define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
00329 #define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
00330 #endif 
00331 
00332 #ifdef __cplusplus
00333 #include <stdexcept>
00334 
00337 #define _PhiloxNxW_base_tpl(CType, KType, N, W)                         \
00338 namespace r123{                                                          \
00339 template<unsigned int ROUNDS>                                             \
00340 struct Philox##N##x##W##_R{                                             \
00341     typedef CType ctr_type;                                         \
00342     typedef KType key_type;                                             \
00343     typedef KType ukey_type;                                         \
00344     static const unsigned int rounds=ROUNDS;                                 \
00345     inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
00346         R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
00347         return philox##N##x##W##_R(ROUNDS, ctr, key);                       \
00348     }                                                                   \
00349 };                                                                      \
00350 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
00351  } // namespace r123
00352 
00354 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) 
00355 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) 
00356 #if R123_USE_PHILOX_64BIT
00357 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) 
00358 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) 
00359 #endif
00360 
00361 
00362 
00363 
00458 #endif 
00459 
00460 #endif