Geant4 Cross Reference |
1 // 1 2 // -*- C++ -*- 3 // 4 // ------------------------------------------- 5 // HEP Random 6 // --- RanluxppEngine -- 7 // helper implementation f 8 // ------------------------------------------- 9 10 #ifndef RANLUXPP_MULMOD_H 11 #define RANLUXPP_MULMOD_H 12 13 #include "helpers.h" 14 15 #include <cstdint> 16 17 /// Multiply two 576 bit numbers, stored as 9 18 /// 19 /// \param[in] in1 first factor as 9 numbers o 20 /// \param[in] in2 second factor as 9 numbers 21 /// \param[out] out result with 18 numbers of 22 static void multiply9x9(const uint64_t *in1, c 23 uint64_t *out) { 24 uint64_t next = 0; 25 unsigned nextCarry = 0; 26 27 #if defined(__clang__) || defined(__INTEL_COMP 28 #pragma unroll 29 #elif defined(__GNUC__) && __GNUC__ >= 8 30 // This pragma was introduced in GCC version 8 31 #pragma GCC unroll 18 32 #endif 33 for (int i = 0; i < 18; i++) { 34 uint64_t current = next; 35 unsigned carry = nextCarry; 36 37 next = 0; 38 nextCarry = 0; 39 40 #if defined(__clang__) || defined(__INTEL_COMP 41 #pragma unroll 42 #elif defined(__GNUC__) && __GNUC__ >= 8 43 // This pragma was introduced in GCC version 8 44 #pragma GCC unroll 9 45 #endif 46 for (int j = 0; j < 9; j++) { 47 int k = i - j; 48 if (k < 0 || k >= 9) 49 continue; 50 51 uint64_t fac1 = in1[j]; 52 uint64_t fac2 = in2[k]; 53 #if defined(__SIZEOF_INT128__) && !defined(CLH 54 #ifdef __GNUC__ 55 #pragma GCC diagnostic push 56 #pragma GCC diagnostic ignored "-Wpedantic" 57 #endif 58 unsigned __int128 prod = fac1; 59 prod = prod * fac2; 60 61 uint64_t upper = prod >> 64; 62 uint64_t lower = static_cast<uint64_t>(p 63 #ifdef __GNUC__ 64 #pragma GCC diagnostic pop 65 #endif 66 #else 67 uint64_t upper1 = fac1 >> 32; 68 uint64_t lower1 = static_cast<uint32_t>( 69 70 uint64_t upper2 = fac2 >> 32; 71 uint64_t lower2 = static_cast<uint32_t>( 72 73 // Multiply 32-bit parts, each product h 74 // (2 ** 32 - 1) ** 2 = 2 ** 64 - 2 * 2 75 uint64_t upper = upper1 * upper2; 76 uint64_t middle1 = upper1 * lower2; 77 uint64_t middle2 = lower1 * upper2; 78 uint64_t lower = lower1 * lower2; 79 80 // When adding the two products, the max 81 // 2 * 2 ** 64 - 4 * 2 ** 32 + 2, which 82 unsigned overflow; 83 uint64_t middle = add_overflow(middle1, 84 // Handling the overflow by a multiplica 85 // than branching with an if statement, 86 // optimize to this equivalent code. Not 87 // without this overflow handling when s 88 // products differently as described in 89 // https://stackoverflow.com/a/515872 90 // However, this approach takes at least 91 // why a) the code gives the same result 92 // to the mixture of 32 bit arithmetic. 93 // the scheme implemented here is actual 94 uint64_t overflow_add = overflow * (uint 95 // This addition can never overflow beca 96 // is 2 ** 64 - 2 * 2 ** 32 + 1 (see abo 97 // 2 ** 32, the result is 2 ** 64 - 2 ** 98 // the maximum 2 ** 64 - 1 that can be s 99 upper += overflow_add; 100 101 uint64_t middle_upper = middle >> 32; 102 uint64_t middle_lower = middle << 32; 103 104 lower = add_overflow(lower, middle_lower 105 upper += overflow; 106 107 // This still can't overflow since the m 108 // - 2 ** 32 - 4 if there was an overfl 109 // the maximum value of upper to 2 ** 110 // - otherwise upper still has the init 111 // and the addition of a value smalle 112 // a maximum value of 2 ** 64 - 2 ** 113 // (Both cases include the increment to 114 // 115 // All the reasoning makes perfect sense 116 // 64 bit numbers is smaller than or equ 117 // (2 ** 64 - 1) ** 2 = 2 ** 128 - 2 118 // with the upper bits matching the 2 ** 119 upper += middle_upper; 120 #endif 121 122 // Add to current, remember carry. 123 current = add_carry(current, lower, carr 124 125 // Add to next, remember nextCarry. 126 next = add_carry(next, upper, nextCarry) 127 } 128 129 next = add_carry(next, carry, nextCarry); 130 131 out[i] = current; 132 } 133 } 134 135 /// Compute a value congruent to mul modulo m 136 /// 137 /// \param[in] mul product from multiply9x9 wi 138 /// \param[out] out result with 9 numbers of 6 139 /// 140 /// \f$ m = 2^{576} - 2^{240} + 1 \f$ 141 /// 142 /// The result in out is guaranteed to be smal 143 static void mod_m(const uint64_t *mul, uint64_ 144 uint64_t r[9]; 145 // Assign r = t0 146 for (int i = 0; i < 9; i++) { 147 r[i] = mul[i]; 148 } 149 150 int64_t c = compute_r(mul + 9, r); 151 152 // To update r = r - c * m, it suffices to k 153 // because the 2 ** 576 will cancel out. Als 154 // the operation is still performed to avoid 155 156 // c * (-2 ** 240 + 1) in 576 bits looks as 157 // - if c = 0, the number is zero. 158 // - if c = 1: bits 576 to 240 are set, 159 // bits 239 to 1 are zero, and 160 // the last one is set 161 // - if c = -1, which corresponds to all bi 162 // bits 576 to 240 are zero and 163 // Note that all bits except the last are ex 164 // and the last byte is conveniently represe 165 // Now construct the three bit patterns from 166 // assembly implementation by Alexei Sibidan 167 168 // c = 0 -> t0 = 0; c = 1 -> t0 = 0; c = -1 169 // (The assembly implementation shifts by 63 170 int64_t t0 = c >> 1; 171 172 // Left shifting negative values is undefine 173 // unsigned. 174 uint64_t c_unsigned = static_cast<uint64_t>( 175 176 // c = 0 -> t2 = 0; c = 1 -> upper 16 bits s 177 int64_t t2 = t0 - (c_unsigned << 48); 178 179 // c = 0 -> t1 = 0; c = 1 -> all bits set (s 180 // (The assembly implementation shifts by 63 181 int64_t t1 = t2 >> 48; 182 183 unsigned carry = 0; 184 { 185 uint64_t r_0 = r[0]; 186 187 uint64_t out_0 = sub_carry(r_0, c, carry); 188 out[0] = out_0; 189 } 190 for (int i = 1; i < 3; i++) { 191 uint64_t r_i = r[i]; 192 r_i = sub_overflow(r_i, carry, carry); 193 194 uint64_t out_i = sub_carry(r_i, t0, carry) 195 out[i] = out_i; 196 } 197 { 198 uint64_t r_3 = r[3]; 199 r_3 = sub_overflow(r_3, carry, carry); 200 201 uint64_t out_3 = sub_carry(r_3, t2, carry) 202 out[3] = out_3; 203 } 204 for (int i = 4; i < 9; i++) { 205 uint64_t r_i = r[i]; 206 r_i = sub_overflow(r_i, carry, carry); 207 208 uint64_t out_i = sub_carry(r_i, t1, carry) 209 out[i] = out_i; 210 } 211 } 212 213 /// Combine multiply9x9 and mod_m with interna 214 /// 215 /// \param[in] in1 first factor with 9 numbers 216 /// \param[inout] inout second factor and also 217 /// 218 /// The result in inout is guaranteed to be sm 219 static void mulmod(const uint64_t *in1, uint64 220 uint64_t mul[2 * 9] = {0}; 221 multiply9x9(in1, inout, mul); 222 mod_m(mul, inout); 223 } 224 225 /// Compute base to the n modulo m 226 /// 227 /// \param[in] base with 9 numbers of 64 bits 228 /// \param[out] res output with 9 numbers of 6 229 /// \param[in] n exponent 230 /// 231 /// The arguments base and res may point to th 232 static void powermod(const uint64_t *base, uin 233 uint64_t fac[9] = {0}; 234 fac[0] = base[0]; 235 res[0] = 1; 236 for (int i = 1; i < 9; i++) { 237 fac[i] = base[i]; 238 res[i] = 0; 239 } 240 241 uint64_t mul[18] = {0}; 242 while (n) { 243 if (n & 1) { 244 multiply9x9(res, fac, mul); 245 mod_m(mul, res); 246 } 247 n >>= 1; 248 if (!n) 249 break; 250 multiply9x9(fac, fac, mul); 251 mod_m(mul, fac); 252 } 253 } 254 255 #endif 256