accum_generic.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. package xxh3
  2. // avx512Switch is the size at which the avx512 code is used.
  3. // Bigger blocks benefit more.
  4. const avx512Switch = 1 << 10
  5. func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
  6. if secret != key {
  7. accumScalarSeed(accs, p, secret, l)
  8. return
  9. }
  10. for l > _block {
  11. k := secret
  12. // accs
  13. for i := 0; i < 16; i++ {
  14. dv0 := readU64(p, 8*0)
  15. dk0 := dv0 ^ readU64(k, 8*0)
  16. accs[1] += dv0
  17. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  18. dv1 := readU64(p, 8*1)
  19. dk1 := dv1 ^ readU64(k, 8*1)
  20. accs[0] += dv1
  21. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  22. dv2 := readU64(p, 8*2)
  23. dk2 := dv2 ^ readU64(k, 8*2)
  24. accs[3] += dv2
  25. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  26. dv3 := readU64(p, 8*3)
  27. dk3 := dv3 ^ readU64(k, 8*3)
  28. accs[2] += dv3
  29. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  30. dv4 := readU64(p, 8*4)
  31. dk4 := dv4 ^ readU64(k, 8*4)
  32. accs[5] += dv4
  33. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  34. dv5 := readU64(p, 8*5)
  35. dk5 := dv5 ^ readU64(k, 8*5)
  36. accs[4] += dv5
  37. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  38. dv6 := readU64(p, 8*6)
  39. dk6 := dv6 ^ readU64(k, 8*6)
  40. accs[7] += dv6
  41. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  42. dv7 := readU64(p, 8*7)
  43. dk7 := dv7 ^ readU64(k, 8*7)
  44. accs[6] += dv7
  45. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  46. l -= _stripe
  47. if l > 0 {
  48. p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
  49. }
  50. }
  51. // scramble accs
  52. accs[0] ^= accs[0] >> 47
  53. accs[0] ^= key64_128
  54. accs[0] *= prime32_1
  55. accs[1] ^= accs[1] >> 47
  56. accs[1] ^= key64_136
  57. accs[1] *= prime32_1
  58. accs[2] ^= accs[2] >> 47
  59. accs[2] ^= key64_144
  60. accs[2] *= prime32_1
  61. accs[3] ^= accs[3] >> 47
  62. accs[3] ^= key64_152
  63. accs[3] *= prime32_1
  64. accs[4] ^= accs[4] >> 47
  65. accs[4] ^= key64_160
  66. accs[4] *= prime32_1
  67. accs[5] ^= accs[5] >> 47
  68. accs[5] ^= key64_168
  69. accs[5] *= prime32_1
  70. accs[6] ^= accs[6] >> 47
  71. accs[6] ^= key64_176
  72. accs[6] *= prime32_1
  73. accs[7] ^= accs[7] >> 47
  74. accs[7] ^= key64_184
  75. accs[7] *= prime32_1
  76. }
  77. if l > 0 {
  78. t, k := (l-1)/_stripe, secret
  79. for i := u64(0); i < t; i++ {
  80. dv0 := readU64(p, 8*0)
  81. dk0 := dv0 ^ readU64(k, 8*0)
  82. accs[1] += dv0
  83. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  84. dv1 := readU64(p, 8*1)
  85. dk1 := dv1 ^ readU64(k, 8*1)
  86. accs[0] += dv1
  87. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  88. dv2 := readU64(p, 8*2)
  89. dk2 := dv2 ^ readU64(k, 8*2)
  90. accs[3] += dv2
  91. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  92. dv3 := readU64(p, 8*3)
  93. dk3 := dv3 ^ readU64(k, 8*3)
  94. accs[2] += dv3
  95. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  96. dv4 := readU64(p, 8*4)
  97. dk4 := dv4 ^ readU64(k, 8*4)
  98. accs[5] += dv4
  99. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  100. dv5 := readU64(p, 8*5)
  101. dk5 := dv5 ^ readU64(k, 8*5)
  102. accs[4] += dv5
  103. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  104. dv6 := readU64(p, 8*6)
  105. dk6 := dv6 ^ readU64(k, 8*6)
  106. accs[7] += dv6
  107. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  108. dv7 := readU64(p, 8*7)
  109. dk7 := dv7 ^ readU64(k, 8*7)
  110. accs[6] += dv7
  111. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  112. l -= _stripe
  113. if l > 0 {
  114. p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
  115. }
  116. }
  117. if l > 0 {
  118. p = ptr(ui(p) - uintptr(_stripe-l))
  119. dv0 := readU64(p, 8*0)
  120. dk0 := dv0 ^ key64_121
  121. accs[1] += dv0
  122. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  123. dv1 := readU64(p, 8*1)
  124. dk1 := dv1 ^ key64_129
  125. accs[0] += dv1
  126. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  127. dv2 := readU64(p, 8*2)
  128. dk2 := dv2 ^ key64_137
  129. accs[3] += dv2
  130. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  131. dv3 := readU64(p, 8*3)
  132. dk3 := dv3 ^ key64_145
  133. accs[2] += dv3
  134. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  135. dv4 := readU64(p, 8*4)
  136. dk4 := dv4 ^ key64_153
  137. accs[5] += dv4
  138. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  139. dv5 := readU64(p, 8*5)
  140. dk5 := dv5 ^ key64_161
  141. accs[4] += dv5
  142. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  143. dv6 := readU64(p, 8*6)
  144. dk6 := dv6 ^ key64_169
  145. accs[7] += dv6
  146. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  147. dv7 := readU64(p, 8*7)
  148. dk7 := dv7 ^ key64_177
  149. accs[6] += dv7
  150. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  151. }
  152. }
  153. }
  154. func accumBlockScalar(accs *[8]u64, p, secret ptr) {
  155. if secret != key {
  156. accumBlockScalarSeed(accs, p, secret)
  157. return
  158. }
  159. // accs
  160. for i := 0; i < 16; i++ {
  161. dv0 := readU64(p, 8*0)
  162. dk0 := dv0 ^ readU64(secret, 8*0)
  163. accs[1] += dv0
  164. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  165. dv1 := readU64(p, 8*1)
  166. dk1 := dv1 ^ readU64(secret, 8*1)
  167. accs[0] += dv1
  168. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  169. dv2 := readU64(p, 8*2)
  170. dk2 := dv2 ^ readU64(secret, 8*2)
  171. accs[3] += dv2
  172. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  173. dv3 := readU64(p, 8*3)
  174. dk3 := dv3 ^ readU64(secret, 8*3)
  175. accs[2] += dv3
  176. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  177. dv4 := readU64(p, 8*4)
  178. dk4 := dv4 ^ readU64(secret, 8*4)
  179. accs[5] += dv4
  180. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  181. dv5 := readU64(p, 8*5)
  182. dk5 := dv5 ^ readU64(secret, 8*5)
  183. accs[4] += dv5
  184. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  185. dv6 := readU64(p, 8*6)
  186. dk6 := dv6 ^ readU64(secret, 8*6)
  187. accs[7] += dv6
  188. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  189. dv7 := readU64(p, 8*7)
  190. dk7 := dv7 ^ readU64(secret, 8*7)
  191. accs[6] += dv7
  192. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  193. p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
  194. }
  195. // scramble accs
  196. accs[0] ^= accs[0] >> 47
  197. accs[0] ^= key64_128
  198. accs[0] *= prime32_1
  199. accs[1] ^= accs[1] >> 47
  200. accs[1] ^= key64_136
  201. accs[1] *= prime32_1
  202. accs[2] ^= accs[2] >> 47
  203. accs[2] ^= key64_144
  204. accs[2] *= prime32_1
  205. accs[3] ^= accs[3] >> 47
  206. accs[3] ^= key64_152
  207. accs[3] *= prime32_1
  208. accs[4] ^= accs[4] >> 47
  209. accs[4] ^= key64_160
  210. accs[4] *= prime32_1
  211. accs[5] ^= accs[5] >> 47
  212. accs[5] ^= key64_168
  213. accs[5] *= prime32_1
  214. accs[6] ^= accs[6] >> 47
  215. accs[6] ^= key64_176
  216. accs[6] *= prime32_1
  217. accs[7] ^= accs[7] >> 47
  218. accs[7] ^= key64_184
  219. accs[7] *= prime32_1
  220. }
  221. // accumScalarSeed should be used with custom key.
  222. func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
  223. for l > _block {
  224. k := secret
  225. // accs
  226. for i := 0; i < 16; i++ {
  227. dv0 := readU64(p, 8*0)
  228. dk0 := dv0 ^ readU64(k, 8*0)
  229. accs[1] += dv0
  230. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  231. dv1 := readU64(p, 8*1)
  232. dk1 := dv1 ^ readU64(k, 8*1)
  233. accs[0] += dv1
  234. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  235. dv2 := readU64(p, 8*2)
  236. dk2 := dv2 ^ readU64(k, 8*2)
  237. accs[3] += dv2
  238. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  239. dv3 := readU64(p, 8*3)
  240. dk3 := dv3 ^ readU64(k, 8*3)
  241. accs[2] += dv3
  242. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  243. dv4 := readU64(p, 8*4)
  244. dk4 := dv4 ^ readU64(k, 8*4)
  245. accs[5] += dv4
  246. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  247. dv5 := readU64(p, 8*5)
  248. dk5 := dv5 ^ readU64(k, 8*5)
  249. accs[4] += dv5
  250. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  251. dv6 := readU64(p, 8*6)
  252. dk6 := dv6 ^ readU64(k, 8*6)
  253. accs[7] += dv6
  254. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  255. dv7 := readU64(p, 8*7)
  256. dk7 := dv7 ^ readU64(k, 8*7)
  257. accs[6] += dv7
  258. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  259. l -= _stripe
  260. if l > 0 {
  261. p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
  262. }
  263. }
  264. // scramble accs
  265. accs[0] ^= accs[0] >> 47
  266. accs[0] ^= readU64(secret, 128)
  267. accs[0] *= prime32_1
  268. accs[1] ^= accs[1] >> 47
  269. accs[1] ^= readU64(secret, 136)
  270. accs[1] *= prime32_1
  271. accs[2] ^= accs[2] >> 47
  272. accs[2] ^= readU64(secret, 144)
  273. accs[2] *= prime32_1
  274. accs[3] ^= accs[3] >> 47
  275. accs[3] ^= readU64(secret, 152)
  276. accs[3] *= prime32_1
  277. accs[4] ^= accs[4] >> 47
  278. accs[4] ^= readU64(secret, 160)
  279. accs[4] *= prime32_1
  280. accs[5] ^= accs[5] >> 47
  281. accs[5] ^= readU64(secret, 168)
  282. accs[5] *= prime32_1
  283. accs[6] ^= accs[6] >> 47
  284. accs[6] ^= readU64(secret, 176)
  285. accs[6] *= prime32_1
  286. accs[7] ^= accs[7] >> 47
  287. accs[7] ^= readU64(secret, 184)
  288. accs[7] *= prime32_1
  289. }
  290. if l > 0 {
  291. t, k := (l-1)/_stripe, secret
  292. for i := u64(0); i < t; i++ {
  293. dv0 := readU64(p, 8*0)
  294. dk0 := dv0 ^ readU64(k, 8*0)
  295. accs[1] += dv0
  296. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  297. dv1 := readU64(p, 8*1)
  298. dk1 := dv1 ^ readU64(k, 8*1)
  299. accs[0] += dv1
  300. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  301. dv2 := readU64(p, 8*2)
  302. dk2 := dv2 ^ readU64(k, 8*2)
  303. accs[3] += dv2
  304. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  305. dv3 := readU64(p, 8*3)
  306. dk3 := dv3 ^ readU64(k, 8*3)
  307. accs[2] += dv3
  308. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  309. dv4 := readU64(p, 8*4)
  310. dk4 := dv4 ^ readU64(k, 8*4)
  311. accs[5] += dv4
  312. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  313. dv5 := readU64(p, 8*5)
  314. dk5 := dv5 ^ readU64(k, 8*5)
  315. accs[4] += dv5
  316. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  317. dv6 := readU64(p, 8*6)
  318. dk6 := dv6 ^ readU64(k, 8*6)
  319. accs[7] += dv6
  320. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  321. dv7 := readU64(p, 8*7)
  322. dk7 := dv7 ^ readU64(k, 8*7)
  323. accs[6] += dv7
  324. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  325. l -= _stripe
  326. if l > 0 {
  327. p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
  328. }
  329. }
  330. if l > 0 {
  331. p = ptr(ui(p) - uintptr(_stripe-l))
  332. dv0 := readU64(p, 8*0)
  333. dk0 := dv0 ^ readU64(secret, 121)
  334. accs[1] += dv0
  335. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  336. dv1 := readU64(p, 8*1)
  337. dk1 := dv1 ^ readU64(secret, 129)
  338. accs[0] += dv1
  339. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  340. dv2 := readU64(p, 8*2)
  341. dk2 := dv2 ^ readU64(secret, 137)
  342. accs[3] += dv2
  343. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  344. dv3 := readU64(p, 8*3)
  345. dk3 := dv3 ^ readU64(secret, 145)
  346. accs[2] += dv3
  347. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  348. dv4 := readU64(p, 8*4)
  349. dk4 := dv4 ^ readU64(secret, 153)
  350. accs[5] += dv4
  351. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  352. dv5 := readU64(p, 8*5)
  353. dk5 := dv5 ^ readU64(secret, 161)
  354. accs[4] += dv5
  355. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  356. dv6 := readU64(p, 8*6)
  357. dk6 := dv6 ^ readU64(secret, 169)
  358. accs[7] += dv6
  359. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  360. dv7 := readU64(p, 8*7)
  361. dk7 := dv7 ^ readU64(secret, 177)
  362. accs[6] += dv7
  363. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  364. }
  365. }
  366. }
  367. // accumBlockScalarSeed should be used with custom key.
  368. func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
  369. // accs
  370. {
  371. secret := secret
  372. for i := 0; i < 16; i++ {
  373. dv0 := readU64(p, 8*0)
  374. dk0 := dv0 ^ readU64(secret, 8*0)
  375. accs[1] += dv0
  376. accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
  377. dv1 := readU64(p, 8*1)
  378. dk1 := dv1 ^ readU64(secret, 8*1)
  379. accs[0] += dv1
  380. accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
  381. dv2 := readU64(p, 8*2)
  382. dk2 := dv2 ^ readU64(secret, 8*2)
  383. accs[3] += dv2
  384. accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
  385. dv3 := readU64(p, 8*3)
  386. dk3 := dv3 ^ readU64(secret, 8*3)
  387. accs[2] += dv3
  388. accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
  389. dv4 := readU64(p, 8*4)
  390. dk4 := dv4 ^ readU64(secret, 8*4)
  391. accs[5] += dv4
  392. accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
  393. dv5 := readU64(p, 8*5)
  394. dk5 := dv5 ^ readU64(secret, 8*5)
  395. accs[4] += dv5
  396. accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
  397. dv6 := readU64(p, 8*6)
  398. dk6 := dv6 ^ readU64(secret, 8*6)
  399. accs[7] += dv6
  400. accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
  401. dv7 := readU64(p, 8*7)
  402. dk7 := dv7 ^ readU64(secret, 8*7)
  403. accs[6] += dv7
  404. accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
  405. p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
  406. }
  407. }
  408. // scramble accs
  409. accs[0] ^= accs[0] >> 47
  410. accs[0] ^= readU64(secret, 128)
  411. accs[0] *= prime32_1
  412. accs[1] ^= accs[1] >> 47
  413. accs[1] ^= readU64(secret, 136)
  414. accs[1] *= prime32_1
  415. accs[2] ^= accs[2] >> 47
  416. accs[2] ^= readU64(secret, 144)
  417. accs[2] *= prime32_1
  418. accs[3] ^= accs[3] >> 47
  419. accs[3] ^= readU64(secret, 152)
  420. accs[3] *= prime32_1
  421. accs[4] ^= accs[4] >> 47
  422. accs[4] ^= readU64(secret, 160)
  423. accs[4] *= prime32_1
  424. accs[5] ^= accs[5] >> 47
  425. accs[5] ^= readU64(secret, 168)
  426. accs[5] *= prime32_1
  427. accs[6] ^= accs[6] >> 47
  428. accs[6] ^= readU64(secret, 176)
  429. accs[6] *= prime32_1
  430. accs[7] ^= accs[7] >> 47
  431. accs[7] ^= readU64(secret, 184)
  432. accs[7] *= prime32_1
  433. }