| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542 |
- package xxh3
- // avx512Switch is the size at which the avx512 code is used.
- // Bigger blocks benefit more.
- const avx512Switch = 1 << 10
- func accumScalar(accs *[8]u64, p, secret ptr, l u64) {
- if secret != key {
- accumScalarSeed(accs, p, secret, l)
- return
- }
- for l > _block {
- k := secret
- // accs
- for i := 0; i < 16; i++ {
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ readU64(k, 8*0)
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ readU64(k, 8*1)
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ readU64(k, 8*2)
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ readU64(k, 8*3)
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ readU64(k, 8*4)
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ readU64(k, 8*5)
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ readU64(k, 8*6)
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ readU64(k, 8*7)
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- l -= _stripe
- if l > 0 {
- p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
- }
- }
- // scramble accs
- accs[0] ^= accs[0] >> 47
- accs[0] ^= key64_128
- accs[0] *= prime32_1
- accs[1] ^= accs[1] >> 47
- accs[1] ^= key64_136
- accs[1] *= prime32_1
- accs[2] ^= accs[2] >> 47
- accs[2] ^= key64_144
- accs[2] *= prime32_1
- accs[3] ^= accs[3] >> 47
- accs[3] ^= key64_152
- accs[3] *= prime32_1
- accs[4] ^= accs[4] >> 47
- accs[4] ^= key64_160
- accs[4] *= prime32_1
- accs[5] ^= accs[5] >> 47
- accs[5] ^= key64_168
- accs[5] *= prime32_1
- accs[6] ^= accs[6] >> 47
- accs[6] ^= key64_176
- accs[6] *= prime32_1
- accs[7] ^= accs[7] >> 47
- accs[7] ^= key64_184
- accs[7] *= prime32_1
- }
- if l > 0 {
- t, k := (l-1)/_stripe, secret
- for i := u64(0); i < t; i++ {
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ readU64(k, 8*0)
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ readU64(k, 8*1)
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ readU64(k, 8*2)
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ readU64(k, 8*3)
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ readU64(k, 8*4)
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ readU64(k, 8*5)
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ readU64(k, 8*6)
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ readU64(k, 8*7)
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- l -= _stripe
- if l > 0 {
- p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
- }
- }
- if l > 0 {
- p = ptr(ui(p) - uintptr(_stripe-l))
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ key64_121
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ key64_129
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ key64_137
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ key64_145
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ key64_153
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ key64_161
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ key64_169
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ key64_177
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- }
- }
- }
- func accumBlockScalar(accs *[8]u64, p, secret ptr) {
- if secret != key {
- accumBlockScalarSeed(accs, p, secret)
- return
- }
- // accs
- for i := 0; i < 16; i++ {
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ readU64(secret, 8*0)
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ readU64(secret, 8*1)
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ readU64(secret, 8*2)
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ readU64(secret, 8*3)
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ readU64(secret, 8*4)
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ readU64(secret, 8*5)
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ readU64(secret, 8*6)
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ readU64(secret, 8*7)
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
- }
- // scramble accs
- accs[0] ^= accs[0] >> 47
- accs[0] ^= key64_128
- accs[0] *= prime32_1
- accs[1] ^= accs[1] >> 47
- accs[1] ^= key64_136
- accs[1] *= prime32_1
- accs[2] ^= accs[2] >> 47
- accs[2] ^= key64_144
- accs[2] *= prime32_1
- accs[3] ^= accs[3] >> 47
- accs[3] ^= key64_152
- accs[3] *= prime32_1
- accs[4] ^= accs[4] >> 47
- accs[4] ^= key64_160
- accs[4] *= prime32_1
- accs[5] ^= accs[5] >> 47
- accs[5] ^= key64_168
- accs[5] *= prime32_1
- accs[6] ^= accs[6] >> 47
- accs[6] ^= key64_176
- accs[6] *= prime32_1
- accs[7] ^= accs[7] >> 47
- accs[7] ^= key64_184
- accs[7] *= prime32_1
- }
- // accumScalarSeed should be used with custom key.
- func accumScalarSeed(accs *[8]u64, p, secret ptr, l u64) {
- for l > _block {
- k := secret
- // accs
- for i := 0; i < 16; i++ {
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ readU64(k, 8*0)
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ readU64(k, 8*1)
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ readU64(k, 8*2)
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ readU64(k, 8*3)
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ readU64(k, 8*4)
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ readU64(k, 8*5)
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ readU64(k, 8*6)
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ readU64(k, 8*7)
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- l -= _stripe
- if l > 0 {
- p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
- }
- }
- // scramble accs
- accs[0] ^= accs[0] >> 47
- accs[0] ^= readU64(secret, 128)
- accs[0] *= prime32_1
- accs[1] ^= accs[1] >> 47
- accs[1] ^= readU64(secret, 136)
- accs[1] *= prime32_1
- accs[2] ^= accs[2] >> 47
- accs[2] ^= readU64(secret, 144)
- accs[2] *= prime32_1
- accs[3] ^= accs[3] >> 47
- accs[3] ^= readU64(secret, 152)
- accs[3] *= prime32_1
- accs[4] ^= accs[4] >> 47
- accs[4] ^= readU64(secret, 160)
- accs[4] *= prime32_1
- accs[5] ^= accs[5] >> 47
- accs[5] ^= readU64(secret, 168)
- accs[5] *= prime32_1
- accs[6] ^= accs[6] >> 47
- accs[6] ^= readU64(secret, 176)
- accs[6] *= prime32_1
- accs[7] ^= accs[7] >> 47
- accs[7] ^= readU64(secret, 184)
- accs[7] *= prime32_1
- }
- if l > 0 {
- t, k := (l-1)/_stripe, secret
- for i := u64(0); i < t; i++ {
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ readU64(k, 8*0)
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ readU64(k, 8*1)
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ readU64(k, 8*2)
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ readU64(k, 8*3)
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ readU64(k, 8*4)
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ readU64(k, 8*5)
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ readU64(k, 8*6)
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ readU64(k, 8*7)
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- l -= _stripe
- if l > 0 {
- p, k = ptr(ui(p)+_stripe), ptr(ui(k)+8)
- }
- }
- if l > 0 {
- p = ptr(ui(p) - uintptr(_stripe-l))
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ readU64(secret, 121)
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ readU64(secret, 129)
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ readU64(secret, 137)
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ readU64(secret, 145)
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ readU64(secret, 153)
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ readU64(secret, 161)
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ readU64(secret, 169)
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ readU64(secret, 177)
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- }
- }
- }
- // accumBlockScalarSeed should be used with custom key.
- func accumBlockScalarSeed(accs *[8]u64, p, secret ptr) {
- // accs
- {
- secret := secret
- for i := 0; i < 16; i++ {
- dv0 := readU64(p, 8*0)
- dk0 := dv0 ^ readU64(secret, 8*0)
- accs[1] += dv0
- accs[0] += (dk0 & 0xffffffff) * (dk0 >> 32)
- dv1 := readU64(p, 8*1)
- dk1 := dv1 ^ readU64(secret, 8*1)
- accs[0] += dv1
- accs[1] += (dk1 & 0xffffffff) * (dk1 >> 32)
- dv2 := readU64(p, 8*2)
- dk2 := dv2 ^ readU64(secret, 8*2)
- accs[3] += dv2
- accs[2] += (dk2 & 0xffffffff) * (dk2 >> 32)
- dv3 := readU64(p, 8*3)
- dk3 := dv3 ^ readU64(secret, 8*3)
- accs[2] += dv3
- accs[3] += (dk3 & 0xffffffff) * (dk3 >> 32)
- dv4 := readU64(p, 8*4)
- dk4 := dv4 ^ readU64(secret, 8*4)
- accs[5] += dv4
- accs[4] += (dk4 & 0xffffffff) * (dk4 >> 32)
- dv5 := readU64(p, 8*5)
- dk5 := dv5 ^ readU64(secret, 8*5)
- accs[4] += dv5
- accs[5] += (dk5 & 0xffffffff) * (dk5 >> 32)
- dv6 := readU64(p, 8*6)
- dk6 := dv6 ^ readU64(secret, 8*6)
- accs[7] += dv6
- accs[6] += (dk6 & 0xffffffff) * (dk6 >> 32)
- dv7 := readU64(p, 8*7)
- dk7 := dv7 ^ readU64(secret, 8*7)
- accs[6] += dv7
- accs[7] += (dk7 & 0xffffffff) * (dk7 >> 32)
- p, secret = ptr(ui(p)+_stripe), ptr(ui(secret)+8)
- }
- }
- // scramble accs
- accs[0] ^= accs[0] >> 47
- accs[0] ^= readU64(secret, 128)
- accs[0] *= prime32_1
- accs[1] ^= accs[1] >> 47
- accs[1] ^= readU64(secret, 136)
- accs[1] *= prime32_1
- accs[2] ^= accs[2] >> 47
- accs[2] ^= readU64(secret, 144)
- accs[2] *= prime32_1
- accs[3] ^= accs[3] >> 47
- accs[3] ^= readU64(secret, 152)
- accs[3] *= prime32_1
- accs[4] ^= accs[4] >> 47
- accs[4] ^= readU64(secret, 160)
- accs[4] *= prime32_1
- accs[5] ^= accs[5] >> 47
- accs[5] ^= readU64(secret, 168)
- accs[5] *= prime32_1
- accs[6] ^= accs[6] >> 47
- accs[6] ^= readU64(secret, 176)
- accs[6] *= prime32_1
- accs[7] ^= accs[7] >> 47
- accs[7] ^= readU64(secret, 184)
- accs[7] *= prime32_1
- }
|