compress_amd64.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. package blake3
  2. import "unsafe"
  3. //go:generate go run avo/gen.go -out blake3_amd64.s
  4. //go:noescape
  5. func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
  6. //go:noescape
  7. func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
  8. //go:noescape
  9. func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
  10. //go:noescape
  11. func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
  12. //go:noescape
  13. func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
  14. func compressNode(n node) (out [16]uint32) {
  15. compressNodeGeneric(&out, n)
  16. return
  17. }
  18. func compressBufferAVX512(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
  19. var cvs [maxSIMD][8]uint32
  20. compressChunksAVX512(&cvs, buf, key, counter, flags)
  21. numChunks := uint64(buflen / chunkSize)
  22. if buflen%chunkSize != 0 {
  23. // use non-asm for remainder
  24. partialChunk := buf[buflen-buflen%chunkSize : buflen]
  25. cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
  26. numChunks++
  27. }
  28. return mergeSubtrees(&cvs, numChunks, key, flags)
  29. }
  30. func compressBufferAVX2(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
  31. var cvs [maxSIMD][8]uint32
  32. cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs))
  33. bufHalves := (*[2][8 * chunkSize]byte)(unsafe.Pointer(buf))
  34. compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags)
  35. numChunks := uint64(buflen / chunkSize)
  36. if numChunks > 8 {
  37. compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags)
  38. }
  39. if buflen%chunkSize != 0 {
  40. // use non-asm for remainder
  41. partialChunk := buf[buflen-buflen%chunkSize : buflen]
  42. cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
  43. numChunks++
  44. }
  45. return mergeSubtrees(&cvs, numChunks, key, flags)
  46. }
  47. func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
  48. switch {
  49. case haveAVX512 && buflen >= chunkSize*2:
  50. return compressBufferAVX512(buf, buflen, key, counter, flags)
  51. case haveAVX2 && buflen >= chunkSize*2:
  52. return compressBufferAVX2(buf, buflen, key, counter, flags)
  53. default:
  54. return compressBufferGeneric(buf, buflen, key, counter, flags)
  55. }
  56. }
  57. func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
  58. n := node{
  59. cv: *key,
  60. counter: counter,
  61. blockLen: blockSize,
  62. flags: flags | flagChunkStart,
  63. }
  64. blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:]
  65. for len(chunk) > blockSize {
  66. copy(blockBytes, chunk)
  67. chunk = chunk[blockSize:]
  68. n.cv = chainingValue(n)
  69. n.flags &^= flagChunkStart
  70. }
  71. // pad last block with zeros
  72. n.block = [16]uint32{}
  73. copy(blockBytes, chunk)
  74. n.blockLen = uint32(len(chunk))
  75. n.flags |= flagChunkEnd
  76. return n
  77. }
  78. func hashBlock(out *[64]byte, buf []byte) {
  79. var block [16]uint32
  80. copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
  81. compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{
  82. cv: iv,
  83. block: block,
  84. blockLen: uint32(len(buf)),
  85. flags: flagChunkStart | flagChunkEnd | flagRoot,
  86. })
  87. }
  88. func compressBlocks(out *[maxSIMD * blockSize]byte, n node) {
  89. switch {
  90. case haveAVX512:
  91. compressBlocksAVX512(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags)
  92. case haveAVX2:
  93. outs := (*[2][512]byte)(unsafe.Pointer(out))
  94. compressBlocksAVX2(&outs[0], &n.block, &n.cv, n.counter, n.blockLen, n.flags)
  95. compressBlocksAVX2(&outs[1], &n.block, &n.cv, n.counter+8, n.blockLen, n.flags)
  96. default:
  97. outs := (*[maxSIMD][64]byte)(unsafe.Pointer(out))
  98. compressBlocksGeneric(outs, n)
  99. }
  100. }
  101. func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
  102. if !haveAVX2 {
  103. return mergeSubtreesGeneric(cvs, numCVs, key, flags)
  104. }
  105. for numCVs > 2 {
  106. if numCVs%2 == 0 {
  107. compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
  108. } else {
  109. keep := cvs[numCVs-1]
  110. compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
  111. cvs[numCVs/2] = keep
  112. numCVs++
  113. }
  114. numCVs /= 2
  115. }
  116. return parentNode(cvs[0], cvs[1], *key, flags)
  117. }
  118. func wordsToBytes(words [16]uint32, block *[64]byte) {
  119. *block = *(*[64]byte)(unsafe.Pointer(&words))
  120. }