chacha_ppc64x.s 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. // Copyright 2019 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Based on CRYPTOGAMS code with the following comment:
  5. // # ====================================================================
  6. // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  7. // # project. The module is, however, dual licensed under OpenSSL and
  8. // # CRYPTOGAMS licenses depending on where you obtain it. For further
  9. // # details see http://www.openssl.org/~appro/cryptogams/.
  10. // # ====================================================================
  11. // Code for the perl script that generates the ppc64 assembler
  12. // can be found in the cryptogams repository at the link below. It is based on
  13. // the original from openssl.
  14. // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
  15. // The differences in this and the original implementation are
  16. // due to the calling conventions and initialization of constants.
  17. //go:build gc && !purego && (ppc64 || ppc64le)
  18. #include "textflag.h"
  19. #define OUT R3
  20. #define INP R4
  21. #define LEN R5
  22. #define KEY R6
  23. #define CNT R7
  24. #define TMP R15
  25. #define CONSTBASE R16
  26. #define BLOCKS R17
  27. // for VPERMXOR
  28. #define MASK R18
  29. DATA consts<>+0x00(SB)/4, $0x61707865
  30. DATA consts<>+0x04(SB)/4, $0x3320646e
  31. DATA consts<>+0x08(SB)/4, $0x79622d32
  32. DATA consts<>+0x0c(SB)/4, $0x6b206574
  33. DATA consts<>+0x10(SB)/4, $0x00000001
  34. DATA consts<>+0x14(SB)/4, $0x00000000
  35. DATA consts<>+0x18(SB)/4, $0x00000000
  36. DATA consts<>+0x1c(SB)/4, $0x00000000
  37. DATA consts<>+0x20(SB)/4, $0x00000004
  38. DATA consts<>+0x24(SB)/4, $0x00000000
  39. DATA consts<>+0x28(SB)/4, $0x00000000
  40. DATA consts<>+0x2c(SB)/4, $0x00000000
  41. DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
  42. DATA consts<>+0x34(SB)/4, $0x0a0b0809
  43. DATA consts<>+0x38(SB)/4, $0x06070405
  44. DATA consts<>+0x3c(SB)/4, $0x02030001
  45. DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
  46. DATA consts<>+0x44(SB)/4, $0x090a0b08
  47. DATA consts<>+0x48(SB)/4, $0x05060704
  48. DATA consts<>+0x4c(SB)/4, $0x01020300
  49. DATA consts<>+0x50(SB)/4, $0x61707865
  50. DATA consts<>+0x54(SB)/4, $0x61707865
  51. DATA consts<>+0x58(SB)/4, $0x61707865
  52. DATA consts<>+0x5c(SB)/4, $0x61707865
  53. DATA consts<>+0x60(SB)/4, $0x3320646e
  54. DATA consts<>+0x64(SB)/4, $0x3320646e
  55. DATA consts<>+0x68(SB)/4, $0x3320646e
  56. DATA consts<>+0x6c(SB)/4, $0x3320646e
  57. DATA consts<>+0x70(SB)/4, $0x79622d32
  58. DATA consts<>+0x74(SB)/4, $0x79622d32
  59. DATA consts<>+0x78(SB)/4, $0x79622d32
  60. DATA consts<>+0x7c(SB)/4, $0x79622d32
  61. DATA consts<>+0x80(SB)/4, $0x6b206574
  62. DATA consts<>+0x84(SB)/4, $0x6b206574
  63. DATA consts<>+0x88(SB)/4, $0x6b206574
  64. DATA consts<>+0x8c(SB)/4, $0x6b206574
  65. DATA consts<>+0x90(SB)/4, $0x00000000
  66. DATA consts<>+0x94(SB)/4, $0x00000001
  67. DATA consts<>+0x98(SB)/4, $0x00000002
  68. DATA consts<>+0x9c(SB)/4, $0x00000003
  69. DATA consts<>+0xa0(SB)/4, $0x11223300
  70. DATA consts<>+0xa4(SB)/4, $0x55667744
  71. DATA consts<>+0xa8(SB)/4, $0x99aabb88
  72. DATA consts<>+0xac(SB)/4, $0xddeeffcc
  73. DATA consts<>+0xb0(SB)/4, $0x22330011
  74. DATA consts<>+0xb4(SB)/4, $0x66774455
  75. DATA consts<>+0xb8(SB)/4, $0xaabb8899
  76. DATA consts<>+0xbc(SB)/4, $0xeeffccdd
  77. GLOBL consts<>(SB), RODATA, $0xc0
  78. #ifdef GOARCH_ppc64
  79. #define BE_XXBRW_INIT() \
  80. LVSL (R0)(R0), V24 \
  81. VSPLTISB $3, V25 \
  82. VXOR V24, V25, V24 \
  83. #define BE_XXBRW(vr) VPERM vr, vr, V24, vr
  84. #else
  85. #define BE_XXBRW_INIT()
  86. #define BE_XXBRW(vr)
  87. #endif
  88. //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
  89. TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
  90. MOVD out+0(FP), OUT
  91. MOVD inp+8(FP), INP
  92. MOVD len+16(FP), LEN
  93. MOVD key+24(FP), KEY
  94. MOVD counter+32(FP), CNT
  95. // Addressing for constants
  96. MOVD $consts<>+0x00(SB), CONSTBASE
  97. MOVD $16, R8
  98. MOVD $32, R9
  99. MOVD $48, R10
  100. MOVD $64, R11
  101. SRD $6, LEN, BLOCKS
  102. // for VPERMXOR
  103. MOVD $consts<>+0xa0(SB), MASK
  104. MOVD $16, R20
  105. // V16
  106. LXVW4X (CONSTBASE)(R0), VS48
  107. ADD $80,CONSTBASE
  108. // Load key into V17,V18
  109. LXVW4X (KEY)(R0), VS49
  110. LXVW4X (KEY)(R8), VS50
  111. // Load CNT, NONCE into V19
  112. LXVW4X (CNT)(R0), VS51
  113. // Clear V27
  114. VXOR V27, V27, V27
  115. BE_XXBRW_INIT()
  116. // V28
  117. LXVW4X (CONSTBASE)(R11), VS60
  118. // Load mask constants for VPERMXOR
  119. LXVW4X (MASK)(R0), V20
  120. LXVW4X (MASK)(R20), V21
  121. // splat slot from V19 -> V26
  122. VSPLTW $0, V19, V26
  123. VSLDOI $4, V19, V27, V19
  124. VSLDOI $12, V27, V19, V19
  125. VADDUWM V26, V28, V26
  126. MOVD $10, R14
  127. MOVD R14, CTR
  128. PCALIGN $16
  129. loop_outer_vsx:
  130. // V0, V1, V2, V3
  131. LXVW4X (R0)(CONSTBASE), VS32
  132. LXVW4X (R8)(CONSTBASE), VS33
  133. LXVW4X (R9)(CONSTBASE), VS34
  134. LXVW4X (R10)(CONSTBASE), VS35
  135. // splat values from V17, V18 into V4-V11
  136. VSPLTW $0, V17, V4
  137. VSPLTW $1, V17, V5
  138. VSPLTW $2, V17, V6
  139. VSPLTW $3, V17, V7
  140. VSPLTW $0, V18, V8
  141. VSPLTW $1, V18, V9
  142. VSPLTW $2, V18, V10
  143. VSPLTW $3, V18, V11
  144. // VOR
  145. VOR V26, V26, V12
  146. // splat values from V19 -> V13, V14, V15
  147. VSPLTW $1, V19, V13
  148. VSPLTW $2, V19, V14
  149. VSPLTW $3, V19, V15
  150. // splat const values
  151. VSPLTISW $-16, V27
  152. VSPLTISW $12, V28
  153. VSPLTISW $8, V29
  154. VSPLTISW $7, V30
  155. PCALIGN $16
  156. loop_vsx:
  157. VADDUWM V0, V4, V0
  158. VADDUWM V1, V5, V1
  159. VADDUWM V2, V6, V2
  160. VADDUWM V3, V7, V3
  161. VPERMXOR V12, V0, V21, V12
  162. VPERMXOR V13, V1, V21, V13
  163. VPERMXOR V14, V2, V21, V14
  164. VPERMXOR V15, V3, V21, V15
  165. VADDUWM V8, V12, V8
  166. VADDUWM V9, V13, V9
  167. VADDUWM V10, V14, V10
  168. VADDUWM V11, V15, V11
  169. VXOR V4, V8, V4
  170. VXOR V5, V9, V5
  171. VXOR V6, V10, V6
  172. VXOR V7, V11, V7
  173. VRLW V4, V28, V4
  174. VRLW V5, V28, V5
  175. VRLW V6, V28, V6
  176. VRLW V7, V28, V7
  177. VADDUWM V0, V4, V0
  178. VADDUWM V1, V5, V1
  179. VADDUWM V2, V6, V2
  180. VADDUWM V3, V7, V3
  181. VPERMXOR V12, V0, V20, V12
  182. VPERMXOR V13, V1, V20, V13
  183. VPERMXOR V14, V2, V20, V14
  184. VPERMXOR V15, V3, V20, V15
  185. VADDUWM V8, V12, V8
  186. VADDUWM V9, V13, V9
  187. VADDUWM V10, V14, V10
  188. VADDUWM V11, V15, V11
  189. VXOR V4, V8, V4
  190. VXOR V5, V9, V5
  191. VXOR V6, V10, V6
  192. VXOR V7, V11, V7
  193. VRLW V4, V30, V4
  194. VRLW V5, V30, V5
  195. VRLW V6, V30, V6
  196. VRLW V7, V30, V7
  197. VADDUWM V0, V5, V0
  198. VADDUWM V1, V6, V1
  199. VADDUWM V2, V7, V2
  200. VADDUWM V3, V4, V3
  201. VPERMXOR V15, V0, V21, V15
  202. VPERMXOR V12, V1, V21, V12
  203. VPERMXOR V13, V2, V21, V13
  204. VPERMXOR V14, V3, V21, V14
  205. VADDUWM V10, V15, V10
  206. VADDUWM V11, V12, V11
  207. VADDUWM V8, V13, V8
  208. VADDUWM V9, V14, V9
  209. VXOR V5, V10, V5
  210. VXOR V6, V11, V6
  211. VXOR V7, V8, V7
  212. VXOR V4, V9, V4
  213. VRLW V5, V28, V5
  214. VRLW V6, V28, V6
  215. VRLW V7, V28, V7
  216. VRLW V4, V28, V4
  217. VADDUWM V0, V5, V0
  218. VADDUWM V1, V6, V1
  219. VADDUWM V2, V7, V2
  220. VADDUWM V3, V4, V3
  221. VPERMXOR V15, V0, V20, V15
  222. VPERMXOR V12, V1, V20, V12
  223. VPERMXOR V13, V2, V20, V13
  224. VPERMXOR V14, V3, V20, V14
  225. VADDUWM V10, V15, V10
  226. VADDUWM V11, V12, V11
  227. VADDUWM V8, V13, V8
  228. VADDUWM V9, V14, V9
  229. VXOR V5, V10, V5
  230. VXOR V6, V11, V6
  231. VXOR V7, V8, V7
  232. VXOR V4, V9, V4
  233. VRLW V5, V30, V5
  234. VRLW V6, V30, V6
  235. VRLW V7, V30, V7
  236. VRLW V4, V30, V4
  237. BDNZ loop_vsx
  238. VADDUWM V12, V26, V12
  239. VMRGEW V0, V1, V27
  240. VMRGEW V2, V3, V28
  241. VMRGOW V0, V1, V0
  242. VMRGOW V2, V3, V2
  243. VMRGEW V4, V5, V29
  244. VMRGEW V6, V7, V30
  245. XXPERMDI VS32, VS34, $0, VS33
  246. XXPERMDI VS32, VS34, $3, VS35
  247. XXPERMDI VS59, VS60, $0, VS32
  248. XXPERMDI VS59, VS60, $3, VS34
  249. VMRGOW V4, V5, V4
  250. VMRGOW V6, V7, V6
  251. VMRGEW V8, V9, V27
  252. VMRGEW V10, V11, V28
  253. XXPERMDI VS36, VS38, $0, VS37
  254. XXPERMDI VS36, VS38, $3, VS39
  255. XXPERMDI VS61, VS62, $0, VS36
  256. XXPERMDI VS61, VS62, $3, VS38
  257. VMRGOW V8, V9, V8
  258. VMRGOW V10, V11, V10
  259. VMRGEW V12, V13, V29
  260. VMRGEW V14, V15, V30
  261. XXPERMDI VS40, VS42, $0, VS41
  262. XXPERMDI VS40, VS42, $3, VS43
  263. XXPERMDI VS59, VS60, $0, VS40
  264. XXPERMDI VS59, VS60, $3, VS42
  265. VMRGOW V12, V13, V12
  266. VMRGOW V14, V15, V14
  267. VSPLTISW $4, V27
  268. VADDUWM V26, V27, V26
  269. XXPERMDI VS44, VS46, $0, VS45
  270. XXPERMDI VS44, VS46, $3, VS47
  271. XXPERMDI VS61, VS62, $0, VS44
  272. XXPERMDI VS61, VS62, $3, VS46
  273. VADDUWM V0, V16, V0
  274. VADDUWM V4, V17, V4
  275. VADDUWM V8, V18, V8
  276. VADDUWM V12, V19, V12
  277. BE_XXBRW(V0)
  278. BE_XXBRW(V4)
  279. BE_XXBRW(V8)
  280. BE_XXBRW(V12)
  281. CMPU LEN, $64
  282. BLT tail_vsx
  283. // Bottom of loop
  284. LXVW4X (INP)(R0), VS59
  285. LXVW4X (INP)(R8), VS60
  286. LXVW4X (INP)(R9), VS61
  287. LXVW4X (INP)(R10), VS62
  288. VXOR V27, V0, V27
  289. VXOR V28, V4, V28
  290. VXOR V29, V8, V29
  291. VXOR V30, V12, V30
  292. STXVW4X VS59, (OUT)(R0)
  293. STXVW4X VS60, (OUT)(R8)
  294. ADD $64, INP
  295. STXVW4X VS61, (OUT)(R9)
  296. ADD $-64, LEN
  297. STXVW4X VS62, (OUT)(R10)
  298. ADD $64, OUT
  299. BEQ done_vsx
  300. VADDUWM V1, V16, V0
  301. VADDUWM V5, V17, V4
  302. VADDUWM V9, V18, V8
  303. VADDUWM V13, V19, V12
  304. BE_XXBRW(V0)
  305. BE_XXBRW(V4)
  306. BE_XXBRW(V8)
  307. BE_XXBRW(V12)
  308. CMPU LEN, $64
  309. BLT tail_vsx
  310. LXVW4X (INP)(R0), VS59
  311. LXVW4X (INP)(R8), VS60
  312. LXVW4X (INP)(R9), VS61
  313. LXVW4X (INP)(R10), VS62
  314. VXOR V27, V0, V27
  315. VXOR V28, V4, V28
  316. VXOR V29, V8, V29
  317. VXOR V30, V12, V30
  318. STXVW4X VS59, (OUT)(R0)
  319. STXVW4X VS60, (OUT)(R8)
  320. ADD $64, INP
  321. STXVW4X VS61, (OUT)(R9)
  322. ADD $-64, LEN
  323. STXVW4X VS62, (OUT)(V10)
  324. ADD $64, OUT
  325. BEQ done_vsx
  326. VADDUWM V2, V16, V0
  327. VADDUWM V6, V17, V4
  328. VADDUWM V10, V18, V8
  329. VADDUWM V14, V19, V12
  330. BE_XXBRW(V0)
  331. BE_XXBRW(V4)
  332. BE_XXBRW(V8)
  333. BE_XXBRW(V12)
  334. CMPU LEN, $64
  335. BLT tail_vsx
  336. LXVW4X (INP)(R0), VS59
  337. LXVW4X (INP)(R8), VS60
  338. LXVW4X (INP)(R9), VS61
  339. LXVW4X (INP)(R10), VS62
  340. VXOR V27, V0, V27
  341. VXOR V28, V4, V28
  342. VXOR V29, V8, V29
  343. VXOR V30, V12, V30
  344. STXVW4X VS59, (OUT)(R0)
  345. STXVW4X VS60, (OUT)(R8)
  346. ADD $64, INP
  347. STXVW4X VS61, (OUT)(R9)
  348. ADD $-64, LEN
  349. STXVW4X VS62, (OUT)(R10)
  350. ADD $64, OUT
  351. BEQ done_vsx
  352. VADDUWM V3, V16, V0
  353. VADDUWM V7, V17, V4
  354. VADDUWM V11, V18, V8
  355. VADDUWM V15, V19, V12
  356. BE_XXBRW(V0)
  357. BE_XXBRW(V4)
  358. BE_XXBRW(V8)
  359. BE_XXBRW(V12)
  360. CMPU LEN, $64
  361. BLT tail_vsx
  362. LXVW4X (INP)(R0), VS59
  363. LXVW4X (INP)(R8), VS60
  364. LXVW4X (INP)(R9), VS61
  365. LXVW4X (INP)(R10), VS62
  366. VXOR V27, V0, V27
  367. VXOR V28, V4, V28
  368. VXOR V29, V8, V29
  369. VXOR V30, V12, V30
  370. STXVW4X VS59, (OUT)(R0)
  371. STXVW4X VS60, (OUT)(R8)
  372. ADD $64, INP
  373. STXVW4X VS61, (OUT)(R9)
  374. ADD $-64, LEN
  375. STXVW4X VS62, (OUT)(R10)
  376. ADD $64, OUT
  377. MOVD $10, R14
  378. MOVD R14, CTR
  379. BNE loop_outer_vsx
  380. done_vsx:
  381. // Increment counter by number of 64 byte blocks
  382. MOVWZ (CNT), R14
  383. ADD BLOCKS, R14
  384. MOVWZ R14, (CNT)
  385. RET
  386. tail_vsx:
  387. ADD $32, R1, R11
  388. MOVD LEN, CTR
  389. // Save values on stack to copy from
  390. STXVW4X VS32, (R11)(R0)
  391. STXVW4X VS36, (R11)(R8)
  392. STXVW4X VS40, (R11)(R9)
  393. STXVW4X VS44, (R11)(R10)
  394. ADD $-1, R11, R12
  395. ADD $-1, INP
  396. ADD $-1, OUT
  397. PCALIGN $16
  398. looptail_vsx:
  399. // Copying the result to OUT
  400. // in bytes.
  401. MOVBZU 1(R12), KEY
  402. MOVBZU 1(INP), TMP
  403. XOR KEY, TMP, KEY
  404. MOVBU KEY, 1(OUT)
  405. BDNZ looptail_vsx
  406. // Clear the stack values
  407. STXVW4X VS48, (R11)(R0)
  408. STXVW4X VS48, (R11)(R8)
  409. STXVW4X VS48, (R11)(R9)
  410. STXVW4X VS48, (R11)(R10)
  411. BR done_vsx