setutil_arm64.s 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. // +build arm64,!gccgo,!appengine
  2. #include "textflag.h"
  3. // This implements union2by2 using golang's version of arm64 assembly
  4. // The algorithm is very similar to the generic one,
  5. // but makes better use of arm64 features so is notably faster.
  6. // The basic algorithm structure is as follows:
  7. // 1. If either set is empty, copy the other set into the buffer and return the length
  8. // 2. Otherwise, load the first element of each set into a variable (s1 and s2).
  9. // 3. a. Compare the values of s1 and s2.
  10. // b. add the smaller one to the buffer.
  11. // c. perform a bounds check before incrementing.
  12. // If one set is finished, copy the rest of the other set over.
  13. // d. update s1 and or s2 to the next value, continue loop.
  14. //
  15. // Past the fact of the algorithm, this code makes use of several arm64 features
  16. // Condition Codes:
  17. // arm64's CMP operation sets 4 bits that can be used for branching,
  18. // rather than just true or false.
  19. // As a consequence, a single comparison gives enough information to distinguish the three cases
  20. //
  21. // Post-increment pointers after load/store:
  22. // Instructions like `MOVHU.P 2(R0), R6`
  23. // increment the register by a specified amount, in this example 2.
  24. // Because uint16's are exactly 2 bytes and the length of the slices
  25. // is part of the slice header,
  26. // there is no need to separately track the index into the slice.
  27. // Instead, the code can calculate the final read value and compare against that,
  28. // using the post-increment reads to move the pointers along.
  29. //
  30. // TODO: CALL out to memmove once the list is exhausted.
  31. // Right now it moves the necessary shorts so that the remaining count
  32. // is a multiple of 4 and then copies 64 bits at a time.
  33. TEXT ·union2by2(SB), NOSPLIT, $0-80
  34. // R0, R1, and R2 for the pointers to the three slices
  35. MOVD set1+0(FP), R0
  36. MOVD set2+24(FP), R1
  37. MOVD buffer+48(FP), R2
  38. //R3 and R4 will be the values at which we will have finished reading set1 and set2.
  39. // R3 should be R0 + 2 * set1_len+8(FP)
  40. MOVD set1_len+8(FP), R3
  41. MOVD set2_len+32(FP), R4
  42. ADD R3<<1, R0, R3
  43. ADD R4<<1, R1, R4
  44. //Rather than counting the number of elements added separately
  45. //Save the starting register of buffer.
  46. MOVD buffer+48(FP), R5
  47. // set1 is empty, just flush set2
  48. CMP R0, R3
  49. BEQ flush_right
  50. // set2 is empty, just flush set1
  51. CMP R1, R4
  52. BEQ flush_left
  53. // R6, R7 are the working space for s1 and s2
  54. MOVD ZR, R6
  55. MOVD ZR, R7
  56. MOVHU.P 2(R0), R6
  57. MOVHU.P 2(R1), R7
  58. loop:
  59. CMP R6, R7
  60. BEQ pop_both // R6 == R7
  61. BLS pop_right // R6 > R7
  62. //pop_left: // R6 < R7
  63. MOVHU.P R6, 2(R2)
  64. CMP R0, R3
  65. BEQ pop_then_flush_right
  66. MOVHU.P 2(R0), R6
  67. JMP loop
  68. pop_both:
  69. MOVHU.P R6, 2(R2) //could also use R7, since they are equal
  70. CMP R0, R3
  71. BEQ flush_right
  72. CMP R1, R4
  73. BEQ flush_left
  74. MOVHU.P 2(R0), R6
  75. MOVHU.P 2(R1), R7
  76. JMP loop
  77. pop_right:
  78. MOVHU.P R7, 2(R2)
  79. CMP R1, R4
  80. BEQ pop_then_flush_left
  81. MOVHU.P 2(R1), R7
  82. JMP loop
  83. pop_then_flush_right:
  84. MOVHU.P R7, 2(R2)
  85. flush_right:
  86. MOVD R1, R0
  87. MOVD R4, R3
  88. JMP flush_left
  89. pop_then_flush_left:
  90. MOVHU.P R6, 2(R2)
  91. flush_left:
  92. CMP R0, R3
  93. BEQ return
  94. //figure out how many bytes to slough off. Must be a multiple of two
  95. SUB R0, R3, R4
  96. ANDS $6, R4
  97. BEQ long_flush //handles the 0 mod 8 case
  98. SUBS $4, R4, R4 // since possible values are 2, 4, 6, this splits evenly
  99. BLT pop_single // exactly the 2 case
  100. MOVW.P 4(R0), R6
  101. MOVW.P R6, 4(R2)
  102. BEQ long_flush // we're now aligned by 64 bits, as R4==4, otherwise 2 more
  103. pop_single:
  104. MOVHU.P 2(R0), R6
  105. MOVHU.P R6, 2(R2)
  106. long_flush:
  107. // at this point we know R3 - R0 is a multiple of 8.
  108. CMP R0, R3
  109. BEQ return
  110. MOVD.P 8(R0), R6
  111. MOVD.P R6, 8(R2)
  112. JMP long_flush
  113. return:
  114. // number of shorts written is (R5 - R2) >> 1
  115. SUB R5, R2
  116. LSR $1, R2, R2
  117. MOVD R2, size+72(FP)
  118. RET