grapheme.go 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. package uniseg
  2. import "unicode/utf8"
  3. // The states of the grapheme cluster parser.
  4. const (
  5. grAny = iota
  6. grCR
  7. grControlLF
  8. grL
  9. grLVV
  10. grLVTT
  11. grPrepend
  12. grExtendedPictographic
  13. grExtendedPictographicZWJ
  14. grRIOdd
  15. grRIEven
  16. )
  17. // The grapheme cluster parser's breaking instructions.
  18. const (
  19. grNoBoundary = iota
  20. grBoundary
  21. )
  22. // The grapheme cluster parser's state transitions. Maps (state, property) to
  23. // (new state, breaking instruction, rule number). The breaking instruction
  24. // always refers to the boundary between the last and next code point.
  25. //
  26. // This map is queried as follows:
  27. //
  28. // 1. Find specific state + specific property. Stop if found.
  29. // 2. Find specific state + any property.
  30. // 3. Find any state + specific property.
  31. // 4. If only (2) or (3) (but not both) was found, stop.
  32. // 5. If both (2) and (3) were found, use state and breaking instruction from
  33. // the transition with the lower rule number, prefer (3) if rule numbers
  34. // are equal. Stop.
  35. // 6. Assume grAny and grBoundary.
  36. var grTransitions = map[[2]int][3]int{
  37. // GB5
  38. {grAny, prCR}: {grCR, grBoundary, 50},
  39. {grAny, prLF}: {grControlLF, grBoundary, 50},
  40. {grAny, prControl}: {grControlLF, grBoundary, 50},
  41. // GB4
  42. {grCR, prAny}: {grAny, grBoundary, 40},
  43. {grControlLF, prAny}: {grAny, grBoundary, 40},
  44. // GB3.
  45. {grCR, prLF}: {grAny, grNoBoundary, 30},
  46. // GB6.
  47. {grAny, prL}: {grL, grBoundary, 9990},
  48. {grL, prL}: {grL, grNoBoundary, 60},
  49. {grL, prV}: {grLVV, grNoBoundary, 60},
  50. {grL, prLV}: {grLVV, grNoBoundary, 60},
  51. {grL, prLVT}: {grLVTT, grNoBoundary, 60},
  52. // GB7.
  53. {grAny, prLV}: {grLVV, grBoundary, 9990},
  54. {grAny, prV}: {grLVV, grBoundary, 9990},
  55. {grLVV, prV}: {grLVV, grNoBoundary, 70},
  56. {grLVV, prT}: {grLVTT, grNoBoundary, 70},
  57. // GB8.
  58. {grAny, prLVT}: {grLVTT, grBoundary, 9990},
  59. {grAny, prT}: {grLVTT, grBoundary, 9990},
  60. {grLVTT, prT}: {grLVTT, grNoBoundary, 80},
  61. // GB9.
  62. {grAny, prExtend}: {grAny, grNoBoundary, 90},
  63. {grAny, prZWJ}: {grAny, grNoBoundary, 90},
  64. // GB9a.
  65. {grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
  66. // GB9b.
  67. {grAny, prPreprend}: {grPrepend, grBoundary, 9990},
  68. {grPrepend, prAny}: {grAny, grNoBoundary, 92},
  69. // GB11.
  70. {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
  71. {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
  72. {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
  73. {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
  74. // GB12 / GB13.
  75. {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
  76. {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
  77. {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
  78. }
  79. // Graphemes implements an iterator over Unicode extended grapheme clusters,
  80. // specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
  81. // "user-perceived characters". These characters often consist of multiple
  82. // code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
  83. // woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
  84. // woman) and the rules described in Annex #29 must be applied to group those
  85. // code points into clusters perceived by the user as one character.
  86. type Graphemes struct {
  87. // The code points over which this class iterates.
  88. codePoints []rune
  89. // The (byte-based) indices of the code points into the original string plus
  90. // len(original string). Thus, len(indices) = len(codePoints) + 1.
  91. indices []int
  92. // The current grapheme cluster to be returned. These are indices into
  93. // codePoints/indices. If start == end, we either haven't started iterating
  94. // yet (0) or the iteration has already completed (1).
  95. start, end int
  96. // The index of the next code point to be parsed.
  97. pos int
  98. // The current state of the code point parser.
  99. state int
  100. }
  101. // NewGraphemes returns a new grapheme cluster iterator.
  102. func NewGraphemes(s string) *Graphemes {
  103. l := utf8.RuneCountInString(s)
  104. codePoints := make([]rune, l)
  105. indices := make([]int, l+1)
  106. i := 0
  107. for pos, r := range s {
  108. codePoints[i] = r
  109. indices[i] = pos
  110. i++
  111. }
  112. indices[l] = len(s)
  113. g := &Graphemes{
  114. codePoints: codePoints,
  115. indices: indices,
  116. }
  117. g.Next() // Parse ahead.
  118. return g
  119. }
  120. // Next advances the iterator by one grapheme cluster and returns false if no
  121. // clusters are left. This function must be called before the first cluster is
  122. // accessed.
  123. func (g *Graphemes) Next() bool {
  124. g.start = g.end
  125. // The state transition gives us a boundary instruction BEFORE the next code
  126. // point so we always need to stay ahead by one code point.
  127. // Parse the next code point.
  128. for g.pos <= len(g.codePoints) {
  129. // GB2.
  130. if g.pos == len(g.codePoints) {
  131. g.end = g.pos
  132. g.pos++
  133. break
  134. }
  135. // Determine the property of the next character.
  136. nextProperty := property(g.codePoints[g.pos])
  137. g.pos++
  138. // Find the applicable transition.
  139. var boundary bool
  140. transition, ok := grTransitions[[2]int{g.state, nextProperty}]
  141. if ok {
  142. // We have a specific transition. We'll use it.
  143. g.state = transition[0]
  144. boundary = transition[1] == grBoundary
  145. } else {
  146. // No specific transition found. Try the less specific ones.
  147. transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
  148. transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
  149. if okAnyProp && okAnyState {
  150. // Both apply. We'll use a mix (see comments for grTransitions).
  151. g.state = transAnyState[0]
  152. boundary = transAnyState[1] == grBoundary
  153. if transAnyProp[2] < transAnyState[2] {
  154. g.state = transAnyProp[0]
  155. boundary = transAnyProp[1] == grBoundary
  156. }
  157. } else if okAnyProp {
  158. // We only have a specific state.
  159. g.state = transAnyProp[0]
  160. boundary = transAnyProp[1] == grBoundary
  161. // This branch will probably never be reached because okAnyState will
  162. // always be true given the current transition map. But we keep it here
  163. // for future modifications to the transition map where this may not be
  164. // true anymore.
  165. } else if okAnyState {
  166. // We only have a specific property.
  167. g.state = transAnyState[0]
  168. boundary = transAnyState[1] == grBoundary
  169. } else {
  170. // No known transition. GB999: Any x Any.
  171. g.state = grAny
  172. boundary = true
  173. }
  174. }
  175. // If we found a cluster boundary, let's stop here. The current cluster will
  176. // be the one that just ended.
  177. if g.pos-1 == 0 /* GB1 */ || boundary {
  178. g.end = g.pos - 1
  179. break
  180. }
  181. }
  182. return g.start != g.end
  183. }
  184. // Runes returns a slice of runes (code points) which corresponds to the current
  185. // grapheme cluster. If the iterator is already past the end or Next() has not
  186. // yet been called, nil is returned.
  187. func (g *Graphemes) Runes() []rune {
  188. if g.start == g.end {
  189. return nil
  190. }
  191. return g.codePoints[g.start:g.end]
  192. }
  193. // Str returns a substring of the original string which corresponds to the
  194. // current grapheme cluster. If the iterator is already past the end or Next()
  195. // has not yet been called, an empty string is returned.
  196. func (g *Graphemes) Str() string {
  197. if g.start == g.end {
  198. return ""
  199. }
  200. return string(g.codePoints[g.start:g.end])
  201. }
  202. // Bytes returns a byte slice which corresponds to the current grapheme cluster.
  203. // If the iterator is already past the end or Next() has not yet been called,
  204. // nil is returned.
  205. func (g *Graphemes) Bytes() []byte {
  206. if g.start == g.end {
  207. return nil
  208. }
  209. return []byte(string(g.codePoints[g.start:g.end]))
  210. }
  211. // Positions returns the interval of the current grapheme cluster as byte
  212. // positions into the original string. The first returned value "from" indexes
  213. // the first byte and the second returned value "to" indexes the first byte that
  214. // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
  215. // the original string "str". If Next() has not yet been called, both values are
  216. // 0. If the iterator is already past the end, both values are 1.
  217. func (g *Graphemes) Positions() (int, int) {
  218. return g.indices[g.start], g.indices[g.end]
  219. }
  220. // Reset puts the iterator into its initial state such that the next call to
  221. // Next() sets it to the first grapheme cluster again.
  222. func (g *Graphemes) Reset() {
  223. g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
  224. g.Next() // Parse ahead again.
  225. }
  226. // GraphemeClusterCount returns the number of user-perceived characters
  227. // (grapheme clusters) for the given string. To calculate this number, it
  228. // iterates through the string using the Graphemes iterator.
  229. func GraphemeClusterCount(s string) (n int) {
  230. g := NewGraphemes(s)
  231. for g.Next() {
  232. n++
  233. }
  234. return
  235. }