translate.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. // Copyright 2015 Huan Du. All rights reserved.
  2. // Licensed under the MIT license that can be found in the LICENSE file.
  3. package xstrings
  4. import (
  5. "unicode"
  6. "unicode/utf8"
  7. )
  8. type runeRangeMap struct {
  9. FromLo rune // Lower bound of range map.
  10. FromHi rune // An inclusive higher bound of range map.
  11. ToLo rune
  12. ToHi rune
  13. }
  14. type runeDict struct {
  15. Dict [unicode.MaxASCII + 1]rune
  16. }
  17. type runeMap map[rune]rune
  18. // Translator can translate string with pre-compiled from and to patterns.
  19. // If a from/to pattern pair needs to be used more than once, it's recommended
  20. // to create a Translator and reuse it.
  21. type Translator struct {
  22. quickDict *runeDict // A quick dictionary to look up rune by index. Only available for latin runes.
  23. runeMap runeMap // Rune map for translation.
  24. ranges []*runeRangeMap // Ranges of runes.
  25. mappedRune rune // If mappedRune >= 0, all matched runes are translated to the mappedRune.
  26. reverted bool // If to pattern is empty, all matched characters will be deleted.
  27. hasPattern bool
  28. }
  29. // NewTranslator creates new Translator through a from/to pattern pair.
  30. func NewTranslator(from, to string) *Translator {
  31. tr := &Translator{}
  32. if from == "" {
  33. return tr
  34. }
  35. reverted := from[0] == '^'
  36. deletion := len(to) == 0
  37. if reverted {
  38. from = from[1:]
  39. }
  40. var fromStart, fromEnd, fromRangeStep rune
  41. var toStart, toEnd, toRangeStep rune
  42. var fromRangeSize, toRangeSize rune
  43. var singleRunes []rune
  44. // Update the to rune range.
  45. updateRange := func() {
  46. // No more rune to read in the to rune pattern.
  47. if toEnd == utf8.RuneError {
  48. return
  49. }
  50. if toRangeStep == 0 {
  51. to, toStart, toEnd, toRangeStep = nextRuneRange(to, toEnd)
  52. return
  53. }
  54. // Current range is not empty. Consume 1 rune from start.
  55. if toStart != toEnd {
  56. toStart += toRangeStep
  57. return
  58. }
  59. // No more rune. Repeat the last rune.
  60. if to == "" {
  61. toEnd = utf8.RuneError
  62. return
  63. }
  64. // Both start and end are used. Read two more runes from the to pattern.
  65. to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
  66. }
  67. if deletion {
  68. toStart = utf8.RuneError
  69. toEnd = utf8.RuneError
  70. } else {
  71. // If from pattern is reverted, only the last rune in the to pattern will be used.
  72. if reverted {
  73. var size int
  74. for len(to) > 0 {
  75. toStart, size = utf8.DecodeRuneInString(to)
  76. to = to[size:]
  77. }
  78. toEnd = utf8.RuneError
  79. } else {
  80. to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
  81. }
  82. }
  83. fromEnd = utf8.RuneError
  84. for len(from) > 0 {
  85. from, fromStart, fromEnd, fromRangeStep = nextRuneRange(from, fromEnd)
  86. // fromStart is a single character. Just map it with a rune in the to pattern.
  87. if fromRangeStep == 0 {
  88. singleRunes = tr.addRune(fromStart, toStart, singleRunes)
  89. updateRange()
  90. continue
  91. }
  92. for toEnd != utf8.RuneError && fromStart != fromEnd {
  93. // If mapped rune is a single character instead of a range, simply shift first
  94. // rune in the range.
  95. if toRangeStep == 0 {
  96. singleRunes = tr.addRune(fromStart, toStart, singleRunes)
  97. updateRange()
  98. fromStart += fromRangeStep
  99. continue
  100. }
  101. fromRangeSize = (fromEnd - fromStart) * fromRangeStep
  102. toRangeSize = (toEnd - toStart) * toRangeStep
  103. // Not enough runes in the to pattern. Need to read more.
  104. if fromRangeSize > toRangeSize {
  105. fromStart, toStart = tr.addRuneRange(fromStart, fromStart+toRangeSize*fromRangeStep, toStart, toEnd, singleRunes)
  106. fromStart += fromRangeStep
  107. updateRange()
  108. // Edge case: If fromRangeSize == toRangeSize + 1, the last fromStart value needs be considered
  109. // as a single rune.
  110. if fromStart == fromEnd {
  111. singleRunes = tr.addRune(fromStart, toStart, singleRunes)
  112. updateRange()
  113. }
  114. continue
  115. }
  116. fromStart, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart+fromRangeSize*toRangeStep, singleRunes)
  117. updateRange()
  118. break
  119. }
  120. if fromStart == fromEnd {
  121. fromEnd = utf8.RuneError
  122. continue
  123. }
  124. _, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart, singleRunes)
  125. fromEnd = utf8.RuneError
  126. }
  127. if fromEnd != utf8.RuneError {
  128. tr.addRune(fromEnd, toStart, singleRunes)
  129. }
  130. tr.reverted = reverted
  131. tr.mappedRune = -1
  132. tr.hasPattern = true
  133. // Translate RuneError only if in deletion or reverted mode.
  134. if deletion || reverted {
  135. tr.mappedRune = toStart
  136. }
  137. return tr
  138. }
  139. func (tr *Translator) addRune(from, to rune, singleRunes []rune) []rune {
  140. if from <= unicode.MaxASCII {
  141. if tr.quickDict == nil {
  142. tr.quickDict = &runeDict{}
  143. }
  144. tr.quickDict.Dict[from] = to
  145. } else {
  146. if tr.runeMap == nil {
  147. tr.runeMap = make(runeMap)
  148. }
  149. tr.runeMap[from] = to
  150. }
  151. singleRunes = append(singleRunes, from)
  152. return singleRunes
  153. }
  154. func (tr *Translator) addRuneRange(fromLo, fromHi, toLo, toHi rune, singleRunes []rune) (rune, rune) {
  155. var r rune
  156. var rrm *runeRangeMap
  157. if fromLo < fromHi {
  158. rrm = &runeRangeMap{
  159. FromLo: fromLo,
  160. FromHi: fromHi,
  161. ToLo: toLo,
  162. ToHi: toHi,
  163. }
  164. } else {
  165. rrm = &runeRangeMap{
  166. FromLo: fromHi,
  167. FromHi: fromLo,
  168. ToLo: toHi,
  169. ToHi: toLo,
  170. }
  171. }
  172. // If there is any single rune conflicts with this rune range, clear single rune record.
  173. for _, r = range singleRunes {
  174. if rrm.FromLo <= r && r <= rrm.FromHi {
  175. if r <= unicode.MaxASCII {
  176. tr.quickDict.Dict[r] = 0
  177. } else {
  178. delete(tr.runeMap, r)
  179. }
  180. }
  181. }
  182. tr.ranges = append(tr.ranges, rrm)
  183. return fromHi, toHi
  184. }
  185. func nextRuneRange(str string, last rune) (remaining string, start, end rune, rangeStep rune) {
  186. var r rune
  187. var size int
  188. remaining = str
  189. escaping := false
  190. isRange := false
  191. for len(remaining) > 0 {
  192. r, size = utf8.DecodeRuneInString(remaining)
  193. remaining = remaining[size:]
  194. // Parse special characters.
  195. if !escaping {
  196. if r == '\\' {
  197. escaping = true
  198. continue
  199. }
  200. if r == '-' {
  201. // Ignore slash at beginning of string.
  202. if last == utf8.RuneError {
  203. continue
  204. }
  205. start = last
  206. isRange = true
  207. continue
  208. }
  209. }
  210. escaping = false
  211. if last != utf8.RuneError {
  212. // This is a range which start and end are the same.
  213. // Considier it as a normal character.
  214. if isRange && last == r {
  215. isRange = false
  216. continue
  217. }
  218. start = last
  219. end = r
  220. if isRange {
  221. if start < end {
  222. rangeStep = 1
  223. } else {
  224. rangeStep = -1
  225. }
  226. }
  227. return
  228. }
  229. last = r
  230. }
  231. start = last
  232. end = utf8.RuneError
  233. return
  234. }
  235. // Translate str with a from/to pattern pair.
  236. //
  237. // See comment in Translate function for usage and samples.
  238. func (tr *Translator) Translate(str string) string {
  239. if !tr.hasPattern || str == "" {
  240. return str
  241. }
  242. var r rune
  243. var size int
  244. var needTr bool
  245. orig := str
  246. var output *stringBuilder
  247. for len(str) > 0 {
  248. r, size = utf8.DecodeRuneInString(str)
  249. r, needTr = tr.TranslateRune(r)
  250. if needTr && output == nil {
  251. output = allocBuffer(orig, str)
  252. }
  253. if r != utf8.RuneError && output != nil {
  254. output.WriteRune(r)
  255. }
  256. str = str[size:]
  257. }
  258. // No character is translated.
  259. if output == nil {
  260. return orig
  261. }
  262. return output.String()
  263. }
  264. // TranslateRune return translated rune and true if r matches the from pattern.
  265. // If r doesn't match the pattern, original r is returned and translated is false.
  266. func (tr *Translator) TranslateRune(r rune) (result rune, translated bool) {
  267. switch {
  268. case tr.quickDict != nil:
  269. if r <= unicode.MaxASCII {
  270. result = tr.quickDict.Dict[r]
  271. if result != 0 {
  272. translated = true
  273. if tr.mappedRune >= 0 {
  274. result = tr.mappedRune
  275. }
  276. break
  277. }
  278. }
  279. fallthrough
  280. case tr.runeMap != nil:
  281. var ok bool
  282. if result, ok = tr.runeMap[r]; ok {
  283. translated = true
  284. if tr.mappedRune >= 0 {
  285. result = tr.mappedRune
  286. }
  287. break
  288. }
  289. fallthrough
  290. default:
  291. var rrm *runeRangeMap
  292. ranges := tr.ranges
  293. for i := len(ranges) - 1; i >= 0; i-- {
  294. rrm = ranges[i]
  295. if rrm.FromLo <= r && r <= rrm.FromHi {
  296. translated = true
  297. if tr.mappedRune >= 0 {
  298. result = tr.mappedRune
  299. break
  300. }
  301. if rrm.ToLo < rrm.ToHi {
  302. result = rrm.ToLo + r - rrm.FromLo
  303. } else if rrm.ToLo > rrm.ToHi {
  304. // ToHi can be smaller than ToLo if range is from higher to lower.
  305. result = rrm.ToLo - r + rrm.FromLo
  306. } else {
  307. result = rrm.ToLo
  308. }
  309. break
  310. }
  311. }
  312. }
  313. if tr.reverted {
  314. if !translated {
  315. result = tr.mappedRune
  316. }
  317. translated = !translated
  318. }
  319. if !translated {
  320. result = r
  321. }
  322. return
  323. }
  324. // HasPattern returns true if Translator has one pattern at least.
  325. func (tr *Translator) HasPattern() bool {
  326. return tr.hasPattern
  327. }
  328. // Translate str with the characters defined in from replaced by characters defined in to.
  329. //
  330. // From and to are patterns representing a set of characters. Pattern is defined as following.
  331. //
  332. // * Special characters
  333. // * '-' means a range of runes, e.g.
  334. // * "a-z" means all characters from 'a' to 'z' inclusive;
  335. // * "z-a" means all characters from 'z' to 'a' inclusive.
  336. // * '^' as first character means a set of all runes excepted listed, e.g.
  337. // * "^a-z" means all characters except 'a' to 'z' inclusive.
  338. // * '\' escapes special characters.
  339. // * Normal character represents itself, e.g. "abc" is a set including 'a', 'b' and 'c'.
  340. //
  341. // Translate will try to find a 1:1 mapping from from to to.
  342. // If to is smaller than from, last rune in to will be used to map "out of range" characters in from.
  343. //
  344. // Note that '^' only works in the from pattern. It will be considered as a normal character in the to pattern.
  345. //
  346. // If the to pattern is an empty string, Translate works exactly the same as Delete.
  347. //
  348. // Samples:
  349. // Translate("hello", "aeiou", "12345") => "h2ll4"
  350. // Translate("hello", "a-z", "A-Z") => "HELLO"
  351. // Translate("hello", "z-a", "a-z") => "svool"
  352. // Translate("hello", "aeiou", "*") => "h*ll*"
  353. // Translate("hello", "^l", "*") => "**ll*"
  354. // Translate("hello ^ world", `\^lo`, "*") => "he*** * w*r*d"
  355. func Translate(str, from, to string) string {
  356. tr := NewTranslator(from, to)
  357. return tr.Translate(str)
  358. }
  359. // Delete runes in str matching the pattern.
  360. // Pattern is defined in Translate function.
  361. //
  362. // Samples:
  363. // Delete("hello", "aeiou") => "hll"
  364. // Delete("hello", "a-k") => "llo"
  365. // Delete("hello", "^a-k") => "he"
  366. func Delete(str, pattern string) string {
  367. tr := NewTranslator(pattern, "")
  368. return tr.Translate(str)
  369. }
  370. // Count how many runes in str match the pattern.
  371. // Pattern is defined in Translate function.
  372. //
  373. // Samples:
  374. // Count("hello", "aeiou") => 3
  375. // Count("hello", "a-k") => 3
  376. // Count("hello", "^a-k") => 2
  377. func Count(str, pattern string) int {
  378. if pattern == "" || str == "" {
  379. return 0
  380. }
  381. var r rune
  382. var size int
  383. var matched bool
  384. tr := NewTranslator(pattern, "")
  385. cnt := 0
  386. for len(str) > 0 {
  387. r, size = utf8.DecodeRuneInString(str)
  388. str = str[size:]
  389. if _, matched = tr.TranslateRune(r); matched {
  390. cnt++
  391. }
  392. }
  393. return cnt
  394. }
  395. // Squeeze deletes adjacent repeated runes in str.
  396. // If pattern is not empty, only runes matching the pattern will be squeezed.
  397. //
  398. // Samples:
  399. // Squeeze("hello", "") => "helo"
  400. // Squeeze("hello", "m-z") => "hello"
  401. // Squeeze("hello world", " ") => "hello world"
  402. func Squeeze(str, pattern string) string {
  403. var last, r rune
  404. var size int
  405. var skipSqueeze, matched bool
  406. var tr *Translator
  407. var output *stringBuilder
  408. orig := str
  409. last = -1
  410. if len(pattern) > 0 {
  411. tr = NewTranslator(pattern, "")
  412. }
  413. for len(str) > 0 {
  414. r, size = utf8.DecodeRuneInString(str)
  415. // Need to squeeze the str.
  416. if last == r && !skipSqueeze {
  417. if tr != nil {
  418. if _, matched = tr.TranslateRune(r); !matched {
  419. skipSqueeze = true
  420. }
  421. }
  422. if output == nil {
  423. output = allocBuffer(orig, str)
  424. }
  425. if skipSqueeze {
  426. output.WriteRune(r)
  427. }
  428. } else {
  429. if output != nil {
  430. output.WriteRune(r)
  431. }
  432. last = r
  433. skipSqueeze = false
  434. }
  435. str = str[size:]
  436. }
  437. if output == nil {
  438. return orig
  439. }
  440. return output.String()
  441. }