levenshtein.go 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. // This package implements the Levenshtein algorithm for computing the
  2. // similarity between two strings. The central function is MatrixForStrings,
  3. // which computes the Levenshtein matrix. The functions DistanceForMatrix,
  4. // EditScriptForMatrix and RatioForMatrix read various interesting properties
  5. // off the matrix. The package also provides the convenience functions
  6. // DistanceForStrings, EditScriptForStrings and RatioForStrings for going
  7. // directly from two strings to the property of interest.
  8. package levenshtein
  9. import (
  10. "fmt"
  11. "io"
  12. "os"
  13. )
  14. type EditOperation int
  15. const (
  16. Ins = iota
  17. Del
  18. Sub
  19. Match
  20. )
  21. type EditScript []EditOperation
  22. type MatchFunction func(rune, rune) bool
  23. type Options struct {
  24. InsCost int
  25. DelCost int
  26. SubCost int
  27. Matches MatchFunction
  28. }
  29. // DefaultOptions is the default options: insertion cost is 1, deletion cost is
  30. // 1, substitution cost is 2, and two runes match iff they are the same.
  31. var DefaultOptions Options = Options{
  32. InsCost: 1,
  33. DelCost: 1,
  34. SubCost: 2,
  35. Matches: func(sourceCharacter rune, targetCharacter rune) bool {
  36. return sourceCharacter == targetCharacter
  37. },
  38. }
  39. func (operation EditOperation) String() string {
  40. if operation == Match {
  41. return "match"
  42. } else if operation == Ins {
  43. return "ins"
  44. } else if operation == Sub {
  45. return "sub"
  46. }
  47. return "del"
  48. }
  49. // DistanceForStrings returns the edit distance between source and target.
  50. //
  51. // It has a runtime proportional to len(source) * len(target) and memory use
  52. // proportional to len(target).
  53. func DistanceForStrings(source []rune, target []rune, op Options) int {
  54. // Note: This algorithm is a specialization of MatrixForStrings.
  55. // MatrixForStrings returns the full edit matrix. However, we only need a
  56. // single value (see DistanceForMatrix) and the main loop of the algorithm
  57. // only uses the current and previous row. As such we create a 2D matrix,
  58. // but with height 2 (enough to store current and previous row).
  59. height := len(source) + 1
  60. width := len(target) + 1
  61. matrix := make([][]int, 2)
  62. // Initialize trivial distances (from/to empty string). That is, fill
  63. // the left column and the top row with row/column indices.
  64. for i := 0; i < 2; i++ {
  65. matrix[i] = make([]int, width)
  66. matrix[i][0] = i
  67. }
  68. for j := 1; j < width; j++ {
  69. matrix[0][j] = j
  70. }
  71. // Fill in the remaining cells: for each prefix pair, choose the
  72. // (edit history, operation) pair with the lowest cost.
  73. for i := 1; i < height; i++ {
  74. cur := matrix[i%2]
  75. prev := matrix[(i-1)%2]
  76. cur[0] = i
  77. for j := 1; j < width; j++ {
  78. delCost := prev[j] + op.DelCost
  79. matchSubCost := prev[j-1]
  80. if !op.Matches(source[i-1], target[j-1]) {
  81. matchSubCost += op.SubCost
  82. }
  83. insCost := cur[j-1] + op.InsCost
  84. cur[j] = min(delCost, min(matchSubCost, insCost))
  85. }
  86. }
  87. return matrix[(height-1)%2][width-1]
  88. }
  89. // DistanceForMatrix reads the edit distance off the given Levenshtein matrix.
  90. func DistanceForMatrix(matrix [][]int) int {
  91. return matrix[len(matrix)-1][len(matrix[0])-1]
  92. }
  93. // RatioForStrings returns the Levenshtein ratio for the given strings. The
  94. // ratio is computed as follows:
  95. //
  96. // (sourceLength + targetLength - distance) / (sourceLength + targetLength)
  97. func RatioForStrings(source []rune, target []rune, op Options) float64 {
  98. matrix := MatrixForStrings(source, target, op)
  99. return RatioForMatrix(matrix)
  100. }
  101. // RatioForMatrix returns the Levenshtein ratio for the given matrix. The ratio
  102. // is computed as follows:
  103. //
  104. // (sourceLength + targetLength - distance) / (sourceLength + targetLength)
  105. func RatioForMatrix(matrix [][]int) float64 {
  106. sourcelength := len(matrix) - 1
  107. targetlength := len(matrix[0]) - 1
  108. sum := sourcelength + targetlength
  109. if sum == 0 {
  110. return 0
  111. }
  112. dist := DistanceForMatrix(matrix)
  113. return float64(sum-dist) / float64(sum)
  114. }
  115. // MatrixForStrings generates a 2-D array representing the dynamic programming
  116. // table used by the Levenshtein algorithm, as described e.g. here:
  117. // http://www.let.rug.nl/kleiweg/lev/
  118. // The reason for putting the creation of the table into a separate function is
  119. // that it cannot only be used for reading of the edit distance between two
  120. // strings, but also e.g. to backtrace an edit script that provides an
  121. // alignment between the characters of both strings.
  122. func MatrixForStrings(source []rune, target []rune, op Options) [][]int {
  123. // Make a 2-D matrix. Rows correspond to prefixes of source, columns to
  124. // prefixes of target. Cells will contain edit distances.
  125. // Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html
  126. height := len(source) + 1
  127. width := len(target) + 1
  128. matrix := make([][]int, height)
  129. // Initialize trivial distances (from/to empty string). That is, fill
  130. // the left column and the top row with row/column indices.
  131. for i := 0; i < height; i++ {
  132. matrix[i] = make([]int, width)
  133. matrix[i][0] = i
  134. }
  135. for j := 1; j < width; j++ {
  136. matrix[0][j] = j
  137. }
  138. // Fill in the remaining cells: for each prefix pair, choose the
  139. // (edit history, operation) pair with the lowest cost.
  140. for i := 1; i < height; i++ {
  141. for j := 1; j < width; j++ {
  142. delCost := matrix[i-1][j] + op.DelCost
  143. matchSubCost := matrix[i-1][j-1]
  144. if !op.Matches(source[i-1], target[j-1]) {
  145. matchSubCost += op.SubCost
  146. }
  147. insCost := matrix[i][j-1] + op.InsCost
  148. matrix[i][j] = min(delCost, min(matchSubCost,
  149. insCost))
  150. }
  151. }
  152. //LogMatrix(source, target, matrix)
  153. return matrix
  154. }
  155. // EditScriptForStrings returns an optimal edit script to turn source into
  156. // target.
  157. func EditScriptForStrings(source []rune, target []rune, op Options) EditScript {
  158. return backtrace(len(source), len(target),
  159. MatrixForStrings(source, target, op), op)
  160. }
  161. // EditScriptForMatrix returns an optimal edit script based on the given
  162. // Levenshtein matrix.
  163. func EditScriptForMatrix(matrix [][]int, op Options) EditScript {
  164. return backtrace(len(matrix)-1, len(matrix[0])-1, matrix, op)
  165. }
  166. // WriteMatrix writes a visual representation of the given matrix for the given
  167. // strings to the given writer.
  168. func WriteMatrix(source []rune, target []rune, matrix [][]int, writer io.Writer) {
  169. fmt.Fprintf(writer, " ")
  170. for _, targetRune := range target {
  171. fmt.Fprintf(writer, " %c", targetRune)
  172. }
  173. fmt.Fprintf(writer, "\n")
  174. fmt.Fprintf(writer, " %2d", matrix[0][0])
  175. for j, _ := range target {
  176. fmt.Fprintf(writer, " %2d", matrix[0][j+1])
  177. }
  178. fmt.Fprintf(writer, "\n")
  179. for i, sourceRune := range source {
  180. fmt.Fprintf(writer, "%c %2d", sourceRune, matrix[i+1][0])
  181. for j, _ := range target {
  182. fmt.Fprintf(writer, " %2d", matrix[i+1][j+1])
  183. }
  184. fmt.Fprintf(writer, "\n")
  185. }
  186. }
  187. // LogMatrix writes a visual representation of the given matrix for the given
  188. // strings to os.Stderr. This function is deprecated, use
  189. // WriteMatrix(source, target, matrix, os.Stderr) instead.
  190. func LogMatrix(source []rune, target []rune, matrix [][]int) {
  191. WriteMatrix(source, target, matrix, os.Stderr)
  192. }
  193. func backtrace(i int, j int, matrix [][]int, op Options) EditScript {
  194. if i > 0 && matrix[i-1][j]+op.DelCost == matrix[i][j] {
  195. return append(backtrace(i-1, j, matrix, op), Del)
  196. }
  197. if j > 0 && matrix[i][j-1]+op.InsCost == matrix[i][j] {
  198. return append(backtrace(i, j-1, matrix, op), Ins)
  199. }
  200. if i > 0 && j > 0 && matrix[i-1][j-1]+op.SubCost == matrix[i][j] {
  201. return append(backtrace(i-1, j-1, matrix, op), Sub)
  202. }
  203. if i > 0 && j > 0 && matrix[i-1][j-1] == matrix[i][j] {
  204. return append(backtrace(i-1, j-1, matrix, op), Match)
  205. }
  206. return []EditOperation{}
  207. }
  208. func min(a int, b int) int {
  209. if b < a {
  210. return b
  211. }
  212. return a
  213. }
  214. func max(a int, b int) int {
  215. if b > a {
  216. return b
  217. }
  218. return a
  219. }