hashtable.go 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. // Copyright 2014-2022 Ulrich Kunitz. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package lzma
  5. import (
  6. "errors"
  7. "fmt"
  8. "github.com/ulikunitz/xz/internal/hash"
  9. )
  10. /* For compression we need to find byte sequences that match the byte
  11. * sequence at the dictionary head. A hash table is a simple method to
  12. * provide this capability.
  13. */
  14. // maxMatches limits the number of matches requested from the Matches
  15. // function. This controls the speed of the overall encoding.
  16. const maxMatches = 16
  17. // shortDists defines the number of short distances supported by the
  18. // implementation.
  19. const shortDists = 8
  20. // The minimum is somehow arbitrary but the maximum is limited by the
  21. // memory requirements of the hash table.
  22. const (
  23. minTableExponent = 9
  24. maxTableExponent = 20
  25. )
  26. // newRoller contains the function used to create an instance of the
  27. // hash.Roller.
  28. var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }
  29. // hashTable stores the hash table including the rolling hash method.
  30. //
  31. // We implement chained hashing into a circular buffer. Each entry in
  32. // the circular buffer stores the delta distance to the next position with a
  33. // word that has the same hash value.
  34. type hashTable struct {
  35. dict *encoderDict
  36. // actual hash table
  37. t []int64
  38. // circular list data with the offset to the next word
  39. data []uint32
  40. front int
  41. // mask for computing the index for the hash table
  42. mask uint64
  43. // hash offset; initial value is -int64(wordLen)
  44. hoff int64
  45. // length of the hashed word
  46. wordLen int
  47. // hash roller for computing the hash values for the Write
  48. // method
  49. wr hash.Roller
  50. // hash roller for computing arbitrary hashes
  51. hr hash.Roller
  52. // preallocated slices
  53. p [maxMatches]int64
  54. distances [maxMatches + shortDists]int
  55. }
  56. // hashTableExponent derives the hash table exponent from the dictionary
  57. // capacity.
  58. func hashTableExponent(n uint32) int {
  59. e := 30 - nlz32(n)
  60. switch {
  61. case e < minTableExponent:
  62. e = minTableExponent
  63. case e > maxTableExponent:
  64. e = maxTableExponent
  65. }
  66. return e
  67. }
  68. // newHashTable creates a new hash table for words of length wordLen
  69. func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {
  70. if !(0 < capacity) {
  71. return nil, errors.New(
  72. "newHashTable: capacity must not be negative")
  73. }
  74. exp := hashTableExponent(uint32(capacity))
  75. if !(1 <= wordLen && wordLen <= 4) {
  76. return nil, errors.New("newHashTable: " +
  77. "argument wordLen out of range")
  78. }
  79. n := 1 << uint(exp)
  80. if n <= 0 {
  81. panic("newHashTable: exponent is too large")
  82. }
  83. t = &hashTable{
  84. t: make([]int64, n),
  85. data: make([]uint32, capacity),
  86. mask: (uint64(1) << uint(exp)) - 1,
  87. hoff: -int64(wordLen),
  88. wordLen: wordLen,
  89. wr: newRoller(wordLen),
  90. hr: newRoller(wordLen),
  91. }
  92. return t, nil
  93. }
  94. func (t *hashTable) SetDict(d *encoderDict) { t.dict = d }
  95. // buffered returns the number of bytes that are currently hashed.
  96. func (t *hashTable) buffered() int {
  97. n := t.hoff + 1
  98. switch {
  99. case n <= 0:
  100. return 0
  101. case n >= int64(len(t.data)):
  102. return len(t.data)
  103. }
  104. return int(n)
  105. }
  106. // addIndex adds n to an index ensuring that is stays inside the
  107. // circular buffer for the hash chain.
  108. func (t *hashTable) addIndex(i, n int) int {
  109. i += n - len(t.data)
  110. if i < 0 {
  111. i += len(t.data)
  112. }
  113. return i
  114. }
  115. // putDelta puts the delta instance at the current front of the circular
  116. // chain buffer.
  117. func (t *hashTable) putDelta(delta uint32) {
  118. t.data[t.front] = delta
  119. t.front = t.addIndex(t.front, 1)
  120. }
  121. // putEntry puts a new entry into the hash table. If there is already a
  122. // value stored it is moved into the circular chain buffer.
  123. func (t *hashTable) putEntry(h uint64, pos int64) {
  124. if pos < 0 {
  125. return
  126. }
  127. i := h & t.mask
  128. old := t.t[i] - 1
  129. t.t[i] = pos + 1
  130. var delta int64
  131. if old >= 0 {
  132. delta = pos - old
  133. if delta > 1<<32-1 || delta > int64(t.buffered()) {
  134. delta = 0
  135. }
  136. }
  137. t.putDelta(uint32(delta))
  138. }
  139. // WriteByte converts a single byte into a hash and puts them into the hash
  140. // table.
  141. func (t *hashTable) WriteByte(b byte) error {
  142. h := t.wr.RollByte(b)
  143. t.hoff++
  144. t.putEntry(h, t.hoff)
  145. return nil
  146. }
  147. // Write converts the bytes provided into hash tables and stores the
  148. // abbreviated offsets into the hash table. The method will never return an
  149. // error.
  150. func (t *hashTable) Write(p []byte) (n int, err error) {
  151. for _, b := range p {
  152. // WriteByte doesn't generate an error.
  153. t.WriteByte(b)
  154. }
  155. return len(p), nil
  156. }
  157. // getMatches the matches for a specific hash. The functions returns the
  158. // number of positions found.
  159. //
  160. // TODO: Make a getDistances because that we are actually interested in.
  161. func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {
  162. if t.hoff < 0 || len(positions) == 0 {
  163. return 0
  164. }
  165. buffered := t.buffered()
  166. tailPos := t.hoff + 1 - int64(buffered)
  167. rear := t.front - buffered
  168. if rear >= 0 {
  169. rear -= len(t.data)
  170. }
  171. // get the slot for the hash
  172. pos := t.t[h&t.mask] - 1
  173. delta := pos - tailPos
  174. for {
  175. if delta < 0 {
  176. return n
  177. }
  178. positions[n] = tailPos + delta
  179. n++
  180. if n >= len(positions) {
  181. return n
  182. }
  183. i := rear + int(delta)
  184. if i < 0 {
  185. i += len(t.data)
  186. }
  187. u := t.data[i]
  188. if u == 0 {
  189. return n
  190. }
  191. delta -= int64(u)
  192. }
  193. }
  194. // hash computes the rolling hash for the word stored in p. For correct
  195. // results its length must be equal to t.wordLen.
  196. func (t *hashTable) hash(p []byte) uint64 {
  197. var h uint64
  198. for _, b := range p {
  199. h = t.hr.RollByte(b)
  200. }
  201. return h
  202. }
  203. // Matches fills the positions slice with potential matches. The
  204. // functions returns the number of positions filled into positions. The
  205. // byte slice p must have word length of the hash table.
  206. func (t *hashTable) Matches(p []byte, positions []int64) int {
  207. if len(p) != t.wordLen {
  208. panic(fmt.Errorf(
  209. "byte slice must have length %d", t.wordLen))
  210. }
  211. h := t.hash(p)
  212. return t.getMatches(h, positions)
  213. }
  214. // NextOp identifies the next operation using the hash table.
  215. //
  216. // TODO: Use all repetitions to find matches.
  217. func (t *hashTable) NextOp(rep [4]uint32) operation {
  218. // get positions
  219. data := t.dict.data[:maxMatchLen]
  220. n, _ := t.dict.buf.Peek(data)
  221. data = data[:n]
  222. var p []int64
  223. if n < t.wordLen {
  224. p = t.p[:0]
  225. } else {
  226. p = t.p[:maxMatches]
  227. n = t.Matches(data[:t.wordLen], p)
  228. p = p[:n]
  229. }
  230. // convert positions in potential distances
  231. head := t.dict.head
  232. dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)
  233. for _, pos := range p {
  234. dis := int(head - pos)
  235. if dis > shortDists {
  236. dists = append(dists, dis)
  237. }
  238. }
  239. // check distances
  240. var m match
  241. dictLen := t.dict.DictLen()
  242. for _, dist := range dists {
  243. if dist > dictLen {
  244. continue
  245. }
  246. // Here comes a trick. We are only interested in matches
  247. // that are longer than the matches we have been found
  248. // before. So before we test the whole byte sequence at
  249. // the given distance, we test the first byte that would
  250. // make the match longer. If it doesn't match the byte
  251. // to match, we don't to care any longer.
  252. i := t.dict.buf.rear - dist + m.n
  253. if i < 0 {
  254. i += len(t.dict.buf.data)
  255. }
  256. if t.dict.buf.data[i] != data[m.n] {
  257. // We can't get a longer match. Jump to the next
  258. // distance.
  259. continue
  260. }
  261. n := t.dict.buf.matchLen(dist, data)
  262. switch n {
  263. case 0:
  264. continue
  265. case 1:
  266. if uint32(dist-minDistance) != rep[0] {
  267. continue
  268. }
  269. }
  270. if n > m.n {
  271. m = match{int64(dist), n}
  272. if n == len(data) {
  273. // No better match will be found.
  274. break
  275. }
  276. }
  277. }
  278. if m.n == 0 {
  279. return lit{data[0]}
  280. }
  281. return m
  282. }