| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309 |
- // Copyright 2014-2022 Ulrich Kunitz. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package lzma
- import (
- "errors"
- "fmt"
- "github.com/ulikunitz/xz/internal/hash"
- )
- /* For compression we need to find byte sequences that match the byte
- * sequence at the dictionary head. A hash table is a simple method to
- * provide this capability.
- */
- // maxMatches limits the number of matches requested from the Matches
- // function. This controls the speed of the overall encoding.
- const maxMatches = 16
- // shortDists defines the number of short distances supported by the
- // implementation.
- const shortDists = 8
- // The minimum is somehow arbitrary but the maximum is limited by the
- // memory requirements of the hash table.
- const (
- minTableExponent = 9
- maxTableExponent = 20
- )
- // newRoller contains the function used to create an instance of the
- // hash.Roller.
- var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }
- // hashTable stores the hash table including the rolling hash method.
- //
- // We implement chained hashing into a circular buffer. Each entry in
- // the circular buffer stores the delta distance to the next position with a
- // word that has the same hash value.
- type hashTable struct {
- dict *encoderDict
- // actual hash table
- t []int64
- // circular list data with the offset to the next word
- data []uint32
- front int
- // mask for computing the index for the hash table
- mask uint64
- // hash offset; initial value is -int64(wordLen)
- hoff int64
- // length of the hashed word
- wordLen int
- // hash roller for computing the hash values for the Write
- // method
- wr hash.Roller
- // hash roller for computing arbitrary hashes
- hr hash.Roller
- // preallocated slices
- p [maxMatches]int64
- distances [maxMatches + shortDists]int
- }
- // hashTableExponent derives the hash table exponent from the dictionary
- // capacity.
- func hashTableExponent(n uint32) int {
- e := 30 - nlz32(n)
- switch {
- case e < minTableExponent:
- e = minTableExponent
- case e > maxTableExponent:
- e = maxTableExponent
- }
- return e
- }
- // newHashTable creates a new hash table for words of length wordLen
- func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {
- if !(0 < capacity) {
- return nil, errors.New(
- "newHashTable: capacity must not be negative")
- }
- exp := hashTableExponent(uint32(capacity))
- if !(1 <= wordLen && wordLen <= 4) {
- return nil, errors.New("newHashTable: " +
- "argument wordLen out of range")
- }
- n := 1 << uint(exp)
- if n <= 0 {
- panic("newHashTable: exponent is too large")
- }
- t = &hashTable{
- t: make([]int64, n),
- data: make([]uint32, capacity),
- mask: (uint64(1) << uint(exp)) - 1,
- hoff: -int64(wordLen),
- wordLen: wordLen,
- wr: newRoller(wordLen),
- hr: newRoller(wordLen),
- }
- return t, nil
- }
- func (t *hashTable) SetDict(d *encoderDict) { t.dict = d }
- // buffered returns the number of bytes that are currently hashed.
- func (t *hashTable) buffered() int {
- n := t.hoff + 1
- switch {
- case n <= 0:
- return 0
- case n >= int64(len(t.data)):
- return len(t.data)
- }
- return int(n)
- }
- // addIndex adds n to an index ensuring that is stays inside the
- // circular buffer for the hash chain.
- func (t *hashTable) addIndex(i, n int) int {
- i += n - len(t.data)
- if i < 0 {
- i += len(t.data)
- }
- return i
- }
- // putDelta puts the delta instance at the current front of the circular
- // chain buffer.
- func (t *hashTable) putDelta(delta uint32) {
- t.data[t.front] = delta
- t.front = t.addIndex(t.front, 1)
- }
- // putEntry puts a new entry into the hash table. If there is already a
- // value stored it is moved into the circular chain buffer.
- func (t *hashTable) putEntry(h uint64, pos int64) {
- if pos < 0 {
- return
- }
- i := h & t.mask
- old := t.t[i] - 1
- t.t[i] = pos + 1
- var delta int64
- if old >= 0 {
- delta = pos - old
- if delta > 1<<32-1 || delta > int64(t.buffered()) {
- delta = 0
- }
- }
- t.putDelta(uint32(delta))
- }
- // WriteByte converts a single byte into a hash and puts them into the hash
- // table.
- func (t *hashTable) WriteByte(b byte) error {
- h := t.wr.RollByte(b)
- t.hoff++
- t.putEntry(h, t.hoff)
- return nil
- }
- // Write converts the bytes provided into hash tables and stores the
- // abbreviated offsets into the hash table. The method will never return an
- // error.
- func (t *hashTable) Write(p []byte) (n int, err error) {
- for _, b := range p {
- // WriteByte doesn't generate an error.
- t.WriteByte(b)
- }
- return len(p), nil
- }
- // getMatches the matches for a specific hash. The functions returns the
- // number of positions found.
- //
- // TODO: Make a getDistances because that we are actually interested in.
- func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {
- if t.hoff < 0 || len(positions) == 0 {
- return 0
- }
- buffered := t.buffered()
- tailPos := t.hoff + 1 - int64(buffered)
- rear := t.front - buffered
- if rear >= 0 {
- rear -= len(t.data)
- }
- // get the slot for the hash
- pos := t.t[h&t.mask] - 1
- delta := pos - tailPos
- for {
- if delta < 0 {
- return n
- }
- positions[n] = tailPos + delta
- n++
- if n >= len(positions) {
- return n
- }
- i := rear + int(delta)
- if i < 0 {
- i += len(t.data)
- }
- u := t.data[i]
- if u == 0 {
- return n
- }
- delta -= int64(u)
- }
- }
- // hash computes the rolling hash for the word stored in p. For correct
- // results its length must be equal to t.wordLen.
- func (t *hashTable) hash(p []byte) uint64 {
- var h uint64
- for _, b := range p {
- h = t.hr.RollByte(b)
- }
- return h
- }
- // Matches fills the positions slice with potential matches. The
- // functions returns the number of positions filled into positions. The
- // byte slice p must have word length of the hash table.
- func (t *hashTable) Matches(p []byte, positions []int64) int {
- if len(p) != t.wordLen {
- panic(fmt.Errorf(
- "byte slice must have length %d", t.wordLen))
- }
- h := t.hash(p)
- return t.getMatches(h, positions)
- }
- // NextOp identifies the next operation using the hash table.
- //
- // TODO: Use all repetitions to find matches.
- func (t *hashTable) NextOp(rep [4]uint32) operation {
- // get positions
- data := t.dict.data[:maxMatchLen]
- n, _ := t.dict.buf.Peek(data)
- data = data[:n]
- var p []int64
- if n < t.wordLen {
- p = t.p[:0]
- } else {
- p = t.p[:maxMatches]
- n = t.Matches(data[:t.wordLen], p)
- p = p[:n]
- }
- // convert positions in potential distances
- head := t.dict.head
- dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)
- for _, pos := range p {
- dis := int(head - pos)
- if dis > shortDists {
- dists = append(dists, dis)
- }
- }
- // check distances
- var m match
- dictLen := t.dict.DictLen()
- for _, dist := range dists {
- if dist > dictLen {
- continue
- }
- // Here comes a trick. We are only interested in matches
- // that are longer than the matches we have been found
- // before. So before we test the whole byte sequence at
- // the given distance, we test the first byte that would
- // make the match longer. If it doesn't match the byte
- // to match, we don't to care any longer.
- i := t.dict.buf.rear - dist + m.n
- if i < 0 {
- i += len(t.dict.buf.data)
- }
- if t.dict.buf.data[i] != data[m.n] {
- // We can't get a longer match. Jump to the next
- // distance.
- continue
- }
- n := t.dict.buf.matchLen(dist, data)
- switch n {
- case 0:
- continue
- case 1:
- if uint32(dist-minDistance) != rep[0] {
- continue
- }
- }
- if n > m.n {
- m = match{int64(dist), n}
- if n == len(data) {
- // No better match will be found.
- break
- }
- }
- }
- if m.n == 0 {
- return lit{data[0]}
- }
- return m
- }
|