LZ77压缩算法

LZ77是无损压损算法

组成结构如下

  • 字符序列字典

    从字符序列S1…Sn中组成n个字符序列字典。比如字符序列(A, B, C)可以组合为{(A), (A, B), (A, B, C), (B),(B, C),©}

  • 前向缓冲区

    每次读取数据的时候,先将一部分数据预载到该缓冲区,并从搜索缓冲区找到缓冲区的最大匹配字符序列

  • 搜索缓冲区

    缓存用于匹配的字符序列

大致的压缩过程如下

  1. 读取前向缓冲区数据在搜索缓冲区进行匹配
  2. 若匹配到,则编码相同字符序列到当前字符偏移量以及长度
  3. 若未匹配到则直接放入原数据
package lz77

import (
	"bytes"
	"encoding/binary"
)

const (
	defaultWindowSize          = 4
	defaultMinMatchSize        = 2
	defaultLookAheadBufferSize = 4
	matchFlag                  = 1
	missMatchFlag              = 0
)

type lz77Compressor struct {
	winSize                int
	maxLookAheadBufferSize int
	minMatchSize           int
}

func NewLz77Compressor(lookAheadBufferSize, minMatchSize, winSize int) *lz77Compressor {
	return &lz77Compressor{
		winSize:                winSize,
		maxLookAheadBufferSize: lookAheadBufferSize,
		minMatchSize:           minMatchSize,
	}
}

func DefaultLz77Compressor() *lz77Compressor {
	return NewLz77Compressor(defaultLookAheadBufferSize, defaultMinMatchSize, defaultWindowSize)
}

func (c *lz77Compressor) compress(data string) []byte {
	bitBuffer := bytes.Buffer{}
	var pos int
	for pos < len(data) {
		match, offset, l := c.findLongestMatch(data, pos)
		if match {
			bitBuffer.WriteByte(matchFlag)
			binary.Write(&bitBuffer, binary.BigEndian, uint16(offset))
			bitBuffer.WriteByte(byte(l))
			pos += l
		} else {
			bitBuffer.WriteByte(missMatchFlag)
			bitBuffer.WriteByte(data[pos])
			pos++
		}
	}

	return bitBuffer.Bytes()
}

func (c *lz77Compressor) decompress(data []byte) string {
	output := bytes.Buffer{}
	var pos int
	var offset uint16
	var l byte
	var flag byte

	for pos < len(data) {
		flag = data[pos]
		pos++
		if flag == matchFlag {
			dataBuffer := bytes.NewBuffer(data[pos : pos+2])
			binary.Read(dataBuffer, binary.BigEndian, &offset)
			pos += 2
			l = data[pos]
			ob := output.Bytes()
			output.Write(ob[len(ob)-int(offset) : len(ob)-int(offset)+int(l)])
		} else if flag == missMatchFlag {
			output.WriteByte(data[pos])
		} else {
			panic("wrong flag")
		}
		pos++
	}

	return output.String()
}

func (c *lz77Compressor) findLongestMatch(data string, currentPos int) (bool, int, int) {
	endPos := minInt(currentPos+c.maxLookAheadBufferSize, len(data)+1)
	var bestMatchDistance int
	var bestMatchLength int
	// 遍历前向缓冲区
	for i := currentPos + c.minMatchSize; i < endPos; i++ {
		startIndex := maxInt(0, currentPos-c.winSize)
		// 获取当前前向缓冲区
		lookAheadBuffer := data[currentPos:i]

		// 遍历搜索缓冲区
		for j := startIndex; j < currentPos; j++ {
			repetitions := len(lookAheadBuffer) / (currentPos - j)
			last := len(lookAheadBuffer) % (currentPos - j)
			var matchedStr string
			// 在前向缓冲区大于搜索缓冲区的时候可以重复组合搜索缓冲区尝试是否等于前向缓冲区
			for k := 0; k < repetitions; k++ {
				matchedStr += data[j:currentPos]
			}
			matchedStr += data[j : j+last]

			// 在搜索缓冲区组合与前向缓冲区匹配且大于之前匹配的长度,那么更新匹配偏移量与长度
			if matchedStr == lookAheadBuffer && len(lookAheadBuffer) > bestMatchLength {
				bestMatchDistance = currentPos - j
				bestMatchLength = len(lookAheadBuffer)
			}
		}
	}

	if bestMatchDistance > 0 && bestMatchLength > 0 {
		return true, bestMatchDistance, bestMatchLength
	}
	return false, 0, 0
}

func minInt(x, y int) int {
	if x < y {
		return x
	}
	return y
}

func maxInt(x, y int) int {
	if x > y {
		return x
	}
	return y
}