contentcoder.go

//  Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ice

import (
	"bytes"
	"encoding/binary"
	"errors"
	"io"

	"github.com/blugelabs/ice/compress"
)

const minTermLenWithEscape = 2

var (
	termSeparator byte = 0xff
	termEscape    byte = '\\'
)

type chunkedContentCoder struct {
	final     []byte
	chunkSize uint64
	currChunk uint64
	chunkLens []uint64

	w                io.Writer
	progressiveWrite bool

	chunkMetaBuf bytes.Buffer
	chunkBuf     bytes.Buffer

	chunkMeta []metaData

	compressed []byte // temp buf for compression
}

// metaData represents the data information inside a
// chunk.
type metaData struct {
	DocNum      uint64 // docNum of the data inside the chunk
	DocDvOffset uint64 // offset of data inside the chunk for the given docid
}

// newChunkedContentCoder returns a new chunk content coder which
// packs data into chunks based on the provided chunkSize
func newChunkedContentCoder(chunkSize, maxDocNum uint64,
	w io.Writer, progressiveWrite bool) *chunkedContentCoder {
	total := maxDocNum/chunkSize + 1
	rv := &chunkedContentCoder{
		chunkSize:        chunkSize,
		chunkLens:        make([]uint64, total),
		chunkMeta:        make([]metaData, 0, total),
		w:                w,
		progressiveWrite: progressiveWrite,
	}

	return rv
}

// Reset lets you reuse this chunked content coder. Buffers are reset
// and re used. You cannot change the chunk size.
func (c *chunkedContentCoder) Reset() {
	c.currChunk = 0
	c.final = c.final[:0]
	c.chunkBuf.Reset()
	c.chunkMetaBuf.Reset()
	for i := range c.chunkLens {
		c.chunkLens[i] = 0
	}
	c.chunkMeta = c.chunkMeta[:0]
}

func (c *chunkedContentCoder) SetChunkSize(chunkSize, maxDocNum uint64) {
	total := int(maxDocNum/chunkSize + 1)
	c.chunkSize = chunkSize
	if cap(c.chunkLens) < total {
		c.chunkLens = make([]uint64, total)
	} else {
		c.chunkLens = c.chunkLens[:total]
	}
	if cap(c.chunkMeta) < total {
		c.chunkMeta = make([]metaData, 0, total)
	}
}

// Close indicates you are done calling Add() this allows
// the final chunk to be encoded.
func (c *chunkedContentCoder) Close() error {
	return c.flushContents()
}

func (c *chunkedContentCoder) flushContents() error {
	// flush the contents, with meta information at first
	buf := make([]byte, binary.MaxVarintLen64)
	n := binary.PutUvarint(buf, uint64(len(c.chunkMeta)))
	_, err := c.chunkMetaBuf.Write(buf[:n])
	if err != nil {
		return err
	}

	// write out the metaData slice
	diffDocNum := uint64(0)
	diffDvOffset := uint64(0)
	for _, meta := range c.chunkMeta {
		err = writeUvarints(&c.chunkMetaBuf, meta.DocNum-diffDocNum, meta.DocDvOffset-diffDvOffset)
		if err != nil {
			return err
		}
		diffDocNum = meta.DocNum
		diffDvOffset = meta.DocDvOffset
	}

	// write the metadata to final data
	metaData := c.chunkMetaBuf.Bytes()
	c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
	// write the compressed data to the final data
	c.compressed, err = compress.Compress(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
	if err != nil {
		return err
	}
	c.final = append(c.final, c.compressed...)

	c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))

	if c.progressiveWrite {
		_, err := c.w.Write(c.final)
		if err != nil {
			return err
		}
		c.final = c.final[:0]
	}

	return nil
}

// Add encodes the provided byte slice into the correct chunk for the provided
// doc num.  You MUST call Add() with increasing docNums.
func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
	chunk := docNum / c.chunkSize
	if chunk != c.currChunk {
		// flush out the previous chunk details
		err := c.flushContents()
		if err != nil {
			return err
		}
		// clearing the chunk specific meta for next chunk
		c.chunkBuf.Reset()
		c.chunkMetaBuf.Reset()
		c.chunkMeta = c.chunkMeta[:0]
		c.currChunk = chunk
	}

	// get the starting offset for this doc
	dvOffset := c.chunkBuf.Len()
	dvSize, err := c.chunkBuf.Write(vals)
	if err != nil {
		return err
	}

	c.chunkMeta = append(c.chunkMeta, metaData{
		DocNum:      docNum,
		DocDvOffset: uint64(dvOffset + dvSize),
	})
	return nil
}

// Write commits all the encoded chunked contents to the provided writer.
//
// | ..... data ..... | chunk offsets (varints)
// | position of chunk offsets (uint64) | number of offsets (uint64) |
func (c *chunkedContentCoder) Write() (int, error) {
	var tw int

	if c.final != nil {
		// write out the data section first
		nw, err := c.w.Write(c.final)
		tw += nw
		if err != nil {
			return tw, err
		}
	}

	chunkOffsetsStart := uint64(tw)

	if cap(c.final) < binary.MaxVarintLen64 {
		c.final = make([]byte, binary.MaxVarintLen64)
	} else {
		c.final = c.final[0:binary.MaxVarintLen64]
	}
	chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
	// write out the chunk offsets
	for _, chunkOffset := range chunkOffsets {
		n := binary.PutUvarint(c.final, chunkOffset)
		nw, err := c.w.Write(c.final[:n])
		tw += nw
		if err != nil {
			return tw, err
		}
	}

	chunkOffsetsLen := uint64(tw) - chunkOffsetsStart

	c.final = c.final[0:8]
	// write out the length of chunk offsets
	binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
	nw, err := c.w.Write(c.final)
	tw += nw
	if err != nil {
		return tw, err
	}

	// write out the number of chunks
	binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
	nw, err = c.w.Write(c.final)
	tw += nw
	if err != nil {
		return tw, err
	}

	c.final = c.final[:0]

	return tw, nil
}

// readDocValueBoundary elicits the start, end offsets from a
// metaData header slice
func readDocValueBoundary(chunk int, metaHeaders []metaData) (start, end uint64) {
	if chunk > 0 {
		start = metaHeaders[chunk-1].DocDvOffset
	}
	return start, metaHeaders[chunk].DocDvOffset
}

func encodeTerm(dest, src []byte) []byte {
	if src == nil {
		dest = append(dest, termSeparator)
		return dest
	}
	if bytes.IndexByte(src, termSeparator) < 0 && bytes.IndexByte(src, termEscape) < 0 {
		dest = append(dest, src...)
		dest = append(dest, termSeparator)
		return dest
	}
	for _, b := range src {
		if b == termSeparator || b == termEscape {
			dest = append(dest, termEscape)
		}
		dest = append(dest, b)
	}
	dest = append(dest, termSeparator)
	return dest
}

// nolint: gocritic, nolintlint
func decodeTerm(dest, src []byte) ([]byte, []byte, error) {
	if len(src) == 0 {
		return nil, nil, errors.New("empty term values")
	}
	if src[0] == termSeparator {
		return dest, src[1:], nil
	}
	for len(src) > 0 {
		switch {
		case src[0] == termEscape:
			if len(src) < minTermLenWithEscape {
				return nil, nil, errors.New("invalid termEscape character")
			}
			src = src[1:]
			dest = append(dest, src[0])
		case src[0] == termSeparator:
			return dest, src[1:], nil
		default:
			dest = append(dest, src[0])
		}
		src = src[1:]
	}
	return nil, nil, errors.New("invalid term values")
}