• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package raczlib
16
17// TODO: API for shared dictionaries.
18
19import (
20	"bytes"
21	"compress/zlib"
22	"errors"
23	"io"
24
25	"github.com/google/wuffs/lib/rac"
26	"github.com/google/wuffs/lib/zlibcut"
27)
28
29const (
30	defaultDChunkSize = 65536 // 64 KiB.
31
32	maxCChunkSize       = 1 << 30 // 1 GiB.
33	maxTargetDChunkSize = 1 << 31 // 2 GiB.
34)
35
36var (
37	errCChunkSizeIsTooSmall = errors.New("raczlib: CChunkSize is too small")
38	errInvalidWriter        = errors.New("raczlib: invalid Writer")
39	errWriterIsClosed       = errors.New("raczlib: Writer is closed")
40
41	errInternalShortCSize = errors.New("raczlib: internal error: short CSize")
42)
43
44func startingTargetDChunkSize(cChunkSize uint64) uint64 { return 2 * cChunkSize }
45
46// stripTrailingZeroes returns b shorn of any trailing '\x00' bytes. The RAC
47// file format will implicitly set any trailing zeroes.
48func stripTrailingZeroes(b []byte) []byte {
49	n := len(b)
50	for (n > 0) && (b[n-1] == 0) {
51		n--
52	}
53	return b[:n]
54}
55
56// writeBuffer is a sequence of bytes, split into two slices: prev[p:] and
57// curr. Together, they hold the tail (so far) of the stream of bytes given to
58// an io.Writer.
59//
60// Conceptually, it is the concatenation of those two slices, but the struct
61// holds two slices, not one, to minimize copying and memory allocation.
62//
63// The first slice, prev[p:], is those bytes from all previous Write calls that
64// have been copied to an internal buffer but not yet fully processed.
65//
66// The second slice, curr, is those bytes from the current Write call that have
67// not been processed.
68//
69// The underlying array that backs the prev slice is owned by the writeBuffer,
70// and is re-used to minimize memory allocations. The curr slice is a sub-slice
71// of the []byte passed to the Write method, and is owned by the caller, not by
72// the writeBuffer.
73//
74// As bytes are fully processed, prev[p:] shrinks (by incrementing "p += etc")
75// and after len(prev[p:]) hits zero, curr shrinks (by re-assigning "curr =
76// curr[etc:]"). The two mechanisms differ (e.g. there is a p int that offsets
77// prev but not a corresponding c int that offsets curr) because of the
78// difference in backing-array memory ownership, mentioned above.
79//
80// The intended usage is for an io.Writer's Write(data) method to first call
81// writeBuffer.extend(data), then a mixture of multiple writeBuffer.peek(n) and
82// writeBuffer.advance(n), and finally writeBuffer.compact(). Thus, before and
83// after every Write call, writeBuffer.curr is empty and writebuffer.p is 0.
84type writeBuffer struct {
85	prev []byte
86	curr []byte
87	p    int
88}
89
90// extend extends b's view of an io.Writer's incoming byte stream.
91func (b *writeBuffer) extend(curr []byte) {
92	if len(b.curr) != 0 {
93		panic("inconsistent writeBuffer state")
94	}
95	b.curr = curr
96}
97
98// length returns the total number of bytes available in b's buffers.
99func (b *writeBuffer) length() uint64 {
100	return uint64(len(b.prev)-b.p) + uint64(len(b.curr))
101}
102
103// peek returns the next m bytes in b's buffers, where m is the minimum of n
104// and b.length().
105//
106// The bytes are returned in two slices, not a single contiguous slice, in
107// order to minimize copying and memory allocation.
108func (b *writeBuffer) peek(n uint64) ([]byte, []byte) {
109	available := uint64(len(b.prev) - b.p)
110	if n <= available {
111		return b.prev[b.p : b.p+int(n)], nil
112	}
113	n -= available
114	if n <= uint64(len(b.curr)) {
115		return b.prev[b.p:], b.curr[:n]
116	}
117	return b.prev[b.p:], b.curr
118}
119
120// advance consumes the next n bytes.
121func (b *writeBuffer) advance(n uint64) {
122	available := uint64(len(b.prev) - b.p)
123	if n <= available {
124		b.p += int(n)
125		return
126	}
127	n -= available
128	b.curr = b.curr[n:]
129	b.p = len(b.prev)
130}
131
132// compact moves and copies any unprocessed bytes in b.prev and b.curr to be at
133// the start of b.prev.
134func (b *writeBuffer) compact() {
135	// Move any remaining bytes in prev to the front.
136	n := copy(b.prev, b.prev[b.p:])
137	b.prev = b.prev[:n]
138	b.p = 0
139	// Move any remaining bytes in curr to prev.
140	b.prev = append(b.prev, b.curr...)
141	b.curr = nil
142}
143
144// Writer provides a relatively simple way to write a RAC file (with the Zlib
145// compression codec) - one that is created starting from nothing, as opposed
146// to incrementally modifying an existing RAC file.
147//
148// Other packages may provide a more flexible (and more complicated) way to
149// write or append to RAC files, but that is out of scope of this package.
150//
151// Do not modify its exported fields after calling any of its methods.
152//
153// Writer implements the io.Writer interface.
154type Writer struct {
155	// Writer is where the RAC-encoded data is written to.
156	//
157	// Nil is an invalid value.
158	Writer io.Writer
159
160	// IndexLocation is whether the index is at the start or end of the RAC
161	// file.
162	//
163	// See the RAC specification for further discussion.
164	//
165	// The zero value of this field is IndexLocationAtEnd: a one pass encoding.
166	IndexLocation IndexLocation
167
168	// TempFile is a temporary file to use for a two pass encoding. The field
169	// name says "file" but it need not be a real file, in the operating system
170	// sense.
171	//
172	// For IndexLocationAtEnd, this must be nil. For IndexLocationAtStart, this
173	// must be non-nil.
174	//
175	// It does not need to implement io.Seeker, if it supports separate read
176	// and write positions (e.g. it is a bytes.Buffer). If it does implement
177	// io.Seeker (e.g. it is an os.File), its current position will be noted
178	// (via SeekCurrent), then it will be written to (via the
179	// raczlib.Writer.Write method), then its position will be reset (via
180	// SeekSet), then it will be read from (via the raczlib.Writer.Close
181	// method).
182	//
183	// The raczlib.Writer does not call TempFile.Close even if that method
184	// exists (e.g. the TempFile is an os.File). In that case, closing the
185	// temporary file (and deleting it) after the raczlib.Writer is closed is
186	// the responsibility of the raczlib.Writer caller, not the raczlib.Writer
187	// itself.
188	TempFile io.ReadWriter
189
190	// CPageSize guides padding the output to minimize the number of pages that
191	// each chunk occupies (in what the RAC spec calls CSpace).
192	//
193	// It must either be zero (which means no padding is inserted) or a power
194	// of 2, and no larger than MaxSize.
195	//
196	// For example, suppose that CSpace is partitioned into 1024-byte pages,
197	// that 1000 bytes have already been written to the output, and the next
198	// chunk is 1500 bytes long.
199	//
200	//   - With no padding (i.e. with CPageSize set to 0), this chunk will
201	//     occupy the half-open range [1000, 2500) in CSpace, which straddles
202	//     three 1024-byte pages: the page [0, 1024), the page [1024, 2048) and
203	//     the page [2048, 3072). Call those pages p0, p1 and p2.
204	//
205	//   - With padding (i.e. with CPageSize set to 1024), 24 bytes of zeroes
206	//     are first inserted so that this chunk occupies the half-open range
207	//     [1024, 2524) in CSpace, which straddles only two pages (p1 and p2).
208	//
209	// This concept is similar, but not identical, to alignment. Even with a
210	// non-zero CPageSize, chunk start offsets are not necessarily aligned to
211	// page boundaries. For example, suppose that the chunk size was only 1040
212	// bytes, not 1500 bytes. No padding will be inserted, as both [1000, 2040)
213	// and [1024, 2064) straddle two pages: either pages p0 and p1, or pages p1
214	// and p2.
215	//
216	// Nonetheless, if all chunks (or all but the final chunk) have a size
217	// equal to (or just under) a multiple of the page size, then in practice,
218	// each chunk's starting offset will be aligned to a page boundary.
219	CPageSize uint64
220
221	// CChunkSize is the compressed size (i.e. size in CSpace) of each chunk.
222	// The final chunk might be smaller than this.
223	//
224	// This field is ignored if DChunkSize is non-zero.
225	CChunkSize uint64
226
227	// DChunkSize is the uncompressed size (i.e. size in DSpace) of each chunk.
228	// The final chunk might be smaller than this.
229	//
230	// If both CChunkSize and DChunkSize are non-zero, DChunkSize takes
231	// precedence and CChunkSize is ignored.
232	//
233	// If both CChunkSize and DChunkSize are zero, a default DChunkSize value
234	// will be used.
235	DChunkSize uint64
236
237	// cChunkSize and dChunkSize are copies of CChunkSize and DChunkSize,
238	// validated during the initialize method. For example, if both CChunkSize
239	// and DChunkSize are zero, dChunkSize will be defaultDChunkSize.
240	cChunkSize uint64
241	dChunkSize uint64
242
243	// err is the first error encountered. It is sticky: once a non-nil error
244	// occurs, all public methods will return that error.
245	err error
246
247	racWriter  rac.Writer
248	zlibWriter *zlib.Writer
249
250	compressed   bytes.Buffer
251	uncompressed writeBuffer
252}
253
254func (w *Writer) initialize() error {
255	if w.err != nil {
256		return w.err
257	}
258	if w.racWriter.Writer != nil {
259		// We're already initialized.
260		return nil
261	}
262	if w.Writer == nil {
263		w.err = errInvalidWriter
264		return w.err
265	}
266
267	if w.DChunkSize > 0 {
268		w.dChunkSize = w.DChunkSize
269	} else if w.CChunkSize > 0 {
270		w.cChunkSize = w.CChunkSize
271		if w.cChunkSize > maxCChunkSize {
272			w.cChunkSize = maxCChunkSize
273		}
274	} else {
275		w.dChunkSize = defaultDChunkSize
276	}
277
278	w.racWriter.Writer = w.Writer
279	w.racWriter.Codec = rac.CodecZlib
280	w.racWriter.IndexLocation = w.IndexLocation
281	w.racWriter.TempFile = w.TempFile
282	w.racWriter.CPageSize = w.CPageSize
283
284	w.zlibWriter, w.err = zlib.NewWriterLevel(&w.compressed, zlib.BestCompression)
285	return w.err
286}
287
288func (w *Writer) compress(dBytes0 []byte, dBytes1 []byte) ([]byte, error) {
289	w.compressed.Reset()
290	w.zlibWriter.Reset(&w.compressed)
291	if len(dBytes0) > 0 {
292		if _, err := w.zlibWriter.Write(dBytes0); err != nil {
293			w.err = err
294			return nil, err
295		}
296	}
297	if len(dBytes1) > 0 {
298		if _, err := w.zlibWriter.Write(dBytes1); err != nil {
299			w.err = err
300			return nil, err
301		}
302	}
303	if err := w.zlibWriter.Close(); err != nil {
304		w.err = err
305		return nil, err
306	}
307	return w.compressed.Bytes(), nil
308}
309
310// Write implements io.Writer.
311func (w *Writer) Write(p []byte) (int, error) {
312	if err := w.initialize(); err != nil {
313		return 0, err
314	}
315	w.uncompressed.extend(p)
316	n, err := len(p), w.write(false)
317	w.uncompressed.compact()
318	if err != nil {
319		return 0, err
320	}
321	return n, nil
322}
323
324func (w *Writer) write(eof bool) error {
325	if w.dChunkSize > 0 {
326		return w.writeDChunks(eof)
327	}
328	return w.writeCChunks(eof)
329}
330
331func (w *Writer) writeDChunks(eof bool) error {
332	for {
333		peek0, peek1 := w.uncompressed.peek(w.dChunkSize)
334		dSize := uint64(len(peek0)) + uint64(len(peek1))
335		if dSize == 0 {
336			return nil
337		}
338		if !eof && (dSize < w.dChunkSize) {
339			return nil
340		}
341
342		peek1 = stripTrailingZeroes(peek1)
343		if len(peek1) == 0 {
344			peek0 = stripTrailingZeroes(peek0)
345		}
346		cBytes, err := w.compress(peek0, peek1)
347		if err != nil {
348			return err
349		}
350
351		if err := w.racWriter.AddChunk(dSize, cBytes, 0, 0); err != nil {
352			w.err = err
353			return err
354		}
355		w.uncompressed.advance(dSize)
356	}
357}
358
359func (w *Writer) writeCChunks(eof bool) error {
360	// Each outer loop iteration tries to write exactly one chunk.
361outer:
362	for {
363		// We don't know, a priori, how many w.uncompressed bytes are needed to
364		// produce a compressed chunk of size cChunkSize. We use a
365		// startingTargetDChunkSize that is a small multiple of cChunkSize, and
366		// keep doubling that until we build something at least as large as
367		// cChunkSize, then zlibcut it back to be exactly cChunkSize.
368		targetDChunkSize := uint64(maxTargetDChunkSize)
369		if !eof {
370			targetDChunkSize = startingTargetDChunkSize(w.cChunkSize)
371		}
372
373		if n := w.uncompressed.length(); n == 0 {
374			return nil
375		} else if !eof && (n < targetDChunkSize) {
376			return nil
377		}
378
379		// The inner loop keeps doubling the targetDChunkSize until it's big
380		// enough to produce at least cChunkSize bytes of compressed data.
381		for {
382			next := targetDChunkSize * 2
383			if next > maxTargetDChunkSize {
384				next = maxTargetDChunkSize
385			}
386			force := next <= targetDChunkSize
387
388			if err := w.tryCChunk(targetDChunkSize, force); err == nil {
389				continue outer
390			} else if err != errInternalShortCSize {
391				return err
392			}
393
394			if w.uncompressed.length() <= targetDChunkSize {
395				// Growing the targetDChunkSize isn't going to change anything.
396				return nil
397			}
398			targetDChunkSize = next
399		}
400	}
401}
402
403func (w *Writer) tryCChunk(targetDChunkSize uint64, force bool) error {
404	peek0, peek1 := w.uncompressed.peek(targetDChunkSize)
405	dSize := uint64(len(peek0)) + uint64(len(peek1))
406	cBytes, err := w.compress(peek0, peek1)
407	if err != nil {
408		return err
409	}
410
411	switch {
412	case uint64(len(cBytes)) < w.cChunkSize:
413		if !force {
414			return errInternalShortCSize
415		}
416		fallthrough
417	case uint64(len(cBytes)) == w.cChunkSize:
418		if err := w.racWriter.AddChunk(dSize, cBytes, 0, 0); err != nil {
419			w.err = err
420			return err
421		}
422		w.uncompressed.advance(dSize)
423		return nil
424	}
425
426	eLen, dLen, err := zlibcut.Cut(nil, cBytes, int(w.cChunkSize))
427	if err != nil {
428		w.err = err
429		return err
430	}
431	if dLen == 0 {
432		w.err = errCChunkSizeIsTooSmall
433		return w.err
434	}
435	if err := w.racWriter.AddChunk(uint64(dLen), cBytes[:eLen], 0, 0); err != nil {
436		w.err = err
437		return err
438	}
439	w.uncompressed.advance(uint64(dLen))
440	return nil
441}
442
443// Close writes the RAC index to w.Writer and marks that w accepts no further
444// method calls.
445//
446// For a one pass encoding, no further action is taken. For a two pass encoding
447// (i.e. IndexLocationAtStart), it then copies w.TempFile to w.Writer. Either
448// way, if this method returns nil error, the entirety of what was written to
449// w.Writer constitutes a valid RAC file.
450//
451// Closing the underlying w.Writer, w.TempFile or both is the responsibility of
452// the raczlib.Writer caller, not the raczlib.Writer itself.
453//
454// It is not necessary to call Close, e.g. if an earlier Write call returned a
455// non-nil error. Unlike an os.File, failing to call raczlib.Writer.Close will
456// not leak resources such as file descriptors.
457func (w *Writer) Close() error {
458	if err := w.initialize(); err != nil {
459		return err
460	}
461	if err := w.write(true); err != nil {
462		return err
463	}
464	if err := w.racWriter.Close(); err != nil {
465		w.err = err
466		return err
467	}
468	w.err = errWriterIsClosed
469	return nil
470}
471