• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package rac
16
17import (
18	"errors"
19	"hash/crc32"
20	"io"
21	"sort"
22)
23
24func btoi(b bool) int {
25	if b {
26		return 1
27	}
28	return 0
29}
30
31func isZeroOrAPowerOf2(x uint64) bool {
32	return (x & (x - 1)) == 0
33}
34
35func putU64LE(b []byte, v uint64) {
36	_ = b[7] // Early bounds check to guarantee safety of writes below.
37	b[0] = byte(v)
38	b[1] = byte(v >> 8)
39	b[2] = byte(v >> 16)
40	b[3] = byte(v >> 24)
41	b[4] = byte(v >> 32)
42	b[5] = byte(v >> 40)
43	b[6] = byte(v >> 48)
44	b[7] = byte(v >> 56)
45}
46
47// Writer provides a relatively simple way to write a RAC file - one that is
48// created starting from nothing, as opposed to incrementally modifying an
49// existing RAC file.
50//
51// Other packages may provide a more flexible (and more complicated) way to
52// write or append to RAC files, but that is out of scope of this package.
53//
54// Do not modify its exported fields after calling any of its methods.
55type Writer struct {
56	// Writer is where the RAC-encoded data is written to.
57	//
58	// Nil is an invalid value.
59	Writer io.Writer
60
61	// Codec is the compression codec for the RAC file.
62	//
63	// See the RAC specification for further discussion.
64	//
65	// Zero is an invalid value.
66	Codec Codec
67
68	// initialized is set true after the first AddXxx call.
69	initialized bool
70
71	// IndexLocation is whether the index is at the start or end of the RAC
72	// file.
73	//
74	// See the RAC specification for further discussion.
75	//
76	// The zero value of this field is IndexLocationAtEnd: a one pass encoding.
77	IndexLocation IndexLocation
78
79	// TempFile is a temporary file to use for a two pass encoding. The field
80	// name says "file" but it need not be a real file, in the operating system
81	// sense.
82	//
83	// For IndexLocationAtEnd, this must be nil. For IndexLocationAtStart, this
84	// must be non-nil.
85	//
86	// It does not need to implement io.Seeker, if it supports separate read
87	// and write positions (e.g. it is a bytes.Buffer). If it does implement
88	// io.Seeker (e.g. it is an os.File), its current position will be noted
89	// (via SeekCurrent), then it will be written to (via the rac.Writer.AddXxx
90	// methods), then its position will be reset (via SeekSet), then it will be
91	// read from (via the rac.Writer.Close method).
92	//
93	// The rac.Writer does not call TempFile.Close even if that method exists
94	// (e.g. the TempFile is an os.File). In that case, closing the temporary
95	// file (and deleting it) after the rac.Writer is closed is the
96	// responsibility of the rac.Writer caller, not the rac.Writer itself.
97	TempFile io.ReadWriter
98
99	// CPageSize guides padding the output to minimize the number of pages that
100	// each chunk occupies (in what the RAC spec calls CSpace).
101	//
102	// It must either be zero (which means no padding is inserted) or a power
103	// of 2, and no larger than MaxSize.
104	//
105	// For example, suppose that CSpace is partitioned into 1024-byte pages,
106	// that 1000 bytes have already been written to the output, and the next
107	// chunk is 1500 bytes long.
108	//
109	//   - With no padding (i.e. with CPageSize set to 0), this chunk will
110	//     occupy the half-open range [1000, 2500) in CSpace, which straddles
111	//     three 1024-byte pages: the page [0, 1024), the page [1024, 2048) and
112	//     the page [2048, 3072). Call those pages p0, p1 and p2.
113	//
114	//   - With padding (i.e. with CPageSize set to 1024), 24 bytes of zeroes
115	//     are first inserted so that this chunk occupies the half-open range
116	//     [1024, 2524) in CSpace, which straddles only two pages (p1 and p2).
117	//
118	// This concept is similar, but not identical, to alignment. Even with a
119	// non-zero CPageSize, chunk start offsets are not necessarily aligned to
120	// page boundaries. For example, suppose that the chunk size was only 1040
121	// bytes, not 1500 bytes. No padding will be inserted, as both [1000, 2040)
122	// and [1024, 2064) straddle two pages: either pages p0 and p1, or pages p1
123	// and p2.
124	//
125	// Nonetheless, if all chunks (or all but the final chunk) have a size
126	// equal to (or just under) a multiple of the page size, then in practice,
127	// each chunk's starting offset will be aligned to a page boundary.
128	CPageSize uint64
129
130	// err is the first error encountered. It is sticky: once a non-nil error
131	// occurs, all public methods will return that error.
132	err error
133
134	// tempFileSeekStart is the TempFile's starting position, if the TempFile
135	// is an io.Seeker.
136	tempFileSeekStart int64
137
138	// dataSize is the total size (so far) of the data portion (as opposed to
139	// the index portion) of the compressed file. For a two pass encoding, this
140	// should equal the size of the TempFile.
141	dataSize uint64
142
143	// dFileSize is the total size (so far) of the decompressed file. It is the
144	// sum of all the dRangeSize arguments passed to AddChunk.
145	dFileSize uint64
146
147	// resourcesCOffCLens is the COffset and CLength values for each shared
148	// resource. Those values are for the compressed file (what the RAC spec
149	// calls CSpace).
150	//
151	// The first element (if it exists) is an unused placeholder, as a zero
152	// OptResource means unused.
153	resourcesCOffCLens []uint64
154
155	// leafNodes are the non-resource leaf nodes of the hierarchical index.
156	leafNodes []node
157
158	// log2CPageSize is the base-2 logarithm of CPageSize, or zero if CPageSize
159	// is zero.
160	log2CPageSize uint32
161
162	// padding is a slice of zeroes to pad output to CPageSize boundaries.
163	padding []byte
164}
165
166func (w *Writer) checkParameters() error {
167	if w.Writer == nil {
168		w.err = errors.New("rac: invalid Writer")
169		return w.err
170	}
171	if w.Codec == 0 {
172		w.err = errors.New("rac: invalid Codec")
173		return w.err
174	}
175	if !isZeroOrAPowerOf2(w.CPageSize) || (w.CPageSize > MaxSize) {
176		w.err = errors.New("rac: invalid CPageSize")
177		return w.err
178	} else if w.CPageSize > 0 {
179		for w.log2CPageSize = 1; w.CPageSize != (1 << w.log2CPageSize); w.log2CPageSize++ {
180		}
181	}
182	return nil
183}
184
185func (w *Writer) initialize() error {
186	if w.err != nil {
187		return w.err
188	}
189	if w.initialized {
190		return nil
191	}
192	w.initialized = true
193
194	if err := w.checkParameters(); err != nil {
195		return err
196	}
197
198	if s, ok := w.TempFile.(io.Seeker); ok {
199		if n, err := s.Seek(0, io.SeekCurrent); err != nil {
200			w.err = err
201			return err
202		} else {
203			w.tempFileSeekStart = n
204		}
205	}
206
207	switch w.IndexLocation {
208	case IndexLocationAtEnd:
209		if w.TempFile != nil {
210			w.err = errors.New("rac: IndexLocationAtEnd requires a nil TempFile")
211			return w.err
212		}
213		return w.write(indexLocationAtEndMagic)
214
215	case IndexLocationAtStart:
216		if w.TempFile == nil {
217			w.err = errors.New("rac: IndexLocationAtStart requires a non-nil TempFile")
218			return w.err
219		}
220	}
221	return nil
222}
223
224func (w *Writer) write(data []byte) error {
225	ioWriter := w.Writer
226	if w.TempFile != nil {
227		ioWriter = w.TempFile
228	}
229
230	if uint64(len(data)) > MaxSize {
231		w.err = errors.New("rac: too much input")
232		return w.err
233	}
234	if w.CPageSize > 0 {
235		if err := w.writePadding(ioWriter, uint64(len(data))); err != nil {
236			return err
237		}
238	}
239	if _, err := ioWriter.Write(data); err != nil {
240		w.err = err
241		return err
242	}
243	w.dataSize += uint64(len(data))
244	if w.dataSize > MaxSize {
245		w.err = errors.New("rac: too much input")
246		return w.err
247	}
248	return nil
249}
250
251func (w *Writer) writePadding(ioWriter io.Writer, lenData uint64) error {
252	if lenData == 0 {
253		return nil
254	}
255
256	offset0 := w.dataSize & (w.CPageSize - 1)
257	offset1WithPadding := (0 * offset0) + lenData
258	offset1SansPadding := (1 * offset0) + lenData
259	numPagesWithPadding := (offset1WithPadding + (w.CPageSize - 1)) >> w.log2CPageSize
260	numPagesSansPadding := (offset1SansPadding + (w.CPageSize - 1)) >> w.log2CPageSize
261
262	if numPagesSansPadding == numPagesWithPadding {
263		return nil
264	}
265	return w.padToPageSize(ioWriter, w.dataSize)
266}
267
268func (w *Writer) padToPageSize(ioWriter io.Writer, offset uint64) error {
269	offset &= w.CPageSize - 1
270	if (offset == 0) || (w.CPageSize == 0) {
271		return nil
272	}
273
274	if w.padding == nil {
275		n := w.CPageSize
276		if n > 4096 {
277			n = 4096
278		}
279		w.padding = make([]byte, n)
280	}
281
282	for remaining := w.CPageSize - offset; remaining > 0; {
283		padding := w.padding
284		if remaining < uint64(len(padding)) {
285			padding = padding[:remaining]
286		}
287
288		n, err := ioWriter.Write(padding)
289		if err != nil {
290			w.err = err
291			return err
292		}
293		remaining -= uint64(n)
294		w.dataSize += uint64(n)
295		if w.dataSize > MaxSize {
296			w.err = errors.New("rac: too much input")
297			return w.err
298		}
299	}
300	return nil
301}
302
303func (w *Writer) roundUpToCPageBoundary(x uint64) uint64 {
304	if w.CPageSize == 0 {
305		return x
306	}
307	return (x + w.CPageSize - 1) &^ (w.CPageSize - 1)
308}
309
310// AddResource adds a shared resource to the RAC file. It returns a non-zero
311// OptResource that identifies the resource's bytes. Future calls to AddChunk
312// can pass these identifiers to indicate that decompressing a chunk depends on
313// up to two shared resources.
314//
315// The caller may modify resource's contents after this method returns.
316func (w *Writer) AddResource(resource []byte) (OptResource, error) {
317	if len(w.resourcesCOffCLens) >= (1 << 30) {
318		w.err = errors.New("rac: too many resources")
319		return 0, w.err
320	}
321
322	if err := w.initialize(); err != nil {
323		return 0, err
324	}
325	if err := w.write(resource); err != nil {
326		return 0, err
327	}
328
329	if len(w.resourcesCOffCLens) == 0 {
330		w.resourcesCOffCLens = make([]uint64, 1, 8)
331	}
332	id := OptResource(len(w.resourcesCOffCLens))
333	cOffset := w.dataSize - uint64(len(resource))
334	cLength := calcCLength(len(resource))
335	w.resourcesCOffCLens = append(w.resourcesCOffCLens, cOffset|(cLength<<48))
336	return id, nil
337}
338
339// AddChunk adds a chunk of compressed data - the (primary, secondary,
340// tertiary) tuple - to the RAC file. Decompressing that chunk should produce
341// dRangeSize bytes, although the rac.Writer does not attempt to verify that.
342//
343// The OptResource arguments may be zero, meaning that no resource is used. In
344// that case, the corresponding STag or TTag (see the RAC specification for
345// further discussion) will be 0xFF.
346//
347// The caller may modify primary's contents after this method returns.
348func (w *Writer) AddChunk(dRangeSize uint64, primary []byte, secondary OptResource, tertiary OptResource) error {
349	if dRangeSize == 0 {
350		return nil
351	}
352	if (dRangeSize > MaxSize) || ((w.dFileSize + dRangeSize) > MaxSize) {
353		w.err = errors.New("rac: too much input")
354		return w.err
355	}
356	if len(w.leafNodes) >= (1 << 30) {
357		w.err = errors.New("rac: too many chunks")
358		return w.err
359	}
360
361	if err := w.initialize(); err != nil {
362		return err
363	}
364	if err := w.write(primary); err != nil {
365		return err
366	}
367
368	cOffset := w.dataSize - uint64(len(primary))
369	cLength := calcCLength(len(primary))
370	w.dFileSize += dRangeSize
371	w.leafNodes = append(w.leafNodes, node{
372		dRangeSize:     dRangeSize,
373		cOffsetCLength: cOffset | (cLength << 48),
374		secondary:      secondary,
375		tertiary:       tertiary,
376	})
377	return nil
378}
379
380func writeEmpty(w io.Writer, codec Codec) error {
381	buf := [32]byte{
382		0x72, 0xC3, 0x63, 0x01, 0x00, 0x00, 0x00, 0xFF,
383		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, uint8(codec),
384		0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xFF,
385		0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
386	}
387
388	checksum := crc32.ChecksumIEEE(buf[6:])
389	checksum ^= checksum >> 16
390	buf[4] = uint8(checksum >> 0)
391	buf[5] = uint8(checksum >> 8)
392
393	_, err := w.Write(buf[:])
394	return err
395}
396
397// Close writes the RAC index to w.Writer and marks that w accepts no further
398// method calls.
399//
400// For a one pass encoding, no further action is taken. For a two pass encoding
401// (i.e. IndexLocationAtStart), it then copies w.TempFile to w.Writer. Either
402// way, if this method returns nil error, the entirety of what was written to
403// w.Writer constitutes a valid RAC file.
404//
405// Closing the underlying w.Writer, w.TempFile or both is the responsibility of
406// the rac.Writer caller, not the rac.Writer itself.
407//
408// It is not necessary to call Close, e.g. if an earlier AddXxx call returned a
409// non-nil error. Unlike an os.File, failing to call rac.Writer.Close will not
410// leak resources such as file descriptors.
411func (w *Writer) Close() error {
412	if w.err != nil {
413		return w.err
414	}
415	if !w.initialized {
416		if err := w.checkParameters(); err != nil {
417			return err
418		}
419	}
420
421	if len(w.leafNodes) == 0 {
422		return writeEmpty(w.Writer, w.Codec)
423	}
424	rootNode := gather(w.leafNodes)
425	indexSize := rootNode.calcEncodedSize(0)
426
427	nw := &nodeWriter{
428		w:                  w.Writer,
429		resourcesCOffCLens: w.resourcesCOffCLens,
430		codec:              w.Codec,
431	}
432
433	if w.IndexLocation == IndexLocationAtEnd {
434		nw.indexCOffset = w.roundUpToCPageBoundary(w.dataSize)
435		nw.cFileSize = nw.indexCOffset + indexSize
436	} else {
437		nw.dataCOffset = w.roundUpToCPageBoundary(indexSize)
438		nw.cFileSize = nw.dataCOffset + w.dataSize
439	}
440	if nw.cFileSize > MaxSize {
441		w.err = errors.New("rac: too much input")
442		return w.err
443	}
444
445	if w.IndexLocation == IndexLocationAtEnd {
446		// Write the align-to-CPageSize padding.
447		if w.CPageSize > 0 {
448			if err := w.padToPageSize(w.Writer, w.dataSize); err != nil {
449				return err
450			}
451		}
452
453		// Write the index. The compressed data has already been written.
454		if err := nw.writeIndex(&rootNode); err != nil {
455			w.err = err
456			return err
457		}
458
459	} else {
460		expectedTempFileSize := w.dataSize
461
462		// Write the index.
463		if err := nw.writeIndex(&rootNode); err != nil {
464			w.err = err
465			return err
466		}
467
468		// Write the align-to-CPageSize padding.
469		if w.CPageSize > 0 {
470			if err := w.padToPageSize(w.Writer, indexSize); err != nil {
471				return err
472			}
473		}
474
475		// Write the compressed data.
476		if s, ok := w.TempFile.(io.Seeker); ok {
477			if _, err := s.Seek(w.tempFileSeekStart, io.SeekStart); err != nil {
478				w.err = err
479				return err
480			}
481		}
482		if n, err := io.Copy(w.Writer, w.TempFile); err != nil {
483			w.err = err
484			return err
485		} else if uint64(n) != expectedTempFileSize {
486			w.err = errors.New("rac: inconsistent compressed data size")
487			return w.err
488		}
489	}
490
491	w.err = errors.New("rac: Writer is closed")
492	return nil
493}
494
495type node struct {
496	dRangeSize uint64
497
498	children  []node // Excludes resource-node chidren.
499	resources []int
500
501	cOffsetCLength uint64
502	secondary      OptResource
503	tertiary       OptResource
504}
505
506// calcEncodedSize accumulates the encoded size of n and its children,
507// traversing the nodes in depth-first order.
508//
509// As a side effect, it also sets n.cOffsetCLength for branch nodes.
510func (n *node) calcEncodedSize(accumulator uint64) (newAccumulator uint64) {
511	arity := len(n.children) + len(n.resources)
512	if arity == 0 {
513		return accumulator
514	}
515	size := (arity * 16) + 16
516	cLength := calcCLength(size)
517	n.cOffsetCLength = accumulator | (cLength << 48)
518	accumulator += uint64(size)
519	for i := range n.children {
520		accumulator = n.children[i].calcEncodedSize(accumulator)
521	}
522	return accumulator
523}
524
525type nodeWriter struct {
526	w io.Writer
527
528	cFileSize          uint64
529	dataCOffset        uint64
530	indexCOffset       uint64
531	resourcesCOffCLens []uint64
532
533	codec Codec
534
535	// A node's maximum arity is 255, so a node's maximum size in bytes is
536	// ((255 * 16) + 16), which is 4096.
537	buffer [4096]byte
538}
539
540func (w *nodeWriter) writeIndex(n *node) error {
541	const tagFF = 0xFF << 56
542
543	buf, dPtr := w.buffer[:], uint64(0)
544	arity := uint64(len(n.children) + len(n.resources))
545	if arity > 0xFF {
546		return errors.New("rac: internal error: arity is too large")
547	}
548
549	// DPtr's. We write resources before regular children (non-resources), so
550	// that any TTag that refers to a resource always avoids the [0xC0, 0xFD]
551	// reserved zone. The ratio of resources to regulars is at most 2:1, as
552	// every chunk uses exactly 1 regular slot and up to 2 resource slots, and
553	// that reserved zone is less than 33% of the [0x00, 0xFF] space.
554	for i := range n.resources {
555		putU64LE(buf[8*i:], dPtr|tagFF)
556	}
557	buf = buf[8*len(n.resources):]
558	for i, o := range n.children {
559		putU64LE(buf[8*i:], dPtr|resourceToTag(n.resources, o.tertiary))
560		dPtr += o.dRangeSize
561	}
562	buf = buf[8*len(n.children):]
563
564	// DPtrMax.
565	putU64LE(buf, dPtr|(uint64(w.codec)<<56))
566	buf = buf[8:]
567
568	// CPtr's. While the RAC format allows otherwise, this Writer always keeps
569	// the CBias at zero, so that a CPtr equals a COffset.
570	for i, res := range n.resources {
571		cOffsetCLength := w.resourcesCOffCLens[res] + w.dataCOffset
572		putU64LE(buf[8*i:], cOffsetCLength|tagFF)
573	}
574	buf = buf[8*len(n.resources):]
575	for i, o := range n.children {
576		cOffsetCLength := o.cOffsetCLength
577		if len(o.children) == 0 {
578			cOffsetCLength += w.dataCOffset
579		} else {
580			cOffsetCLength += w.indexCOffset
581		}
582		putU64LE(buf[8*i:], cOffsetCLength|resourceToTag(n.resources, o.secondary))
583	}
584	buf = buf[8*len(n.children):]
585
586	// CPtrMax.
587	const version = 0x01
588	putU64LE(buf, w.cFileSize|(version<<48)|(arity<<56))
589
590	// Magic and Arity.
591	w.buffer[0] = 0x72
592	w.buffer[1] = 0xC3
593	w.buffer[2] = 0x63
594	w.buffer[3] = uint8(arity)
595
596	// Checksum.
597	size := (arity * 16) + 16
598	checksum := crc32.ChecksumIEEE(w.buffer[6:size])
599	checksum ^= checksum >> 16
600	w.buffer[4] = uint8(checksum >> 0)
601	w.buffer[5] = uint8(checksum >> 8)
602
603	if _, err := w.w.Write(w.buffer[:size]); err != nil {
604		return err
605	}
606
607	for i, o := range n.children {
608		if len(o.children) != 0 {
609			if err := w.writeIndex(&n.children[i]); err != nil {
610				return err
611			}
612		}
613	}
614	return nil
615}
616
617func calcCLength(primarySize int) uint64 {
618	if primarySize <= 0 {
619		return 1
620	}
621
622	// Divide by 1024, rounding up.
623	primarySize = ((primarySize - 1) >> 10) + 1
624
625	if primarySize > 255 {
626		return 0
627	}
628	return uint64(primarySize)
629}
630
631func resourceToTag(resources []int, r OptResource) uint64 {
632	if r != 0 {
633		for i, res := range resources {
634			if res == int(r) {
635				return uint64(i) << 56
636			}
637		}
638	}
639	return 0xFF << 56
640}
641
642// gather brings the given nodes into a tree, such that every branch node's
643// arity (the count of its resource and non-resource child nodes) is at most
644// 0xFF. It returns the root of that tree.
645//
646// TODO: it currently builds a tree where all leaf nodes have equal depth. For
647// small RAC files (with only one branch node: the mandatory root node), this
648// doesn't matter. For larger RAC files, it might be better to build an uneven
649// tree to minimize average leaf node depth, with some branch nodes having both
650// branch node children and leaf node children. It might even be worth ensuring
651// that the root node always directly contains the first and last leaf nodes as
652// children, as those leaves presumably contain the most commonly accessed
653// parts of the decompressed file.
654func gather(nodes []node) node {
655	if len(nodes) == 0 {
656		panic("gather: no nodes")
657	}
658
659	resources := map[OptResource]bool{}
660
661	for {
662		i, j, arity, newNodes := 0, 0, 0, []node(nil)
663		for ; j < len(nodes); j++ {
664			o := &nodes[j]
665
666			new2 := (o.secondary != 0) && !resources[o.secondary]
667			new3 := (o.tertiary != 0) && !resources[o.tertiary]
668			arity += 1 + btoi(new2) + btoi(new3)
669			if arity <= 0xFF {
670				if new2 {
671					resources[o.secondary] = true
672				}
673				if new3 {
674					resources[o.tertiary] = true
675				}
676				continue
677			}
678
679			newNodes = append(newNodes, makeBranch(nodes[i:j], resources))
680			if len(resources) != 0 {
681				resources = map[OptResource]bool{}
682			}
683
684			i = j
685			arity = 1
686			if o.secondary != 0 {
687				resources[o.secondary] = true
688				arity++
689			}
690			if o.tertiary != 0 {
691				resources[o.tertiary] = true
692				arity++
693			}
694		}
695
696		if i == 0 {
697			return makeBranch(nodes, resources)
698		}
699
700		newNodes = append(newNodes, makeBranch(nodes[i:], resources))
701		if len(resources) != 0 {
702			resources = map[OptResource]bool{}
703		}
704
705		nodes, newNodes = newNodes, nil
706	}
707}
708
709func makeBranch(children []node, resMap map[OptResource]bool) node {
710	dRangeSize := uint64(0)
711	for _, c := range children {
712		dRangeSize += c.dRangeSize
713	}
714
715	resList := []int(nil)
716	if len(resMap) != 0 {
717		resList = make([]int, 0, len(resMap))
718		for res := range resMap {
719			resList = append(resList, int(res))
720		}
721		sort.Ints(resList)
722	}
723
724	return node{
725		dRangeSize:     dRangeSize,
726		children:       children,
727		resources:      resList,
728		cOffsetCLength: invalidCOffsetCLength,
729	}
730}
731