1// Copyright 2019 The Wuffs Authors. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package rac 16 17import ( 18 "errors" 19 "hash/crc32" 20 "io" 21 "sort" 22) 23 24func btoi(b bool) int { 25 if b { 26 return 1 27 } 28 return 0 29} 30 31func isZeroOrAPowerOf2(x uint64) bool { 32 return (x & (x - 1)) == 0 33} 34 35func putU64LE(b []byte, v uint64) { 36 _ = b[7] // Early bounds check to guarantee safety of writes below. 37 b[0] = byte(v) 38 b[1] = byte(v >> 8) 39 b[2] = byte(v >> 16) 40 b[3] = byte(v >> 24) 41 b[4] = byte(v >> 32) 42 b[5] = byte(v >> 40) 43 b[6] = byte(v >> 48) 44 b[7] = byte(v >> 56) 45} 46 47// Writer provides a relatively simple way to write a RAC file - one that is 48// created starting from nothing, as opposed to incrementally modifying an 49// existing RAC file. 50// 51// Other packages may provide a more flexible (and more complicated) way to 52// write or append to RAC files, but that is out of scope of this package. 53// 54// Do not modify its exported fields after calling any of its methods. 55type Writer struct { 56 // Writer is where the RAC-encoded data is written to. 57 // 58 // Nil is an invalid value. 59 Writer io.Writer 60 61 // Codec is the compression codec for the RAC file. 62 // 63 // See the RAC specification for further discussion. 64 // 65 // Zero is an invalid value. 66 Codec Codec 67 68 // initialized is set true after the first AddXxx call. 69 initialized bool 70 71 // IndexLocation is whether the index is at the start or end of the RAC 72 // file. 73 // 74 // See the RAC specification for further discussion. 75 // 76 // The zero value of this field is IndexLocationAtEnd: a one pass encoding. 77 IndexLocation IndexLocation 78 79 // TempFile is a temporary file to use for a two pass encoding. The field 80 // name says "file" but it need not be a real file, in the operating system 81 // sense. 82 // 83 // For IndexLocationAtEnd, this must be nil. For IndexLocationAtStart, this 84 // must be non-nil. 85 // 86 // It does not need to implement io.Seeker, if it supports separate read 87 // and write positions (e.g. it is a bytes.Buffer). If it does implement 88 // io.Seeker (e.g. it is an os.File), its current position will be noted 89 // (via SeekCurrent), then it will be written to (via the rac.Writer.AddXxx 90 // methods), then its position will be reset (via SeekSet), then it will be 91 // read from (via the rac.Writer.Close method). 92 // 93 // The rac.Writer does not call TempFile.Close even if that method exists 94 // (e.g. the TempFile is an os.File). In that case, closing the temporary 95 // file (and deleting it) after the rac.Writer is closed is the 96 // responsibility of the rac.Writer caller, not the rac.Writer itself. 97 TempFile io.ReadWriter 98 99 // CPageSize guides padding the output to minimize the number of pages that 100 // each chunk occupies (in what the RAC spec calls CSpace). 101 // 102 // It must either be zero (which means no padding is inserted) or a power 103 // of 2, and no larger than MaxSize. 104 // 105 // For example, suppose that CSpace is partitioned into 1024-byte pages, 106 // that 1000 bytes have already been written to the output, and the next 107 // chunk is 1500 bytes long. 108 // 109 // - With no padding (i.e. with CPageSize set to 0), this chunk will 110 // occupy the half-open range [1000, 2500) in CSpace, which straddles 111 // three 1024-byte pages: the page [0, 1024), the page [1024, 2048) and 112 // the page [2048, 3072). Call those pages p0, p1 and p2. 113 // 114 // - With padding (i.e. with CPageSize set to 1024), 24 bytes of zeroes 115 // are first inserted so that this chunk occupies the half-open range 116 // [1024, 2524) in CSpace, which straddles only two pages (p1 and p2). 117 // 118 // This concept is similar, but not identical, to alignment. Even with a 119 // non-zero CPageSize, chunk start offsets are not necessarily aligned to 120 // page boundaries. For example, suppose that the chunk size was only 1040 121 // bytes, not 1500 bytes. No padding will be inserted, as both [1000, 2040) 122 // and [1024, 2064) straddle two pages: either pages p0 and p1, or pages p1 123 // and p2. 124 // 125 // Nonetheless, if all chunks (or all but the final chunk) have a size 126 // equal to (or just under) a multiple of the page size, then in practice, 127 // each chunk's starting offset will be aligned to a page boundary. 128 CPageSize uint64 129 130 // err is the first error encountered. It is sticky: once a non-nil error 131 // occurs, all public methods will return that error. 132 err error 133 134 // tempFileSeekStart is the TempFile's starting position, if the TempFile 135 // is an io.Seeker. 136 tempFileSeekStart int64 137 138 // dataSize is the total size (so far) of the data portion (as opposed to 139 // the index portion) of the compressed file. For a two pass encoding, this 140 // should equal the size of the TempFile. 141 dataSize uint64 142 143 // dFileSize is the total size (so far) of the decompressed file. It is the 144 // sum of all the dRangeSize arguments passed to AddChunk. 145 dFileSize uint64 146 147 // resourcesCOffCLens is the COffset and CLength values for each shared 148 // resource. Those values are for the compressed file (what the RAC spec 149 // calls CSpace). 150 // 151 // The first element (if it exists) is an unused placeholder, as a zero 152 // OptResource means unused. 153 resourcesCOffCLens []uint64 154 155 // leafNodes are the non-resource leaf nodes of the hierarchical index. 156 leafNodes []node 157 158 // log2CPageSize is the base-2 logarithm of CPageSize, or zero if CPageSize 159 // is zero. 160 log2CPageSize uint32 161 162 // padding is a slice of zeroes to pad output to CPageSize boundaries. 163 padding []byte 164} 165 166func (w *Writer) checkParameters() error { 167 if w.Writer == nil { 168 w.err = errors.New("rac: invalid Writer") 169 return w.err 170 } 171 if w.Codec == 0 { 172 w.err = errors.New("rac: invalid Codec") 173 return w.err 174 } 175 if !isZeroOrAPowerOf2(w.CPageSize) || (w.CPageSize > MaxSize) { 176 w.err = errors.New("rac: invalid CPageSize") 177 return w.err 178 } else if w.CPageSize > 0 { 179 for w.log2CPageSize = 1; w.CPageSize != (1 << w.log2CPageSize); w.log2CPageSize++ { 180 } 181 } 182 return nil 183} 184 185func (w *Writer) initialize() error { 186 if w.err != nil { 187 return w.err 188 } 189 if w.initialized { 190 return nil 191 } 192 w.initialized = true 193 194 if err := w.checkParameters(); err != nil { 195 return err 196 } 197 198 if s, ok := w.TempFile.(io.Seeker); ok { 199 if n, err := s.Seek(0, io.SeekCurrent); err != nil { 200 w.err = err 201 return err 202 } else { 203 w.tempFileSeekStart = n 204 } 205 } 206 207 switch w.IndexLocation { 208 case IndexLocationAtEnd: 209 if w.TempFile != nil { 210 w.err = errors.New("rac: IndexLocationAtEnd requires a nil TempFile") 211 return w.err 212 } 213 return w.write(indexLocationAtEndMagic) 214 215 case IndexLocationAtStart: 216 if w.TempFile == nil { 217 w.err = errors.New("rac: IndexLocationAtStart requires a non-nil TempFile") 218 return w.err 219 } 220 } 221 return nil 222} 223 224func (w *Writer) write(data []byte) error { 225 ioWriter := w.Writer 226 if w.TempFile != nil { 227 ioWriter = w.TempFile 228 } 229 230 if uint64(len(data)) > MaxSize { 231 w.err = errors.New("rac: too much input") 232 return w.err 233 } 234 if w.CPageSize > 0 { 235 if err := w.writePadding(ioWriter, uint64(len(data))); err != nil { 236 return err 237 } 238 } 239 if _, err := ioWriter.Write(data); err != nil { 240 w.err = err 241 return err 242 } 243 w.dataSize += uint64(len(data)) 244 if w.dataSize > MaxSize { 245 w.err = errors.New("rac: too much input") 246 return w.err 247 } 248 return nil 249} 250 251func (w *Writer) writePadding(ioWriter io.Writer, lenData uint64) error { 252 if lenData == 0 { 253 return nil 254 } 255 256 offset0 := w.dataSize & (w.CPageSize - 1) 257 offset1WithPadding := (0 * offset0) + lenData 258 offset1SansPadding := (1 * offset0) + lenData 259 numPagesWithPadding := (offset1WithPadding + (w.CPageSize - 1)) >> w.log2CPageSize 260 numPagesSansPadding := (offset1SansPadding + (w.CPageSize - 1)) >> w.log2CPageSize 261 262 if numPagesSansPadding == numPagesWithPadding { 263 return nil 264 } 265 return w.padToPageSize(ioWriter, w.dataSize) 266} 267 268func (w *Writer) padToPageSize(ioWriter io.Writer, offset uint64) error { 269 offset &= w.CPageSize - 1 270 if (offset == 0) || (w.CPageSize == 0) { 271 return nil 272 } 273 274 if w.padding == nil { 275 n := w.CPageSize 276 if n > 4096 { 277 n = 4096 278 } 279 w.padding = make([]byte, n) 280 } 281 282 for remaining := w.CPageSize - offset; remaining > 0; { 283 padding := w.padding 284 if remaining < uint64(len(padding)) { 285 padding = padding[:remaining] 286 } 287 288 n, err := ioWriter.Write(padding) 289 if err != nil { 290 w.err = err 291 return err 292 } 293 remaining -= uint64(n) 294 w.dataSize += uint64(n) 295 if w.dataSize > MaxSize { 296 w.err = errors.New("rac: too much input") 297 return w.err 298 } 299 } 300 return nil 301} 302 303func (w *Writer) roundUpToCPageBoundary(x uint64) uint64 { 304 if w.CPageSize == 0 { 305 return x 306 } 307 return (x + w.CPageSize - 1) &^ (w.CPageSize - 1) 308} 309 310// AddResource adds a shared resource to the RAC file. It returns a non-zero 311// OptResource that identifies the resource's bytes. Future calls to AddChunk 312// can pass these identifiers to indicate that decompressing a chunk depends on 313// up to two shared resources. 314// 315// The caller may modify resource's contents after this method returns. 316func (w *Writer) AddResource(resource []byte) (OptResource, error) { 317 if len(w.resourcesCOffCLens) >= (1 << 30) { 318 w.err = errors.New("rac: too many resources") 319 return 0, w.err 320 } 321 322 if err := w.initialize(); err != nil { 323 return 0, err 324 } 325 if err := w.write(resource); err != nil { 326 return 0, err 327 } 328 329 if len(w.resourcesCOffCLens) == 0 { 330 w.resourcesCOffCLens = make([]uint64, 1, 8) 331 } 332 id := OptResource(len(w.resourcesCOffCLens)) 333 cOffset := w.dataSize - uint64(len(resource)) 334 cLength := calcCLength(len(resource)) 335 w.resourcesCOffCLens = append(w.resourcesCOffCLens, cOffset|(cLength<<48)) 336 return id, nil 337} 338 339// AddChunk adds a chunk of compressed data - the (primary, secondary, 340// tertiary) tuple - to the RAC file. Decompressing that chunk should produce 341// dRangeSize bytes, although the rac.Writer does not attempt to verify that. 342// 343// The OptResource arguments may be zero, meaning that no resource is used. In 344// that case, the corresponding STag or TTag (see the RAC specification for 345// further discussion) will be 0xFF. 346// 347// The caller may modify primary's contents after this method returns. 348func (w *Writer) AddChunk(dRangeSize uint64, primary []byte, secondary OptResource, tertiary OptResource) error { 349 if dRangeSize == 0 { 350 return nil 351 } 352 if (dRangeSize > MaxSize) || ((w.dFileSize + dRangeSize) > MaxSize) { 353 w.err = errors.New("rac: too much input") 354 return w.err 355 } 356 if len(w.leafNodes) >= (1 << 30) { 357 w.err = errors.New("rac: too many chunks") 358 return w.err 359 } 360 361 if err := w.initialize(); err != nil { 362 return err 363 } 364 if err := w.write(primary); err != nil { 365 return err 366 } 367 368 cOffset := w.dataSize - uint64(len(primary)) 369 cLength := calcCLength(len(primary)) 370 w.dFileSize += dRangeSize 371 w.leafNodes = append(w.leafNodes, node{ 372 dRangeSize: dRangeSize, 373 cOffsetCLength: cOffset | (cLength << 48), 374 secondary: secondary, 375 tertiary: tertiary, 376 }) 377 return nil 378} 379 380func writeEmpty(w io.Writer, codec Codec) error { 381 buf := [32]byte{ 382 0x72, 0xC3, 0x63, 0x01, 0x00, 0x00, 0x00, 0xFF, 383 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, uint8(codec), 384 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0xFF, 385 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 386 } 387 388 checksum := crc32.ChecksumIEEE(buf[6:]) 389 checksum ^= checksum >> 16 390 buf[4] = uint8(checksum >> 0) 391 buf[5] = uint8(checksum >> 8) 392 393 _, err := w.Write(buf[:]) 394 return err 395} 396 397// Close writes the RAC index to w.Writer and marks that w accepts no further 398// method calls. 399// 400// For a one pass encoding, no further action is taken. For a two pass encoding 401// (i.e. IndexLocationAtStart), it then copies w.TempFile to w.Writer. Either 402// way, if this method returns nil error, the entirety of what was written to 403// w.Writer constitutes a valid RAC file. 404// 405// Closing the underlying w.Writer, w.TempFile or both is the responsibility of 406// the rac.Writer caller, not the rac.Writer itself. 407// 408// It is not necessary to call Close, e.g. if an earlier AddXxx call returned a 409// non-nil error. Unlike an os.File, failing to call rac.Writer.Close will not 410// leak resources such as file descriptors. 411func (w *Writer) Close() error { 412 if w.err != nil { 413 return w.err 414 } 415 if !w.initialized { 416 if err := w.checkParameters(); err != nil { 417 return err 418 } 419 } 420 421 if len(w.leafNodes) == 0 { 422 return writeEmpty(w.Writer, w.Codec) 423 } 424 rootNode := gather(w.leafNodes) 425 indexSize := rootNode.calcEncodedSize(0) 426 427 nw := &nodeWriter{ 428 w: w.Writer, 429 resourcesCOffCLens: w.resourcesCOffCLens, 430 codec: w.Codec, 431 } 432 433 if w.IndexLocation == IndexLocationAtEnd { 434 nw.indexCOffset = w.roundUpToCPageBoundary(w.dataSize) 435 nw.cFileSize = nw.indexCOffset + indexSize 436 } else { 437 nw.dataCOffset = w.roundUpToCPageBoundary(indexSize) 438 nw.cFileSize = nw.dataCOffset + w.dataSize 439 } 440 if nw.cFileSize > MaxSize { 441 w.err = errors.New("rac: too much input") 442 return w.err 443 } 444 445 if w.IndexLocation == IndexLocationAtEnd { 446 // Write the align-to-CPageSize padding. 447 if w.CPageSize > 0 { 448 if err := w.padToPageSize(w.Writer, w.dataSize); err != nil { 449 return err 450 } 451 } 452 453 // Write the index. The compressed data has already been written. 454 if err := nw.writeIndex(&rootNode); err != nil { 455 w.err = err 456 return err 457 } 458 459 } else { 460 expectedTempFileSize := w.dataSize 461 462 // Write the index. 463 if err := nw.writeIndex(&rootNode); err != nil { 464 w.err = err 465 return err 466 } 467 468 // Write the align-to-CPageSize padding. 469 if w.CPageSize > 0 { 470 if err := w.padToPageSize(w.Writer, indexSize); err != nil { 471 return err 472 } 473 } 474 475 // Write the compressed data. 476 if s, ok := w.TempFile.(io.Seeker); ok { 477 if _, err := s.Seek(w.tempFileSeekStart, io.SeekStart); err != nil { 478 w.err = err 479 return err 480 } 481 } 482 if n, err := io.Copy(w.Writer, w.TempFile); err != nil { 483 w.err = err 484 return err 485 } else if uint64(n) != expectedTempFileSize { 486 w.err = errors.New("rac: inconsistent compressed data size") 487 return w.err 488 } 489 } 490 491 w.err = errors.New("rac: Writer is closed") 492 return nil 493} 494 495type node struct { 496 dRangeSize uint64 497 498 children []node // Excludes resource-node chidren. 499 resources []int 500 501 cOffsetCLength uint64 502 secondary OptResource 503 tertiary OptResource 504} 505 506// calcEncodedSize accumulates the encoded size of n and its children, 507// traversing the nodes in depth-first order. 508// 509// As a side effect, it also sets n.cOffsetCLength for branch nodes. 510func (n *node) calcEncodedSize(accumulator uint64) (newAccumulator uint64) { 511 arity := len(n.children) + len(n.resources) 512 if arity == 0 { 513 return accumulator 514 } 515 size := (arity * 16) + 16 516 cLength := calcCLength(size) 517 n.cOffsetCLength = accumulator | (cLength << 48) 518 accumulator += uint64(size) 519 for i := range n.children { 520 accumulator = n.children[i].calcEncodedSize(accumulator) 521 } 522 return accumulator 523} 524 525type nodeWriter struct { 526 w io.Writer 527 528 cFileSize uint64 529 dataCOffset uint64 530 indexCOffset uint64 531 resourcesCOffCLens []uint64 532 533 codec Codec 534 535 // A node's maximum arity is 255, so a node's maximum size in bytes is 536 // ((255 * 16) + 16), which is 4096. 537 buffer [4096]byte 538} 539 540func (w *nodeWriter) writeIndex(n *node) error { 541 const tagFF = 0xFF << 56 542 543 buf, dPtr := w.buffer[:], uint64(0) 544 arity := uint64(len(n.children) + len(n.resources)) 545 if arity > 0xFF { 546 return errors.New("rac: internal error: arity is too large") 547 } 548 549 // DPtr's. We write resources before regular children (non-resources), so 550 // that any TTag that refers to a resource always avoids the [0xC0, 0xFD] 551 // reserved zone. The ratio of resources to regulars is at most 2:1, as 552 // every chunk uses exactly 1 regular slot and up to 2 resource slots, and 553 // that reserved zone is less than 33% of the [0x00, 0xFF] space. 554 for i := range n.resources { 555 putU64LE(buf[8*i:], dPtr|tagFF) 556 } 557 buf = buf[8*len(n.resources):] 558 for i, o := range n.children { 559 putU64LE(buf[8*i:], dPtr|resourceToTag(n.resources, o.tertiary)) 560 dPtr += o.dRangeSize 561 } 562 buf = buf[8*len(n.children):] 563 564 // DPtrMax. 565 putU64LE(buf, dPtr|(uint64(w.codec)<<56)) 566 buf = buf[8:] 567 568 // CPtr's. While the RAC format allows otherwise, this Writer always keeps 569 // the CBias at zero, so that a CPtr equals a COffset. 570 for i, res := range n.resources { 571 cOffsetCLength := w.resourcesCOffCLens[res] + w.dataCOffset 572 putU64LE(buf[8*i:], cOffsetCLength|tagFF) 573 } 574 buf = buf[8*len(n.resources):] 575 for i, o := range n.children { 576 cOffsetCLength := o.cOffsetCLength 577 if len(o.children) == 0 { 578 cOffsetCLength += w.dataCOffset 579 } else { 580 cOffsetCLength += w.indexCOffset 581 } 582 putU64LE(buf[8*i:], cOffsetCLength|resourceToTag(n.resources, o.secondary)) 583 } 584 buf = buf[8*len(n.children):] 585 586 // CPtrMax. 587 const version = 0x01 588 putU64LE(buf, w.cFileSize|(version<<48)|(arity<<56)) 589 590 // Magic and Arity. 591 w.buffer[0] = 0x72 592 w.buffer[1] = 0xC3 593 w.buffer[2] = 0x63 594 w.buffer[3] = uint8(arity) 595 596 // Checksum. 597 size := (arity * 16) + 16 598 checksum := crc32.ChecksumIEEE(w.buffer[6:size]) 599 checksum ^= checksum >> 16 600 w.buffer[4] = uint8(checksum >> 0) 601 w.buffer[5] = uint8(checksum >> 8) 602 603 if _, err := w.w.Write(w.buffer[:size]); err != nil { 604 return err 605 } 606 607 for i, o := range n.children { 608 if len(o.children) != 0 { 609 if err := w.writeIndex(&n.children[i]); err != nil { 610 return err 611 } 612 } 613 } 614 return nil 615} 616 617func calcCLength(primarySize int) uint64 { 618 if primarySize <= 0 { 619 return 1 620 } 621 622 // Divide by 1024, rounding up. 623 primarySize = ((primarySize - 1) >> 10) + 1 624 625 if primarySize > 255 { 626 return 0 627 } 628 return uint64(primarySize) 629} 630 631func resourceToTag(resources []int, r OptResource) uint64 { 632 if r != 0 { 633 for i, res := range resources { 634 if res == int(r) { 635 return uint64(i) << 56 636 } 637 } 638 } 639 return 0xFF << 56 640} 641 642// gather brings the given nodes into a tree, such that every branch node's 643// arity (the count of its resource and non-resource child nodes) is at most 644// 0xFF. It returns the root of that tree. 645// 646// TODO: it currently builds a tree where all leaf nodes have equal depth. For 647// small RAC files (with only one branch node: the mandatory root node), this 648// doesn't matter. For larger RAC files, it might be better to build an uneven 649// tree to minimize average leaf node depth, with some branch nodes having both 650// branch node children and leaf node children. It might even be worth ensuring 651// that the root node always directly contains the first and last leaf nodes as 652// children, as those leaves presumably contain the most commonly accessed 653// parts of the decompressed file. 654func gather(nodes []node) node { 655 if len(nodes) == 0 { 656 panic("gather: no nodes") 657 } 658 659 resources := map[OptResource]bool{} 660 661 for { 662 i, j, arity, newNodes := 0, 0, 0, []node(nil) 663 for ; j < len(nodes); j++ { 664 o := &nodes[j] 665 666 new2 := (o.secondary != 0) && !resources[o.secondary] 667 new3 := (o.tertiary != 0) && !resources[o.tertiary] 668 arity += 1 + btoi(new2) + btoi(new3) 669 if arity <= 0xFF { 670 if new2 { 671 resources[o.secondary] = true 672 } 673 if new3 { 674 resources[o.tertiary] = true 675 } 676 continue 677 } 678 679 newNodes = append(newNodes, makeBranch(nodes[i:j], resources)) 680 if len(resources) != 0 { 681 resources = map[OptResource]bool{} 682 } 683 684 i = j 685 arity = 1 686 if o.secondary != 0 { 687 resources[o.secondary] = true 688 arity++ 689 } 690 if o.tertiary != 0 { 691 resources[o.tertiary] = true 692 arity++ 693 } 694 } 695 696 if i == 0 { 697 return makeBranch(nodes, resources) 698 } 699 700 newNodes = append(newNodes, makeBranch(nodes[i:], resources)) 701 if len(resources) != 0 { 702 resources = map[OptResource]bool{} 703 } 704 705 nodes, newNodes = newNodes, nil 706 } 707} 708 709func makeBranch(children []node, resMap map[OptResource]bool) node { 710 dRangeSize := uint64(0) 711 for _, c := range children { 712 dRangeSize += c.dRangeSize 713 } 714 715 resList := []int(nil) 716 if len(resMap) != 0 { 717 resList = make([]int, 0, len(resMap)) 718 for res := range resMap { 719 resList = append(resList, int(res)) 720 } 721 sort.Ints(resList) 722 } 723 724 return node{ 725 dRangeSize: dRangeSize, 726 children: children, 727 resources: resList, 728 cOffsetCLength: invalidCOffsetCLength, 729 } 730} 731