1// Copyright 2019 The Wuffs Authors. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package raczlib 16 17// TODO: API for shared dictionaries. 18 19import ( 20 "bytes" 21 "compress/zlib" 22 "errors" 23 "io" 24 25 "github.com/google/wuffs/lib/rac" 26 "github.com/google/wuffs/lib/zlibcut" 27) 28 29const ( 30 defaultDChunkSize = 65536 // 64 KiB. 31 32 maxCChunkSize = 1 << 30 // 1 GiB. 33 maxTargetDChunkSize = 1 << 31 // 2 GiB. 34) 35 36var ( 37 errCChunkSizeIsTooSmall = errors.New("raczlib: CChunkSize is too small") 38 errInvalidWriter = errors.New("raczlib: invalid Writer") 39 errWriterIsClosed = errors.New("raczlib: Writer is closed") 40 41 errInternalShortCSize = errors.New("raczlib: internal error: short CSize") 42) 43 44func startingTargetDChunkSize(cChunkSize uint64) uint64 { return 2 * cChunkSize } 45 46// stripTrailingZeroes returns b shorn of any trailing '\x00' bytes. The RAC 47// file format will implicitly set any trailing zeroes. 48func stripTrailingZeroes(b []byte) []byte { 49 n := len(b) 50 for (n > 0) && (b[n-1] == 0) { 51 n-- 52 } 53 return b[:n] 54} 55 56// writeBuffer is a sequence of bytes, split into two slices: prev[p:] and 57// curr. Together, they hold the tail (so far) of the stream of bytes given to 58// an io.Writer. 59// 60// Conceptually, it is the concatenation of those two slices, but the struct 61// holds two slices, not one, to minimize copying and memory allocation. 62// 63// The first slice, prev[p:], is those bytes from all previous Write calls that 64// have been copied to an internal buffer but not yet fully processed. 65// 66// The second slice, curr, is those bytes from the current Write call that have 67// not been processed. 68// 69// The underlying array that backs the prev slice is owned by the writeBuffer, 70// and is re-used to minimize memory allocations. The curr slice is a sub-slice 71// of the []byte passed to the Write method, and is owned by the caller, not by 72// the writeBuffer. 73// 74// As bytes are fully processed, prev[p:] shrinks (by incrementing "p += etc") 75// and after len(prev[p:]) hits zero, curr shrinks (by re-assigning "curr = 76// curr[etc:]"). The two mechanisms differ (e.g. there is a p int that offsets 77// prev but not a corresponding c int that offsets curr) because of the 78// difference in backing-array memory ownership, mentioned above. 79// 80// The intended usage is for an io.Writer's Write(data) method to first call 81// writeBuffer.extend(data), then a mixture of multiple writeBuffer.peek(n) and 82// writeBuffer.advance(n), and finally writeBuffer.compact(). Thus, before and 83// after every Write call, writeBuffer.curr is empty and writebuffer.p is 0. 84type writeBuffer struct { 85 prev []byte 86 curr []byte 87 p int 88} 89 90// extend extends b's view of an io.Writer's incoming byte stream. 91func (b *writeBuffer) extend(curr []byte) { 92 if len(b.curr) != 0 { 93 panic("inconsistent writeBuffer state") 94 } 95 b.curr = curr 96} 97 98// length returns the total number of bytes available in b's buffers. 99func (b *writeBuffer) length() uint64 { 100 return uint64(len(b.prev)-b.p) + uint64(len(b.curr)) 101} 102 103// peek returns the next m bytes in b's buffers, where m is the minimum of n 104// and b.length(). 105// 106// The bytes are returned in two slices, not a single contiguous slice, in 107// order to minimize copying and memory allocation. 108func (b *writeBuffer) peek(n uint64) ([]byte, []byte) { 109 available := uint64(len(b.prev) - b.p) 110 if n <= available { 111 return b.prev[b.p : b.p+int(n)], nil 112 } 113 n -= available 114 if n <= uint64(len(b.curr)) { 115 return b.prev[b.p:], b.curr[:n] 116 } 117 return b.prev[b.p:], b.curr 118} 119 120// advance consumes the next n bytes. 121func (b *writeBuffer) advance(n uint64) { 122 available := uint64(len(b.prev) - b.p) 123 if n <= available { 124 b.p += int(n) 125 return 126 } 127 n -= available 128 b.curr = b.curr[n:] 129 b.p = len(b.prev) 130} 131 132// compact moves and copies any unprocessed bytes in b.prev and b.curr to be at 133// the start of b.prev. 134func (b *writeBuffer) compact() { 135 // Move any remaining bytes in prev to the front. 136 n := copy(b.prev, b.prev[b.p:]) 137 b.prev = b.prev[:n] 138 b.p = 0 139 // Move any remaining bytes in curr to prev. 140 b.prev = append(b.prev, b.curr...) 141 b.curr = nil 142} 143 144// Writer provides a relatively simple way to write a RAC file (with the Zlib 145// compression codec) - one that is created starting from nothing, as opposed 146// to incrementally modifying an existing RAC file. 147// 148// Other packages may provide a more flexible (and more complicated) way to 149// write or append to RAC files, but that is out of scope of this package. 150// 151// Do not modify its exported fields after calling any of its methods. 152// 153// Writer implements the io.Writer interface. 154type Writer struct { 155 // Writer is where the RAC-encoded data is written to. 156 // 157 // Nil is an invalid value. 158 Writer io.Writer 159 160 // IndexLocation is whether the index is at the start or end of the RAC 161 // file. 162 // 163 // See the RAC specification for further discussion. 164 // 165 // The zero value of this field is IndexLocationAtEnd: a one pass encoding. 166 IndexLocation IndexLocation 167 168 // TempFile is a temporary file to use for a two pass encoding. The field 169 // name says "file" but it need not be a real file, in the operating system 170 // sense. 171 // 172 // For IndexLocationAtEnd, this must be nil. For IndexLocationAtStart, this 173 // must be non-nil. 174 // 175 // It does not need to implement io.Seeker, if it supports separate read 176 // and write positions (e.g. it is a bytes.Buffer). If it does implement 177 // io.Seeker (e.g. it is an os.File), its current position will be noted 178 // (via SeekCurrent), then it will be written to (via the 179 // raczlib.Writer.Write method), then its position will be reset (via 180 // SeekSet), then it will be read from (via the raczlib.Writer.Close 181 // method). 182 // 183 // The raczlib.Writer does not call TempFile.Close even if that method 184 // exists (e.g. the TempFile is an os.File). In that case, closing the 185 // temporary file (and deleting it) after the raczlib.Writer is closed is 186 // the responsibility of the raczlib.Writer caller, not the raczlib.Writer 187 // itself. 188 TempFile io.ReadWriter 189 190 // CPageSize guides padding the output to minimize the number of pages that 191 // each chunk occupies (in what the RAC spec calls CSpace). 192 // 193 // It must either be zero (which means no padding is inserted) or a power 194 // of 2, and no larger than MaxSize. 195 // 196 // For example, suppose that CSpace is partitioned into 1024-byte pages, 197 // that 1000 bytes have already been written to the output, and the next 198 // chunk is 1500 bytes long. 199 // 200 // - With no padding (i.e. with CPageSize set to 0), this chunk will 201 // occupy the half-open range [1000, 2500) in CSpace, which straddles 202 // three 1024-byte pages: the page [0, 1024), the page [1024, 2048) and 203 // the page [2048, 3072). Call those pages p0, p1 and p2. 204 // 205 // - With padding (i.e. with CPageSize set to 1024), 24 bytes of zeroes 206 // are first inserted so that this chunk occupies the half-open range 207 // [1024, 2524) in CSpace, which straddles only two pages (p1 and p2). 208 // 209 // This concept is similar, but not identical, to alignment. Even with a 210 // non-zero CPageSize, chunk start offsets are not necessarily aligned to 211 // page boundaries. For example, suppose that the chunk size was only 1040 212 // bytes, not 1500 bytes. No padding will be inserted, as both [1000, 2040) 213 // and [1024, 2064) straddle two pages: either pages p0 and p1, or pages p1 214 // and p2. 215 // 216 // Nonetheless, if all chunks (or all but the final chunk) have a size 217 // equal to (or just under) a multiple of the page size, then in practice, 218 // each chunk's starting offset will be aligned to a page boundary. 219 CPageSize uint64 220 221 // CChunkSize is the compressed size (i.e. size in CSpace) of each chunk. 222 // The final chunk might be smaller than this. 223 // 224 // This field is ignored if DChunkSize is non-zero. 225 CChunkSize uint64 226 227 // DChunkSize is the uncompressed size (i.e. size in DSpace) of each chunk. 228 // The final chunk might be smaller than this. 229 // 230 // If both CChunkSize and DChunkSize are non-zero, DChunkSize takes 231 // precedence and CChunkSize is ignored. 232 // 233 // If both CChunkSize and DChunkSize are zero, a default DChunkSize value 234 // will be used. 235 DChunkSize uint64 236 237 // cChunkSize and dChunkSize are copies of CChunkSize and DChunkSize, 238 // validated during the initialize method. For example, if both CChunkSize 239 // and DChunkSize are zero, dChunkSize will be defaultDChunkSize. 240 cChunkSize uint64 241 dChunkSize uint64 242 243 // err is the first error encountered. It is sticky: once a non-nil error 244 // occurs, all public methods will return that error. 245 err error 246 247 racWriter rac.Writer 248 zlibWriter *zlib.Writer 249 250 compressed bytes.Buffer 251 uncompressed writeBuffer 252} 253 254func (w *Writer) initialize() error { 255 if w.err != nil { 256 return w.err 257 } 258 if w.racWriter.Writer != nil { 259 // We're already initialized. 260 return nil 261 } 262 if w.Writer == nil { 263 w.err = errInvalidWriter 264 return w.err 265 } 266 267 if w.DChunkSize > 0 { 268 w.dChunkSize = w.DChunkSize 269 } else if w.CChunkSize > 0 { 270 w.cChunkSize = w.CChunkSize 271 if w.cChunkSize > maxCChunkSize { 272 w.cChunkSize = maxCChunkSize 273 } 274 } else { 275 w.dChunkSize = defaultDChunkSize 276 } 277 278 w.racWriter.Writer = w.Writer 279 w.racWriter.Codec = rac.CodecZlib 280 w.racWriter.IndexLocation = w.IndexLocation 281 w.racWriter.TempFile = w.TempFile 282 w.racWriter.CPageSize = w.CPageSize 283 284 w.zlibWriter, w.err = zlib.NewWriterLevel(&w.compressed, zlib.BestCompression) 285 return w.err 286} 287 288func (w *Writer) compress(dBytes0 []byte, dBytes1 []byte) ([]byte, error) { 289 w.compressed.Reset() 290 w.zlibWriter.Reset(&w.compressed) 291 if len(dBytes0) > 0 { 292 if _, err := w.zlibWriter.Write(dBytes0); err != nil { 293 w.err = err 294 return nil, err 295 } 296 } 297 if len(dBytes1) > 0 { 298 if _, err := w.zlibWriter.Write(dBytes1); err != nil { 299 w.err = err 300 return nil, err 301 } 302 } 303 if err := w.zlibWriter.Close(); err != nil { 304 w.err = err 305 return nil, err 306 } 307 return w.compressed.Bytes(), nil 308} 309 310// Write implements io.Writer. 311func (w *Writer) Write(p []byte) (int, error) { 312 if err := w.initialize(); err != nil { 313 return 0, err 314 } 315 w.uncompressed.extend(p) 316 n, err := len(p), w.write(false) 317 w.uncompressed.compact() 318 if err != nil { 319 return 0, err 320 } 321 return n, nil 322} 323 324func (w *Writer) write(eof bool) error { 325 if w.dChunkSize > 0 { 326 return w.writeDChunks(eof) 327 } 328 return w.writeCChunks(eof) 329} 330 331func (w *Writer) writeDChunks(eof bool) error { 332 for { 333 peek0, peek1 := w.uncompressed.peek(w.dChunkSize) 334 dSize := uint64(len(peek0)) + uint64(len(peek1)) 335 if dSize == 0 { 336 return nil 337 } 338 if !eof && (dSize < w.dChunkSize) { 339 return nil 340 } 341 342 peek1 = stripTrailingZeroes(peek1) 343 if len(peek1) == 0 { 344 peek0 = stripTrailingZeroes(peek0) 345 } 346 cBytes, err := w.compress(peek0, peek1) 347 if err != nil { 348 return err 349 } 350 351 if err := w.racWriter.AddChunk(dSize, cBytes, 0, 0); err != nil { 352 w.err = err 353 return err 354 } 355 w.uncompressed.advance(dSize) 356 } 357} 358 359func (w *Writer) writeCChunks(eof bool) error { 360 // Each outer loop iteration tries to write exactly one chunk. 361outer: 362 for { 363 // We don't know, a priori, how many w.uncompressed bytes are needed to 364 // produce a compressed chunk of size cChunkSize. We use a 365 // startingTargetDChunkSize that is a small multiple of cChunkSize, and 366 // keep doubling that until we build something at least as large as 367 // cChunkSize, then zlibcut it back to be exactly cChunkSize. 368 targetDChunkSize := uint64(maxTargetDChunkSize) 369 if !eof { 370 targetDChunkSize = startingTargetDChunkSize(w.cChunkSize) 371 } 372 373 if n := w.uncompressed.length(); n == 0 { 374 return nil 375 } else if !eof && (n < targetDChunkSize) { 376 return nil 377 } 378 379 // The inner loop keeps doubling the targetDChunkSize until it's big 380 // enough to produce at least cChunkSize bytes of compressed data. 381 for { 382 next := targetDChunkSize * 2 383 if next > maxTargetDChunkSize { 384 next = maxTargetDChunkSize 385 } 386 force := next <= targetDChunkSize 387 388 if err := w.tryCChunk(targetDChunkSize, force); err == nil { 389 continue outer 390 } else if err != errInternalShortCSize { 391 return err 392 } 393 394 if w.uncompressed.length() <= targetDChunkSize { 395 // Growing the targetDChunkSize isn't going to change anything. 396 return nil 397 } 398 targetDChunkSize = next 399 } 400 } 401} 402 403func (w *Writer) tryCChunk(targetDChunkSize uint64, force bool) error { 404 peek0, peek1 := w.uncompressed.peek(targetDChunkSize) 405 dSize := uint64(len(peek0)) + uint64(len(peek1)) 406 cBytes, err := w.compress(peek0, peek1) 407 if err != nil { 408 return err 409 } 410 411 switch { 412 case uint64(len(cBytes)) < w.cChunkSize: 413 if !force { 414 return errInternalShortCSize 415 } 416 fallthrough 417 case uint64(len(cBytes)) == w.cChunkSize: 418 if err := w.racWriter.AddChunk(dSize, cBytes, 0, 0); err != nil { 419 w.err = err 420 return err 421 } 422 w.uncompressed.advance(dSize) 423 return nil 424 } 425 426 eLen, dLen, err := zlibcut.Cut(nil, cBytes, int(w.cChunkSize)) 427 if err != nil { 428 w.err = err 429 return err 430 } 431 if dLen == 0 { 432 w.err = errCChunkSizeIsTooSmall 433 return w.err 434 } 435 if err := w.racWriter.AddChunk(uint64(dLen), cBytes[:eLen], 0, 0); err != nil { 436 w.err = err 437 return err 438 } 439 w.uncompressed.advance(uint64(dLen)) 440 return nil 441} 442 443// Close writes the RAC index to w.Writer and marks that w accepts no further 444// method calls. 445// 446// For a one pass encoding, no further action is taken. For a two pass encoding 447// (i.e. IndexLocationAtStart), it then copies w.TempFile to w.Writer. Either 448// way, if this method returns nil error, the entirety of what was written to 449// w.Writer constitutes a valid RAC file. 450// 451// Closing the underlying w.Writer, w.TempFile or both is the responsibility of 452// the raczlib.Writer caller, not the raczlib.Writer itself. 453// 454// It is not necessary to call Close, e.g. if an earlier Write call returned a 455// non-nil error. Unlike an os.File, failing to call raczlib.Writer.Close will 456// not leak resources such as file descriptors. 457func (w *Writer) Close() error { 458 if err := w.initialize(); err != nil { 459 return err 460 } 461 if err := w.write(true); err != nil { 462 return err 463 } 464 if err := w.racWriter.Close(); err != nil { 465 w.err = err 466 return err 467 } 468 w.err = errWriterIsClosed 469 return nil 470} 471