• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2022 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_
18 #define SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_
19 
20 #include <stdint.h>
21 
22 #include <functional>
23 #include <memory>
24 #include <string>
25 #include <vector>
26 
27 #include "perfetto/base/status.h"
28 #include "perfetto/ext/base/string_view.h"
29 
30 // ZipReader allows to read Zip files in a streaming fashion.
31 // Key features:
32 // - Read-only access, there is no ZipWriter.
33 // - Files can be processed as they are seen in the zip archive, without needing
34 //   to see the whole .zip file first.
35 // - It does not read the final zip central directory. Only the metadata in the
36 //   inline file headers is exposed.
37 // - Only the compressed payload is kept around in memory.
38 // - Supports line-based streaming for compressed text files (e.g. logs). This
39 //   enables line-based processing of compressed logs without having to
40 //   decompress fully the individual text file in memory.
41 // - Does NOT support zip64, encryption and other advanced zip file features.
42 // - It is not suitable for security-sensitive contexts. E.g. it doesn't deal
43 //   with zip path traversal attacks (the same file showing up twice with two
44 //   different payloads).
45 //
46 // Possible future features:
47 // - The user could setup a filter (a glob, or a callback) to select the
48 //   interesting files (e.g. *.txt) and skip the appending of the other entries.
49 //   This would avoid completely the cost of keeping in memory the compressed
50 //   payload of unwanted files (e.g. dumpstate.bin in BRs).
51 namespace perfetto {
52 namespace trace_processor {
53 namespace util {
54 
55 class ZipReader;
56 
57 constexpr size_t kZipFileHdrSize = 30;
58 
59 // Holds the metadata and compressed payload of a zip file and allows
60 // decompression. The lifecycle of a ZipFile is completely independent of the
61 // ZipReader that created it. ZipFile(s) can be std::move(d) around and even
62 // outlive the ZipReader.
63 class ZipFile {
64  public:
65   // Note: the lifetime of the lines passed in the vector argument is valid only
66   // for the duration of the callback. Don't retain the StringView(s) passed.
67   using LinesCallback =
68       std::function<void(const std::vector<base::StringView>&)>;
69 
70   ZipFile();
71   ~ZipFile();
72   ZipFile(ZipFile&&) noexcept;
73   ZipFile& operator=(ZipFile&&) noexcept;
74   ZipFile(const ZipFile&) = delete;
75   ZipFile& operator=(const ZipFile&) = delete;
76 
77   // Bulk decompression. It keeps around the compressed data internally, so
78   // this can be called several times.
79   base::Status Decompress(std::vector<uint8_t>*) const;
80 
81   // Streaming line-based decompression for text files.
82   // It decompresses the file in chunks and passes batches of lines to the
83   // caller, without decompressing the whole file into memory.
84   // The typical use case is processing large log files from a bugreport.
85   // Like the above, this is idempotent and keeps around the compressed data.
86   base::Status DecompressLines(LinesCallback) const;
87 
88   // File name, including the relative path (e.g., "FS/data/misc/foobar")
name()89   const std::string& name() const { return hdr_.fname; }
90 
91   // Seconds since the Epoch. This is effectively time_t on 64 bit platforms.
92   int64_t GetDatetime() const;
93 
94   // Returns the modified time in the format %Y-%m-%d %H:%M:%S.
95   std::string GetDatetimeStr() const;
96 
uncompressed_size()97   size_t uncompressed_size() const { return hdr_.uncompressed_size; }
compressed_size()98   size_t compressed_size() const { return hdr_.compressed_size; }
99 
100  private:
101   friend class ZipReader;
102 
103   base::Status DoDecompressionChecks() const;
104 
105   // Rationale for having this as a nested sub-struct:
106   // 1. Makes the move operator easier to maintain.
107   // 2. Allows the ZipReader to handle a copy of this struct for the file
108   //    being parsed. ZipReade will move the hdr into a full ZipFile once it
109   //    has established the file is complete and valid.
110   struct Header {
111     uint32_t signature = 0;
112     uint16_t version = 0;
113     uint16_t flags = 0;
114     uint16_t compression = 0;
115     uint32_t checksum = 0;
116     uint16_t mtime = 0;
117     uint16_t mdate = 0;
118     uint32_t compressed_size = 0;
119     uint32_t uncompressed_size = 0;
120     uint16_t fname_len = 0;
121     uint16_t extra_field_len = 0;
122     std::string fname;
123   };
124 
125   Header hdr_{};
126   std::unique_ptr<uint8_t[]> compressed_data_;
127   // If adding new fields here, remember to update the move operators.
128 };
129 
130 class ZipReader {
131  public:
132   ZipReader();
133   ~ZipReader();
134 
135   ZipReader(const ZipReader&) = delete;
136   ZipReader& operator=(const ZipReader&) = delete;
137   ZipReader(ZipReader&&) = delete;
138   ZipReader& operator=(ZipReader&&) = delete;
139 
140   // Parses data incrementally from a zip file in chunks. The chunks can be
141   // arbitrarily cut. You can pass the whole file in one go, byte by byte or
142   // anything in between.
143   // files() is updated incrementally as soon as a new whole compressed file
144   // has been processed. You don't need to get to the end of the zip file to
145   // see all files. The final "central directory" at the end of the file is
146   // actually ignored.
147   base::Status Parse(const void* data, size_t len);
148 
149   // Returns a list of all the files discovered so far.
files()150   const std::vector<ZipFile>& files() const { return files_; }
151 
152   // Moves ownership of the ZipFiles to the caller. The caller can use this
153   // to reduce the memory working set and retain only the files they care about.
TakeFiles()154   std::vector<ZipFile> TakeFiles() { return std::move(files_); }
155 
156   // Find a file by its path inside the zip archive.
157   ZipFile* Find(const std::string& path);
158 
159  private:
160   // Keeps track of the incremental parsing state of the current zip stream.
161   // When a compressed file is completely parsed, a ZipFile instance is
162   // constructed and appended to `files_`.
163   struct FileParseState {
164     uint8_t raw_hdr[kZipFileHdrSize]{};
165     size_t raw_hdr_size = 0;  // Actual bytes seen for `hdr_`.
166     std::unique_ptr<uint8_t[]> compressed_data;
167     size_t compressed_data_written = 0;
168     size_t ignore_bytes_after_fname = 0;
169     ZipFile::Header hdr{};
170   };
171   FileParseState cur_;
172   std::vector<ZipFile> files_;
173 };
174 
175 }  // namespace util
176 }  // namespace trace_processor
177 }  // namespace perfetto
178 
179 #endif  // SRC_TRACE_PROCESSOR_UTIL_ZIP_READER_H_
180