1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "puffin/src/include/puffin/utils.h"
6
7 #include <inttypes.h>
8
9 #include <algorithm>
10 #include <set>
11 #include <string>
12 #include <vector>
13
14 #include "puffin/src/bit_reader.h"
15 #include "puffin/src/file_stream.h"
16 #include "puffin/src/include/puffin/common.h"
17 #include "puffin/src/include/puffin/puffer.h"
18 #include "puffin/src/logging.h"
19 #include "puffin/src/memory_stream.h"
20 #include "puffin/src/puff_writer.h"
21
22 using std::set;
23 using std::string;
24 using std::vector;
25
26 namespace {
27 // Use memcpy to access the unaligned data of type |T|.
28 template <typename T>
get_unaligned(const void * address)29 inline T get_unaligned(const void* address) {
30 T result;
31 memcpy(&result, address, sizeof(T));
32 return result;
33 }
34
35 struct ExtentData {
36 puffin::BitExtent extent;
37 uint64_t byte_offset;
38 uint64_t byte_length;
39 const puffin::Buffer& data;
40
ExtentData__anon1c889f2d0111::ExtentData41 ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data)
42 : extent(in_extent), data(in_data) {
43 // Round start offset up and end offset down to exclude bits not in this
44 // extent. We simply ignore the bits at start and end that's not on byte
45 // boundary because as long as the majority of the bytes are the same,
46 // bsdiff will be able to reference it.
47 byte_offset = (extent.offset + 7) / 8;
48 uint64_t byte_end_offset = (extent.offset + extent.length) / 8;
49 CHECK(byte_end_offset <= data.size());
50 if (byte_end_offset > byte_offset) {
51 byte_length = byte_end_offset - byte_offset;
52 } else {
53 byte_length = 0;
54 }
55 }
56
Compare__anon1c889f2d0111::ExtentData57 int Compare(const ExtentData& other) const {
58 if (extent.length != other.extent.length) {
59 return extent.length < other.extent.length ? -1 : 1;
60 }
61 return memcmp(data.data() + byte_offset,
62 other.data.data() + other.byte_offset,
63 std::min(byte_length, other.byte_length));
64 }
operator <__anon1c889f2d0111::ExtentData65 bool operator<(const ExtentData& other) const { return Compare(other) < 0; }
operator ==__anon1c889f2d0111::ExtentData66 bool operator==(const ExtentData& other) const { return Compare(other) == 0; }
67 };
68
69 } // namespace
70
71 namespace puffin {
72
LocateDeflatesInDeflateStream(const uint8_t * data,uint64_t size,uint64_t virtual_offset,vector<BitExtent> * deflates,uint64_t * compressed_size)73 bool LocateDeflatesInDeflateStream(const uint8_t* data,
74 uint64_t size,
75 uint64_t virtual_offset,
76 vector<BitExtent>* deflates,
77 uint64_t* compressed_size) {
78 Puffer puffer;
79 BufferBitReader bit_reader(data, size);
80 BufferPuffWriter puff_writer(nullptr, 0);
81 vector<BitExtent> sub_deflates;
82 TEST_AND_RETURN_FALSE(
83 puffer.PuffDeflate(&bit_reader, &puff_writer, &sub_deflates));
84 for (const auto& deflate : sub_deflates) {
85 deflates->emplace_back(deflate.offset + virtual_offset * 8, deflate.length);
86 }
87 if (compressed_size) {
88 *compressed_size = bit_reader.Offset();
89 }
90 return true;
91 }
92
93 // This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
94 // definition of a zlib stream. For finding the deflate blocks, we relying on
95 // the proper size of the zlib stream in |data|. Basically the size of the zlib
96 // stream should be known before hand. Otherwise we need to parse the stream and
97 // find the location of compressed blocks using CalculateSizeOfDeflateBlock().
LocateDeflatesInZlib(const Buffer & data,vector<BitExtent> * deflates)98 bool LocateDeflatesInZlib(const Buffer& data, vector<BitExtent>* deflates) {
99 // A zlib stream has the following format:
100 // 0 1 compression method and flag
101 // 1 1 flag
102 // 2 4 preset dictionary (optional)
103 // 2 or 6 n compressed data
104 // n+(2 or 6) 4 Adler-32 checksum
105 TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer
106 uint16_t cmf = data[0];
107 auto compression_method = cmf & 0x0F;
108 // For deflate compression_method should be 8.
109 TEST_AND_RETURN_FALSE(compression_method == 8);
110
111 auto cinfo = (cmf & 0xF0) >> 4;
112 // Value greater than 7 is not allowed in deflate.
113 TEST_AND_RETURN_FALSE(cinfo <= 7);
114
115 auto flag = data[1];
116 TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
117
118 uint64_t header_len = 2;
119 if (flag & 0x20) {
120 header_len += 4; // 4 bytes for the preset dictionary.
121 }
122
123 // 4 is for ADLER32.
124 TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
125 data.data() + header_len, data.size() - header_len - 4, header_len,
126 deflates, nullptr));
127 return true;
128 }
129
FindDeflateSubBlocks(const UniqueStreamPtr & src,const vector<ByteExtent> & deflates,vector<BitExtent> * subblock_deflates)130 bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
131 const vector<ByteExtent>& deflates,
132 vector<BitExtent>* subblock_deflates) {
133 Puffer puffer;
134 Buffer deflate_buffer;
135 for (const auto& deflate : deflates) {
136 TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
137 // Read from src into deflate_buffer.
138 deflate_buffer.resize(deflate.length);
139 TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
140
141 // Find all the subblocks.
142 BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
143 // The uncompressed blocks will be ignored since we are passing a null
144 // buffered puff writer and a valid deflate locations output array. This
145 // should not happen in the puffdiff or anywhere else by default.
146 BufferPuffWriter puff_writer(nullptr, 0);
147 vector<BitExtent> subblocks;
148 TEST_AND_RETURN_FALSE(
149 puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks));
150 TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
151 for (const auto& subblock : subblocks) {
152 subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
153 subblock.length);
154 }
155 }
156 return true;
157 }
158
LocateDeflatesInZlibBlocks(const string & file_path,const vector<ByteExtent> & zlibs,vector<BitExtent> * deflates)159 bool LocateDeflatesInZlibBlocks(const string& file_path,
160 const vector<ByteExtent>& zlibs,
161 vector<BitExtent>* deflates) {
162 auto src = FileStream::Open(file_path, true, false);
163 TEST_AND_RETURN_FALSE(src);
164
165 Buffer buffer;
166 for (const auto& zlib : zlibs) {
167 buffer.resize(zlib.length);
168 TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
169 TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
170 vector<BitExtent> tmp_deflates;
171 TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &tmp_deflates));
172 for (const auto& deflate : tmp_deflates) {
173 deflates->emplace_back(deflate.offset + zlib.offset * 8, deflate.length);
174 }
175 }
176 return true;
177 }
178
179 // For more information about gzip format, refer to RFC 1952 located at:
180 // https://www.ietf.org/rfc/rfc1952.txt
LocateDeflatesInGzip(const Buffer & data,vector<BitExtent> * deflates)181 bool LocateDeflatesInGzip(const Buffer& data, vector<BitExtent>* deflates) {
182 uint64_t member_start = 0;
183 while (member_start + 10 <= data.size() && data[member_start + 0] == 0x1F &&
184 data[member_start + 1] == 0x8B && data[member_start + 2] == 8) {
185 // Each member entry has the following format
186 // 0 1 0x1F
187 // 1 1 0x8B
188 // 2 1 compression method (8 denotes deflate)
189 // 3 1 set of flags
190 // 4 4 modification time
191 // 8 1 extra flags
192 // 9 1 operating system
193
194 uint64_t offset = member_start + 10;
195 int flag = data[member_start + 3];
196 // Extra field
197 if (flag & 4) {
198 TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
199 uint16_t extra_length = data[offset++];
200 extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
201 TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
202 offset += extra_length;
203 }
204 // File name field
205 if (flag & 8) {
206 while (true) {
207 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
208 if (data[offset++] == 0) {
209 break;
210 }
211 }
212 }
213 // File comment field
214 if (flag & 16) {
215 while (true) {
216 TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
217 if (data[offset++] == 0) {
218 break;
219 }
220 }
221 }
222 // CRC16 field
223 if (flag & 2) {
224 offset += 2;
225 }
226
227 uint64_t compressed_size = 0;
228 TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
229 data.data() + offset, data.size() - offset, offset, deflates,
230 &compressed_size));
231 offset += compressed_size;
232
233 // Ignore CRC32 and uncompressed size.
234 TEST_AND_RETURN_FALSE(offset + 8 <= data.size());
235 offset += 8;
236 member_start = offset;
237 }
238 // Return true if we've successfully parsed at least one gzip.
239 return member_start != 0;
240 }
241
242 // For more information about the zip format, refer to
243 // https://support.pkware.com/display/PKZIP/APPNOTE
LocateDeflatesInZipArchive(const Buffer & data,vector<BitExtent> * deflates)244 bool LocateDeflatesInZipArchive(const Buffer& data,
245 vector<BitExtent>* deflates) {
246 uint64_t pos = 0;
247 while (pos <= data.size() - 30) {
248 // TODO(xunchang) add support for big endian system when searching for
249 // magic numbers.
250 if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
251 pos++;
252 continue;
253 }
254
255 // local file header format
256 // 0 4 0x04034b50
257 // 4 2 minimum version needed to extract
258 // 6 2 general purpose bit flag
259 // 8 2 compression method
260 // 10 4 file last modification date & time
261 // 14 4 CRC-32
262 // 18 4 compressed size
263 // 22 4 uncompressed size
264 // 26 2 file name length
265 // 28 2 extra field length
266 // 30 n file name
267 // 30+n m extra field
268 auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
269 if (compression_method != 8) { // non-deflate type
270 pos += 4;
271 continue;
272 }
273
274 auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
275 auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
276 auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
277 uint64_t header_size = 30 + file_name_length + extra_field_length;
278
279 // sanity check
280 if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
281 pos > data.size() - header_size - compressed_size) {
282 pos += 4;
283 continue;
284 }
285
286 vector<BitExtent> tmp_deflates;
287 uint64_t offset = pos + header_size;
288 uint64_t calculated_compressed_size = 0;
289 if (!LocateDeflatesInDeflateStream(
290 data.data() + offset, data.size() - offset, offset, &tmp_deflates,
291 &calculated_compressed_size)) {
292 LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
293 << ", skip adding deflates for this entry.";
294 pos += 4;
295 continue;
296 }
297
298 // Double check the compressed size if it is available in the file header.
299 if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
300 LOG(WARNING) << "Compressed size in the file header: " << compressed_size
301 << " doesn't equal the real size: "
302 << calculated_compressed_size;
303 }
304
305 deflates->insert(deflates->end(), tmp_deflates.begin(), tmp_deflates.end());
306 pos += header_size + calculated_compressed_size;
307 }
308
309 return true;
310 }
311
FindPuffLocations(const UniqueStreamPtr & src,const vector<BitExtent> & deflates,vector<ByteExtent> * puffs,uint64_t * out_puff_size)312 bool FindPuffLocations(const UniqueStreamPtr& src,
313 const vector<BitExtent>& deflates,
314 vector<ByteExtent>* puffs,
315 uint64_t* out_puff_size) {
316 Puffer puffer;
317 Buffer deflate_buffer;
318
319 // Here accumulate the size difference between each corresponding deflate and
320 // puff. At the end we add this cummulative size difference to the size of the
321 // deflate stream to get the size of the puff stream. We use signed size
322 // because puff size could be smaller than deflate size.
323 int64_t total_size_difference = 0;
324 for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
325 // Read from src into deflate_buffer.
326 auto start_byte = deflate->offset / 8;
327 auto end_byte = (deflate->offset + deflate->length + 7) / 8;
328 deflate_buffer.resize(end_byte - start_byte);
329 TEST_AND_RETURN_FALSE(src->Seek(start_byte));
330 TEST_AND_RETURN_FALSE(
331 src->Read(deflate_buffer.data(), deflate_buffer.size()));
332 // Find the size of the puff.
333 BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
334 uint64_t bits_to_skip = deflate->offset % 8;
335 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
336 bit_reader.DropBits(bits_to_skip);
337
338 BufferPuffWriter puff_writer(nullptr, 0);
339 TEST_AND_RETURN_FALSE(
340 puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr));
341 TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
342
343 // 1 if a deflate ends at the same byte that the next deflate starts and
344 // there is a few bits gap between them. In practice this may never happen,
345 // but it is a good idea to support it anyways. If there is a gap, the value
346 // of the gap will be saved as an integer byte to the puff stream. The parts
347 // of the byte that belogs to the deflates are shifted out.
348 int gap = 0;
349 if (deflate != deflates.begin()) {
350 auto prev_deflate = std::prev(deflate);
351 if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
352 // If deflates are on byte boundary the gap will not be counted later,
353 // so we won't worry about it.
354 && (deflate->offset % 8 != 0)) {
355 gap = 1;
356 }
357 }
358
359 start_byte = ((deflate->offset + 7) / 8);
360 end_byte = (deflate->offset + deflate->length) / 8;
361 int64_t deflate_length_in_bytes = end_byte - start_byte;
362
363 // If there was no gap bits between the current and previous deflates, there
364 // will be no extra gap byte, so the offset will be shifted one byte back.
365 auto puff_offset = start_byte - gap + total_size_difference;
366 auto puff_size = puff_writer.Size();
367 // Add the location into puff.
368 puffs->emplace_back(puff_offset, puff_size);
369 total_size_difference +=
370 static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
371 }
372
373 uint64_t src_size;
374 TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
375 auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
376 TEST_AND_RETURN_FALSE(final_size >= 0);
377 *out_puff_size = final_size;
378 return true;
379 }
380
RemoveEqualBitExtents(const Buffer & data1,const Buffer & data2,vector<BitExtent> * extents1,vector<BitExtent> * extents2)381 void RemoveEqualBitExtents(const Buffer& data1,
382 const Buffer& data2,
383 vector<BitExtent>* extents1,
384 vector<BitExtent>* extents2) {
385 set<ExtentData> extent1_set, equal_extents;
386 for (const BitExtent& ext : *extents1) {
387 extent1_set.emplace(ext, data1);
388 }
389
390 auto new_extents2_end = extents2->begin();
391 for (const BitExtent& ext : *extents2) {
392 ExtentData extent_data(ext, data2);
393 if (extent1_set.find(extent_data) != extent1_set.end()) {
394 equal_extents.insert(extent_data);
395 } else {
396 *new_extents2_end++ = ext;
397 }
398 }
399 extents2->erase(new_extents2_end, extents2->end());
400 extents1->erase(
401 std::remove_if(extents1->begin(), extents1->end(),
402 [&equal_extents, &data1](const BitExtent& ext) {
403 return equal_extents.find(ExtentData(ext, data1)) !=
404 equal_extents.end();
405 }),
406 extents1->end());
407 }
408
RemoveDeflatesWithBadDistanceCaches(const Buffer & data,vector<BitExtent> * deflates)409 bool RemoveDeflatesWithBadDistanceCaches(const Buffer& data,
410 vector<BitExtent>* deflates) {
411 Puffer puffer(true /* exclude_bad_distance_caches */);
412 for (auto def = deflates->begin(); def != deflates->end();) {
413 uint64_t offset = def->offset / 8;
414 uint64_t length = (def->offset + def->length + 7) / 8 - offset;
415 BufferBitReader br(&data[offset], length);
416 BufferPuffWriter pw(nullptr, 0);
417
418 // Drop the first few bits in the buffer so we start exactly where the
419 // deflate starts.
420 uint64_t bits_to_drop = def->offset % 8;
421 TEST_AND_RETURN_FALSE(br.CacheBits(bits_to_drop));
422 br.DropBits(bits_to_drop);
423
424 vector<BitExtent> defs_out;
425 TEST_AND_RETURN_FALSE(puffer.PuffDeflate(&br, &pw, &defs_out));
426
427 TEST_AND_RETURN_FALSE(defs_out.size() <= 1);
428 if (defs_out.size() == 0) {
429 // This is a deflate we were looking for, remove it.
430 def = deflates->erase(def);
431 } else {
432 ++def;
433 }
434 }
435 return true;
436 }
437
438 } // namespace puffin
439