1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "puffin/src/include/puffin/utils.h"
6 
7 #include <inttypes.h>
8 
9 #include <algorithm>
10 #include <iterator>
11 #include <set>
12 #include <string>
13 #include <vector>
14 
15 #include "puffin/file_stream.h"
16 #include "puffin/src/bit_reader.h"
17 #include "puffin/src/include/puffin/common.h"
18 #include "puffin/src/include/puffin/puffer.h"
19 #include "puffin/src/logging.h"
20 #include "puffin/memory_stream.h"
21 #include "puffin/src/puff_writer.h"
22 
23 using std::set;
24 using std::string;
25 using std::vector;
26 
27 namespace {
28 // Use memcpy to access the unaligned data of type |T|.
29 template <typename T>
get_unaligned(const void * address)30 inline T get_unaligned(const void* address) {
31   T result;
32   memcpy(&result, address, sizeof(T));
33   return result;
34 }
35 
36 struct ExtentData {
37   puffin::BitExtent extent;
38   uint64_t byte_offset;
39   uint64_t byte_length;
40   const puffin::Buffer& data;
41 
ExtentData__anoncc4cb5900111::ExtentData42   ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data)
43       : extent(in_extent), data(in_data) {
44     // Round start offset up and end offset down to exclude bits not in this
45     // extent. We simply ignore the bits at start and end that's not on byte
46     // boundary because as long as the majority of the bytes are the same,
47     // bsdiff will be able to reference it.
48     byte_offset = (extent.offset + 7) / 8;
49     uint64_t byte_end_offset = (extent.offset + extent.length) / 8;
50     CHECK(byte_end_offset <= data.size());
51     if (byte_end_offset > byte_offset) {
52       byte_length = byte_end_offset - byte_offset;
53     } else {
54       byte_length = 0;
55     }
56   }
57 
Compare__anoncc4cb5900111::ExtentData58   int Compare(const ExtentData& other) const {
59     if (extent.length != other.extent.length) {
60       return extent.length < other.extent.length ? -1 : 1;
61     }
62     return memcmp(data.data() + byte_offset,
63                   other.data.data() + other.byte_offset,
64                   std::min(byte_length, other.byte_length));
65   }
operator <__anoncc4cb5900111::ExtentData66   bool operator<(const ExtentData& other) const { return Compare(other) < 0; }
operator ==__anoncc4cb5900111::ExtentData67   bool operator==(const ExtentData& other) const { return Compare(other) == 0; }
68 };
69 
70 }  // namespace
71 
72 namespace puffin {
73 
LocateDeflatesInDeflateStream(const uint8_t * data,uint64_t size,uint64_t virtual_offset,vector<BitExtent> * deflates,uint64_t * compressed_size)74 bool LocateDeflatesInDeflateStream(const uint8_t* data,
75                                    uint64_t size,
76                                    uint64_t virtual_offset,
77                                    vector<BitExtent>* deflates,
78                                    uint64_t* compressed_size) {
79   Puffer puffer;
80   BufferBitReader bit_reader(data, size);
81   BufferPuffWriter puff_writer(nullptr, 0);
82   vector<BitExtent> sub_deflates;
83   TEST_AND_RETURN_FALSE(
84       puffer.PuffDeflate(&bit_reader, &puff_writer, &sub_deflates));
85   for (const auto& deflate : sub_deflates) {
86     deflates->emplace_back(deflate.offset + virtual_offset * 8, deflate.length);
87   }
88   if (compressed_size) {
89     *compressed_size = bit_reader.Offset();
90   }
91   return true;
92 }
93 
94 // This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
95 // definition of a zlib stream.  For finding the deflate blocks, we relying on
96 // the proper size of the zlib stream in |data|. Basically the size of the zlib
97 // stream should be known before hand. Otherwise we need to parse the stream and
98 // find the location of compressed blocks using CalculateSizeOfDeflateBlock().
LocateDeflatesInZlib(const Buffer & data,vector<BitExtent> * deflates)99 bool LocateDeflatesInZlib(const Buffer& data, vector<BitExtent>* deflates) {
100   // A zlib stream has the following format:
101   // 0           1     compression method and flag
102   // 1           1     flag
103   // 2           4     preset dictionary (optional)
104   // 2 or 6      n     compressed data
105   // n+(2 or 6)  4     Adler-32 checksum
106   TEST_AND_RETURN_FALSE(data.size() >= 6 + 4);  // Header + Footer
107   uint16_t cmf = data[0];
108   auto compression_method = cmf & 0x0F;
109   // For deflate compression_method should be 8.
110   TEST_AND_RETURN_FALSE(compression_method == 8);
111 
112   auto cinfo = (cmf & 0xF0) >> 4;
113   // Value greater than 7 is not allowed in deflate.
114   TEST_AND_RETURN_FALSE(cinfo <= 7);
115 
116   auto flag = data[1];
117   TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
118 
119   uint64_t header_len = 2;
120   if (flag & 0x20) {
121     header_len += 4;  // 4 bytes for the preset dictionary.
122   }
123 
124   // 4 is for ADLER32.
125   TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
126       data.data() + header_len, data.size() - header_len - 4, header_len,
127       deflates, nullptr));
128   return true;
129 }
130 
FindDeflateSubBlocks(const UniqueStreamPtr & src,const vector<ByteExtent> & deflates,vector<BitExtent> * subblock_deflates)131 bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
132                           const vector<ByteExtent>& deflates,
133                           vector<BitExtent>* subblock_deflates) {
134   Puffer puffer;
135   Buffer deflate_buffer;
136   for (const auto& deflate : deflates) {
137     TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
138     // Read from src into deflate_buffer.
139     deflate_buffer.resize(deflate.length);
140     TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
141 
142     // Find all the subblocks.
143     BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
144     // The uncompressed blocks will be ignored since we are passing a null
145     // buffered puff writer and a valid deflate locations output array. This
146     // should not happen in the puffdiff or anywhere else by default.
147     BufferPuffWriter puff_writer(nullptr, 0);
148     vector<BitExtent> subblocks;
149     TEST_AND_RETURN_FALSE(
150         puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks));
151     TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
152     for (const auto& subblock : subblocks) {
153       subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
154                                       subblock.length);
155     }
156   }
157   return true;
158 }
159 
LocateDeflatesInZlibBlocks(const string & file_path,const vector<ByteExtent> & zlibs,vector<BitExtent> * deflates)160 bool LocateDeflatesInZlibBlocks(const string& file_path,
161                                 const vector<ByteExtent>& zlibs,
162                                 vector<BitExtent>* deflates) {
163   auto src = FileStream::Open(file_path, true, false);
164   TEST_AND_RETURN_FALSE(src);
165 
166   Buffer buffer;
167   for (const auto& zlib : zlibs) {
168     buffer.resize(zlib.length);
169     TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
170     TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
171     vector<BitExtent> tmp_deflates;
172     TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &tmp_deflates));
173     for (const auto& deflate : tmp_deflates) {
174       deflates->emplace_back(deflate.offset + zlib.offset * 8, deflate.length);
175     }
176   }
177   return true;
178 }
179 
180 namespace {
181 // For more information about gzip format, refer to RFC 1952 located at:
182 // https://www.ietf.org/rfc/rfc1952.txt
IsValidGzipHeader(const uint8_t * header,size_t size)183 bool IsValidGzipHeader(const uint8_t* header, size_t size) {
184   // Each gzip entry has the following format magic header:
185   // 0      1     0x1F
186   // 1      1     0x8B
187   // 2      1     compression method (8 denotes deflate)
188   static constexpr uint8_t magic[] = {0x1F, 0x8B, 8};
189   return size >= 10 && std::equal(std::begin(magic), std::end(magic), header);
190 }
191 }  // namespace
192 
LocateDeflatesInGzip(const Buffer & data,vector<BitExtent> * deflates)193 bool LocateDeflatesInGzip(const Buffer& data, vector<BitExtent>* deflates) {
194   TEST_AND_RETURN_FALSE(IsValidGzipHeader(data.data(), data.size()));
195   uint64_t member_start = 0;
196   do {
197     // After the magic header, the gzip contains:
198     // 3      1     set of flags
199     // 4      4     modification time
200     // 8      1     extra flags
201     // 9      1     operating system
202 
203     uint64_t offset = member_start + 10;
204     int flag = data[member_start + 3];
205     // Extra field
206     if (flag & 4) {
207       TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
208       uint16_t extra_length = data[offset++];
209       extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
210       TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
211       offset += extra_length;
212     }
213     // File name field
214     if (flag & 8) {
215       while (true) {
216         TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
217         if (data[offset++] == 0) {
218           break;
219         }
220       }
221     }
222     // File comment field
223     if (flag & 16) {
224       while (true) {
225         TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
226         if (data[offset++] == 0) {
227           break;
228         }
229       }
230     }
231     // CRC16 field
232     if (flag & 2) {
233       offset += 2;
234     }
235 
236     uint64_t compressed_size = 0;
237     TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
238         data.data() + offset, data.size() - offset, offset, deflates,
239         &compressed_size));
240     offset += compressed_size;
241 
242     // Ignore CRC32 and uncompressed size.
243     offset += 8;
244     member_start = offset;
245   } while (member_start < data.size() &&
246            IsValidGzipHeader(&data[member_start], data.size() - member_start));
247   return true;
248 }
249 
250 // For more information about the zip format, refer to
251 // https://support.pkware.com/display/PKZIP/APPNOTE
LocateDeflatesInZipArchive(const Buffer & data,vector<BitExtent> * deflates)252 bool LocateDeflatesInZipArchive(const Buffer& data,
253                                 vector<BitExtent>* deflates) {
254   uint64_t pos = 0;
255   while (pos + 30 <= data.size()) {
256     // TODO(xunchang) add support for big endian system when searching for
257     // magic numbers.
258     if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
259       pos++;
260       continue;
261     }
262 
263     // local file header format
264     // 0      4     0x04034b50
265     // 4      2     minimum version needed to extract
266     // 6      2     general purpose bit flag
267     // 8      2     compression method
268     // 10     4     file last modification date & time
269     // 14     4     CRC-32
270     // 18     4     compressed size
271     // 22     4     uncompressed size
272     // 26     2     file name length
273     // 28     2     extra field length
274     // 30     n     file name
275     // 30+n   m     extra field
276     auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
277     if (compression_method != 8) {  // non-deflate type
278       pos += 4;
279       continue;
280     }
281 
282     auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
283     auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
284     auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
285     uint64_t header_size = 30 + file_name_length + extra_field_length;
286 
287     // sanity check
288     if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
289         pos > data.size() - header_size - compressed_size) {
290       pos += 4;
291       continue;
292     }
293 
294     vector<BitExtent> tmp_deflates;
295     uint64_t offset = pos + header_size;
296     uint64_t calculated_compressed_size = 0;
297     if (!LocateDeflatesInDeflateStream(
298             data.data() + offset, data.size() - offset, offset, &tmp_deflates,
299             &calculated_compressed_size)) {
300       LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
301                  << ", skip adding deflates for this entry.";
302       pos += 4;
303       continue;
304     }
305 
306     // Double check the compressed size if it is available in the file header.
307     if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
308       LOG(WARNING) << "Compressed size in the file header: " << compressed_size
309                    << " doesn't equal the real size: "
310                    << calculated_compressed_size;
311     }
312 
313     deflates->insert(deflates->end(), tmp_deflates.begin(), tmp_deflates.end());
314     pos += header_size + calculated_compressed_size;
315   }
316 
317   return true;
318 }
319 
FindPuffLocations(const UniqueStreamPtr & src,const vector<BitExtent> & deflates,vector<ByteExtent> * puffs,uint64_t * out_puff_size)320 bool FindPuffLocations(const UniqueStreamPtr& src,
321                        const vector<BitExtent>& deflates,
322                        vector<ByteExtent>* puffs,
323                        uint64_t* out_puff_size) {
324   Puffer puffer;
325   Buffer deflate_buffer;
326 
327   // Here accumulate the size difference between each corresponding deflate and
328   // puff. At the end we add this cummulative size difference to the size of the
329   // deflate stream to get the size of the puff stream. We use signed size
330   // because puff size could be smaller than deflate size.
331   int64_t total_size_difference = 0;
332   for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
333     // Read from src into deflate_buffer.
334     auto start_byte = deflate->offset / 8;
335     auto end_byte = (deflate->offset + deflate->length + 7) / 8;
336     deflate_buffer.resize(end_byte - start_byte);
337     TEST_AND_RETURN_FALSE(src->Seek(start_byte));
338     TEST_AND_RETURN_FALSE(
339         src->Read(deflate_buffer.data(), deflate_buffer.size()));
340     // Find the size of the puff.
341     BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
342     uint64_t bits_to_skip = deflate->offset % 8;
343     TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
344     bit_reader.DropBits(bits_to_skip);
345 
346     BufferPuffWriter puff_writer(nullptr, 0);
347     TEST_AND_RETURN_FALSE(
348         puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr));
349     TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
350 
351     // 1 if a deflate ends at the same byte that the next deflate starts and
352     // there is a few bits gap between them. In practice this may never happen,
353     // but it is a good idea to support it anyways. If there is a gap, the value
354     // of the gap will be saved as an integer byte to the puff stream. The parts
355     // of the byte that belogs to the deflates are shifted out.
356     int gap = 0;
357     if (deflate != deflates.begin()) {
358       auto prev_deflate = std::prev(deflate);
359       if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
360           // If deflates are on byte boundary the gap will not be counted later,
361           // so we won't worry about it.
362           && (deflate->offset % 8 != 0)) {
363         gap = 1;
364       }
365     }
366 
367     start_byte = ((deflate->offset + 7) / 8);
368     end_byte = (deflate->offset + deflate->length) / 8;
369     int64_t deflate_length_in_bytes = end_byte - start_byte;
370 
371     // If there was no gap bits between the current and previous deflates, there
372     // will be no extra gap byte, so the offset will be shifted one byte back.
373     auto puff_offset = start_byte - gap + total_size_difference;
374     auto puff_size = puff_writer.Size();
375     // Add the location into puff.
376     puffs->emplace_back(puff_offset, puff_size);
377     total_size_difference +=
378         static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
379   }
380 
381   uint64_t src_size;
382   TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
383   auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
384   TEST_AND_RETURN_FALSE(final_size >= 0);
385   *out_puff_size = final_size;
386   return true;
387 }
388 
RemoveEqualBitExtents(const Buffer & data1,const Buffer & data2,vector<BitExtent> * extents1,vector<BitExtent> * extents2)389 void RemoveEqualBitExtents(const Buffer& data1,
390                            const Buffer& data2,
391                            vector<BitExtent>* extents1,
392                            vector<BitExtent>* extents2) {
393   set<ExtentData> extent1_set, equal_extents;
394   for (const BitExtent& ext : *extents1) {
395     extent1_set.emplace(ext, data1);
396   }
397 
398   auto new_extents2_end = extents2->begin();
399   for (const BitExtent& ext : *extents2) {
400     ExtentData extent_data(ext, data2);
401     if (extent1_set.find(extent_data) != extent1_set.end()) {
402       equal_extents.insert(extent_data);
403     } else {
404       *new_extents2_end++ = ext;
405     }
406   }
407   extents2->erase(new_extents2_end, extents2->end());
408   extents1->erase(
409       std::remove_if(extents1->begin(), extents1->end(),
410                      [&equal_extents, &data1](const BitExtent& ext) {
411                        return equal_extents.find(ExtentData(ext, data1)) !=
412                               equal_extents.end();
413                      }),
414       extents1->end());
415 }
416 
RemoveDeflatesWithBadDistanceCaches(const Buffer & data,vector<BitExtent> * deflates)417 bool RemoveDeflatesWithBadDistanceCaches(const Buffer& data,
418                                          vector<BitExtent>* deflates) {
419   Puffer puffer(true /* exclude_bad_distance_caches */);
420   for (auto def = deflates->begin(); def != deflates->end();) {
421     uint64_t offset = def->offset / 8;
422     uint64_t length = (def->offset + def->length + 7) / 8 - offset;
423     BufferBitReader br(&data[offset], length);
424     BufferPuffWriter pw(nullptr, 0);
425 
426     // Drop the first few bits in the buffer so we start exactly where the
427     // deflate starts.
428     uint64_t bits_to_drop = def->offset % 8;
429     TEST_AND_RETURN_FALSE(br.CacheBits(bits_to_drop));
430     br.DropBits(bits_to_drop);
431 
432     vector<BitExtent> defs_out;
433     TEST_AND_RETURN_FALSE(puffer.PuffDeflate(&br, &pw, &defs_out));
434 
435     TEST_AND_RETURN_FALSE(defs_out.size() <= 1);
436     if (defs_out.size() == 0) {
437       // This is a deflate we were looking for, remove it.
438       def = deflates->erase(def);
439     } else {
440       ++def;
441     }
442   }
443   return true;
444 }
445 
446 }  // namespace puffin
447