• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "components/zucchini/equivalence_map.h"
6 
7 #include <algorithm>
8 #include <utility>
9 
10 #include "base/containers/cxx20_erase.h"
11 #include "base/logging.h"
12 #include "base/numerics/safe_conversions.h"
13 #include "components/zucchini/encoded_view.h"
14 #include "components/zucchini/patch_reader.h"
15 #include "components/zucchini/suffix_array.h"
16 
17 namespace zucchini {
18 
19 namespace {
20 
21 // TODO(haungs): Tune these numbers to improve pathological case results.
22 
23 // In pathological cases Zucchini can exhibit O(n^2) behavior if the seed
24 // selection process runs to completion. To prevent this we impose a quota for
25 // the total length of equivalences the seed selection process can perform
26 // trials on. For regular use cases it is unlikely this quota will be exceeded,
27 // and if it is the effects on patch size are expected to be small.
28 constexpr uint64_t kSeedSelectionTotalVisitLengthQuota = 1 << 18;  // 256 KiB
29 
30 // The aforementioned quota alone is insufficient, as exploring backwards will
31 // still be very successful resulting in O(n) behavior in the case of a limited
32 // seed selection trials. This results in O(n^2) behavior returning. To mitigate
33 // this we also impose a cap on the ExtendEquivalenceBackward() exploration.
34 constexpr offset_t kBackwardsExtendLimit = 1 << 16;  // 64 KiB
35 
36 }  // namespace
37 
38 /******** Utility Functions ********/
39 
GetTokenSimilarity(const ImageIndex & old_image_index,const ImageIndex & new_image_index,const std::vector<TargetsAffinity> & targets_affinities,offset_t src,offset_t dst)40 double GetTokenSimilarity(
41     const ImageIndex& old_image_index,
42     const ImageIndex& new_image_index,
43     const std::vector<TargetsAffinity>& targets_affinities,
44     offset_t src,
45     offset_t dst) {
46   DCHECK(old_image_index.IsToken(src));
47   DCHECK(new_image_index.IsToken(dst));
48 
49   TypeTag old_type = old_image_index.LookupType(src);
50   TypeTag new_type = new_image_index.LookupType(dst);
51   if (old_type != new_type)
52     return kMismatchFatal;
53 
54   // Raw comparison.
55   if (!old_image_index.IsReference(src) && !new_image_index.IsReference(dst)) {
56     return old_image_index.GetRawValue(src) == new_image_index.GetRawValue(dst)
57                ? 1.0
58                : -1.5;
59   }
60 
61   const ReferenceSet& old_ref_set = old_image_index.refs(old_type);
62   const ReferenceSet& new_ref_set = new_image_index.refs(new_type);
63   Reference old_reference = old_ref_set.at(src);
64   Reference new_reference = new_ref_set.at(dst);
65   PoolTag pool_tag = old_ref_set.pool_tag();
66 
67   double affinity = targets_affinities[pool_tag.value()].AffinityBetween(
68       old_ref_set.target_pool().KeyForOffset(old_reference.target),
69       new_ref_set.target_pool().KeyForOffset(new_reference.target));
70 
71   // Both targets are not associated, which implies a weak match.
72   if (affinity == 0.0)
73     return 0.5 * old_ref_set.width();
74 
75   // At least one target is associated, so values are compared.
76   return affinity > 0.0 ? old_ref_set.width() : -2.0;
77 }
78 
GetEquivalenceSimilarity(const ImageIndex & old_image_index,const ImageIndex & new_image_index,const std::vector<TargetsAffinity> & targets_affinities,const Equivalence & equivalence)79 double GetEquivalenceSimilarity(
80     const ImageIndex& old_image_index,
81     const ImageIndex& new_image_index,
82     const std::vector<TargetsAffinity>& targets_affinities,
83     const Equivalence& equivalence) {
84   double similarity = 0.0;
85   for (offset_t k = 0; k < equivalence.length; ++k) {
86     // Non-tokens are joined with the nearest previous token: skip until we
87     // cover the unit.
88     if (!new_image_index.IsToken(equivalence.dst_offset + k))
89       continue;
90 
91     similarity += GetTokenSimilarity(
92         old_image_index, new_image_index, targets_affinities,
93         equivalence.src_offset + k, equivalence.dst_offset + k);
94     if (similarity == kMismatchFatal)
95       return kMismatchFatal;
96   }
97   return similarity;
98 }
99 
ExtendEquivalenceForward(const ImageIndex & old_image_index,const ImageIndex & new_image_index,const std::vector<TargetsAffinity> & targets_affinities,const EquivalenceCandidate & candidate,double min_similarity)100 EquivalenceCandidate ExtendEquivalenceForward(
101     const ImageIndex& old_image_index,
102     const ImageIndex& new_image_index,
103     const std::vector<TargetsAffinity>& targets_affinities,
104     const EquivalenceCandidate& candidate,
105     double min_similarity) {
106   Equivalence equivalence = candidate.eq;
107   offset_t best_k = equivalence.length;
108   double current_similarity = candidate.similarity;
109   double best_similarity = current_similarity;
110   double current_penalty = min_similarity;
111   for (offset_t k = best_k;
112        equivalence.src_offset + k < old_image_index.size() &&
113        equivalence.dst_offset + k < new_image_index.size();
114        ++k) {
115     // Mismatch in type, |candidate| cannot be extended further.
116     if (old_image_index.LookupType(equivalence.src_offset + k) !=
117         new_image_index.LookupType(equivalence.dst_offset + k)) {
118       break;
119     }
120 
121     if (!new_image_index.IsToken(equivalence.dst_offset + k)) {
122       // Non-tokens are joined with the nearest previous token: skip until we
123       // cover the unit, and extend |best_k| if applicable.
124       if (best_k == k)
125         best_k = k + 1;
126       continue;
127     }
128 
129     double similarity = GetTokenSimilarity(
130         old_image_index, new_image_index, targets_affinities,
131         equivalence.src_offset + k, equivalence.dst_offset + k);
132     current_similarity += similarity;
133     current_penalty = std::max(0.0, current_penalty) - similarity;
134 
135     if (current_similarity < 0.0 || current_penalty >= min_similarity)
136       break;
137     if (current_similarity >= best_similarity) {
138       best_similarity = current_similarity;
139       best_k = k + 1;
140     }
141   }
142   equivalence.length = best_k;
143   return {equivalence, best_similarity};
144 }
145 
ExtendEquivalenceBackward(const ImageIndex & old_image_index,const ImageIndex & new_image_index,const std::vector<TargetsAffinity> & targets_affinities,const EquivalenceCandidate & candidate,double min_similarity)146 EquivalenceCandidate ExtendEquivalenceBackward(
147     const ImageIndex& old_image_index,
148     const ImageIndex& new_image_index,
149     const std::vector<TargetsAffinity>& targets_affinities,
150     const EquivalenceCandidate& candidate,
151     double min_similarity) {
152   Equivalence equivalence = candidate.eq;
153   offset_t best_k = 0;
154   double current_similarity = candidate.similarity;
155   double best_similarity = current_similarity;
156   double current_penalty = 0.0;
157   offset_t k_min = std::min(
158       {equivalence.dst_offset, equivalence.src_offset, kBackwardsExtendLimit});
159   for (offset_t k = 1; k <= k_min; ++k) {
160     // Mismatch in type, |candidate| cannot be extended further.
161     if (old_image_index.LookupType(equivalence.src_offset - k) !=
162         new_image_index.LookupType(equivalence.dst_offset - k)) {
163       break;
164     }
165 
166     // Non-tokens are joined with the nearest previous token: skip until we
167     // reach the next token.
168     if (!new_image_index.IsToken(equivalence.dst_offset - k))
169       continue;
170 
171     DCHECK_EQ(old_image_index.LookupType(equivalence.src_offset - k),
172               new_image_index.LookupType(equivalence.dst_offset -
173                                          k));  // Sanity check.
174     double similarity = GetTokenSimilarity(
175         old_image_index, new_image_index, targets_affinities,
176         equivalence.src_offset - k, equivalence.dst_offset - k);
177 
178     current_similarity += similarity;
179     current_penalty = std::max(0.0, current_penalty) - similarity;
180 
181     if (current_similarity < 0.0 || current_penalty >= min_similarity)
182       break;
183     if (current_similarity >= best_similarity) {
184       best_similarity = current_similarity;
185       best_k = k;
186     }
187   }
188 
189   equivalence.dst_offset -= best_k;
190   equivalence.src_offset -= best_k;
191   equivalence.length += best_k;
192   return {equivalence, best_similarity};
193 }
194 
VisitEquivalenceSeed(const ImageIndex & old_image_index,const ImageIndex & new_image_index,const std::vector<TargetsAffinity> & targets_affinities,offset_t src,offset_t dst,double min_similarity)195 EquivalenceCandidate VisitEquivalenceSeed(
196     const ImageIndex& old_image_index,
197     const ImageIndex& new_image_index,
198     const std::vector<TargetsAffinity>& targets_affinities,
199     offset_t src,
200     offset_t dst,
201     double min_similarity) {
202   EquivalenceCandidate candidate{{src, dst, 0}, 0.0};  // Empty.
203   if (!old_image_index.IsToken(src))
204     return candidate;
205   candidate =
206       ExtendEquivalenceForward(old_image_index, new_image_index,
207                                targets_affinities, candidate, min_similarity);
208   if (candidate.similarity < min_similarity)
209     return candidate;  // Not worth exploring any more.
210   return ExtendEquivalenceBackward(old_image_index, new_image_index,
211                                    targets_affinities, candidate,
212                                    min_similarity);
213 }
214 
215 /******** OffsetMapper ********/
216 
OffsetMapper(std::deque<Equivalence> && equivalences,offset_t old_image_size,offset_t new_image_size)217 OffsetMapper::OffsetMapper(std::deque<Equivalence>&& equivalences,
218                            offset_t old_image_size,
219                            offset_t new_image_size)
220     : equivalences_(std::move(equivalences)),
221       old_image_size_(old_image_size),
222       new_image_size_(new_image_size) {
223   DCHECK_GT(new_image_size_, 0U);
224   DCHECK(std::is_sorted(equivalences_.begin(), equivalences_.end(),
225                         [](const Equivalence& a, const Equivalence& b) {
226                           return a.src_offset < b.src_offset;
227                         }));
228   // This is for testing. Assume pruned.
229 }
230 
OffsetMapper(EquivalenceSource && equivalence_source,offset_t old_image_size,offset_t new_image_size)231 OffsetMapper::OffsetMapper(EquivalenceSource&& equivalence_source,
232                            offset_t old_image_size,
233                            offset_t new_image_size)
234     : old_image_size_(old_image_size), new_image_size_(new_image_size) {
235   DCHECK_GT(new_image_size_, 0U);
236   for (auto e = equivalence_source.GetNext(); e.has_value();
237        e = equivalence_source.GetNext()) {
238     equivalences_.push_back(*e);
239   }
240   PruneEquivalencesAndSortBySource(&equivalences_);
241 }
242 
OffsetMapper(const EquivalenceMap & equivalence_map,offset_t old_image_size,offset_t new_image_size)243 OffsetMapper::OffsetMapper(const EquivalenceMap& equivalence_map,
244                            offset_t old_image_size,
245                            offset_t new_image_size)
246     : equivalences_(equivalence_map.size()),
247       old_image_size_(old_image_size),
248       new_image_size_(new_image_size) {
249   DCHECK_GT(new_image_size_, 0U);
250   std::transform(equivalence_map.begin(), equivalence_map.end(),
251                  equivalences_.begin(),
252                  [](const EquivalenceCandidate& c) { return c.eq; });
253   PruneEquivalencesAndSortBySource(&equivalences_);
254 }
255 
256 OffsetMapper::~OffsetMapper() = default;
257 
258 // Safely evaluates |offset - unit.src_offset + unit.dst_offset| with signed
259 // arithmetic, then clips the result to |[0, new_image_size_)|.
NaiveExtendedForwardProject(const Equivalence & unit,offset_t offset) const260 offset_t OffsetMapper::NaiveExtendedForwardProject(const Equivalence& unit,
261                                                    offset_t offset) const {
262   int64_t old_offset64 = offset;
263   int64_t src_offset64 = unit.src_offset;
264   int64_t dst_offset64 = unit.dst_offset;
265   uint64_t new_offset64 = std::min<uint64_t>(
266       std::max<int64_t>(0LL, old_offset64 - src_offset64 + dst_offset64),
267       new_image_size_ - 1);
268   return base::checked_cast<offset_t>(new_offset64);
269 }
270 
ExtendedForwardProject(offset_t offset) const271 offset_t OffsetMapper::ExtendedForwardProject(offset_t offset) const {
272   DCHECK(!equivalences_.empty());
273   if (offset < old_image_size_) {
274     // Finds the equivalence unit whose "old" block is nearest to |offset|,
275     // favoring the block with lower offset in case of a tie.
276     auto pos = std::upper_bound(
277         equivalences_.begin(), equivalences_.end(), offset,
278         [](offset_t a, const Equivalence& b) { return a < b.src_offset; });
279     // For tiebreaking: |offset - pos[-1].src_end()| is actually 1 less than
280     // |offset|'s distance to "old" block of |pos[-1]|. Therefore "<" is used.
281     if (pos != equivalences_.begin() &&
282         (pos == equivalences_.end() || offset < pos[-1].src_end() ||
283          offset - pos[-1].src_end() < pos->src_offset - offset)) {
284       --pos;
285     }
286     return NaiveExtendedForwardProject(*pos, offset);
287   }
288   // Fake offsets.
289   offset_t delta = offset - old_image_size_;
290   return delta < kOffsetBound - new_image_size_ ? new_image_size_ + delta
291                                                 : kOffsetBound - 1;
292 }
293 
ForwardProjectAll(std::deque<offset_t> * offsets) const294 void OffsetMapper::ForwardProjectAll(std::deque<offset_t>* offsets) const {
295   DCHECK(std::is_sorted(offsets->begin(), offsets->end()));
296   auto current = equivalences_.begin();
297   for (auto& src : *offsets) {
298     while (current != end() && current->src_end() <= src) {
299       ++current;
300     }
301 
302     if (current != end() && current->src_offset <= src) {
303       src = src - current->src_offset + current->dst_offset;
304     } else {
305       src = kInvalidOffset;
306     }
307   }
308   base::Erase(*offsets, kInvalidOffset);
309   offsets->shrink_to_fit();
310 }
311 
PruneEquivalencesAndSortBySource(std::deque<Equivalence> * equivalences)312 void OffsetMapper::PruneEquivalencesAndSortBySource(
313     std::deque<Equivalence>* equivalences) {
314   std::sort(equivalences->begin(), equivalences->end(),
315             [](const Equivalence& a, const Equivalence& b) {
316               return a.src_offset < b.src_offset;
317             });
318 
319   for (auto current = equivalences->begin(); current != equivalences->end();
320        ++current) {
321     // A "reaper" is an equivalence after |current| that overlaps with it, but
322     // is longer, and so truncates |current|.  For example:
323     //  ******  <=  |current|
324     //    **
325     //    ****
326     //      ****
327     //      **********  <= |next| as reaper.
328     // If a reaper is found (as |next|), every equivalence strictly between
329     // |current| and |next| would be truncated to 0 and discarded. Handling this
330     // case is important to avoid O(n^2) behavior.
331     bool next_is_reaper = false;
332 
333     // Look ahead to resolve overlaps, until a better candidate is found.
334     auto next = current + 1;
335     for (; next != equivalences->end(); ++next) {
336       DCHECK_GE(next->src_offset, current->src_offset);
337       if (next->src_offset >= current->src_end())
338         break;  // No more overlap.
339 
340       if (current->length < next->length) {
341         // |next| is better: So it is a reaper that shrinks |current|.
342         offset_t delta = current->src_end() - next->src_offset;
343         current->length -= delta;
344         next_is_reaper = true;
345         break;
346       }
347     }
348 
349     if (next_is_reaper) {
350       // Discard all equivalences strictly between |cur| and |next|.
351       for (auto reduced = current + 1; reduced != next; ++reduced)
352         reduced->length = 0;
353       current = next - 1;
354     } else {
355       // Shrink all equivalences that overlap with |current|. These are all
356       // worse than |current| since no reaper is found.
357       for (auto reduced = current + 1; reduced != next; ++reduced) {
358         offset_t delta = current->src_end() - reduced->src_offset;
359         reduced->length -= std::min(reduced->length, delta);
360         reduced->src_offset += delta;
361         reduced->dst_offset += delta;
362         DCHECK_EQ(reduced->src_offset, current->src_end());
363       }
364     }
365   }
366 
367   // Discard all equivalences with length == 0.
368   base::EraseIf(*equivalences, [](const Equivalence& equivalence) {
369     return equivalence.length == 0;
370   });
371   equivalences->shrink_to_fit();
372 }
373 
374 /******** EquivalenceMap ********/
375 
376 EquivalenceMap::EquivalenceMap() = default;
377 
EquivalenceMap(std::vector<EquivalenceCandidate> && equivalences)378 EquivalenceMap::EquivalenceMap(std::vector<EquivalenceCandidate>&& equivalences)
379     : candidates_(std::move(equivalences)) {
380   SortByDestination();
381 }
382 
383 EquivalenceMap::EquivalenceMap(EquivalenceMap&&) = default;
384 
385 EquivalenceMap::~EquivalenceMap() = default;
386 
Build(const std::vector<offset_t> & old_sa,const EncodedView & old_view,const EncodedView & new_view,const std::vector<TargetsAffinity> & targets_affinities,double min_similarity)387 void EquivalenceMap::Build(
388     const std::vector<offset_t>& old_sa,
389     const EncodedView& old_view,
390     const EncodedView& new_view,
391     const std::vector<TargetsAffinity>& targets_affinities,
392     double min_similarity) {
393   DCHECK_EQ(old_sa.size(), old_view.size());
394 
395   CreateCandidates(old_sa, old_view, new_view, targets_affinities,
396                    min_similarity);
397   SortByDestination();
398   Prune(old_view, new_view, targets_affinities, min_similarity);
399 
400   offset_t coverage = 0;
401   offset_t current_offset = 0;
402   for (auto candidate : candidates_) {
403     DCHECK_GE(candidate.eq.dst_offset, current_offset);
404     coverage += candidate.eq.length;
405     current_offset = candidate.eq.dst_end();
406   }
407   LOG(INFO) << "Equivalence Count: " << size();
408   LOG(INFO) << "Coverage / Extra / Total: " << coverage << " / "
409             << new_view.size() - coverage << " / " << new_view.size();
410 }
411 
CreateCandidates(const std::vector<offset_t> & old_sa,const EncodedView & old_view,const EncodedView & new_view,const std::vector<TargetsAffinity> & targets_affinities,double min_similarity)412 void EquivalenceMap::CreateCandidates(
413     const std::vector<offset_t>& old_sa,
414     const EncodedView& old_view,
415     const EncodedView& new_view,
416     const std::vector<TargetsAffinity>& targets_affinities,
417     double min_similarity) {
418   candidates_.clear();
419 
420   // This is an heuristic to find 'good' equivalences on encoded views.
421   // Equivalences are found in ascending order of |new_image|.
422   offset_t dst_offset = 0;
423 
424   while (dst_offset < new_view.size()) {
425     if (!new_view.IsToken(dst_offset)) {
426       ++dst_offset;
427       continue;
428     }
429     auto match =
430         SuffixLowerBound(old_sa, old_view.begin(),
431                          new_view.begin() + dst_offset, new_view.end());
432 
433     offset_t next_dst_offset = dst_offset + 1;
434     // TODO(huangs): Clean up.
435     double best_similarity = min_similarity;
436     uint64_t total_visit_length = 0;
437     EquivalenceCandidate best_candidate = {{0, 0, 0}, 0.0};
438     for (auto it = match; it != old_sa.end(); ++it) {
439       EquivalenceCandidate candidate = VisitEquivalenceSeed(
440           old_view.image_index(), new_view.image_index(), targets_affinities,
441           static_cast<offset_t>(*it), dst_offset, min_similarity);
442       if (candidate.similarity > best_similarity) {
443         best_candidate = candidate;
444         best_similarity = candidate.similarity;
445         next_dst_offset = candidate.eq.dst_end();
446         total_visit_length += candidate.eq.length;
447         if (total_visit_length > kSeedSelectionTotalVisitLengthQuota) {
448           break;
449         }
450       } else {
451         break;
452       }
453     }
454     total_visit_length = 0;
455     for (auto it = match; it != old_sa.begin(); --it) {
456       EquivalenceCandidate candidate = VisitEquivalenceSeed(
457           old_view.image_index(), new_view.image_index(), targets_affinities,
458           static_cast<offset_t>(it[-1]), dst_offset, min_similarity);
459       if (candidate.similarity > best_similarity) {
460         best_candidate = candidate;
461         best_similarity = candidate.similarity;
462         next_dst_offset = candidate.eq.dst_end();
463         total_visit_length += candidate.eq.length;
464         if (total_visit_length > kSeedSelectionTotalVisitLengthQuota) {
465           break;
466         }
467       } else {
468         break;
469       }
470     }
471     if (best_candidate.similarity >= min_similarity) {
472       candidates_.push_back(best_candidate);
473     }
474 
475     dst_offset = next_dst_offset;
476   }
477 }
478 
SortByDestination()479 void EquivalenceMap::SortByDestination() {
480   std::sort(candidates_.begin(), candidates_.end(),
481             [](const EquivalenceCandidate& a, const EquivalenceCandidate& b) {
482               return a.eq.dst_offset < b.eq.dst_offset;
483             });
484 }
485 
Prune(const EncodedView & old_view,const EncodedView & new_view,const std::vector<TargetsAffinity> & target_affinities,double min_similarity)486 void EquivalenceMap::Prune(
487     const EncodedView& old_view,
488     const EncodedView& new_view,
489     const std::vector<TargetsAffinity>& target_affinities,
490     double min_similarity) {
491   // TODO(etiennep): unify with
492   // OffsetMapper::PruneEquivalencesAndSortBySource().
493   for (auto current = candidates_.begin(); current != candidates_.end();
494        ++current) {
495     if (current->similarity < min_similarity)
496       continue;  // This candidate will be discarded anyways.
497 
498     bool next_is_reaper = false;
499 
500     // Look ahead to resolve overlaps, until a better candidate is found.
501     auto next = current + 1;
502     for (; next != candidates_.end(); ++next) {
503       DCHECK_GE(next->eq.dst_offset, current->eq.dst_offset);
504       if (next->eq.dst_offset >= current->eq.dst_offset + current->eq.length)
505         break;  // No more overlap.
506 
507       if (current->similarity < next->similarity) {
508         // |next| is better: So it is a reaper that shrinks |current|.
509         offset_t delta = current->eq.dst_end() - next->eq.dst_offset;
510         current->eq.length -= delta;
511         current->similarity = GetEquivalenceSimilarity(
512             old_view.image_index(), new_view.image_index(), target_affinities,
513             current->eq);
514 
515         next_is_reaper = true;
516         break;
517       }
518     }
519 
520     if (next_is_reaper) {
521       // Discard all equivalences strictly between |cur| and |next|.
522       for (auto reduced = current + 1; reduced != next; ++reduced) {
523         reduced->eq.length = 0;
524         reduced->similarity = 0;
525       }
526       current = next - 1;
527     } else {
528       // Shrinks all overlapping candidates following and worse than |current|.
529       for (auto reduced = current + 1; reduced != next; ++reduced) {
530         offset_t delta = current->eq.dst_end() - reduced->eq.dst_offset;
531         reduced->eq.length -= std::min(reduced->eq.length, delta);
532         reduced->eq.src_offset += delta;
533         reduced->eq.dst_offset += delta;
534         reduced->similarity = GetEquivalenceSimilarity(
535             old_view.image_index(), new_view.image_index(), target_affinities,
536             reduced->eq);
537         DCHECK_EQ(reduced->eq.dst_offset, current->eq.dst_end());
538       }
539     }
540   }
541 
542   // Discard all candidates with similarity smaller than |min_similarity|.
543   base::EraseIf(candidates_,
544                 [min_similarity](const EquivalenceCandidate& candidate) {
545                   return candidate.similarity < min_similarity;
546                 });
547 }
548 
549 }  // namespace zucchini
550