• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Fast memory copying and comparison routines.
2 //   strings::fastmemcmp_inlined() replaces memcmp()
3 //   strings::memcpy_inlined() replaces memcpy()
4 //   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
5 //
6 // strings::*_inlined() routines are inline versions of the
7 // routines exported by this module.  Sometimes using the inlined
8 // versions is faster.  Measure before using the inlined versions.
9 
10 #ifndef DYNAMIC_DEPTH_INTERNAL_STRINGS_FASTMEM_H_  // NOLINT
11 #define DYNAMIC_DEPTH_INTERNAL_STRINGS_FASTMEM_H_  // NOLINT
12 
13 #include <stddef.h>
14 #include <stdint.h>
15 #include <stdio.h>
16 #include <string.h>
17 
18 #include "base/integral_types.h"
19 #include "base/macros.h"
20 #include "base/port.h"
21 
22 namespace dynamic_depth {
23 namespace strings {
24 
25 // Return true if the n bytes at a equal the n bytes at b.
26 // The regions are allowed to overlap.
27 //
28 // The performance is similar to the performance of memcmp(), but faster for
29 // moderately-sized inputs, or inputs that share a common prefix and differ
30 // somewhere in their last 8 bytes. Further optimizations can be added later
31 // if it makes sense to do so.  Alternatively, if the compiler & runtime improve
32 // to eliminate the need for this, we can remove it.  Please keep this in sync
33 // with google_internal::gg_memeq() in //third_party/stl/gcc3/string.
memeq(const char * a,const char * b,size_t n)34 inline bool memeq(const char* a, const char* b, size_t n) {
35   size_t n_rounded_down = n & ~static_cast<size_t>(7);
36   if (PREDICT_FALSE(n_rounded_down == 0)) {  // n <= 7
37     return memcmp(a, b, n) == 0;
38   }
39   // n >= 8
40   uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
41   uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
42   if ((u | v) != 0) {  // The first or last 8 bytes differ.
43     return false;
44   }
45   // The next line forces n to be a multiple of 8.
46   n = n_rounded_down;
47   if (n >= 80) {
48     // In 2013 or later, this should be fast on long strings.
49     return memcmp(a, b, n) == 0;
50   }
51   // Now force n to be a multiple of 16.  Arguably, a "switch" would be smart
52   // here, but there's a difficult-to-evaluate code size vs. speed issue.  The
53   // current approach often re-compares some bytes (worst case is if n initially
54   // was 16, 32, 48, or 64), but is fairly short.
55   size_t e = n & 8;
56   a += e;
57   b += e;
58   n -= e;
59   // n is now in {0, 16, 32, ...}.  Process 0 or more 16-byte chunks.
60   while (n > 0) {
61     uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
62     uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
63     if ((x | y) != 0) {
64       return false;
65     }
66     a += 16;
67     b += 16;
68     n -= 16;
69   }
70   return true;
71 }
72 
fastmemcmp_inlined(const void * va,const void * vb,size_t n)73 inline int fastmemcmp_inlined(const void* va, const void* vb, size_t n) {
74   const unsigned char* pa = static_cast<const unsigned char*>(va);
75   const unsigned char* pb = static_cast<const unsigned char*>(vb);
76   switch (n) {
77     default:
78       return memcmp(va, vb, n);
79     case 7:
80       if (*pa != *pb) return *pa < *pb ? -1 : +1;
81       ++pa;
82       ++pb;
83       FALLTHROUGH_INTENDED;
84     case 6:
85       if (*pa != *pb) return *pa < *pb ? -1 : +1;
86       ++pa;
87       ++pb;
88       FALLTHROUGH_INTENDED;
89     case 5:
90       if (*pa != *pb) return *pa < *pb ? -1 : +1;
91       ++pa;
92       ++pb;
93       FALLTHROUGH_INTENDED;
94     case 4:
95       if (*pa != *pb) return *pa < *pb ? -1 : +1;
96       ++pa;
97       ++pb;
98       FALLTHROUGH_INTENDED;
99     case 3:
100       if (*pa != *pb) return *pa < *pb ? -1 : +1;
101       ++pa;
102       ++pb;
103       FALLTHROUGH_INTENDED;
104     case 2:
105       if (*pa != *pb) return *pa < *pb ? -1 : +1;
106       ++pa;
107       ++pb;
108       FALLTHROUGH_INTENDED;
109     case 1:
110       if (*pa != *pb) return *pa < *pb ? -1 : +1;
111       FALLTHROUGH_INTENDED;
112     case 0:
113       break;
114   }
115   return 0;
116 }
117 
118 // The standard memcpy operation is slow for variable small sizes.
119 // This implementation inlines the optimal realization for sizes 1 to 16.
120 // To avoid code bloat don't use it in case of not performance-critical spots,
121 // nor when you don't expect very frequent values of size <= 16.
memcpy_inlined(char * dst,const char * src,size_t size)122 inline void memcpy_inlined(char* dst, const char* src, size_t size) {
123   // Compiler inlines code with minimal amount of data movement when third
124   // parameter of memcpy is a constant.
125   switch (size) {
126     case 1:
127       memcpy(dst, src, 1);
128       break;
129     case 2:
130       memcpy(dst, src, 2);
131       break;
132     case 3:
133       memcpy(dst, src, 3);
134       break;
135     case 4:
136       memcpy(dst, src, 4);
137       break;
138     case 5:
139       memcpy(dst, src, 5);
140       break;
141     case 6:
142       memcpy(dst, src, 6);
143       break;
144     case 7:
145       memcpy(dst, src, 7);
146       break;
147     case 8:
148       memcpy(dst, src, 8);
149       break;
150     case 9:
151       memcpy(dst, src, 9);
152       break;
153     case 10:
154       memcpy(dst, src, 10);
155       break;
156     case 11:
157       memcpy(dst, src, 11);
158       break;
159     case 12:
160       memcpy(dst, src, 12);
161       break;
162     case 13:
163       memcpy(dst, src, 13);
164       break;
165     case 14:
166       memcpy(dst, src, 14);
167       break;
168     case 15:
169       memcpy(dst, src, 15);
170       break;
171     case 16:
172       memcpy(dst, src, 16);
173       break;
174     default:
175       memcpy(dst, src, size);
176       break;
177   }
178 }
179 
180 }  // namespace strings
181 }  // namespace dynamic_depth
182 
183 #endif  // DYNAMIC_DEPTH_INTERNAL_STRINGS_FASTMEM_H_  // NOLINT
184