1 // Copyright 2008 Google Inc. All Rights Reserved.
2
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6
7 // http://www.apache.org/licenses/LICENSE-2.0
8
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "adler32memcpy.h"
16
17 // We are using (a modified form of) adler-32 checksum algorithm instead
18 // of CRC since adler-32 is faster than CRC.
19 // (Comparison: http://guru.multimedia.cx/crc32-vs-adler32/)
20 // This form of adler is bit modified, instead of treating the data in
21 // units of bytes, 32-bit data is taken as a unit and two 64-bit
22 // checksums are done (we could have one checksum but two checksums
23 // make the code run faster).
24
25 // Adler-32 implementation:
26 // Data is treated as 1-byte numbers and,
27 // there are two 16-bit numbers a and b
28 // Initialize a with 1 and b with 0.
29 // for each data unit 'd'
30 // a += d
31 // b += a
32 // checksum = a<<16 + b
33 // This sum should never overflow.
34 //
35 // Adler-64+64 implementation:
36 // (applied in this code)
37 // Data is treated as 32-bit numbers and whole data is separated into two
38 // streams, and hence the two checksums a1, a2, b1 and b2.
39 // Initialize a1 and a2 with 1, b1 and b2 with 0
40 // add first dataunit to a1
41 // add a1 to b1
42 // add second dataunit to a1
43 // add a1 to b1
44 // add third dataunit to a2
45 // add a2 to b2
46 // add fourth dataunit to a2
47 // add a2 to b2
48 // ...
49 // repeat the sequence back for next 4 dataunits
50 //
51 // variable A = XMM6 and variable B = XMM7.
52 // (a1 = lower 8 bytes of XMM6 and b1 = lower 8 bytes of XMM7)
53
54 // Assumptions
55 // 1. size_in_bytes is a multiple of 16.
56 // 2. srcmem and dstmem are 16 byte aligned.
57 // 3. size_in_bytes is less than 2^19 bytes.
58
59 // Assumption 3 ensures that there is no overflow when numbers are being
60 // added (we can remove this assumption by doing modulus with a prime
61 // number when it is just about to overflow but that would be a very costly
62 // exercise)
63
64 // Returns true if the checksums are equal.
Equals(const AdlerChecksum & other) const65 bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
66 return ( (a1_ == other.a1_) && (a2_ == other.a2_) &&
67 (b1_ == other.b1_) && (b2_ == other.b2_) );
68 }
69
70 // Returns string representation of the Adler checksum.
ToHexString() const71 string AdlerChecksum::ToHexString() const {
72 char buffer[128];
73 snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
74 return string(buffer);
75 }
76
77 // Sets components of the Adler checksum.
Set(uint64 a1,uint64 a2,uint64 b1,uint64 b2)78 void AdlerChecksum::Set(uint64 a1, uint64 a2, uint64 b1, uint64 b2) {
79 a1_ = a1;
80 a2_ = a2;
81 b1_ = b1;
82 b2_ = b2;
83 }
84
85 // Calculates Adler checksum for supplied data.
CalculateAdlerChecksum(uint64 * data64,unsigned int size_in_bytes,AdlerChecksum * checksum)86 bool CalculateAdlerChecksum(uint64 *data64, unsigned int size_in_bytes,
87 AdlerChecksum *checksum) {
88 // Use this data wrapper to access memory with 64bit read/write.
89 datacast_t data;
90 unsigned int count = size_in_bytes / sizeof(data);
91
92 if (count > (1U) << 19) {
93 // Size is too large, must be strictly less than 512 KB.
94 return false;
95 }
96
97 uint64 a1 = 1;
98 uint64 a2 = 1;
99 uint64 b1 = 0;
100 uint64 b2 = 0;
101
102 unsigned int i = 0;
103 while (i < count) {
104 // Process 64 bits at a time.
105 data.l64 = data64[i];
106 a1 = a1 + data.l32.l;
107 b1 = b1 + a1;
108 a1 = a1 + data.l32.h;
109 b1 = b1 + a1;
110 i++;
111
112 data.l64 = data64[i];
113 a2 = a2 + data.l32.l;
114 b2 = b2 + a2;
115 a2 = a2 + data.l32.h;
116 b2 = b2 + a2;
117 i++;
118 }
119 checksum->Set(a1, a2, b1, b2);
120 return true;
121 }
122
123 // C implementation of Adler memory copy.
AdlerMemcpyC(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)124 bool AdlerMemcpyC(uint64 *dstmem64, uint64 *srcmem64,
125 unsigned int size_in_bytes, AdlerChecksum *checksum) {
126 // Use this data wrapper to access memory with 64bit read/write.
127 datacast_t data;
128 unsigned int count = size_in_bytes / sizeof(data);
129
130 if (count > ((1U) << 19)) {
131 // Size is too large, must be strictly less than 512 KB.
132 return false;
133 }
134
135 uint64 a1 = 1;
136 uint64 a2 = 1;
137 uint64 b1 = 0;
138 uint64 b2 = 0;
139
140 unsigned int i = 0;
141 while (i < count) {
142 // Process 64 bits at a time.
143 data.l64 = srcmem64[i];
144 a1 = a1 + data.l32.l;
145 b1 = b1 + a1;
146 a1 = a1 + data.l32.h;
147 b1 = b1 + a1;
148 dstmem64[i] = data.l64;
149 i++;
150
151 data.l64 = srcmem64[i];
152 a2 = a2 + data.l32.l;
153 b2 = b2 + a2;
154 a2 = a2 + data.l32.h;
155 b2 = b2 + a2;
156 dstmem64[i] = data.l64;
157 i++;
158 }
159 checksum->Set(a1, a2, b1, b2);
160 return true;
161 }
162
163 // C implementation of Adler memory copy with some float point ops,
164 // attempting to warm up the CPU.
AdlerMemcpyWarmC(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)165 bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
166 unsigned int size_in_bytes, AdlerChecksum *checksum) {
167 // Use this data wrapper to access memory with 64bit read/write.
168 datacast_t data;
169 unsigned int count = size_in_bytes / sizeof(data);
170
171 if (count > ((1U) << 19)) {
172 // Size is too large, must be strictly less than 512 KB.
173 return false;
174 }
175
176 uint64 a1 = 1;
177 uint64 a2 = 1;
178 uint64 b1 = 0;
179 uint64 b2 = 0;
180
181 double a = 2.0 * static_cast<double>(srcmem64[0]);
182 double b = 5.0 * static_cast<double>(srcmem64[0]);
183 double c = 7.0 * static_cast<double>(srcmem64[0]);
184 double d = 9.0 * static_cast<double>(srcmem64[0]);
185
186 unsigned int i = 0;
187 while (i < count) {
188 // Process 64 bits at a time.
189 data.l64 = srcmem64[i];
190 a1 = a1 + data.l32.l;
191 b1 = b1 + a1;
192 a1 = a1 + data.l32.h;
193 b1 = b1 + a1;
194 dstmem64[i] = data.l64;
195 i++;
196
197 // Warm cpu up.
198 a = a * b;
199 b = b + c;
200
201 data.l64 = srcmem64[i];
202 a2 = a2 + data.l32.l;
203 b2 = b2 + a2;
204 a2 = a2 + data.l32.h;
205 b2 = b2 + a2;
206 dstmem64[i] = data.l64;
207 i++;
208
209 // Warm cpu up.
210 c = c * d;
211 d = d + d;
212 }
213
214 // Warm cpu up.
215 d = a + b + c + d;
216 if (d == 1.0) {
217 // Reference the result so that it can't be discarded by the compiler.
218 printf("Log: This will probably never happen.\n");
219 }
220
221 checksum->Set(a1, a2, b1, b2);
222 return true;
223 }
224
225 // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
AdlerMemcpyAsm(uint64 * dstmem64,uint64 * srcmem64,unsigned int size_in_bytes,AdlerChecksum * checksum)226 bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
227 unsigned int size_in_bytes, AdlerChecksum *checksum) {
228 // Use assembly implementation where supported.
229 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
230
231 // Pull a bit of tricky preprocessing to make the inline asm both
232 // 32 bit and 64 bit.
233 #ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86...
234 #define rAX "%%eax"
235 #define rCX "%%ecx"
236 #define rDX "%%edx"
237 #define rBX "%%ebx"
238 #define rSP "%%esp"
239 #define rBP "%%ebp"
240 #define rSI "%%esi"
241 #define rDI "%%edi"
242 #endif
243
244 #ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros.
245 #define rAX "%%rax"
246 #define rCX "%%rcx"
247 #define rDX "%%rdx"
248 #define rBX "%%rbx"
249 #define rSP "%%rsp"
250 #define rBP "%%rbp"
251 #define rSI "%%rsi"
252 #define rDI "%%rdi"
253 #endif
254
255 // Elements 0 to 3 are used for holding checksum terms a1, a2,
256 // b1, b2 respectively. These elements are filled by asm code.
257 // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
258 // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
259 // setting word index 0 and word index 2 to zero).
260 // Element 6 and 7 are used for setting a1 and a2 to 1.
261 volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
262 {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
263
264 if ((size_in_bytes >> 19) > 0) {
265 // Size is too large. Must be less than 2^19 bytes = 512 KB.
266 return false;
267 }
268
269 // Number of 32-bit words which are not added to a1/a2 in the main loop.
270 uint32 remaining_words = (size_in_bytes % 48) / 4;
271
272 // Since we are moving 48 bytes at a time number of iterations = total size/48
273 // is value of counter.
274 uint32 num_of_48_byte_units = size_in_bytes / 48;
275
276 asm volatile (
277 // Source address is in ESI (extended source index)
278 // destination is in EDI (extended destination index)
279 // and counter is already in ECX (extended counter
280 // index).
281 "cmp $0, " rCX ";" // Compare counter to zero.
282 "jz END;"
283
284 // XMM6 is initialized with 1 and XMM7 with 0.
285 "prefetchnta 0(" rSI ");"
286 "prefetchnta 64(" rSI ");"
287 "movdqu 48(" rAX "), %%xmm6;"
288 "xorps %%xmm7, %%xmm7;"
289
290 // Start of the loop which copies 48 bytes from source to dst each time.
291 "TOP:\n"
292
293 // Make 6 moves each of 16 bytes from srcmem to XMM registers.
294 // We are using 2 words out of 4 words in each XMM register,
295 // word index 0 and word index 2
296 "movdqa 0(" rSI "), %%xmm0;"
297 "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here.
298 "movdqa 16(" rSI "), %%xmm2;"
299 "movdqu 20(" rSI "), %%xmm3;"
300 "movdqa 32(" rSI "), %%xmm4;"
301 "movdqu 36(" rSI "), %%xmm5;"
302
303 // Move 3 * 16 bytes from XMM registers to dstmem.
304 // Note: this copy must be performed before pinsrw instructions since
305 // they will modify the XMM registers.
306 "movntdq %%xmm0, 0(" rDI ");"
307 "movntdq %%xmm2, 16(" rDI ");"
308 "movntdq %%xmm4, 32(" rDI ");"
309
310 // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
311 "andps 32(" rAX "), %%xmm0;"
312 "andps 32(" rAX "), %%xmm1;"
313 "andps 32(" rAX "), %%xmm2;"
314 "andps 32(" rAX "), %%xmm3;"
315 "andps 32(" rAX "), %%xmm4;"
316 "andps 32(" rAX "), %%xmm5;"
317
318 // Add XMM0 to XMM6 and then add XMM6 to XMM7.
319 // Repeat this for XMM1, ..., XMM5.
320 // Overflow(for XMM7) can occur only if there are more
321 // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
322 // if size_in_bytes > 2^19 than overflow occurs.
323 "paddq %%xmm0, %%xmm6;"
324 "paddq %%xmm6, %%xmm7;"
325 "paddq %%xmm1, %%xmm6;"
326 "paddq %%xmm6, %%xmm7;"
327 "paddq %%xmm2, %%xmm6;"
328 "paddq %%xmm6, %%xmm7;"
329 "paddq %%xmm3, %%xmm6;"
330 "paddq %%xmm6, %%xmm7;"
331 "paddq %%xmm4, %%xmm6;"
332 "paddq %%xmm6, %%xmm7;"
333 "paddq %%xmm5, %%xmm6;"
334 "paddq %%xmm6, %%xmm7;"
335
336 // Increment ESI and EDI by 48 bytes and decrement counter by 1.
337 "add $48, " rSI ";"
338 "add $48, " rDI ";"
339 "prefetchnta 0(" rSI ");"
340 "prefetchnta 64(" rSI ");"
341 "dec " rCX ";"
342 "jnz TOP;"
343
344 // Now only remaining_words 32-bit words are left.
345 // make a loop, add first two words to a1 and next two to a2 (just like
346 // above loop, the only extra thing we are doing is rechecking
347 // rDX (=remaining_words) everytime we add a number to a1/a2.
348 "REM_IS_STILL_NOT_ZERO:\n"
349 // Unless remaining_words becomes less than 4 words(16 bytes)
350 // there is not much issue and remaining_words will always
351 // be a multiple of four by assumption.
352 "cmp $4, " rDX ";"
353 // In case for some weird reasons if remaining_words becomes
354 // less than 4 but not zero then also break the code and go off to END.
355 "jl END;"
356 // Otherwise just go on and copy data in chunks of 4-words at a time till
357 // whole data (<48 bytes) is copied.
358 "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1.
359
360 "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without
361 "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary.
362
363 "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination.
364 "andps 32(" rAX "), %%xmm0;"
365 "andps 32(" rAX "), %%xmm1;"
366 "paddq %%xmm0, %%xmm6;"
367 "paddq %%xmm6, %%xmm7;"
368 "paddq %%xmm1, %%xmm6;"
369 "paddq %%xmm6, %%xmm7;"
370 "add $16, " rSI ";"
371 "add $16, " rDI ";"
372 "sub $4, " rDX ";"
373 // Decrement %rDX by 4 since %rDX is number of 32-bit
374 // words left after considering all 48-byte units.
375 "jmp REM_IS_STILL_NOT_ZERO;"
376
377 "END:\n"
378 // Report checksum values A and B (both right now are two concatenated
379 // 64 bit numbers and have to be converted to 64 bit numbers)
380 // seems like Adler128 (since size of each part is 4 byte rather than
381 // 1 byte).
382 "movdqa %%xmm6, 0(" rAX ");"
383 "movdqa %%xmm7, 16(" rAX ");"
384 "sfence;"
385
386 // No output registers.
387 :
388 // Input registers.
389 : "S" (srcmem64), "D" (dstmem64), "a" (checksum_arr),
390 "c" (num_of_48_byte_units), "d" (remaining_words)
391 ); // asm.
392
393 if (checksum != NULL) {
394 checksum->Set(checksum_arr[0], checksum_arr[1],
395 checksum_arr[2], checksum_arr[3]);
396 }
397
398 // Everything went fine, so return true (this does not mean
399 // that there is no problem with memory this just mean that data was copied
400 // from src to dst and checksum was calculated successfully).
401 return true;
402 #elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
403 // Elements 0 to 3 are used for holding checksum terms a1, a2,
404 // b1, b2 respectively. These elements are filled by asm code.
405 // Checksum is seeded with the null checksum.
406 volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
407 {1, 1, 0, 0};
408
409 if ((size_in_bytes >> 19) > 0) {
410 // Size is too large. Must be less than 2^19 bytes = 512 KB.
411 return false;
412 }
413
414 // Since we are moving 64 bytes at a time number of iterations = total size/64
415 uint32 blocks = size_in_bytes / 64;
416
417 uint64 *dst = dstmem64;
418 uint64 *src = srcmem64;
419
420 #define src_r "r3"
421 #define dst_r "r4"
422 #define blocks_r "r5"
423 #define crc_r "r6"
424
425 asm volatile (
426 "mov " src_r ", %[src]; \n"
427 "mov " dst_r ", %[dst]; \n"
428 "mov " crc_r ", %[crc]; \n"
429 "mov " blocks_r ", %[blocks]; \n"
430
431 // Loop over block count.
432 "cmp " blocks_r ", #0; \n" // Compare counter to zero.
433 "ble END; \n"
434
435
436 // Preload upcoming cacheline.
437 "pld [" src_r ", #0x0]; \n"
438 "pld [" src_r ", #0x20]; \n"
439
440 // Init checksum
441 "vldm " crc_r ", {q0}; \n"
442 "vmov.i32 q1, #0; \n"
443
444 // Start of the loop which copies 48 bytes from source to dst each time.
445 "TOP: \n"
446
447 // Make 3 moves each of 16 bytes from srcmem to qX registers.
448 // We are using 2 words out of 4 words in each qX register,
449 // word index 0 and word index 2. We'll swizzle them in a bit.
450 // Copy it.
451 "vldm " src_r "!, {q8, q9, q10, q11}; \n"
452 "vstm " dst_r "!, {q8, q9, q10, q11}; \n"
453
454 // Arrange it.
455 "vmov.i64 q12, #0; \n"
456 "vmov.i64 q13, #0; \n"
457 "vmov.i64 q14, #0; \n"
458 "vmov.i64 q15, #0; \n"
459 // This exchenges words 1,3 in the filled registers with
460 // words 0,2 in the empty registers.
461 "vtrn.32 q8, q12; \n"
462 "vtrn.32 q9, q13; \n"
463 "vtrn.32 q10, q14; \n"
464 "vtrn.32 q11, q15; \n"
465
466 // Sum into q0, then into q1.
467 // Repeat this for q8 - q13.
468 // Overflow can occur only if there are more
469 // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
470 // if size_in_bytes > 2^19 than overflow occurs.
471 "vadd.i64 q0, q0, q8; \n"
472 "vadd.i64 q1, q1, q0; \n"
473 "vadd.i64 q0, q0, q12; \n"
474 "vadd.i64 q1, q1, q0; \n"
475 "vadd.i64 q0, q0, q9; \n"
476 "vadd.i64 q1, q1, q0; \n"
477 "vadd.i64 q0, q0, q13; \n"
478 "vadd.i64 q1, q1, q0; \n"
479
480 "vadd.i64 q0, q0, q10; \n"
481 "vadd.i64 q1, q1, q0; \n"
482 "vadd.i64 q0, q0, q14; \n"
483 "vadd.i64 q1, q1, q0; \n"
484 "vadd.i64 q0, q0, q11; \n"
485 "vadd.i64 q1, q1, q0; \n"
486 "vadd.i64 q0, q0, q15; \n"
487 "vadd.i64 q1, q1, q0; \n"
488
489 // Increment counter and loop.
490 "sub " blocks_r ", " blocks_r ", #1; \n"
491 "cmp " blocks_r ", #0; \n" // Compare counter to zero.
492 "bgt TOP; \n"
493
494
495 "END:\n"
496 // Report checksum values A and B (both right now are two concatenated
497 // 64 bit numbers and have to be converted to 64 bit numbers)
498 // seems like Adler128 (since size of each part is 4 byte rather than
499 // 1 byte).
500 "vstm " crc_r ", {q0, q1}; \n"
501
502 // Output registers.
503 :
504 // Input registers.
505 : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
506 : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
507 ); // asm.
508
509 if (checksum != NULL) {
510 checksum->Set(checksum_arr[0], checksum_arr[1],
511 checksum_arr[2], checksum_arr[3]);
512 }
513
514 // Everything went fine, so return true (this does not mean
515 // that there is no problem with memory this just mean that data was copied
516 // from src to dst and checksum was calculated successfully).
517 return true;
518 #else
519 #warning "No vector copy defined for this architecture."
520 // Fall back to C implementation for anything else.
521 return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
522 #endif
523 }
524