1 /*
2 * Copyright (C) 2019 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 // A Neon vectorized implementation of the GNU symbol hash function.
30
31 // This function generally accesses beyond the bounds of the name string. Specifically, it reads
32 // each aligned 8-byte chunk containing a byte of the string, including the final NUL byte. This
33 // should be acceptable for use with MTE, which uses 16-byte granules. Typically, the function is
34 // used to hash strings in an ELF file's string table, where MTE is presumably unaware of the
35 // bounds of each symbol, but the linker also hashes the symbol name passed to dlsym.
36
37 #include "linker_gnu_hash_neon.h"
38
39 #include <arm_neon.h>
40 #include <stdio.h>
41 #include <stdint.h>
42 #include <stdlib.h>
43
44 struct __attribute__((aligned(8))) GnuHashInitEntry {
45 uint64_t ignore_mask;
46 uint32_t accum;
47 };
48
49 constexpr uint32_t kStep0 = 1;
50 constexpr uint32_t kStep1 = kStep0 * 33;
51 constexpr uint32_t kStep2 = kStep1 * 33;
52 constexpr uint32_t kStep3 = kStep2 * 33;
53 constexpr uint32_t kStep4 = kStep3 * 33;
54 constexpr uint32_t kStep5 = kStep4 * 33;
55 constexpr uint32_t kStep6 = kStep5 * 33;
56 constexpr uint32_t kStep7 = kStep6 * 33;
57 constexpr uint32_t kStep8 = kStep7 * 33;
58 constexpr uint32_t kStep9 = kStep8 * 33;
59 constexpr uint32_t kStep10 = kStep9 * 33;
60 constexpr uint32_t kStep11 = kStep10 * 33;
61
62 // Step by -1 through -7: 33 * 0x3e0f83e1 == 1 (mod 2**32)
63 constexpr uint32_t kStepN1 = kStep0 * 0x3e0f83e1;
64 constexpr uint32_t kStepN2 = kStepN1 * 0x3e0f83e1;
65 constexpr uint32_t kStepN3 = kStepN2 * 0x3e0f83e1;
66 constexpr uint32_t kStepN4 = kStepN3 * 0x3e0f83e1;
67 constexpr uint32_t kStepN5 = kStepN4 * 0x3e0f83e1;
68 constexpr uint32_t kStepN6 = kStepN5 * 0x3e0f83e1;
69 constexpr uint32_t kStepN7 = kStepN6 * 0x3e0f83e1;
70
71 // Calculate the GNU hash and string length of the symbol name.
72 //
73 // The hash calculation is an optimized version of this function:
74 //
75 // uint32_t calculate_gnu_hash(const uint8_t* name) {
76 // uint32_t h = 5381;
77 // for (; *name != '\0'; ++name) {
78 // h *= 33;
79 // h += *name;
80 // }
81 // return h;
82 // }
83 //
84 // This does an within-alignment out-of-bounds read for performance reasons.
85 __attribute__((no_sanitize("hwaddress")))
calculate_gnu_hash_neon(const char * name)86 std::pair<uint32_t, uint32_t> calculate_gnu_hash_neon(const char* name) {
87
88 // The input string may be misaligned by 0-7 bytes (K). This function loads the first aligned
89 // 8-byte chunk, then counteracts the misalignment:
90 // - The initial K bytes are set to 0xff in the working chunk vector.
91 // - The accumulator is initialized to 5381 * modinv(33)**K.
92 // - The accumulator also cancels out each initial 0xff byte.
93 // If we could set bytes to NUL instead, then the accumulator wouldn't need to cancel out the
94 // 0xff values, but this would break the NUL check.
95
96 static const struct GnuHashInitEntry kInitTable[] = {
97 { // (addr&7) == 0
98 0ull,
99 5381u*kStep0,
100 }, { // (addr&7) == 1
101 0xffull,
102 5381u*kStepN1 - 0xffu*kStepN1,
103 }, { // (addr&7) == 2
104 0xffffull,
105 5381u*kStepN2 - 0xffu*kStepN1 - 0xffu*kStepN2,
106 }, { // (addr&7) == 3
107 0xffffffull,
108 5381u*kStepN3 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3,
109 }, { // (addr&7) == 4
110 0xffffffffull,
111 5381u*kStepN4 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4,
112 }, { // (addr&7) == 5
113 0xffffffffffull,
114 5381u*kStepN5 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5,
115 }, { // (addr&7) == 6
116 0xffffffffffffull,
117 5381u*kStepN6 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6,
118 }, { // (addr&7) == 7
119 0xffffffffffffffull,
120 5381u*kStepN7 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6 - 0xffu*kStepN7,
121 },
122 };
123
124 uint8_t offset = reinterpret_cast<uintptr_t>(name) & 7;
125 const uint64_t* chunk_ptr = reinterpret_cast<const uint64_t*>(reinterpret_cast<uintptr_t>(name) & ~7);
126 const struct GnuHashInitEntry* entry = &kInitTable[offset];
127
128 uint8x8_t chunk = vld1_u8(reinterpret_cast<const uint8_t*>(chunk_ptr));
129 chunk |= vld1_u8(reinterpret_cast<const uint8_t*>(&entry->ignore_mask));
130
131 uint32x4_t accum_lo = { 0 };
132 uint32x4_t accum_hi = { entry->accum, 0, 0, 0 };
133 const uint16x4_t kInclineVec = { kStep3, kStep2, kStep1, kStep0 };
134 const uint32x4_t kStep8Vec = vdupq_n_u32(kStep8);
135 uint8x8_t is_nul;
136 uint16x8_t expand;
137
138 while (1) {
139 // Exit the loop if any of the 8 bytes is NUL.
140 is_nul = vceq_u8(chunk, (uint8x8_t){ 0 });
141 expand = vmovl_u8(chunk);
142 uint64x1_t is_nul_64 = vreinterpret_u64_u8(is_nul);
143 if (vget_lane_u64(is_nul_64, 0)) break;
144
145 // Multiply both accumulators by 33**8.
146 accum_lo = vmulq_u32(accum_lo, kStep8Vec);
147 accum_hi = vmulq_u32(accum_hi, kStep8Vec);
148
149 // Multiply each 4-piece subchunk by (33**3, 33**2, 33*1, 1), then accumulate the result. The lo
150 // accumulator will be behind by 33**4 until the very end of the computation.
151 accum_lo = vmlal_u16(accum_lo, vget_low_u16(expand), kInclineVec);
152 accum_hi = vmlal_u16(accum_hi, vget_high_u16(expand), kInclineVec);
153
154 // Load the next chunk.
155 chunk = vld1_u8(reinterpret_cast<const uint8_t*>(++chunk_ptr));
156 }
157
158 // Reverse the is-NUL vector so we can use clz to count the number of remaining bytes.
159 is_nul = vrev64_u8(is_nul);
160 const uint64_t is_nul_u64 = vget_lane_u64(vreinterpret_u64_u8(is_nul), 0);
161 const uint32_t num_valid_bits = __builtin_clzll(is_nul_u64);
162
163 const uint32_t name_len = reinterpret_cast<const char*>(chunk_ptr) - name + (num_valid_bits >> 3);
164
165 static const uint32_t kFinalStepTable[] = {
166 kStep4, kStep0, // 0 remaining bytes
167 kStep5, kStep1, // 1 remaining byte
168 kStep6, kStep2, // 2 remaining bytes
169 kStep7, kStep3, // 3 remaining bytes
170 kStep8, kStep4, // 4 remaining bytes
171 kStep9, kStep5, // 5 remaining bytes
172 kStep10, kStep6, // 6 remaining bytes
173 kStep11, kStep7, // 7 remaining bytes
174 };
175
176 // Advance the lo/hi accumulators appropriately for the number of remaining bytes. Multiply 33**4
177 // into the lo accumulator to catch it up with the hi accumulator.
178 const uint32_t* final_step = &kFinalStepTable[num_valid_bits >> 2];
179 accum_lo = vmulq_u32(accum_lo, vdupq_n_u32(final_step[0]));
180 accum_lo = vmlaq_u32(accum_lo, accum_hi, vdupq_n_u32(final_step[1]));
181
182 static const uint32_t kFinalInclineTable[] = {
183 0, kStep6, kStep5, kStep4, kStep3, kStep2, kStep1, kStep0,
184 0, 0, 0, 0, 0, 0, 0, 0,
185 };
186
187 // Prepare a vector to multiply powers of 33 into each of the remaining bytes.
188 const uint32_t* const incline = &kFinalInclineTable[8 - (num_valid_bits >> 3)];
189 const uint32x4_t incline_lo = vld1q_u32(incline);
190 const uint32x4_t incline_hi = vld1q_u32(incline + 4);
191
192 // Multiply 33 into each of the remaining 4-piece vectors, then accumulate everything into
193 // accum_lo. Combine everything into a single 32-bit result.
194 accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_low_u16(expand)), incline_lo);
195 accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_high_u16(expand)), incline_hi);
196
197 uint32x2_t sum = vadd_u32(vget_low_u32(accum_lo), vget_high_u32(accum_lo));
198 const uint32_t hash = sum[0] + sum[1];
199
200 return { hash, name_len };
201 }
202