1 // Copyright 2021 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef BASE_ALLOCATOR_PARTITION_ALLOCATOR_STARSCAN_SCAN_LOOP_H_
6 #define BASE_ALLOCATOR_PARTITION_ALLOCATOR_STARSCAN_SCAN_LOOP_H_
7
8 #include <cstddef>
9 #include <cstdint>
10
11 #include "base/allocator/partition_allocator/partition_alloc_base/compiler_specific.h"
12 #include "base/allocator/partition_allocator/partition_alloc_buildflags.h"
13 #include "base/allocator/partition_allocator/partition_alloc_check.h"
14 #include "base/allocator/partition_allocator/partition_alloc_config.h"
15 #include "base/allocator/partition_allocator/starscan/starscan_fwd.h"
16 #include "base/allocator/partition_allocator/tagging.h"
17 #include "build/build_config.h"
18
19 #if defined(ARCH_CPU_X86_64)
20 // Include order is important, so we disable formatting.
21 // clang-format off
22 // Including these headers directly should generally be avoided. For the
23 // scanning loop, we check at runtime which SIMD extension we can use. Since
24 // Chrome is compiled with -msse3 (the minimal requirement), we include the
25 // headers directly to make the intrinsics available. Another option could be to
26 // use inline assembly, but that would hinder compiler optimization for
27 // vectorized instructions.
28 #include <immintrin.h>
29 #include <smmintrin.h>
30 #include <avxintrin.h>
31 #include <avx2intrin.h>
32 // clang-format on
33 #endif
34
35 #if PA_CONFIG(STARSCAN_NEON_SUPPORTED)
36 #include <arm_neon.h>
37 #endif
38
39 namespace partition_alloc::internal {
40
41 // Iterates over range of memory using the best available SIMD extension.
42 // Assumes that 64bit platforms have pool support and the begin pointer of
43 // incoming ranges are properly aligned. The class is designed around the CRTP
44 // version of the "template method" (in GoF terms). CRTP is needed for fast
45 // static dispatch.
46 template <typename Derived>
47 class ScanLoop {
48 public:
ScanLoop(SimdSupport simd_type)49 explicit ScanLoop(SimdSupport simd_type) : simd_type_(simd_type) {}
50
51 ScanLoop(const ScanLoop&) = delete;
52 ScanLoop& operator=(const ScanLoop&) = delete;
53
54 // Scan input range. Assumes the range is properly aligned. Please note that
55 // the function doesn't MTE-tag the input range as it assumes that MTE is
56 // disabled when function is called. See DisableMTEScope for details.
57 void Run(uintptr_t begin, uintptr_t end);
58
59 private:
derived()60 const Derived& derived() const { return static_cast<const Derived&>(*this); }
derived()61 Derived& derived() { return static_cast<Derived&>(*this); }
62
63 #if defined(ARCH_CPU_X86_64)
64 __attribute__((target("avx2"))) void RunAVX2(uintptr_t, uintptr_t);
65 __attribute__((target("sse4.1"))) void RunSSE4(uintptr_t, uintptr_t);
66 #endif
67 #if PA_CONFIG(STARSCAN_NEON_SUPPORTED)
68 void RunNEON(uintptr_t, uintptr_t);
69 #endif
70
71 void RunUnvectorized(uintptr_t, uintptr_t);
72
73 SimdSupport simd_type_;
74 };
75
76 template <typename Derived>
Run(uintptr_t begin,uintptr_t end)77 void ScanLoop<Derived>::Run(uintptr_t begin, uintptr_t end) {
78 // We allow vectorization only for 64bit since they require support of the
79 // 64bit regular pool, and only for x86 because a special instruction set is
80 // required.
81 #if defined(ARCH_CPU_X86_64)
82 if (simd_type_ == SimdSupport::kAVX2)
83 return RunAVX2(begin, end);
84 if (simd_type_ == SimdSupport::kSSE41)
85 return RunSSE4(begin, end);
86 #elif PA_CONFIG(STARSCAN_NEON_SUPPORTED)
87 if (simd_type_ == SimdSupport::kNEON)
88 return RunNEON(begin, end);
89 #endif // PA_CONFIG(STARSCAN_NEON_SUPPORTED)
90 return RunUnvectorized(begin, end);
91 }
92
93 template <typename Derived>
RunUnvectorized(uintptr_t begin,uintptr_t end)94 void ScanLoop<Derived>::RunUnvectorized(uintptr_t begin, uintptr_t end) {
95 PA_SCAN_DCHECK(!(begin % sizeof(uintptr_t)));
96 PA_SCAN_DCHECK(!(end % sizeof(uintptr_t)));
97 #if BUILDFLAG(HAS_64_BIT_POINTERS)
98 // If the read value is a pointer into the PA region, it's likely
99 // MTE-tagged. Piggyback on |mask| to untag, for efficiency.
100 const uintptr_t mask = Derived::RegularPoolMask() & kPtrUntagMask;
101 const uintptr_t base = Derived::RegularPoolBase();
102 #endif // BUILDFLAG(HAS_64_BIT_POINTERS)
103 for (; begin < end; begin += sizeof(uintptr_t)) {
104 // Read the region word-by-word. Everything that we read is a potential
105 // pointer to or inside an object on heap. Such an object should be
106 // quarantined, if attempted to free.
107 //
108 // Keep it MTE-untagged. See DisableMTEScope for details.
109 const uintptr_t maybe_ptr = *reinterpret_cast<uintptr_t*>(begin);
110 #if BUILDFLAG(HAS_64_BIT_POINTERS)
111 if (PA_LIKELY((maybe_ptr & mask) != base))
112 continue;
113 #else
114 if (!maybe_ptr)
115 continue;
116 #endif // BUILDFLAG(HAS_64_BIT_POINTERS)
117 derived().CheckPointer(maybe_ptr);
118 }
119 }
120
121 #if defined(ARCH_CPU_X86_64)
122 template <typename Derived>
RunAVX2(uintptr_t begin,uintptr_t end)123 __attribute__((target("avx2"))) void ScanLoop<Derived>::RunAVX2(uintptr_t begin,
124 uintptr_t end) {
125 static constexpr size_t kAlignmentRequirement = 32;
126 static constexpr size_t kWordsInVector = 4;
127 static constexpr size_t kBytesInVector = kWordsInVector * sizeof(uintptr_t);
128 PA_SCAN_DCHECK(!(begin % kAlignmentRequirement));
129 // Stick to integer instructions. This brings slightly better throughput. For
130 // example, according to the Intel docs, on Broadwell and Haswell the CPI of
131 // vmovdqa (_mm256_load_si256) is twice smaller (0.25) than that of vmovapd
132 // (_mm256_load_pd).
133 const __m256i vbase = _mm256_set1_epi64x(derived().RegularPoolBase());
134 // If the read value is a pointer into the PA region, it's likely
135 // MTE-tagged. Piggyback on |regular_pool_mask| to untag, for efficiency.
136 const __m256i regular_pool_mask =
137 _mm256_set1_epi64x(derived().RegularPoolMask() & kPtrUntagMask);
138
139 static_assert(sizeof(__m256i) == kBytesInVector);
140 for (; begin <= (end - kBytesInVector); begin += kBytesInVector) {
141 // Keep it MTE-untagged. See DisableMTEScope for details.
142 const __m256i maybe_ptrs =
143 _mm256_load_si256(reinterpret_cast<__m256i*>(begin));
144 const __m256i vand = _mm256_and_si256(maybe_ptrs, regular_pool_mask);
145 const __m256i vcmp = _mm256_cmpeq_epi64(vand, vbase);
146 const int mask = _mm256_movemask_pd(_mm256_castsi256_pd(vcmp));
147 if (PA_LIKELY(!mask))
148 continue;
149 // It's important to extract pointers from the already loaded vector.
150 // Otherwise, new loads can break in-pool assumption checked above.
151 if (mask & 0b0001)
152 derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 0));
153 if (mask & 0b0010)
154 derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 1));
155 if (mask & 0b0100)
156 derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 2));
157 if (mask & 0b1000)
158 derived().CheckPointer(_mm256_extract_epi64(maybe_ptrs, 3));
159 }
160 // Run unvectorized on the remainder of the region.
161 RunUnvectorized(begin, end);
162 }
163
164 template <typename Derived>
RunSSE4(uintptr_t begin,uintptr_t end)165 __attribute__((target("sse4.1"))) void ScanLoop<Derived>::RunSSE4(
166 uintptr_t begin,
167 uintptr_t end) {
168 static constexpr size_t kAlignmentRequirement = 16;
169 static constexpr size_t kWordsInVector = 2;
170 static constexpr size_t kBytesInVector = kWordsInVector * sizeof(uintptr_t);
171 PA_SCAN_DCHECK(!(begin % kAlignmentRequirement));
172 const __m128i vbase = _mm_set1_epi64x(derived().RegularPoolBase());
173 // If the read value is a pointer into the PA region, it's likely
174 // MTE-tagged. Piggyback on |regular_pool_mask| to untag, for efficiency.
175 const __m128i regular_pool_mask =
176 _mm_set1_epi64x(derived().RegularPoolMask() & kPtrUntagMask);
177
178 static_assert(sizeof(__m128i) == kBytesInVector);
179 for (; begin <= (end - kBytesInVector); begin += kBytesInVector) {
180 // Keep it MTE-untagged. See DisableMTEScope for details.
181 const __m128i maybe_ptrs =
182 _mm_loadu_si128(reinterpret_cast<__m128i*>(begin));
183 const __m128i vand = _mm_and_si128(maybe_ptrs, regular_pool_mask);
184 const __m128i vcmp = _mm_cmpeq_epi64(vand, vbase);
185 const int mask = _mm_movemask_pd(_mm_castsi128_pd(vcmp));
186 if (PA_LIKELY(!mask))
187 continue;
188 // It's important to extract pointers from the already loaded vector.
189 // Otherwise, new loads can break in-pool assumption checked above.
190 if (mask & 0b01) {
191 derived().CheckPointer(_mm_cvtsi128_si64(maybe_ptrs));
192 }
193 if (mask & 0b10) {
194 // The mask is used to move the 4th and 3rd dwords into the second and
195 // first position.
196 static constexpr int kSecondWordMask = (3 << 2) | (2 << 0);
197 const __m128i shuffled = _mm_shuffle_epi32(maybe_ptrs, kSecondWordMask);
198 derived().CheckPointer(_mm_cvtsi128_si64(shuffled));
199 }
200 }
201 // Run unvectorized on the remainder of the region.
202 RunUnvectorized(begin, end);
203 }
204 #endif // defined(ARCH_CPU_X86_64)
205
206 #if PA_CONFIG(STARSCAN_NEON_SUPPORTED)
207 template <typename Derived>
RunNEON(uintptr_t begin,uintptr_t end)208 void ScanLoop<Derived>::RunNEON(uintptr_t begin, uintptr_t end) {
209 static constexpr size_t kAlignmentRequirement = 16;
210 static constexpr size_t kWordsInVector = 2;
211 static constexpr size_t kBytesInVector = kWordsInVector * sizeof(uintptr_t);
212 PA_SCAN_DCHECK(!(begin % kAlignmentRequirement));
213 const uint64x2_t vbase = vdupq_n_u64(derived().RegularPoolBase());
214 // If the read value is a pointer into the PA region, it's likely
215 // MTE-tagged. Piggyback on |regular_pool_mask| to untag, for efficiency.
216 const uint64x2_t regular_pool_mask =
217 vdupq_n_u64(derived().RegularPoolMask() & kPtrUntagMask);
218
219 for (; begin <= (end - kBytesInVector); begin += kBytesInVector) {
220 // Keep it MTE-untagged. See DisableMTEScope for details.
221 const uint64x2_t maybe_ptrs = vld1q_u64(reinterpret_cast<uint64_t*>(begin));
222 const uint64x2_t vand = vandq_u64(maybe_ptrs, regular_pool_mask);
223 const uint64x2_t vcmp = vceqq_u64(vand, vbase);
224 const uint32_t max = vmaxvq_u32(vreinterpretq_u32_u64(vcmp));
225 if (PA_LIKELY(!max))
226 continue;
227 // It's important to extract pointers from the already loaded vector.
228 // Otherwise, new loads can break in-pool assumption checked above.
229 if (vgetq_lane_u64(vcmp, 0))
230 derived().CheckPointer(vgetq_lane_u64(maybe_ptrs, 0));
231 if (vgetq_lane_u64(vcmp, 1))
232 derived().CheckPointer(vgetq_lane_u64(maybe_ptrs, 1));
233 }
234 // Run unvectorized on the remainder of the region.
235 RunUnvectorized(begin, end);
236 }
237 #endif // PA_CONFIG(STARSCAN_NEON_SUPPORTED)
238
239 } // namespace partition_alloc::internal
240
241 #endif // BASE_ALLOCATOR_PARTITION_ALLOCATOR_STARSCAN_SCAN_LOOP_H_
242