• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2023 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // -----------------------------------------------------------------------------
16 // File: prefetch.h
17 // -----------------------------------------------------------------------------
18 //
19 // This header file defines prefetch functions to prefetch memory contents
20 // into the first level cache (L1) for the current CPU. The prefetch logic
21 // offered in this header is limited to prefetching first level cachelines
22 // only, and is aimed at relatively 'simple' prefetching logic.
23 //
24 #ifndef ABSL_BASE_PREFETCH_H_
25 #define ABSL_BASE_PREFETCH_H_
26 
27 #include "absl/base/config.h"
28 
29 #if defined(ABSL_INTERNAL_HAVE_SSE)
30 #include <xmmintrin.h>
31 #endif
32 
33 #if defined(_MSC_VER) && _MSC_VER >= 1900 && \
34     (defined(_M_X64) || defined(_M_IX86))
35 #include <intrin.h>
36 #pragma intrinsic(_mm_prefetch)
37 #endif
38 
39 namespace absl {
40 ABSL_NAMESPACE_BEGIN
41 
42 // Moves data into the L1 cache before it is read, or "prefetches" it.
43 //
44 // The value of `addr` is the address of the memory to prefetch. If
45 // the target and compiler support it, data prefetch instructions are
46 // generated. If the prefetch is done some time before the memory is
47 // read, it may be in the cache by the time the read occurs.
48 //
49 // This method prefetches data with the highest degree of temporal locality;
50 // data is prefetched where possible into all levels of the cache.
51 //
52 // Incorrect or gratuitous use of this function can degrade performance.
53 // Use this function only when representative benchmarks show an improvement.
54 //
55 // Example:
56 //
57 //  // Computes incremental checksum for `data`.
58 //  int ComputeChecksum(int sum, absl::string_view data);
59 //
60 //  // Computes cumulative checksum for all values in `data`
61 //  int ComputeChecksum(absl::Span<const std::string> data) {
62 //    int sum = 0;
63 //    auto it = data.begin();
64 //    auto pit = data.begin();
65 //    auto end = data.end();
66 //    for (int dist = 8; dist > 0 && pit != data.end(); --dist, ++pit) {
67 //      absl::PrefetchToLocalCache(pit->data());
68 //    }
69 //    for (; pit != end; ++pit, ++it) {
70 //      sum = ComputeChecksum(sum, *it);
71 //      absl::PrefetchToLocalCache(pit->data());
72 //    }
73 //    for (; it != end; ++it) {
74 //      sum = ComputeChecksum(sum, *it);
75 //    }
76 //    return sum;
77 //  }
78 //
79 void PrefetchToLocalCache(const void* addr);
80 
81 // Moves data into the L1 cache before it is read, or "prefetches" it.
82 //
83 // This function is identical to `PrefetchToLocalCache()` except that it has
84 // non-temporal locality: the fetched data should not be left in any of the
85 // cache tiers. This is useful for cases where the data is used only once /
86 // short term, for example, invoking a destructor on an object.
87 //
88 // Incorrect or gratuitous use of this function can degrade performance.
89 // Use this function only when representative benchmarks show an improvement.
90 //
91 // Example:
92 //
93 //  template <typename Iterator>
94 //  void DestroyPointers(Iterator begin, Iterator end) {
95 //    size_t distance = std::min(8U, bars.size());
96 //
97 //    int dist = 8;
98 //    auto prefetch_it = begin;
99 //    while (prefetch_it != end && --dist;) {
100 //      absl::PrefetchToLocalCacheNta(*prefetch_it++);
101 //    }
102 //    while (prefetch_it != end) {
103 //      delete *begin++;
104 //      absl::PrefetchToLocalCacheNta(*prefetch_it++);
105 //    }
106 //    while (begin != end) {
107 //      delete *begin++;
108 //    }
109 //  }
110 //
111 void PrefetchToLocalCacheNta(const void* addr);
112 
113 // Moves data into the L1 cache with the intent to modify it.
114 //
115 // This function is similar to `PrefetchToLocalCache()` except that it
116 // prefetches cachelines with an 'intent to modify' This typically includes
117 // invalidating cache entries for this address in all other cache tiers, and an
118 // exclusive access intent.
119 //
120 // Incorrect or gratuitous use of this function can degrade performance. As this
121 // function can invalidate cached cachelines on other caches and computer cores,
122 // incorrect usage of this function can have an even greater negative impact
123 // than incorrect regular prefetches.
124 // Use this function only when representative benchmarks show an improvement.
125 //
126 // Example:
127 //
128 //  void* Arena::Allocate(size_t size) {
129 //    void* ptr = AllocateBlock(size);
130 //    absl::PrefetchToLocalCacheForWrite(p);
131 //    return ptr;
132 //  }
133 //
134 void PrefetchToLocalCacheForWrite(const void* addr);
135 
136 #if ABSL_HAVE_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
137 
138 #define ABSL_HAVE_PREFETCH 1
139 
140 // See __builtin_prefetch:
141 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html.
142 //
PrefetchToLocalCache(const void * addr)143 inline void PrefetchToLocalCache(const void* addr) {
144   __builtin_prefetch(addr, 0, 3);
145 }
146 
PrefetchToLocalCacheNta(const void * addr)147 inline void PrefetchToLocalCacheNta(const void* addr) {
148   __builtin_prefetch(addr, 0, 0);
149 }
150 
PrefetchToLocalCacheForWrite(const void * addr)151 inline void PrefetchToLocalCacheForWrite(const void* addr) {
152   // [x86] gcc/clang don't generate PREFETCHW for __builtin_prefetch(.., 1)
153   // unless -march=broadwell or newer; this is not generally the default, so we
154   // manually emit prefetchw. PREFETCHW is recognized as a no-op on older Intel
155   // processors and has been present on AMD processors since the K6-2.
156 #if defined(__x86_64__)
157   asm("prefetchw (%0)" : : "r"(addr));
158 #else
159   __builtin_prefetch(addr, 1, 3);
160 #endif
161 }
162 
163 #elif defined(ABSL_INTERNAL_HAVE_SSE)
164 
165 #define ABSL_HAVE_PREFETCH 1
166 
PrefetchToLocalCache(const void * addr)167 inline void PrefetchToLocalCache(const void* addr) {
168   _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0);
169 }
170 
PrefetchToLocalCacheNta(const void * addr)171 inline void PrefetchToLocalCacheNta(const void* addr) {
172   _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_NTA);
173 }
174 
PrefetchToLocalCacheForWrite(const void * addr)175 inline void PrefetchToLocalCacheForWrite(const void* addr) {
176 #if defined(_MM_HINT_ET0)
177   _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_ET0);
178 #elif !defined(_MSC_VER) && defined(__x86_64__)
179   // _MM_HINT_ET0 is not universally supported. As we commented further
180   // up, PREFETCHW is recognized as a no-op on older Intel processors
181   // and has been present on AMD processors since the K6-2. We have this
182   // disabled for MSVC compilers as this miscompiles on older MSVC compilers.
183   asm("prefetchw (%0)" : : "r"(addr));
184 #endif
185 }
186 
187 #else
188 
PrefetchToLocalCache(const void * addr)189 inline void PrefetchToLocalCache(const void* addr) {}
PrefetchToLocalCacheNta(const void * addr)190 inline void PrefetchToLocalCacheNta(const void* addr) {}
PrefetchToLocalCacheForWrite(const void * addr)191 inline void PrefetchToLocalCacheForWrite(const void* addr) {}
192 
193 #endif
194 
195 ABSL_NAMESPACE_END
196 }  // namespace absl
197 
198 #endif  // ABSL_BASE_PREFETCH_H_
199