• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (C) 2009-2021 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8// LSC Cache options
9// Load message caching control
10enum LSC_LDCC {
11    LSC_LDCC_DEFAULT,
12    LSC_LDCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
13    LSC_LDCC_L1UC_L3C,      // Override to L1 uncached and L3 cached
14    LSC_LDCC_L1C_L3UC,      // Override to L1 cached and L3 uncached
15    LSC_LDCC_L1C_L3C,       // Override to L1 cached and L3 cached
16    LSC_LDCC_L1S_L3UC,      // Override to L1 streaming load and L3 uncached
17    LSC_LDCC_L1S_L3C,       // Override to L1 streaming load and L3 cached
18    LSC_LDCC_L1IAR_L3C,     // Override to L1 invalidate-after-read, and L3 cached
19};
20
21// Store message caching control (also used for atomics)
22enum LSC_STCC {
23    LSC_STCC_DEFAULT,
24    LSC_STCC_L1UC_L3UC,     // Override to L1 uncached and L3 uncached
25    LSC_STCC_L1UC_L3WB,     // Override to L1 uncached and L3 written back
26    LSC_STCC_L1WT_L3UC,     // Override to L1 written through and L3 uncached
27    LSC_STCC_L1WT_L3WB,     // Override to L1 written through and L3 written back
28    LSC_STCC_L1S_L3UC,      // Override to L1 streaming and L3 uncached
29    LSC_STCC_L1S_L3WB,      // Override to L1 streaming and L3 written back
30    LSC_STCC_L1WB_L3WB,     // Override to L1 written through and L3 written back
31};
32
33// LSC Loads
34
35// Global address space
36uint    __builtin_IB_lsc_load_global_uchar_to_uint (const __global uchar  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D8U32
37uint    __builtin_IB_lsc_load_global_ushort_to_uint(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt);   //D16U32
38uint    __builtin_IB_lsc_load_global_uint  (const __global uint   *base, int immElemOff, enum LSC_LDCC cacheOpt);       //D32V1
39uint2   __builtin_IB_lsc_load_global_uint2 (const __global uint2  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V2
40uint3   __builtin_IB_lsc_load_global_uint3 (const __global uint3  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V3
41uint4   __builtin_IB_lsc_load_global_uint4 (const __global uint4  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V4
42uint8   __builtin_IB_lsc_load_global_uint8 (const __global uint8  *base, int immElemOff, enum LSC_LDCC cacheOpt);     //D32V8
43ulong   __builtin_IB_lsc_load_global_ulong (const __global ulong  *base, int immElemOff, enum LSC_LDCC cacheOpt);    //D64V1
44ulong2  __builtin_IB_lsc_load_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V2
45ulong3  __builtin_IB_lsc_load_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V3
46ulong4  __builtin_IB_lsc_load_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V4
47ulong8  __builtin_IB_lsc_load_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt);  //D64V8
48
49// Local address space
50uint    __builtin_IB_lsc_load_local_uchar_to_uint( const __local  uchar *base, int immElemOff); //D8U32
51uint    __builtin_IB_lsc_load_local_ushort_to_uint(const __local ushort *base, int immElemOff); //D16U32
52uint    __builtin_IB_lsc_load_local_uint  (const __local uint   *base, int immElemOff);   //D32V1
53uint2   __builtin_IB_lsc_load_local_uint2 (const __local uint2  *base, int immElemOff);  //D32V2
54uint3   __builtin_IB_lsc_load_local_uint3 (const __local uint3  *base, int immElemOff);  //D32V3
55uint4   __builtin_IB_lsc_load_local_uint4 (const __local uint4  *base, int immElemOff);  //D32V4
56uint8   __builtin_IB_lsc_load_local_uint8 (const __local uint8  *base, int immElemOff);  //D32V8
57ulong   __builtin_IB_lsc_load_local_ulong (const __local ulong  *base, int immElemOff);  //D64V1
58ulong2  __builtin_IB_lsc_load_local_ulong2(const __local ulong2 *base, int immElemOff); //D64V2
59ulong3  __builtin_IB_lsc_load_local_ulong3(const __local ulong3 *base, int immElemOff); //D64V3
60ulong4  __builtin_IB_lsc_load_local_ulong4(const __local ulong4 *base, int immElemOff); //D64V4
61ulong8  __builtin_IB_lsc_load_local_ulong8(const __local ulong8 *base, int immElemOff); //D64V8
62
63// LSC Stores
64
65// Global address space
66void  __builtin_IB_lsc_store_global_uchar_from_uint (__global uchar  *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);     //D8U32
67void  __builtin_IB_lsc_store_global_ushort_from_uint(__global ushort *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);  //D16U32
68void  __builtin_IB_lsc_store_global_uint  (__global uint   *base, int immElemOff, uint val, enum LSC_STCC cacheOpt);        //D32V1
69void  __builtin_IB_lsc_store_global_uint2 (__global uint2  *base, int immElemOff, uint2 val, enum LSC_STCC cacheOpt);     //D32V2
70void  __builtin_IB_lsc_store_global_uint3 (__global uint3  *base, int immElemOff, uint3 val, enum LSC_STCC cacheOpt);     //D32V3
71void  __builtin_IB_lsc_store_global_uint4 (__global uint4  *base, int immElemOff, uint4 val, enum LSC_STCC cacheOpt);     //D32V4
72void  __builtin_IB_lsc_store_global_uint8 (__global uint8  *base, int immElemOff, uint8 val, enum LSC_STCC cacheOpt);     //D32V8
73void  __builtin_IB_lsc_store_global_ulong (__global ulong  *base, int immElemOff, ulong val, enum LSC_STCC cacheOpt);     //D64V1
74void  __builtin_IB_lsc_store_global_ulong2(__global ulong2 *base, int immElemOff, ulong2 val, enum LSC_STCC cacheOpt);  //D64V2
75void  __builtin_IB_lsc_store_global_ulong3(__global ulong3 *base, int immElemOff, ulong3 val, enum LSC_STCC cacheOpt);  //D64V3
76void  __builtin_IB_lsc_store_global_ulong4(__global ulong4 *base, int immElemOff, ulong4 val, enum LSC_STCC cacheOpt);  //D64V4
77void  __builtin_IB_lsc_store_global_ulong8(__global ulong8 *base, int immElemOff, ulong8 val, enum LSC_STCC cacheOpt);  //D64V8
78
79// Local address space
80void  __builtin_IB_lsc_store_local_uchar_from_uint (__local  uchar *base, int immElemOff, uint val);   //D8U32
81void  __builtin_IB_lsc_store_local_ushort_from_uint(__local ushort *base, int immElemOff, uint val); //D16U32
82void  __builtin_IB_lsc_store_local_uint  (__local uint   *base, int immElemOff, uint val);   //D32V1
83void  __builtin_IB_lsc_store_local_uint2 (__local uint2  *base, int immElemOff, uint2 val);  //D32V2
84void  __builtin_IB_lsc_store_local_uint3 (__local uint3  *base, int immElemOff, uint3 val);  //D32V3
85void  __builtin_IB_lsc_store_local_uint4 (__local uint4  *base, int immElemOff, uint4 val);  //D32V4
86void  __builtin_IB_lsc_store_local_uint8 (__local uint8  *base, int immElemOff, uint8 val);  //D32V8
87void  __builtin_IB_lsc_store_local_ulong (__local ulong  *base, int immElemOff, ulong val);  //D64V1
88void  __builtin_IB_lsc_store_local_ulong2(__local ulong2 *base, int immElemOff, ulong2 val);  //D64V2
89void  __builtin_IB_lsc_store_local_ulong3(__local ulong3 *base, int immElemOff, ulong3 val);  //D64V3
90void  __builtin_IB_lsc_store_local_ulong4(__local ulong4 *base, int immElemOff, ulong4 val);  //D64V4
91void  __builtin_IB_lsc_store_local_ulong8(__local ulong8 *base, int immElemOff, ulong8 val);  //D64V8
92
93// LSC prefetching
94
95// LSC Pre-Fetch Load functions with CacheControls
96// Global address space
97void __builtin_IB_lsc_prefetch_global_uchar (const __global uchar  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D8U32
98void __builtin_IB_lsc_prefetch_global_ushort(const __global ushort *base, int immElemOff, enum LSC_LDCC cacheOpt); //D16U32
99void __builtin_IB_lsc_prefetch_global_uint  (const __global uint   *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V1
100void __builtin_IB_lsc_prefetch_global_uint2 (const __global uint2  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V2
101void __builtin_IB_lsc_prefetch_global_uint3 (const __global uint3  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V3
102void __builtin_IB_lsc_prefetch_global_uint4 (const __global uint4  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V4
103void __builtin_IB_lsc_prefetch_global_uint8 (const __global uint8  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D32V8
104void __builtin_IB_lsc_prefetch_global_ulong (const __global ulong  *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V1
105void __builtin_IB_lsc_prefetch_global_ulong2(const __global ulong2 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V2
106void __builtin_IB_lsc_prefetch_global_ulong3(const __global ulong3 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V3
107void __builtin_IB_lsc_prefetch_global_ulong4(const __global ulong4 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V4
108void __builtin_IB_lsc_prefetch_global_ulong8(const __global ulong8 *base, int immElemOff, enum LSC_LDCC cacheOpt); //D64V8
109
110// LSC Fence support
111
112// FS - Fence Scope
113enum LSC_FS {
114    LSC_FS_THREAD_GROUP,
115    LSC_FS_LOCAL,
116    LSC_FS_TILE,
117    LSC_FS_GPU,
118    LSC_FS_GPUs,
119    LSC_FS_SYSTEM_RELEASE,
120    LSC_FS_SYSTEM_ACQUIRE
121};
122
123// FT - Fence Type
124enum LSC_FT {
125    LSC_FT_DEFAULT,
126    LSC_FT_EVICT,
127    LSC_FT_INVALIDATE,
128    LSC_FT_DISCARD,
129    LSC_FT_CLEAN,
130    LSC_FT_L3
131};
132
133// LSC Fence functions
134void  __builtin_IB_lsc_fence_global_untyped(enum LSC_FS scope, enum LSC_FT flushType);   // Mem Port - UGM
135void  __builtin_IB_lsc_fence_global_untyped_cross_tile(enum LSC_FS scope, enum LSC_FT flushType);  // Mem Port - UGML
136void  __builtin_IB_lsc_fence_global_typed(enum LSC_FS scope, enum LSC_FT flushType);     // Mem Port - TGM
137void  __builtin_IB_lsc_fence_local();                                                    // Mem Port - SLM
138
139// Exported functions
140
141// LSC Loads
142// uchar
143uint load_uchar_to_uint_L1UC_L3UC(global uchar* it, int offset)
144{
145    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
146}
147
148uint load_uchar_to_uint_L1UC_L3C(global uchar* it, int offset)
149{
150    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
151}
152
153uint load_uchar_to_uint_L1C_L3UC(global uchar* it, int offset)
154{
155    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
156}
157
158uint load_uchar_to_uint_L1C_L3C(global uchar* it, int offset)
159{
160    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1C_L3C);
161}
162
163uint load_uchar_to_uint_L1S_L3UC(global uchar* it, int offset)
164{
165    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
166}
167
168uint load_uchar_to_uint_L1S_L3C(global uchar* it, int offset)
169{
170    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1S_L3C);
171}
172
173uint load_uchar_to_uint_L1IAR_L3C(global uchar* it, int offset)
174{
175    return __builtin_IB_lsc_load_global_uchar_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
176}
177
178// ushort
179uint load_ushort_to_uint_L1UC_L3UC(global ushort* it, int offset)
180{
181    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3UC);
182}
183
184uint load_ushort_to_uint_L1UC_L3C(global ushort* it, int offset)
185{
186    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1UC_L3C);
187}
188
189uint load_ushort_to_uint_L1C_L3UC(global ushort* it, int offset)
190{
191    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3UC);
192}
193
194uint load_ushort_to_uint_L1C_L3C(global ushort* it, int offset)
195{
196    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1C_L3C);
197}
198
199uint load_ushort_to_uint_L1S_L3UC(global ushort* it, int offset)
200{
201    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3UC);
202}
203
204uint load_ushort_to_uint_L1S_L3C(global ushort* it, int offset)
205{
206    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1S_L3C);
207}
208
209uint load_ushort_to_uint_L1IAR_L3C(global ushort* it, int offset)
210{
211    return __builtin_IB_lsc_load_global_ushort_to_uint(it, offset, LSC_LDCC_L1IAR_L3C);
212}
213
214// uint
215uint load_uint_L1UC_L3UC(global uint* it, int offset)
216{
217    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3UC);
218}
219
220uint load_uint_L1UC_L3C(global uint* it, int offset)
221{
222    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1UC_L3C);
223}
224
225uint load_uint_L1C_L3UC(global uint* it, int offset)
226{
227    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3UC);
228}
229
230uint load_uint_L1C_L3C(global uint* it, int offset)
231{
232    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1C_L3C);
233}
234
235uint load_uint_L1S_L3UC(global uint* it, int offset)
236{
237    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3UC);
238}
239
240uint load_uint_L1S_L3C(global uint* it, int offset)
241{
242    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1S_L3C);
243}
244
245uint load_uint_L1IAR_L3C(global uint* it, int offset)
246{
247    return __builtin_IB_lsc_load_global_uint(it, offset, LSC_LDCC_L1IAR_L3C);
248}
249
250// uint2
251uint2 load_uint2_L1UC_L3UC(global uint2* it, int offset)
252{
253    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3UC);
254}
255
256uint2 load_uint2_L1UC_L3C(global uint2* it, int offset)
257{
258    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1UC_L3C);
259}
260
261uint2 load_uint2_L1C_L3UC(global uint2* it, int offset)
262{
263    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3UC);
264}
265
266uint2 load_uint2_L1C_L3C(global uint2* it, int offset)
267{
268    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1C_L3C);
269}
270
271uint2 load_uint2_L1S_L3UC(global uint2* it, int offset)
272{
273    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3UC);
274}
275
276uint2 load_uint2_L1S_L3C(global uint2* it, int offset)
277{
278    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1S_L3C);
279}
280
281uint2 load_uint2_L1IAR_L3C(global uint2* it, int offset)
282{
283    return __builtin_IB_lsc_load_global_uint2(it, offset, LSC_LDCC_L1IAR_L3C);
284}
285
286// uint3
287uint3 load_uint3_L1UC_L3UC(global uint3* it, int offset)
288{
289    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3UC);
290}
291
292uint3 load_uint3_L1UC_L3C(global uint3* it, int offset)
293{
294    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1UC_L3C);
295}
296
297uint3 load_uint3_L1C_L3UC(global uint3* it, int offset)
298{
299    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3UC);
300}
301
302uint3 load_uint3_L1C_L3C(global uint3* it, int offset)
303{
304    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1C_L3C);
305}
306
307uint3 load_uint3_L1S_L3UC(global uint3* it, int offset)
308{
309    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3UC);
310}
311
312uint3 load_uint3_L1S_L3C(global uint3* it, int offset)
313{
314    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1S_L3C);
315}
316
317uint3 load_uint3_L1IAR_L3C(global uint3* it, int offset)
318{
319    return __builtin_IB_lsc_load_global_uint3(it, offset, LSC_LDCC_L1IAR_L3C);
320}
321
322// uint4
323uint4 load_uint4_L1UC_L3UC(global uint4* it, int offset)
324{
325    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3UC);
326}
327
328uint4 load_uint4_L1UC_L3C(global uint4* it, int offset)
329{
330    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1UC_L3C);
331}
332
333uint4 load_uint4_L1C_L3UC(global uint4* it, int offset)
334{
335    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3UC);
336}
337
338uint4 load_uint4_L1C_L3C(global uint4* it, int offset)
339{
340    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1C_L3C);
341}
342
343uint4 load_uint4_L1S_L3UC(global uint4* it, int offset)
344{
345    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3UC);
346}
347
348uint4 load_uint4_L1S_L3C(global uint4* it, int offset)
349{
350    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1S_L3C);
351}
352
353uint4 load_uint4_L1IAR_L3C(global uint4* it, int offset)
354{
355    return __builtin_IB_lsc_load_global_uint4(it, offset, LSC_LDCC_L1IAR_L3C);
356}
357
358// uint8
359uint8 load_uint8_L1UC_L3UC(global uint8* it, int offset)
360{
361    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3UC);
362}
363
364uint8 load_uint8_L1UC_L3C(global uint8* it, int offset)
365{
366    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1UC_L3C);
367}
368
369uint8 load_uint8_L1C_L3UC(global uint8* it, int offset)
370{
371    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3UC);
372}
373
374uint8 load_uint8_L1C_L3C(global uint8* it, int offset)
375{
376    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1C_L3C);
377}
378
379uint8 load_uint8_L1S_L3UC(global uint8* it, int offset)
380{
381    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3UC);
382}
383
384uint8 load_uint8_L1S_L3C(global uint8* it, int offset)
385{
386    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1S_L3C);
387}
388
389uint8 load_uint8_L1IAR_L3C(global uint8* it, int offset)
390{
391    return __builtin_IB_lsc_load_global_uint8(it, offset, LSC_LDCC_L1IAR_L3C);
392}
393
394// ulong
395ulong load_ulong_L1UC_L3UC(global ulong* it, int offset)
396{
397    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3UC);
398}
399
400ulong load_ulong_L1UC_L3C(global ulong* it, int offset)
401{
402    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1UC_L3C);
403}
404
405ulong load_ulong_L1C_L3UC(global ulong* it, int offset)
406{
407    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3UC);
408}
409
410ulong load_ulong_L1C_L3C(global ulong* it, int offset)
411{
412    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1C_L3C);
413}
414
415ulong load_ulong_L1S_L3UC(global ulong* it, int offset)
416{
417    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3UC);
418}
419
420ulong load_ulong_L1S_L3C(global ulong* it, int offset)
421{
422    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1S_L3C);
423}
424
425ulong load_ulong_L1IAR_L3C(global ulong* it, int offset)
426{
427    return __builtin_IB_lsc_load_global_ulong(it, offset, LSC_LDCC_L1IAR_L3C);
428}
429
430// ulong2
431ulong2 load_ulong2_L1UC_L3UC(global ulong2* it, int offset)
432{
433    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3UC);
434}
435
436ulong2 load_ulong2_L1UC_L3C(global ulong2* it, int offset)
437{
438    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1UC_L3C);
439}
440
441ulong2 load_ulong2_L1C_L3UC(global ulong2* it, int offset)
442{
443    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3UC);
444}
445
446ulong2 load_ulong2_L1C_L3C(global ulong2* it, int offset)
447{
448    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1C_L3C);
449}
450
451ulong2 load_ulong2_L1S_L3UC(global ulong2* it, int offset)
452{
453    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3UC);
454}
455
456ulong2 load_ulong2_L1S_L3C(global ulong2* it, int offset)
457{
458    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1S_L3C);
459}
460
461ulong2 load_ulong2_L1IAR_L3C(global ulong2* it, int offset)
462{
463    return __builtin_IB_lsc_load_global_ulong2(it, offset, LSC_LDCC_L1IAR_L3C);
464}
465
466// ulong3
467ulong3 load_ulong3_L1UC_L3UC(global ulong3* it, int offset)
468{
469    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3UC);
470}
471
472ulong3 load_ulong3_L1UC_L3C(global ulong3* it, int offset)
473{
474    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1UC_L3C);
475}
476
477ulong3 load_ulong3_L1C_L3UC(global ulong3* it, int offset)
478{
479    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3UC);
480}
481
482ulong3 load_ulong3_L1C_L3C(global ulong3* it, int offset)
483{
484    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1C_L3C);
485}
486
487ulong3 load_ulong3_L1S_L3UC(global ulong3* it, int offset)
488{
489    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3UC);
490}
491
492ulong3 load_ulong3_L1S_L3C(global ulong3* it, int offset)
493{
494    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1S_L3C);
495}
496
497ulong3 load_ulong3_L1IAR_L3C(global ulong3* it, int offset)
498{
499    return __builtin_IB_lsc_load_global_ulong3(it, offset, LSC_LDCC_L1IAR_L3C);
500}
501
502// ulong4
503ulong4 load_ulong4_L1UC_L3UC(global ulong4* it, int offset)
504{
505    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3UC);
506}
507
508ulong4 load_ulong4_L1UC_L3C(global ulong4* it, int offset)
509{
510    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1UC_L3C);
511}
512
513ulong4 load_ulong4_L1C_L3UC(global ulong4* it, int offset)
514{
515    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3UC);
516}
517
518ulong4 load_ulong4_L1C_L3C(global ulong4* it, int offset)
519{
520    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1C_L3C);
521}
522
523ulong4 load_ulong4_L1S_L3UC(global ulong4* it, int offset)
524{
525    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3UC);
526}
527
528ulong4 load_ulong4_L1S_L3C(global ulong4* it, int offset)
529{
530    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1S_L3C);
531}
532
533ulong4 load_ulong4_L1IAR_L3C(global ulong4* it, int offset)
534{
535    return __builtin_IB_lsc_load_global_ulong4(it, offset, LSC_LDCC_L1IAR_L3C);
536}
537
538// ulong8
539ulong8 load_ulong8_L1UC_L3UC(global ulong8* it, int offset)
540{
541    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3UC);
542}
543
544ulong8 load_ulong8_L1UC_L3C(global ulong8* it, int offset)
545{
546    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1UC_L3C);
547}
548
549ulong8 load_ulong8_L1C_L3UC(global ulong8* it, int offset)
550{
551    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3UC);
552}
553
554ulong8 load_ulong8_L1C_L3C(global ulong8* it, int offset)
555{
556    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1C_L3C);
557}
558
559ulong8 load_ulong8_L1S_L3UC(global ulong8* it, int offset)
560{
561    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3UC);
562}
563
564ulong8 load_ulong8_L1S_L3C(global ulong8* it, int offset)
565{
566    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1S_L3C);
567}
568
569ulong8 load_ulong8_L1IAR_L3C(global ulong8* it, int offset)
570{
571    return __builtin_IB_lsc_load_global_ulong8(it, offset, LSC_LDCC_L1IAR_L3C);
572}
573
574// LSC Stores
575// uchar
576void store_uchar_from_uint_L1UC_L3UC(global uchar* it, int offset, uint value)
577{
578    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
579}
580
581void store_uchar_from_uint_L1UC_L3WB(global uchar* it, int offset, uint value)
582{
583    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
584}
585
586void store_uchar_from_uint_L1WT_L3UC(global uchar* it, int offset, uint value)
587{
588    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
589}
590
591void store_uchar_from_uint_L1WT_L3WB(global uchar* it, int offset, uint value)
592{
593    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
594}
595
596void store_uchar_from_uint_L1S_L3UC(global uchar* it, int offset, uint value)
597{
598    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
599}
600
601void store_uchar_from_uint_L1S_L3WB(global uchar* it, int offset, uint value)
602{
603    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
604}
605
606void store_uchar_from_uint_L1WB_L3WB(global uchar* it, int offset, uint value)
607{
608    __builtin_IB_lsc_store_global_uchar_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
609}
610
611// ushort
612void store_ushort_from_uint_L1UC_L3UC(global ushort* it, int offset, uint value)
613{
614    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
615}
616
617void store_ushort_from_uint_L1UC_L3WB(global ushort* it, int offset, uint value)
618{
619    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
620}
621
622void store_ushort_from_uint_L1WT_L3UC(global ushort* it, int offset, uint value)
623{
624    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
625}
626
627void store_ushort_from_uint_L1WT_L3WB(global ushort* it, int offset, uint value)
628{
629    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
630}
631
632void store_ushort_from_uint_L1S_L3UC(global ushort* it, int offset, uint value)
633{
634    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3UC);
635}
636
637void store_ushort_from_uint_L1S_L3WB(global ushort* it, int offset, uint value)
638{
639    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1S_L3WB);
640}
641
642void store_ushort_from_uint_L1WB_L3WB(global ushort* it, int offset, uint value)
643{
644    __builtin_IB_lsc_store_global_ushort_from_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
645}
646
647// uint
648void store_uint_L1UC_L3UC(global uint* it, int offset, uint value)
649{
650    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3UC);
651}
652
653void store_uint_L1UC_L3WB(global uint* it, int offset, uint value)
654{
655    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1UC_L3WB);
656}
657
658void store_uint_L1WT_L3UC(global uint* it, int offset, uint value)
659{
660    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3UC);
661}
662
663void store_uint_L1WT_L3WB(global uint* it, int offset, uint value)
664{
665    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WT_L3WB);
666}
667
668void store_uint_L1S_L3UC(global uint* it, int offset, uint value)
669{
670    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3UC);
671}
672
673void store_uint_L1S_L3WB(global uint* it, int offset, uint value)
674{
675    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1S_L3WB);
676}
677
678void store_uint_L1WB_L3WB(global uint* it, int offset, uint value)
679{
680    __builtin_IB_lsc_store_global_uint(it, offset, value, LSC_STCC_L1WB_L3WB);
681}
682
683// uint2
684void store_uint2_L1UC_L3UC(global uint2* it, int offset, uint2 value)
685{
686    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3UC);
687}
688
689void store_uint2_L1UC_L3WB(global uint2* it, int offset, uint2 value)
690{
691    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1UC_L3WB);
692}
693
694void store_uint2_L1WT_L3UC(global uint2* it, int offset, uint2 value)
695{
696    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3UC);
697}
698
699void store_uint2_L1WT_L3WB(global uint2* it, int offset, uint2 value)
700{
701    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WT_L3WB);
702}
703
704void store_uint2_L1S_L3UC(global uint2* it, int offset, uint2 value)
705{
706    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3UC);
707}
708
709void store_uint2_L1S_L3WB(global uint2* it, int offset, uint2 value)
710{
711    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1S_L3WB);
712}
713
714void store_uint2_L1WB_L3WB(global uint2* it, int offset, uint2 value)
715{
716    __builtin_IB_lsc_store_global_uint2(it, offset, value, LSC_STCC_L1WB_L3WB);
717}
718
719// uint3
720void store_uint3_L1UC_L3UC(global uint3* it, int offset, uint3 value)
721{
722    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3UC);
723}
724
725void store_uint3_L1UC_L3WB(global uint3* it, int offset, uint3 value)
726{
727    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1UC_L3WB);
728}
729
730void store_uint3_L1WT_L3UC(global uint3* it, int offset, uint3 value)
731{
732    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3UC);
733}
734
735void store_uint3_L1WT_L3WB(global uint3* it, int offset, uint3 value)
736{
737    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WT_L3WB);
738}
739
740void store_uint3_L1S_L3UC(global uint3* it, int offset, uint3 value)
741{
742    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3UC);
743}
744
745void store_uint3_L1S_L3WB(global uint3* it, int offset, uint3 value)
746{
747    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1S_L3WB);
748}
749
750void store_uint3_L1WB_L3WB(global uint3* it, int offset, uint3 value)
751{
752    __builtin_IB_lsc_store_global_uint3(it, offset, value, LSC_STCC_L1WB_L3WB);
753}
754
755// uint4
756void store_uint4_L1UC_L3UC(global uint4* it, int offset, uint4 value)
757{
758    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3UC);
759}
760
761void store_uint4_L1UC_L3WB(global uint4* it, int offset, uint4 value)
762{
763    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1UC_L3WB);
764}
765
766void store_uint4_L1WT_L3UC(global uint4* it, int offset, uint4 value)
767{
768    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3UC);
769}
770
771void store_uint4_L1WT_L3WB(global uint4* it, int offset, uint4 value)
772{
773    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WT_L3WB);
774}
775
776void store_uint4_L1S_L3UC(global uint4* it, int offset, uint4 value)
777{
778    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3UC);
779}
780
781void store_uint4_L1S_L3WB(global uint4* it, int offset, uint4 value)
782{
783    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1S_L3WB);
784}
785
786void store_uint4_L1WB_L3WB(global uint4* it, int offset, uint4 value)
787{
788    __builtin_IB_lsc_store_global_uint4(it, offset, value, LSC_STCC_L1WB_L3WB);
789}
790
791// uint8
792void store_uint8_L1UC_L3UC(global uint8* it, int offset, uint8 value)
793{
794    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3UC);
795}
796
797void store_uint8_L1UC_L3WB(global uint8* it, int offset, uint8 value)
798{
799    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1UC_L3WB);
800}
801
802void store_uint8_L1WT_L3UC(global uint8* it, int offset, uint8 value)
803{
804    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3UC);
805}
806
807void store_uint8_L1WT_L3WB(global uint8* it, int offset, uint8 value)
808{
809    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WT_L3WB);
810}
811
812void store_uint8_L1S_L3UC(global uint8* it, int offset, uint8 value)
813{
814    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3UC);
815}
816
817void store_uint8_L1S_L3WB(global uint8* it, int offset, uint8 value)
818{
819    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1S_L3WB);
820}
821
822void store_uint8_L1WB_L3WB(global uint8* it, int offset, uint8 value)
823{
824    __builtin_IB_lsc_store_global_uint8(it, offset, value, LSC_STCC_L1WB_L3WB);
825}
826
827// ulong
828void store_ulong_L1UC_L3UC(global ulong* it, int offset, ulong value)
829{
830    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3UC);
831}
832
833void store_ulong_L1UC_L3WB(global ulong* it, int offset, ulong value)
834{
835    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1UC_L3WB);
836}
837
838void store_ulong_L1WT_L3UC(global ulong* it, int offset, ulong value)
839{
840    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3UC);
841}
842
843void store_ulong_L1WT_L3WB(global ulong* it, int offset, ulong value)
844{
845    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WT_L3WB);
846}
847
848void store_ulong_L1S_L3UC(global ulong* it, int offset, ulong value)
849{
850    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3UC);
851}
852
853void store_ulong_L1S_L3WB(global ulong* it, int offset, ulong value)
854{
855    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1S_L3WB);
856}
857
858void store_ulong_L1WB_L3WB(global ulong* it, int offset, ulong value)
859{
860    __builtin_IB_lsc_store_global_ulong(it, offset, value, LSC_STCC_L1WB_L3WB);
861}
862
863// ulong2
864void store_ulong2_L1UC_L3UC(global ulong2* it, int offset, ulong2 value)
865{
866    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3UC);
867}
868
869void store_ulong2_L1UC_L3WB(global ulong2* it, int offset, ulong2 value)
870{
871    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1UC_L3WB);
872}
873
874void store_ulong2_L1WT_L3UC(global ulong2* it, int offset, ulong2 value)
875{
876    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3UC);
877}
878
879void store_ulong2_L1WT_L3WB(global ulong2* it, int offset, ulong2 value)
880{
881    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WT_L3WB);
882}
883
884void store_ulong2_L1S_L3UC(global ulong2* it, int offset, ulong2 value)
885{
886    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3UC);
887}
888
889void store_ulong2_L1S_L3WB(global ulong2* it, int offset, ulong2 value)
890{
891    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1S_L3WB);
892}
893
894void store_ulong2_L1WB_L3WB(global ulong2* it, int offset, ulong2 value)
895{
896    __builtin_IB_lsc_store_global_ulong2(it, offset, value, LSC_STCC_L1WB_L3WB);
897}
898
899// ulong3
900void store_ulong3_L1UC_L3UC(global ulong3* it, int offset, ulong3 value)
901{
902    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3UC);
903}
904
905void store_ulong3_L1UC_L3WB(global ulong3* it, int offset, ulong3 value)
906{
907    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1UC_L3WB);
908}
909
910void store_ulong3_L1WT_L3UC(global ulong3* it, int offset, ulong3 value)
911{
912    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3UC);
913}
914
915void store_ulong3_L1WT_L3WB(global ulong3* it, int offset, ulong3 value)
916{
917    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WT_L3WB);
918}
919
920void store_ulong3_L1S_L3UC(global ulong3* it, int offset, ulong3 value)
921{
922    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3UC);
923}
924
925void store_ulong3_L1S_L3WB(global ulong3* it, int offset, ulong3 value)
926{
927    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1S_L3WB);
928}
929
930void store_ulong3_L1WB_L3WB(global ulong3* it, int offset, ulong3 value)
931{
932    __builtin_IB_lsc_store_global_ulong3(it, offset, value, LSC_STCC_L1WB_L3WB);
933}
934
935// ulong4
936void store_ulong4_L1UC_L3UC(global ulong4* it, int offset, ulong4 value)
937{
938    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3UC);
939}
940
941void store_ulong4_L1UC_L3WB(global ulong4* it, int offset, ulong4 value)
942{
943    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1UC_L3WB);
944}
945
946void store_ulong4_L1WT_L3UC(global ulong4* it, int offset, ulong4 value)
947{
948    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3UC);
949}
950
951void store_ulong4_L1WT_L3WB(global ulong4* it, int offset, ulong4 value)
952{
953    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WT_L3WB);
954}
955
956void store_ulong4_L1S_L3UC(global ulong4* it, int offset, ulong4 value)
957{
958    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3UC);
959}
960
961void store_ulong4_L1S_L3WB(global ulong4* it, int offset, ulong4 value)
962{
963    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1S_L3WB);
964}
965
966void store_ulong4_L1WB_L3WB(global ulong4* it, int offset, ulong4 value)
967{
968    __builtin_IB_lsc_store_global_ulong4(it, offset, value, LSC_STCC_L1WB_L3WB);
969}
970
971// ulong8
972void store_ulong8_L1UC_L3UC(global ulong8* it, int offset, ulong8 value)
973{
974    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3UC);
975}
976
977void store_ulong8_L1UC_L3WB(global ulong8* it, int offset, ulong8 value)
978{
979    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1UC_L3WB);
980}
981
982void store_ulong8_L1WT_L3UC(global ulong8* it, int offset, ulong8 value)
983{
984    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3UC);
985}
986
987void store_ulong8_L1WT_L3WB(global ulong8* it, int offset, ulong8 value)
988{
989    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WT_L3WB);
990}
991
992void store_ulong8_L1S_L3UC(global ulong8* it, int offset, ulong8 value)
993{
994    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3UC);
995}
996
997void store_ulong8_L1S_L3WB(global ulong8* it, int offset, ulong8 value)
998{
999    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1S_L3WB);
1000}
1001
1002void store_ulong8_L1WB_L3WB(global ulong8* it, int offset, ulong8 value)
1003{
1004    __builtin_IB_lsc_store_global_ulong8(it, offset, value, LSC_STCC_L1WB_L3WB);
1005}
1006
1007// LSC Fence support
1008void mem_fence_gpu_default()
1009{
1010    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_DEFAULT);
1011}
1012
1013void mem_fence_workgroup_default()
1014{
1015    __builtin_IB_lsc_fence_global_untyped(LSC_FS_THREAD_GROUP, LSC_FT_DEFAULT);
1016}
1017
1018void mem_fence_gpu_invalidate()
1019{
1020    // NOTE: 'FS_TILE' is used here to avoid DG2 HW bug where L3 is needlessly flushed on a 'GPU' scope fence
1021    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_INVALIDATE);
1022}
1023
1024void mem_fence_gpu_evict()
1025{
1026    __builtin_IB_lsc_fence_global_untyped(LSC_FS_TILE, LSC_FT_EVICT);
1027}
1028
1029void mem_fence_evict_to_memory()
1030{
1031    __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_EVICT);
1032    __builtin_IB_lsc_fence_global_untyped(LSC_FS_GPU, LSC_FT_L3);
1033}
1034