Home
last modified time | relevance | path

Searched refs:lane_id (Results 1 – 8 of 8) sorted by relevance

/external/llvm-project/llvm/test/CodeGen/Hexagon/
Djt-in-text.ll15 define void @test2(i32 %lane_id, i32 %rx_pwr_st) #0 {
17 %lane_id.addr = alloca i32, align 4
19 store i32 %lane_id, i32* %lane_id.addr, align 4
21 %0 = load i32, i32* %lane_id.addr, align 4
/external/tensorflow/tensorflow/core/util/
Dgpu_device_functions.h200 unsigned int lane_id; in GpuLaneId() local
205 asm("mov.u32 %0, %%laneid;" : "=r"(lane_id)); in GpuLaneId()
208 lane_id = __lane_id(); in GpuLaneId()
210 return lane_id; in GpuLaneId()
246 int lane_id = GpuLaneId(); in GpuShuffleGetSrcLane() local
247 int lane_base = lane_id & ~width + 1; in GpuShuffleGetSrcLane()
255 unsigned lane_id = GpuLaneId(); in GpuShuffleUpGetSrcLane() local
256 if ((lane_id & width - 1) < delta) { in GpuShuffleUpGetSrcLane()
257 return lane_id; in GpuShuffleUpGetSrcLane()
259 return lane_id - delta; in GpuShuffleUpGetSrcLane()
[all …]
Dgpu_kernel_helper_test.cu.cc100 unsigned lane_id = GpuLaneId(); in GpuShuffleGetSrcLaneTest() local
106 op_name, param, width, lane_id, actual, expected); in GpuShuffleGetSrcLaneTest()
117 GpuShuffleSync(kCudaWarpAll, lane_id, src_lane, width); in GpuShuffleGetSrcLaneTest()
124 GpuShuffleUpSync(kCudaWarpAll, lane_id, delta, width); in GpuShuffleGetSrcLaneTest()
131 GpuShuffleDownSync(kCudaWarpAll, lane_id, delta, width); in GpuShuffleGetSrcLaneTest()
138 GpuShuffleXorSync(kCudaWarpAll, lane_id, lane_lane, width); in GpuShuffleGetSrcLaneTest()
/external/llvm-project/openmp/libomptarget/deviceRTLs/nvptx/docs/
DReductionDesign.txt63 b. its lane_id
64 c. the offset of the lane_id which hosts a remote ReduceData
107 ShuffleReduceFn is used here with lane_id set to 0 because it is not used
108 therefore we save instructions by not retrieving lane_id from the corresponding
134 int lane_id) {
140 ShuffleReduceFn(reduce_data, lane_id, offset, 1);
153 if (lane_id < offset) {
213 from physical lane_id as defined by nvidia.
215 lane_id (instead of the physical one defined by nvidia) that would make
223 offsets, instead of absolute lane_id. Therefore the subtraction is performed
[all …]
/external/llvm-project/openmp/libomptarget/deviceRTLs/
Dinterface.h197 typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
/external/tensorflow/tensorflow/compiler/xla/service/gpu/
Dir_emitter_unnested.h120 llvm::Value* lane_id; member
Dir_emitter_unnested.cc4485 ksl.If("intra_warp_reduce_write", is_zero(thread_id_info.lane_id), [&] { in EmitEpilogueForReduction()
4496 {b_.getInt32(0), constant(j), thread_id_info.lane_id})); in EmitEpilogueForReduction()
4555 b_.CreateAnd(has_output, is_zero(thread_id_info.lane_id)), [&] { in EmitEpilogueForReduction()
/external/mesa3d/src/amd/compiler/
Daco_instruction_selection.cpp7169 Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1)); in emit_boolean_reduce() local
7170 …fset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id); in emit_boolean_reduce()