Home
last modified time | relevance | path

Searched refs:ldg (Results 1 – 25 of 50) sorted by relevance

12

/external/tensorflow/tensorflow/core/kernels/
Dcompare_and_bitpack_op_gpu.cu.cc44 const T thresh = ldg(threshold); in CompareAndBitpackKernel()
48 ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) | in CompareAndBitpackKernel()
49 (((ldg(block + 2) > thresh) << 5)) | in CompareAndBitpackKernel()
50 (((ldg(block + 3) > thresh) << 4)) | in CompareAndBitpackKernel()
51 (((ldg(block + 4) > thresh) << 3)) | in CompareAndBitpackKernel()
52 (((ldg(block + 5) > thresh) << 2)) | in CompareAndBitpackKernel()
53 (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh)))); in CompareAndBitpackKernel()
66 const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i)); in CompareAndBitpackKernel()
84 const float thresh = ldg(threshold); in CompareAndBitpackKernel()
86 const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i)); in CompareAndBitpackKernel()
[all …]
Dpopulation_count_op_gpu.cu.cc39 CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); } in PopulationCountKernel()
47 output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i))); in PopulationCountKernel()
56 output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i))); in PopulationCountKernel()
63 CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); } in PopulationCountKernel()
Dbias_op_gpu.cu.cc60 output[index] = ldg(input + index) + ldg(bias + bias_offset); in BiasNHWCKernel()
70 output[index] = ldg(input + index) + ldg(bias + bias_offset); in BiasNCHWKernel()
106 CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index)); in BiasGradNHWC_Naive()
118 CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index)); in BiasGradNCHW_Naive()
138 CudaAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index))); in BiasGradNHWC_SharedAtomics()
171 T val = ldg(output_backprop + in BiasGradNCHW_SharedAtomics()
Dsparse_tensor_dense_matmul_op_gpu.cu.cc42 const int i = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 1 : 0)); in SparseTensorDenseMatMulKernel()
43 const int k = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 0 : 1)); in SparseTensorDenseMatMulKernel()
55 const T a_value = ldg(a_values + a_ix); in SparseTensorDenseMatMulKernel()
58 const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j)); in SparseTensorDenseMatMulKernel()
Ddepthwise_conv_op_gpu.h131 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNHWC()
153 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNHWC()
264 tile_ptr[0] = static_cast<S>(ldg(in_ptr)); in DepthwiseConv2dGPUKernelNHWCSmall()
266 tile_ptr[tile_offset] = static_cast<S>(ldg(tensor_offset + in_ptr)); in DepthwiseConv2dGPUKernelNHWCSmall()
271 static_cast<S>(ldg(filter_offset + filter)); in DepthwiseConv2dGPUKernelNHWCSmall()
409 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNCHW()
436 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNCHW()
548 tile_ptr[0] = static_cast<S>(ldg(in_ptr)); in DepthwiseConv2dGPUKernelNCHWSmall()
550 tile_ptr[tile_offset] = static_cast<S>(ldg(block_pixels + in_ptr)); in DepthwiseConv2dGPUKernelNCHWSmall()
558 static_cast<S>(ldg(filter_offset + filter)); in DepthwiseConv2dGPUKernelNCHWSmall()
[all …]
Dsoftmax_op_gpu.cu.cc83 U max_val = strict_cast<U>(ldg(max_logits + row)); in GenerateNormalizedProb()
88 result = input - max_val - log(ldg(sum_probs + row)); in GenerateNormalizedProb()
90 result = exp(input - max_val) / ldg(sum_probs + row); in GenerateNormalizedProb()
106 strict_cast<U>(logits_[gid] - ldg(max_logits_ + gid / num_cols_)); in operator ()()
Dinplace_ops_functor_gpu.cu.cc38 *p = ldg(q); in DoParallelConcatOpKernel()
93 *p = ldg(q); in DoInplaceOpKernel()
96 *p += ldg(q); in DoInplaceOpKernel()
99 *p -= ldg(q); in DoInplaceOpKernel()
Dresize_nearest_neighbor_op_gpu.cu.cc62 top_data[index] = ldg(bottom_data_n + idx); in ResizeNearestNeighborNHWC()
91 top_data[index] = ldg(bottom_data_n + idx); in LegacyResizeNearestNeighborNHWC()
122 CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index)); in ResizeNearestNeighborBackwardNHWC()
151 CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index)); in LegacyResizeNearestNeighborBackwardNHWC()
Dgather_functor_gpu.cu.h54 Index gather_i = ldg(indices + indices_i); in GatherOpKernel()
67 out[i] = ldg(params + params_i); in GatherOpKernel()
Ddepthtospace_op_gpu.cu.cc58 *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); in D2S_NHWC()
97 *(output_ptr + output_idx) = ldg(input_ptr + input_idx); in D2S_NCHW()
135 output_ptr[bY * output_width + bX] = ldg( in D2S_NCHW_LOOP()
Dspacetodepth_op_gpu.cu.cc57 *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); in S2D_NHWC()
95 *(output_ptr + output_idx) = ldg(input_ptr + input_idx); in S2D_NCHW()
135 ldg(input_ptr + bY * input_width + bX); in S2D_NCHW_LOOP()
Dgather_nd_op_gpu.cu.cc44 const Index index_j = ldg(indices_i + j); in GatherSliceOpKernel()
61 out[i] = (out_of_bounds) ? T(0) : ldg(params + offset + loc_offset); in GatherSliceOpKernel()
Dmultinomial_op_gpu.cu.cc47 if (ldg(maxima + maxima_idx) == ldg(scores + index)) { in MultinomialKernel()
Dspacetobatch_functor_gpu.cu.cc89 ldg(space_tensor_ptr + space_tensor_idx); in S2B()
92 ldg(batch_tensor_ptr + batch_tensor_idx); in S2B()
Dscatter_nd_op_gpu.cu.cc95 const Index ix_d = internal::SubtleMustCopy(ldg(indices + offset)); in ScatterNdOpKernel()
102 update(out + i + si, ldg(updates + (index * slice_size + si))); in ScatterNdOpKernel()
Dsegment_reduction_ops_gpu.cu.cc91 sum += ldg(input + (input_outer_dim_index_base + j) * inner_dim_size + in SortedSegmentSumCustomKernel()
124 KernelReductionFunctor()(output + output_index, ldg(input + input_index)); in UnsortedSegmentCustomKernel()
Dtranspose_functor_gpu.cu.cc49 dst[o_idx] = Eigen::numext::conj(ldg(src + i_idx)); in TransposeKernel()
51 dst[o_idx] = ldg(src + i_idx); in TransposeKernel()
Ddynamic_stitch_op_gpu.cu.cc44 output[output_index] = ldg(data_ptrs[input_index] + slice_offset); in DynamicStitchKernel()
Ddynamic_partition_op_gpu.cu.cc68 int32 N = min(ldg(size), out_size); in MoveValuesKernel()
70 int32 key = ldg(keys + i); in MoveValuesKernel()
71 int32 value = ldg(values + i); in MoveValuesKernel()
/external/tensorflow/tensorflow/contrib/seq2seq/kernels/
Dbeam_search_ops_gpu.cu.cc39 Eigen::numext::mini(max_time, ldg(max_sequence_lengths + batch)); in GatherTreeOpKernel()
47 beams[initial_beam_ix] = ldg(step_ids + initial_beam_ix); in GatherTreeOpKernel()
48 int32 parent = ldg(parent_ids + initial_beam_ix); in GatherTreeOpKernel()
58 beams[level_beam_ix] = ldg(step_ids + level_parent_ix); in GatherTreeOpKernel()
59 parent = ldg(parent_ids + level_parent_ix); in GatherTreeOpKernel()
/external/swiftshader/third_party/llvm-7.0/llvm/test/CodeGen/NVPTX/
Dldu-ldg.ll6 declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
7 declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
27 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
34 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
Dbug26185-2.ll3 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose
4 ; extending variants in the backend, but the ldg/ldu selection code may pick
Dread-global-variable-constant.ll4 ; ld.global.nc (aka ldg).
/external/llvm/test/CodeGen/NVPTX/
Dldu-ldg.ll6 declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align)
7 declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align)
27 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4)
34 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
Dbug26185-2.ll3 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose
4 ; extending variants in the backend, but the ldg/ldu selection code may pick

12