/external/tensorflow/tensorflow/core/kernels/ |
D | compare_and_bitpack_op_gpu.cu.cc | 44 const T thresh = ldg(threshold); in CompareAndBitpackKernel() 48 ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) | in CompareAndBitpackKernel() 49 (((ldg(block + 2) > thresh) << 5)) | in CompareAndBitpackKernel() 50 (((ldg(block + 3) > thresh) << 4)) | in CompareAndBitpackKernel() 51 (((ldg(block + 4) > thresh) << 3)) | in CompareAndBitpackKernel() 52 (((ldg(block + 5) > thresh) << 2)) | in CompareAndBitpackKernel() 53 (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh)))); in CompareAndBitpackKernel() 66 const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i)); in CompareAndBitpackKernel() 84 const float thresh = ldg(threshold); in CompareAndBitpackKernel() 86 const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i)); in CompareAndBitpackKernel() [all …]
|
D | population_count_op_gpu.cu.cc | 39 CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); } in PopulationCountKernel() 47 output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i))); in PopulationCountKernel() 56 output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i))); in PopulationCountKernel() 63 CUDA_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); } in PopulationCountKernel()
|
D | bias_op_gpu.cu.cc | 60 output[index] = ldg(input + index) + ldg(bias + bias_offset); in BiasNHWCKernel() 70 output[index] = ldg(input + index) + ldg(bias + bias_offset); in BiasNCHWKernel() 106 CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index)); in BiasGradNHWC_Naive() 118 CudaAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index)); in BiasGradNCHW_Naive() 138 CudaAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index))); in BiasGradNHWC_SharedAtomics() 171 T val = ldg(output_backprop + in BiasGradNCHW_SharedAtomics()
|
D | sparse_tensor_dense_matmul_op_gpu.cu.cc | 42 const int i = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 1 : 0)); in SparseTensorDenseMatMulKernel() 43 const int k = ldg(a_indices + 2 * a_ix + ((ADJ_A) ? 0 : 1)); in SparseTensorDenseMatMulKernel() 55 const T a_value = ldg(a_values + a_ix); in SparseTensorDenseMatMulKernel() 58 const T b_value = ldg(b + ((ADJ_B) ? j * b_cols + k : k * b_cols + j)); in SparseTensorDenseMatMulKernel()
|
D | depthwise_conv_op_gpu.h | 131 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNHWC() 153 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNHWC() 264 tile_ptr[0] = static_cast<S>(ldg(in_ptr)); in DepthwiseConv2dGPUKernelNHWCSmall() 266 tile_ptr[tile_offset] = static_cast<S>(ldg(tensor_offset + in_ptr)); in DepthwiseConv2dGPUKernelNHWCSmall() 271 static_cast<S>(ldg(filter_offset + filter)); in DepthwiseConv2dGPUKernelNHWCSmall() 409 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNCHW() 436 sum += ldg(input + input_offset) * ldg(filter + filter_offset); in DepthwiseConv2dGPUKernelNCHW() 548 tile_ptr[0] = static_cast<S>(ldg(in_ptr)); in DepthwiseConv2dGPUKernelNCHWSmall() 550 tile_ptr[tile_offset] = static_cast<S>(ldg(block_pixels + in_ptr)); in DepthwiseConv2dGPUKernelNCHWSmall() 558 static_cast<S>(ldg(filter_offset + filter)); in DepthwiseConv2dGPUKernelNCHWSmall() [all …]
|
D | softmax_op_gpu.cu.cc | 83 U max_val = strict_cast<U>(ldg(max_logits + row)); in GenerateNormalizedProb() 88 result = input - max_val - log(ldg(sum_probs + row)); in GenerateNormalizedProb() 90 result = exp(input - max_val) / ldg(sum_probs + row); in GenerateNormalizedProb() 106 strict_cast<U>(logits_[gid] - ldg(max_logits_ + gid / num_cols_)); in operator ()()
|
D | inplace_ops_functor_gpu.cu.cc | 38 *p = ldg(q); in DoParallelConcatOpKernel() 93 *p = ldg(q); in DoInplaceOpKernel() 96 *p += ldg(q); in DoInplaceOpKernel() 99 *p -= ldg(q); in DoInplaceOpKernel()
|
D | resize_nearest_neighbor_op_gpu.cu.cc | 62 top_data[index] = ldg(bottom_data_n + idx); in ResizeNearestNeighborNHWC() 91 top_data[index] = ldg(bottom_data_n + idx); in LegacyResizeNearestNeighborNHWC() 122 CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index)); in ResizeNearestNeighborBackwardNHWC() 151 CudaAtomicAdd(bottom_diff_n + idx, ldg(top_diff + index)); in LegacyResizeNearestNeighborBackwardNHWC()
|
D | gather_functor_gpu.cu.h | 54 Index gather_i = ldg(indices + indices_i); in GatherOpKernel() 67 out[i] = ldg(params + params_i); in GatherOpKernel()
|
D | depthtospace_op_gpu.cu.cc | 58 *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); in D2S_NHWC() 97 *(output_ptr + output_idx) = ldg(input_ptr + input_idx); in D2S_NCHW() 135 output_ptr[bY * output_width + bX] = ldg( in D2S_NCHW_LOOP()
|
D | spacetodepth_op_gpu.cu.cc | 57 *(output_ptr + out_idx) = ldg(input_ptr + inp_idx); in S2D_NHWC() 95 *(output_ptr + output_idx) = ldg(input_ptr + input_idx); in S2D_NCHW() 135 ldg(input_ptr + bY * input_width + bX); in S2D_NCHW_LOOP()
|
D | gather_nd_op_gpu.cu.cc | 44 const Index index_j = ldg(indices_i + j); in GatherSliceOpKernel() 61 out[i] = (out_of_bounds) ? T(0) : ldg(params + offset + loc_offset); in GatherSliceOpKernel()
|
D | multinomial_op_gpu.cu.cc | 47 if (ldg(maxima + maxima_idx) == ldg(scores + index)) { in MultinomialKernel()
|
D | spacetobatch_functor_gpu.cu.cc | 89 ldg(space_tensor_ptr + space_tensor_idx); in S2B() 92 ldg(batch_tensor_ptr + batch_tensor_idx); in S2B()
|
D | scatter_nd_op_gpu.cu.cc | 95 const Index ix_d = internal::SubtleMustCopy(ldg(indices + offset)); in ScatterNdOpKernel() 102 update(out + i + si, ldg(updates + (index * slice_size + si))); in ScatterNdOpKernel()
|
D | segment_reduction_ops_gpu.cu.cc | 91 sum += ldg(input + (input_outer_dim_index_base + j) * inner_dim_size + in SortedSegmentSumCustomKernel() 124 KernelReductionFunctor()(output + output_index, ldg(input + input_index)); in UnsortedSegmentCustomKernel()
|
D | transpose_functor_gpu.cu.cc | 49 dst[o_idx] = Eigen::numext::conj(ldg(src + i_idx)); in TransposeKernel() 51 dst[o_idx] = ldg(src + i_idx); in TransposeKernel()
|
D | dynamic_stitch_op_gpu.cu.cc | 44 output[output_index] = ldg(data_ptrs[input_index] + slice_offset); in DynamicStitchKernel()
|
D | dynamic_partition_op_gpu.cu.cc | 68 int32 N = min(ldg(size), out_size); in MoveValuesKernel() 70 int32 key = ldg(keys + i); in MoveValuesKernel() 71 int32 value = ldg(values + i); in MoveValuesKernel()
|
/external/tensorflow/tensorflow/contrib/seq2seq/kernels/ |
D | beam_search_ops_gpu.cu.cc | 39 Eigen::numext::mini(max_time, ldg(max_sequence_lengths + batch)); in GatherTreeOpKernel() 47 beams[initial_beam_ix] = ldg(step_ids + initial_beam_ix); in GatherTreeOpKernel() 48 int32 parent = ldg(parent_ids + initial_beam_ix); in GatherTreeOpKernel() 58 beams[level_beam_ix] = ldg(step_ids + level_parent_ix); in GatherTreeOpKernel() 59 parent = ldg(parent_ids + level_parent_ix); in GatherTreeOpKernel()
|
/external/swiftshader/third_party/llvm-7.0/llvm/test/CodeGen/NVPTX/ |
D | ldu-ldg.ll | 6 declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align) 7 declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align) 27 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4) 34 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
|
D | bug26185-2.ll | 3 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose 4 ; extending variants in the backend, but the ldg/ldu selection code may pick
|
D | read-global-variable-constant.ll | 4 ; ld.global.nc (aka ldg).
|
/external/llvm/test/CodeGen/NVPTX/ |
D | ldu-ldg.ll | 6 declare i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 %align) 7 declare i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 %align) 27 %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1i8(i8 addrspace(1)* %ptr, i32 4) 34 %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1i32(i32 addrspace(1)* %ptr, i32 4)
|
D | bug26185-2.ll | 3 ; Verify that we correctly emit code for extending ldg/ldu. We do not expose 4 ; extending variants in the backend, but the ldg/ldu selection code may pick
|