/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/ |
D | reduce_fp32.c | 26 int ReduceMean(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_dat… in ReduceMean() argument 36 const float *outer_src = src_data + j * axis_size * inner_size; in ReduceMean() 37 float *outer_dst = dst_data + j * inner_size; in ReduceMean() 38 for (k = 0; k < inner_size; k++) { in ReduceMean() 43 tmp += inner_src[i * inner_size]; in ReduceMean() 51 int IntReduceMean(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data… in IntReduceMean() argument 65 int block_mod = inner_size % C4NUM; in IntReduceMean() 66 int block_c4 = inner_size - block_mod; in IntReduceMean() 69 const int *outer_src = src_data + j * axis_size * inner_size; in IntReduceMean() 70 int *outer_dst = dst_data + j * inner_size; in IntReduceMean() [all …]
|
D | reduce_fp32.h | 25 int ReduceMean(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_dat… 27 int IntReduceMean(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data… 29 int ReduceSum(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data… 31 int IntReduceSum(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data,… 33 int ReduceMax(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data… 35 int IntReduceMax(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data,… 37 int ReduceMin(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_data… 39 int IntReduceMin(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data,… 41 int ReduceProd(int outer_size, int inner_size, int axis_size, const float *src_data, float *dst_dat… 43 int IntReduceProd(int outer_size, int inner_size, int axis_size, const int *src_data, int *dst_data… [all …]
|
D | log_softmax_fp32.c | 50 int inner_size = 1; in LogSoftmax() local 57 inner_size *= input_shape[i]; in LogSoftmax() 60 int outter_offset = i * input_shape[axis] * inner_size; in LogSoftmax() 61 int sum_outter_offset = i * inner_size; in LogSoftmax() 62 for (int k = 0; k < inner_size; k++) { in LogSoftmax() 67 int axis_offset = inner_offset + j * inner_size; in LogSoftmax() 71 int axis_offset = inner_offset + j * inner_size; in LogSoftmax() 78 int outter_offset = i * input_shape[axis] * inner_size; in LogSoftmax() 79 int sum_outter_offset = i * inner_size; in LogSoftmax() 81 int axis_offset = outter_offset + j * inner_size; in LogSoftmax() [all …]
|
D | softmax_fp32.c | 109 int inner_size = 1; in Softmax() local 116 inner_size *= input_shape[i]; in Softmax() 119 int outter_offset = i * input_shape[axis] * inner_size; in Softmax() 120 int sum_outter_offset = i * inner_size; in Softmax() 121 for (int k = 0; k < inner_size; k++) { in Softmax() 126 int axis_offset = inner_offset + j * inner_size; in Softmax() 130 int axis_offset = inner_offset + j * inner_size; in Softmax() 137 int outter_offset = i * input_shape[axis] * inner_size; in Softmax() 138 int sum_outter_offset = i * inner_size; in Softmax() 140 int axis_offset = outter_offset + j * inner_size; in Softmax() [all …]
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/ |
D | scatter_functor_impl.cu | 21 __global__ void ScatterUpdateKernel(const size_t inner_size, const size_t updates_size, const S *in… in ScatterUpdateKernel() argument 24 const size_t index = pos / inner_size; in ScatterUpdateKernel() 25 const size_t offset = pos % inner_size; in ScatterUpdateKernel() 26 const size_t current_pos = indices[index] * inner_size + offset; in ScatterUpdateKernel() 32 __global__ void ScatterAddKernel(const size_t inner_size, const size_t updates_size, const S *indic… in ScatterAddKernel() argument 35 const size_t index = pos / inner_size; in ScatterAddKernel() 36 const size_t offset = pos % inner_size; in ScatterAddKernel() 37 const size_t current_pos = indices[index] * inner_size + offset; in ScatterAddKernel() 43 __global__ void ScatterSubKernel(const size_t inner_size, const size_t updates_size, const S *indic… in ScatterSubKernel() argument 46 const size_t index = pos / inner_size; in ScatterSubKernel() [all …]
|
D | index_add_impl.cu | 23 const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size) { in IndexAddAtomic() argument 25 const size_t src_axis_idx = (pos / inner_size) % src_axis_size; in IndexAddAtomic() 26 const size_t src_outer_idx = pos / (src_axis_size * inner_size); in IndexAddAtomic() 29 const size_t dst_inner_idx = pos % inner_size; in IndexAddAtomic() 30 …const size_t dst_idx = src_outer_idx * (dst_axis_size * inner_size) + dst_axis_idx * inner_size + … in IndexAddAtomic() 38 const size_t src_axis_size, const size_t dst_axis_size, const size_t inner_size) { in IndexAdd() argument 40 const size_t src_axis_idx = (pos / inner_size) % src_axis_size; in IndexAdd() 41 const size_t src_outer_idx = pos / (src_axis_size * inner_size); in IndexAdd() 44 const size_t dst_inner_idx = pos % inner_size; in IndexAdd() 45 …const size_t dst_idx = src_outer_idx * (dst_axis_size * inner_size) + dst_axis_idx * inner_size + … in IndexAdd() [all …]
|
D | argmax_impl.cu | 22 const size_t inner_size, S *output) { in Argmax() argument 23 for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < outer_size * inner_size; in Argmax() 25 size_t x = pos / inner_size % outer_size; in Argmax() 26 size_t y = pos % inner_size; in Argmax() 28 size_t input_offset = x * bound * inner_size + 0 * inner_size + y; in Argmax() 31 input_offset = x * bound * inner_size + i * inner_size + y; in Argmax() 42 void CalArgmax(const T *input, const S bound, const size_t outer_size, const size_t inner_size, in CalArgmax() argument 44 …gmax<<<GET_BLOCKS(outer_size), GET_THREADS, 0, cuda_stream>>>(input, bound, outer_size, inner_size, in CalArgmax() 50 const size_t inner_size, int *output, cudaStream_t cuda_stream); 52 const size_t inner_size, int *output, cudaStream_t cuda_stream);
|
D | unsorted_segment_max.cu | 22 size_t inner_size, bool fp16_flag, T init_K, T *output) { in UnsortedSegmentMax() argument 27 …ize_t t_idx = blockIdx.x * blockDim.x + threadIdx.x; t_idx < KWARPSIZE * num_segments * inner_size; in UnsortedSegmentMax() 29 size_t segment_id = t_idx / KWARPSIZE / inner_size; in UnsortedSegmentMax() 30 size_t inner_id = t_idx / KWARPSIZE % inner_size; in UnsortedSegmentMax() 36 T other_K = input[i * inner_size + inner_id]; in UnsortedSegmentMax() 53 output[segment_id * inner_size + inner_id] = threadK; in UnsortedSegmentMax() 61 size_t inner_size, T *output, cudaStream_t stream) { in CalUnsortedSegmentMax() argument 62 size_t size = (inner_size * KWARPSIZE * num_segments); in CalUnsortedSegmentMax() 70 … inner_size, fp16_flag, init_K, output); in CalUnsortedSegmentMax() 75 size_t outer_size, size_t inner_size, float *output, [all …]
|
D | general_reduction_impl.cu | 47 __global__ void ThreadReduction(bool small, size_t outer_size, size_t bound, size_t inner_size, con… in ThreadReduction() argument 55 for (int t_idx = blockIdx.x * blockDim.x + threadIdx.x; t_idx < outer_size * inner_size; in ThreadReduction() 57 int outer_id = t_idx / inner_size; in ThreadReduction() 58 int inner_id = t_idx % inner_size; in ThreadReduction() 64 T other_K = input[outer_id * bound * inner_size + i * inner_size + inner_id]; in ThreadReduction() 72 output[outer_id * inner_size + inner_id] = threadK; in ThreadReduction() 73 output_index[outer_id * inner_size + inner_id] = threadV; in ThreadReduction() 78 __global__ void WarpReduction(bool small, size_t outer_size, size_t bound, size_t inner_size, const… in WarpReduction() argument 85 …or (int t_idx = blockIdx.x * blockDim.x + threadIdx.x; t_idx < kWarpSize * outer_size * inner_size; in WarpReduction() 87 int outer_id = t_idx / kWarpSize / inner_size; in WarpReduction() [all …]
|
D | unsorted_segment_min.cu | 32 size_t outer_size, size_t inner_size, T init_K, T *output) { in UnsortedSegmentMin() argument 34 … (int t_idx = blockIdx.x * blockDim.x + threadIdx.x; t_idx < KWARPSIZE * num_segments * inner_size; in UnsortedSegmentMin() 36 int segment_id = t_idx / KWARPSIZE / inner_size; in UnsortedSegmentMin() 37 int inner_id = t_idx / KWARPSIZE % inner_size; in UnsortedSegmentMin() 43 T other_K = input[i * inner_size + inner_id]; in UnsortedSegmentMin() 58 output[segment_id * inner_size + inner_id] = threadK; in UnsortedSegmentMin() 66 size_t inner_size, T *output, cudaStream_t stream) { in CalUnsortedSegmentMin() argument 67 int size = (inner_size * KWARPSIZE * num_segments); in CalUnsortedSegmentMin() 70 inner_size, init_K, output); in CalUnsortedSegmentMin() 75 … size_t outer_size, size_t inner_size, float *output, cudaStream_t stream); [all …]
|
D | topk_impl.cu | 27 …<<<block_num_limit, BLOCK, 0, stream>>>(outer_size, inner_size, input, output, output_index, k_cut… 100 inline __device__ void TopKStep(const int &outer_size, const int &inner_size, const T *input, T *ou… in TopKStep() argument 113 int limit = (inner_size / kWarpSize) * kWarpSize; in TopKStep() 135 LEFT_INSERT_THREAD_QUEUE((input[outer_id * inner_size + i]), (outer_id * inner_size + i)); in TopKStep() 153 if (i < inner_size) { in TopKStep() 154 LEFT_INSERT_THREAD_QUEUE((input[outer_id * inner_size + i]), (outer_id * inner_size + i)); in TopKStep() 171 output_index[outer_id * k_cut + (*k_prime) + i] = shared_V[i] % inner_size; in TopKStep() 178 __global__ void TopKBlock(int outer_size, int inner_size, const T *input, T *output, S *output_inde… in TopKBlock() argument 200 …outer_size, inner_size, input, output, output_index, k_cut, init_K, outer_id, shared_K, shared_V, … in TopKBlock() 207 void FastTopK(const int outer_size, const int inner_size, const T *input, S k_cut, T *output, S *ou… in FastTopK() argument [all …]
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp16/ |
D | reduce_fp16.c | 21 int ReduceMeanFp16(int outer_size, int inner_size, int axis_size, const float16_t *src_data, float1… in ReduceMeanFp16() argument 31 const float16_t *outer_src = src_data + j * axis_size * inner_size; in ReduceMeanFp16() 32 float16_t *outer_dst = dst_data + j * inner_size; in ReduceMeanFp16() 33 for (k = 0; k < inner_size; k++) { in ReduceMeanFp16() 38 tmp += inner_src[i * inner_size]; in ReduceMeanFp16() 46 int ReduceMaxFp16(int outer_size, int inner_size, int axis_size, const float16_t *src_data, float16… in ReduceMaxFp16() argument 53 const float16_t *outer_src = src_data + j * axis_size * inner_size; in ReduceMaxFp16() 54 float16_t *outer_dst = dst_data + j * inner_size; in ReduceMaxFp16() 55 for (k = 0; k < inner_size; k++) { in ReduceMaxFp16() 60 tmp = tmp > inner_src[i * inner_size] ? tmp : inner_src[i * inner_size]; in ReduceMaxFp16() [all …]
|
D | log_softmax_fp16.c | 53 int inner_size = 1; in LogSoftmaxFp16() local 60 inner_size *= input_shape[i]; in LogSoftmaxFp16() 63 int outter_offset = i * input_shape[axis] * inner_size; in LogSoftmaxFp16() 64 int sum_outter_offset = i * inner_size; in LogSoftmaxFp16() 65 for (int k = 0; k < inner_size; k++) { in LogSoftmaxFp16() 70 int axis_offset = inner_offset + j * inner_size; in LogSoftmaxFp16() 74 int axis_offset = inner_offset + j * inner_size; in LogSoftmaxFp16() 81 int outter_offset = i * input_shape[axis] * inner_size; in LogSoftmaxFp16() 82 int sum_outter_offset = i * inner_size; in LogSoftmaxFp16() 84 int axis_offset = outter_offset + j * inner_size; in LogSoftmaxFp16() [all …]
|
D | softmax_fp16.c | 99 int inner_size = 1; in SoftmaxFp16() local 106 inner_size *= input_shape[i]; in SoftmaxFp16() 109 int outter_offset = i * input_shape[axis] * inner_size; in SoftmaxFp16() 110 int sum_outter_offset = i * inner_size; in SoftmaxFp16() 111 for (int k = 0; k < inner_size; k++) { in SoftmaxFp16() 116 int axis_offset = inner_offset + j * inner_size; in SoftmaxFp16() 120 int axis_offset = inner_offset + j * inner_size; in SoftmaxFp16() 127 int outter_offset = i * input_shape[axis] * inner_size; in SoftmaxFp16() 128 int sum_outter_offset = i * inner_size; in SoftmaxFp16() 130 int axis_offset = outter_offset + j * inner_size; in SoftmaxFp16() [all …]
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/int8/ |
D | reduce_int8.c | 176 int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *… in ReduceMeanInt8() argument 183 const int32_t *outer_src = src_data + j * axis_size * inner_size; in ReduceMeanInt8() 184 int32_t *outer_dst = dst_data + j * inner_size; in ReduceMeanInt8() 185 for (k = 0; k < inner_size; k++) { in ReduceMeanInt8() 190 int32_t tmp = inner_src[i * inner_size] - quant->in_zp_; in ReduceMeanInt8() 210 int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32… in ReduceMeanLastAxis() argument 217 const int32_t *outer_src = src_data + j * axis_size * inner_size; in ReduceMeanLastAxis() 218 int8_t *outer_dst = dst_data + j * inner_size; in ReduceMeanLastAxis() 219 for (k = 0; k < inner_size; k++) { in ReduceMeanLastAxis() 224 int32_t tmp = inner_src[i * inner_size] - quant->in_zp_; in ReduceMeanLastAxis() [all …]
|
D | reduce_int8.h | 43 int ReduceMeanInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *… 45 int ReduceMeanLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32… 47 int ReduceSumInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *s… 49 int ReduceSumLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_… 51 int ReduceMaxInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *s… 53 int ReduceMaxLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_… 55 int ReduceMinInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *s… 57 int ReduceMinLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32_… 59 int ReduceProdLastAxis(const int outer_size, const int inner_size, const int axis_size, const int32… 61 int ReduceProdInt8(const int outer_size, const int inner_size, const int axis_size, const int32_t *… [all …]
|
D | softmax_int8.c | 27 int inner_size = 1; in SoftmaxInt8() local 32 inner_size *= input_shape[i]; in SoftmaxInt8() 36 int outter_offset = o * axis_shape_size * inner_size; in SoftmaxInt8() 38 for (int c = 0; c < inner_size; c++) { in SoftmaxInt8() 41 int axis_offset = outter_offset + c + i * inner_size; in SoftmaxInt8() 47 int axis_offset = outter_offset + c + i * inner_size; in SoftmaxInt8() 58 int axis_offset = outter_offset + i * inner_size; in SoftmaxInt8() 59 for (int c = 0; c < inner_size; ++c) { in SoftmaxInt8()
|
D | l2_norm_int8.c | 23 const int inner_size = param->shape_[param->shape_num_ - 1]; in L2NormalizationInt8() local 27 for (int j = 0; j < inner_size; ++j) { in L2NormalizationInt8() 28 int32_t in = input_data[i * inner_size + j] - quant_param->in_.zp_; in L2NormalizationInt8() 34 for (int k = 0; k < inner_size; ++k) { in L2NormalizationInt8() 35 int32_t in = input_data[i * inner_size + k] - quant_param->in_.zp_; in L2NormalizationInt8() 37 output_data[i * inner_size + k] = MSMIN(127, MSMAX(-128, out)); in L2NormalizationInt8()
|
D | gather_int8.c | 22 int GatherInt8(int8_t *in_data, int8_t *out_data, int outer_size, int inner_size, int limit, const … in GatherInt8() argument 29 const int8_t *inputm = in_data + inner_size * m * limit; in GatherInt8() 30 int8_t *outputm = out_data + inner_size * m * indices_element_size; in GatherInt8() 35 for (j = 0; j < inner_size; ++j) { in GatherInt8() 36 int32_t tmp = round(alpha * (inputm[indices[i] * inner_size + j] - z1)) + z2; in GatherInt8() 39 outputm[i * inner_size + j] = (int8_t)tmp; in GatherInt8()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/ |
D | softmax.c | 59 int inner_size) { in SoftMaxP1() argument 60 if (inner_size == 1) { in SoftMaxP1() 65 int outter_offset = i * length * inner_size; in SoftMaxP1() 66 int sum_outter_offset = i * inner_size; in SoftMaxP1() 67 for (int k = 0; k < inner_size; k++) { in SoftMaxP1() 71 int axis_offset = inner_offset + j * inner_size; in SoftMaxP1() 75 int axis_offset = inner_offset + j * inner_size; in SoftMaxP1() 80 int axis_offset = inner_offset + j * inner_size; in SoftMaxP1() 89 int inner_size) { in SoftMaxP2() argument 91 int outter_offset = i * length * inner_size; in SoftMaxP2() [all …]
|
D | softmax_grad.c | 29 int inner_size = 1, outter_size = 1; in SoftmaxGrad() local 34 inner_size *= input_shape[i]; in SoftmaxGrad() 37 for (int i = 0; i < inner_size * input_shape[axis]; i++) sum_mul[i] = 1.0; in SoftmaxGrad() 43 const int N = inner_size; in SoftmaxGrad() 46 memset(sum_data, 0, (size_t)(inner_size) * sizeof(float)); in SoftmaxGrad() 47 for (int k = 0; k < inner_size; k++) { in SoftmaxGrad() 50 int offset = inner_offset + j * inner_size; in SoftmaxGrad()
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/ |
D | shift_cpu_kernel.cc | 90 const int64_t inner_size = SizeToLong(axisIterator_.InnerSize()); in Launch() local 94 (void)std::fill_n(output, outer_size * axis_size * inner_size, fill_value); in Launch() 98 if (inputs[0]->size != outer_size * axis_size * inner_size * sizeof(T)) { in Launch() 103 if ((inner_size == 1) && (outer_size == 1)) { in Launch() 119 (void)tasks.emplace_back([this, i, fill_value, axis_size, inner_size, input, output, outputs] { in Launch() 120 size_t offset = i * axis_size * inner_size; in Launch() 121 size_t input_offset = offset + copy_src_begin_ * inner_size; in Launch() 122 size_t output_offset = offset + copy_dst_begin_ * inner_size; in Launch() 123 size_t copy_size = copy_size_ * inner_size * sizeof(T); in Launch() 129 size_t fill_offset = offset + fill_begin_ * inner_size; in Launch() [all …]
|
/third_party/mindspore/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/ |
D | softmax_grad_fp32_tests.cc | 61 int inner_size = 1; in TEST_F() local 64 inner_size *= softmax_param->input_shape_[i]; in TEST_F() 66 float *sum_data = new (std::nothrow) float[inner_size]; in TEST_F() 68 …float *sum_mul = new (std::nothrow) float[inner_size * softmax_param->input_shape_[softmax_param->… in TEST_F() 121 int inner_size = 1; in TEST_F() local 124 inner_size *= softmax_param->input_shape_[i]; in TEST_F() 126 float *sum_data = new (std::nothrow) float[inner_size]; in TEST_F() 128 …float *sum_mul = new (std::nothrow) float[inner_size * softmax_param->input_shape_[softmax_param->… in TEST_F() 186 int inner_size = 1; in TEST_F() local 189 inner_size *= softmax_param->input_shape_[i]; in TEST_F() [all …]
|
/third_party/mindspore/mindspore/lite/micro/example/mnist_stm32f746/mnist_stm32f746/operator_library/kernels/nnacl/fp32/ |
D | softmax_fp32.c | 109 int inner_size = 1; in Softmax() local 116 inner_size *= input_shape[i]; in Softmax() 119 int outter_offset = i * input_shape[axis] * inner_size; in Softmax() 120 int sum_outter_offset = i * inner_size; in Softmax() 121 for (int k = 0; k < inner_size; k++) { in Softmax() 125 int axis_offset = inner_offset + j * inner_size; in Softmax() 129 int axis_offset = inner_offset + j * inner_size; in Softmax() 136 int outter_offset = i * input_shape[axis] * inner_size; in Softmax() 137 int sum_outter_offset = i * inner_size; in Softmax() 139 int axis_offset = outter_offset + j * inner_size; in Softmax() [all …]
|
/third_party/mindspore/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/ |
D | gather_base.c | 19 int Gather(const void *input, int outer_size, int inner_size, int limit, const int *indices, int in… in Gather() argument 28 const int8_t *int8_in_m = int8_in + inner_size * m * limit * data_size; in Gather() 29 int8_t *int8_out_m = int8_out + inner_size * m * indices_element_size * data_size; in Gather() 39 memcpy(int8_out_m + i * inner_size * data_size, int8_in_m + index * inner_size * data_size, in Gather() 40 data_size * inner_size); in Gather()
|