1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // NEON implementations of Image methods for compatible devices. Control
17 // should never enter this compilation unit on incompatible devices.
18
19 #ifdef __ARM_NEON
20
21 #include <arm_neon.h>
22
23 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
24 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
25 #include "tensorflow/examples/android/jni/object_tracking/image.h"
26 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
27
28 namespace tf_tracking {
29
GetSum(const float32x4_t & values)30 inline static float GetSum(const float32x4_t& values) {
31 static float32_t summed_values[4];
32 vst1q_f32(summed_values, values);
33 return summed_values[0]
34 + summed_values[1]
35 + summed_values[2]
36 + summed_values[3];
37 }
38
39
ComputeMeanNeon(const float * const values,const int num_vals)40 float ComputeMeanNeon(const float* const values, const int num_vals) {
41 SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
42
43 const float32_t* const arm_vals = (const float32_t* const) values;
44 float32x4_t accum = vdupq_n_f32(0.0f);
45
46 int offset = 0;
47 for (; offset <= num_vals - 4; offset += 4) {
48 accum = vaddq_f32(accum, vld1q_f32(&arm_vals[offset]));
49 }
50
51 // Pull the accumulated values into a single variable.
52 float sum = GetSum(accum);
53
54 // Get the remaining 1 to 3 values.
55 for (; offset < num_vals; ++offset) {
56 sum += values[offset];
57 }
58
59 const float mean_neon = sum / static_cast<float>(num_vals);
60
61 #ifdef SANITY_CHECKS
62 const float mean_cpu = ComputeMeanCpu(values, num_vals);
63 SCHECK(NearlyEqual(mean_neon, mean_cpu, EPSILON * num_vals),
64 "Neon mismatch with CPU mean! %.10f vs %.10f",
65 mean_neon, mean_cpu);
66 #endif
67
68 return mean_neon;
69 }
70
71
ComputeStdDevNeon(const float * const values,const int num_vals,const float mean)72 float ComputeStdDevNeon(const float* const values,
73 const int num_vals, const float mean) {
74 SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
75
76 const float32_t* const arm_vals = (const float32_t* const) values;
77 const float32x4_t mean_vec = vdupq_n_f32(-mean);
78
79 float32x4_t accum = vdupq_n_f32(0.0f);
80
81 int offset = 0;
82 for (; offset <= num_vals - 4; offset += 4) {
83 const float32x4_t deltas =
84 vaddq_f32(mean_vec, vld1q_f32(&arm_vals[offset]));
85
86 accum = vmlaq_f32(accum, deltas, deltas);
87 }
88
89 // Pull the accumulated values into a single variable.
90 float squared_sum = GetSum(accum);
91
92 // Get the remaining 1 to 3 values.
93 for (; offset < num_vals; ++offset) {
94 squared_sum += Square(values[offset] - mean);
95 }
96
97 const float std_dev_neon = sqrt(squared_sum / static_cast<float>(num_vals));
98
99 #ifdef SANITY_CHECKS
100 const float std_dev_cpu = ComputeStdDevCpu(values, num_vals, mean);
101 SCHECK(NearlyEqual(std_dev_neon, std_dev_cpu, EPSILON * num_vals),
102 "Neon mismatch with CPU std dev! %.10f vs %.10f",
103 std_dev_neon, std_dev_cpu);
104 #endif
105
106 return std_dev_neon;
107 }
108
109
ComputeCrossCorrelationNeon(const float * const values1,const float * const values2,const int num_vals)110 float ComputeCrossCorrelationNeon(const float* const values1,
111 const float* const values2,
112 const int num_vals) {
113 SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
114
115 const float32_t* const arm_vals1 = (const float32_t* const) values1;
116 const float32_t* const arm_vals2 = (const float32_t* const) values2;
117
118 float32x4_t accum = vdupq_n_f32(0.0f);
119
120 int offset = 0;
121 for (; offset <= num_vals - 4; offset += 4) {
122 accum = vmlaq_f32(accum,
123 vld1q_f32(&arm_vals1[offset]),
124 vld1q_f32(&arm_vals2[offset]));
125 }
126
127 // Pull the accumulated values into a single variable.
128 float sxy = GetSum(accum);
129
130 // Get the remaining 1 to 3 values.
131 for (; offset < num_vals; ++offset) {
132 sxy += values1[offset] * values2[offset];
133 }
134
135 const float cross_correlation_neon = sxy / num_vals;
136
137 #ifdef SANITY_CHECKS
138 const float cross_correlation_cpu =
139 ComputeCrossCorrelationCpu(values1, values2, num_vals);
140 SCHECK(NearlyEqual(cross_correlation_neon, cross_correlation_cpu,
141 EPSILON * num_vals),
142 "Neon mismatch with CPU cross correlation! %.10f vs %.10f",
143 cross_correlation_neon, cross_correlation_cpu);
144 #endif
145
146 return cross_correlation_neon;
147 }
148
149 } // namespace tf_tracking
150
151 #endif // __ARM_NEON
152