1 // Example code illustrating the theory exposed in doc/quantization.md
2
3 /* Command line to build and run on x86:
4
5 c++ doc/quantization_example.cc -I . --std=c++11 -msse4.1 -lpthread \
6 -o /tmp/quantization_example && \
7 /tmp/quantization_example
8
9 */
10
11 #include <algorithm>
12 #include <cassert>
13 #include <cmath>
14 #include <cstdint>
15 #include <iostream>
16 #include <random>
17 #include <vector>
18 #include "../public/gemmlowp.h"
19 #include "../public/output_stages.h"
20
21 // We will handle both float and quantized matrices, which we will
22 // represent as gemmlowp::MatrixMap.
23 // We will need to be able to print them.
24
25 // Output a matrix to a std::ostream
26 template <typename tScalar, gemmlowp::MapOrder tOrder>
operator <<(std::ostream & s,const gemmlowp::MatrixMap<tScalar,tOrder> & m)27 std::ostream& operator<<(std::ostream& s,
28 const gemmlowp::MatrixMap<tScalar, tOrder>& m) {
29 for (int i = 0; i < m.rows(); i++) {
30 for (int j = 0; j < m.cols(); j++) {
31 if (j) {
32 s << '\t';
33 }
34 s << static_cast<float>(m(i, j));
35 }
36 s << '\n';
37 }
38 return s;
39 }
40
41 // Find the min and max value in a float matrix.
42 template <gemmlowp::MapOrder tOrder>
FindMinMax(const gemmlowp::MatrixMap<float,tOrder> & m,float * min,float * max)43 void FindMinMax(const gemmlowp::MatrixMap<float, tOrder>& m, float* min,
44 float* max) {
45 *min = *max = m(0, 0);
46 for (int i = 0; i < m.rows(); i++) {
47 for (int j = 0; j < m.cols(); j++) {
48 const float val = m(i, j);
49 *min = std::min(*min, val);
50 *max = std::max(*max, val);
51 }
52 }
53 }
54
55 // A structure to hold quantization parameters 'scale' and 'zero_point'
56 // as discussed in doc/quantization.md. As explained there, the meaning
57 // of these values is as the constants in the quantization equation
58 //
59 // real_value = scale * (quantized_value - zero_point)
60 //
61 // In other words, 'zero_point' is the quantized value that corresponds
62 // to the real value 0, and 'scale' is the difference of real values
63 // corresponding to consecutive quantized values.
64 struct QuantizationParams {
65 float scale;
66 std::uint8_t zero_point;
67 };
68
69 // Given the min and max values of a float array, return
70 // reasonable quantization parameters to use for this array.
ChooseQuantizationParams(float min,float max)71 QuantizationParams ChooseQuantizationParams(float min, float max) {
72 // We extend the [min, max] interval to ensure that it contains 0.
73 // Otherwise, we would not meet the requirement that 0 be an exactly
74 // representable value.
75 min = std::min(min, 0.f);
76 max = std::max(max, 0.f);
77
78 // the min and max quantized values, as floating-point values
79 const float qmin = 0;
80 const float qmax = 255;
81
82 // First determine the scale.
83 const double scale = (max - min) / (qmax - qmin);
84
85 // Zero-point computation.
86 // First the initial floating-point computation. The zero-point can be
87 // determined from solving an affine equation for any known pair
88 // (real value, corresponding quantized value).
89 // We know two such pairs: (rmin, qmin) and (rmax, qmax).
90 // Let's use the first one here.
91 const double initial_zero_point = qmin - min / scale;
92
93 // Now we need to nudge the zero point to be an integer
94 // (our zero points are integer, and this is motivated by the requirement
95 // to be able to represent the real value "0" exactly as a quantized value,
96 // which is required in multiple places, for example in Im2col with SAME
97 // padding).
98 std::uint8_t nudged_zero_point = 0;
99 if (initial_zero_point < qmin) {
100 nudged_zero_point = qmin;
101 } else if (initial_zero_point > qmax) {
102 nudged_zero_point = qmax;
103 } else {
104 nudged_zero_point =
105 static_cast<std::uint8_t>(std::round(initial_zero_point));
106 }
107
108 QuantizationParams result;
109 result.scale = scale;
110 result.zero_point = nudged_zero_point;
111 return result;
112 }
113
114 template <gemmlowp::MapOrder tLhsOrder, gemmlowp::MapOrder tRhsOrder,
115 gemmlowp::MapOrder tResultOrder>
FloatMatrixMultiplication(const gemmlowp::MatrixMap<const float,tLhsOrder> & lhs,const gemmlowp::MatrixMap<const float,tRhsOrder> & rhs,gemmlowp::MatrixMap<float,tResultOrder> * result)116 void FloatMatrixMultiplication(
117 const gemmlowp::MatrixMap<const float, tLhsOrder>& lhs,
118 const gemmlowp::MatrixMap<const float, tRhsOrder>& rhs,
119 gemmlowp::MatrixMap<float, tResultOrder>* result) {
120 assert(lhs.cols() == rhs.rows());
121 assert(lhs.rows() == result->rows());
122 assert(rhs.cols() == result->cols());
123 for (int i = 0; i < lhs.rows(); i++) {
124 for (int k = 0; k < rhs.cols(); k++) {
125 (*result)(i, k) = 0;
126 for (int j = 0; j < lhs.cols(); j++) {
127 (*result)(i, k) += lhs(i, j) * rhs(j, k);
128 }
129 }
130 }
131 }
132
Quantize(const QuantizationParams & qparams,const std::vector<float> & src,std::vector<std::uint8_t> * dst)133 void Quantize(const QuantizationParams& qparams, const std::vector<float>& src,
134 std::vector<std::uint8_t>* dst) {
135 assert(src.size() == dst->size());
136 for (std::size_t i = 0; i < src.size(); i++) {
137 const float real_val = src[i];
138 const float transformed_val = qparams.zero_point + real_val / qparams.scale;
139 const float clamped_val = std::max(0.f, std::min(255.f, transformed_val));
140 (*dst)[i] = static_cast<std::uint8_t>(std::round(clamped_val));
141 }
142 }
143
Dequantize(const QuantizationParams & qparams,const std::vector<std::uint8_t> & src,std::vector<float> * dst)144 void Dequantize(const QuantizationParams& qparams,
145 const std::vector<std::uint8_t>& src, std::vector<float>* dst) {
146 assert(src.size() == dst->size());
147 for (std::size_t i = 0; i < src.size(); i++) {
148 const std::uint8_t quantized_val = src[i];
149 (*dst)[i] = qparams.scale * (quantized_val - qparams.zero_point);
150 }
151 }
152
153 template <typename tScalar, gemmlowp::MapOrder tOrder>
154 class MatrixWithStorage {
155 public:
MatrixWithStorage(int rows,int cols)156 MatrixWithStorage(int rows, int cols)
157 : storage(rows * cols), matrix_map(storage.data(), rows, cols) {}
MakeRandom()158 void MakeRandom() {
159 static std::mt19937 random_engine;
160 std::uniform_real_distribution<float> distribution(-1, 1);
161 for (auto& x : storage) {
162 x = static_cast<tScalar>(distribution(random_engine));
163 }
164 }
ConstMap() const165 gemmlowp::MatrixMap<const tScalar, tOrder> ConstMap() const {
166 return gemmlowp::MatrixMap<const tScalar, tOrder>(
167 storage.data(), matrix_map.rows(), matrix_map.cols());
168 }
Map()169 gemmlowp::MatrixMap<tScalar, tOrder> Map() {
170 return gemmlowp::MatrixMap<tScalar, tOrder>(
171 storage.data(), matrix_map.rows(), matrix_map.cols());
172 }
Storage() const173 const std::vector<tScalar>& Storage() const { return storage; }
Storage()174 std::vector<tScalar>& Storage() { return storage; }
175
176 private:
177 std::vector<tScalar> storage;
178 gemmlowp::MatrixMap<tScalar, tOrder> matrix_map;
179 };
180
181 template <typename tScalar, gemmlowp::MapOrder tOrder>
operator <<(std::ostream & s,const MatrixWithStorage<tScalar,tOrder> & m)182 std::ostream& operator<<(std::ostream& s,
183 const MatrixWithStorage<tScalar, tOrder>& m) {
184 return s << m.ConstMap();
185 }
186
187 // Given a real_multiplier in the interval (0, 1),
188 // produces a pair (quantized_multiplier, right_shift) where
189 // quantized_multiplier is an int32 representing a fixed-point value
190 // in the interval [-1, 1) (in practice we only produce positive values)
191 // and right_shift is an amount to shift right by, so that the
192 // floating-point multiplication of some int32 input value by real_multiplier,
193 //
194 // return static_cast<int32>(int32_value * real_multiplier);
195 //
196 // is best approximated by the integer-arithmetic-only code
197 //
198 // return RoundingRightShift(
199 // FixedPointMultiplication(int32_value, quantized_multiplier),
200 // right_shift);
201 //
202 // This is how to obtain the fixed-point multiplier and right shift
203 // parameters to pass to
204 // OutputStageQuantizeDownInt32ByFixedPoint.
205 //
206 // Note: all this code only needs to run offline to generate the quantized
207 // neural network workload, not at runtime on the
208 // device on which quantized neural networks need to run. So it's not
209 // performance-critical at all.
QuantizeMultiplierSmallerThanOne(float real_multiplier,std::int32_t * quantized_multiplier,int * right_shift)210 void QuantizeMultiplierSmallerThanOne(float real_multiplier,
211 std::int32_t* quantized_multiplier,
212 int* right_shift) {
213 assert(real_multiplier > 0.f);
214 assert(real_multiplier < 1.f);
215 int s = 0;
216 // We want to bring the real multiplier into the interval [1/2, 1).
217 // We can do so by multiplying it by two, and recording how many times
218 // we multiplied by two so that we can compensate that by a right
219 // shift by the same amount.
220 while (real_multiplier < 0.5f) {
221 real_multiplier *= 2.0f;
222 s++;
223 }
224 // Now that the real multiplier is in [1/2, 1), we convert it
225 // into a fixed-point number.
226 std::int64_t q =
227 static_cast<std::int64_t>(std::round(real_multiplier * (1ll << 31)));
228 assert(q <= (1ll << 31));
229 // Handle the special case when the real multiplier was so close to 1
230 // that its fixed-point approximation was undistinguishable from 1.
231 // We handle this by dividing it by two, and remembering to decrement
232 // the right shift amount.
233 if (q == (1ll << 31)) {
234 q /= 2;
235 s--;
236 }
237 assert(s >= 0);
238 assert(q <= std::numeric_limits<std::int32_t>::max());
239 *quantized_multiplier = static_cast<std::int32_t>(q);
240 *right_shift = s;
241 }
242
main()243 int main() {
244 std::cout.precision(3);
245
246 const int rows = 2;
247 const int depth = 4;
248 const int cols = 3;
249 const auto kOrder = gemmlowp::MapOrder::ColMajor;
250
251 std::cout << "First, let us make some float matrices LHS and RHS, "
252 << "and compute their product.\n"
253 << std::endl;
254 MatrixWithStorage<float, kOrder> float_lhs(rows, depth);
255 float_lhs.MakeRandom();
256 MatrixWithStorage<float, kOrder> float_rhs(depth, cols);
257 float_rhs.MakeRandom();
258 MatrixWithStorage<float, kOrder> reference_float_result(rows, cols);
259 auto reference_float_result_map = reference_float_result.Map();
260 FloatMatrixMultiplication(float_lhs.ConstMap(), float_rhs.ConstMap(),
261 &reference_float_result_map);
262 std::cout << "Here is the float LHS matrix:\n" << float_lhs << std::endl;
263 std::cout << "Here is the float RHS matrix:\n" << float_rhs << std::endl;
264 std::cout << "Here is the float product (LHS * RHS) matrix obtained by "
265 << "ordinary float matrix multiplication, i.e. as far as we are "
266 << "concerned, the REFERENCE RESULT:\n"
267 << reference_float_result << std::endl;
268
269 std::cout
270 << "Now we embark on reproducing this result using "
271 << "quantized arithmetic. The code below splits into two parts: "
272 << "quantization code that only needs to run offline (e.g. to "
273 << "generate a quantized neural network workload), and actual "
274 << "runtime quantized code, which is typically performance-critical "
275 << "and where we typically do not want to use any floating-point "
276 << "arithmetic. We want to clearly distinguish between the two.\n"
277 << std::endl;
278
279 std::cout << "The below is OFFLINE QUANTIZATION CODE. We still use some "
280 << "floating-point arithmetic in the process of generating the "
281 << "quantized workload to be run on-device.\n"
282 << std::endl;
283
284 std::cout
285 << "Now, let us choose quantization parameters for these matrices. "
286 << "You might ask, what good is quantization if we need to pick "
287 << "quantization parameters for the result before we can run the "
288 << "quantized computation to obtain the result? The idea is that we "
289 << "target applications such as neural networks, where unknown results "
290 << "are only allowed to vary within preexisting bounds. In practice, the "
291 << "bounds for the results are typically learned during the neural "
292 "network "
293 << "training process. The min and max of the result do not have to be "
294 << "exact. If they are too broad, we just get lower quantization "
295 "accuracy. "
296 << "If they are too narrow, we just get clamping at the bounds.\n"
297 << std::endl;
298
299 float lhs_min, lhs_max, rhs_min, rhs_max, result_min, result_max;
300 FindMinMax(float_lhs.Map(), &lhs_min, &lhs_max);
301 FindMinMax(float_rhs.Map(), &rhs_min, &rhs_max);
302 FindMinMax(reference_float_result.Map(), &result_min, &result_max);
303 const auto lhs_qparams = ChooseQuantizationParams(lhs_min, lhs_max);
304 const auto rhs_qparams = ChooseQuantizationParams(rhs_min, rhs_max);
305 const auto result_qparams = ChooseQuantizationParams(result_min, result_max);
306
307 std::cout << "For LHS, we have min = " << lhs_min << ", max = " << lhs_max
308 << ", scale = " << lhs_qparams.scale
309 << ", zero_point = " << static_cast<float>(lhs_qparams.zero_point)
310 << std::endl;
311 std::cout << "For RHS, we have min = " << rhs_min << ", max = " << rhs_max
312 << ", scale = " << rhs_qparams.scale
313 << ", zero_point = " << static_cast<float>(rhs_qparams.zero_point)
314 << std::endl;
315 std::cout << "For the result, we have min = " << result_min
316 << ", max = " << result_max << ", scale = " << result_qparams.scale
317 << ", zero_point = "
318 << static_cast<float>(result_qparams.zero_point) << std::endl;
319
320 std::cout << std::endl;
321
322 MatrixWithStorage<std::uint8_t, kOrder> uint8_lhs(rows, depth);
323 MatrixWithStorage<std::uint8_t, kOrder> uint8_rhs(depth, cols);
324 MatrixWithStorage<std::uint8_t, kOrder> actual_uint8_result(rows, cols);
325
326 Quantize(lhs_qparams, float_lhs.Storage(), &uint8_lhs.Storage());
327 Quantize(rhs_qparams, float_rhs.Storage(), &uint8_rhs.Storage());
328
329 std::cout << "Quantized uint8 LHS matrix:\n" << uint8_lhs << std::endl;
330 std::cout << "Quantized uint8 RHS matrix:\n" << uint8_rhs << std::endl;
331
332 const int lhs_offset = -lhs_qparams.zero_point;
333 const int rhs_offset = -rhs_qparams.zero_point;
334 const int result_offset = result_qparams.zero_point;
335
336 const float real_multiplier =
337 lhs_qparams.scale * rhs_qparams.scale / result_qparams.scale;
338 std::int32_t quantized_multiplier;
339 int right_shift;
340 QuantizeMultiplierSmallerThanOne(real_multiplier, &quantized_multiplier,
341 &right_shift);
342
343 std::cout << "End of OFFLINE QUANTIZATION CODE.\n" << std::endl;
344
345 std::cout << "The below is ON-DEVICE RUNTIME QUANTIZED CODE. "
346 << "This is the part that is performance-critical and may only "
347 << "use quantized arithmetic.\n"
348 << std::endl;
349
350 gemmlowp::OutputStageQuantizeDownInt32ByFixedPoint
351 quantize_down_stage;
352 quantize_down_stage.result_offset_after_shift = result_offset;
353 quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier;
354 quantize_down_stage.result_shift = right_shift;
355 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
356 const auto& output_pipeline =
357 std::make_tuple(quantize_down_stage, saturating_cast_stage);
358
359 auto actual_uint8_result_map = actual_uint8_result.Map();
360 gemmlowp::GemmContext gemm_context;
361 gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
362 gemmlowp::DefaultL8R8BitDepthParams>(
363 &gemm_context, uint8_lhs.ConstMap(), uint8_rhs.ConstMap(),
364 &actual_uint8_result_map, lhs_offset, rhs_offset, output_pipeline);
365
366 std::cout << "Quantized uint8 result matrix obtained by quantized "
367 << "multiplication:\n"
368 << actual_uint8_result << std::endl;
369
370 std::cout << "End of ON-DEVICE RUNTIME QUANTIZED CODE.\n" << std::endl;
371
372 MatrixWithStorage<float, kOrder> actual_float_result(rows, cols);
373 Dequantize(result_qparams, actual_uint8_result.Storage(),
374 &actual_float_result.Storage());
375 std::cout
376 << "Here is the actual float product (LHS * RHS) matrix obtained by "
377 << "dequantizing the above uint8 result, i.e. "
378 << "as far as we are concerned, the ACTUAL RESULT:\n"
379 << actual_float_result << std::endl;
380
381 MatrixWithStorage<float, kOrder> diff_float_result(rows, cols);
382 for (int i = 0; i < rows; i++) {
383 for (int j = 0; j < cols; j++) {
384 diff_float_result.Map()(i, j) =
385 actual_float_result.Map()(i, j) - reference_float_result.Map()(i, j);
386 }
387 }
388
389 std::cout << "Difference between ACTUAL and REFERENCE float results:\n"
390 << diff_float_result << std::endl;
391 }