# Copyright 2016 The Gemmlowp Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """.""" import common def _DuplicateGeneralRegister(size, emitter, registers, value, min_register): register = registers.QuadRegister(min_register) emitter.EmitVDup(size, register, value) return register def _DuplicateGeneralMemoryRegister(size, emitter, registers, value, min_register): register = registers.QuadRegister(min_register) general = registers.GeneralRegister() emitter.EmitLdr(general, value) emitter.EmitVDup(size, register, general) registers.FreeRegister(general) return register class MinMaxTransformation(object): """.""" def Check(self, in_type, out_type, kernel_size, leftovers): assert in_type is 'uint8_t' assert out_type is 'uint8_t' assert kernel_size is 16 assert leftovers < 16 def Prepare(self, emitter, registers, unused_kernel_size): emitter.EmitNewline() emitter.EmitComment('MinMax::Prepare') self.min = _DuplicateGeneralRegister(8, emitter, registers, registers.MapParameter('min', 'params.min'), 4) self.max = _DuplicateGeneralRegister(8, emitter, registers, registers.MapParameter('max', 'params.max'), 4) def Transform(self, emitter, registers, input_address, elements, output_address): """Generate the MinMax transform inner loop code.""" emitter.EmitNewline() emitter.EmitComment('MinMax::Transform') register_count = (elements + 15) / 16 load = [registers.QuadRegister() for unused_i in range(register_count)] emitter.EmitVLoadAE(8, elements, load, input_address, None) emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(16)) for register in load: emitter.EmitVMax('u8', register, register, self.min) for register in load: emitter.EmitVMin('u8', register, register, self.max) emitter.EmitNewline() emitter.EmitVStoreAE(8, elements, load, output_address, None) emitter.EmitPld(output_address) registers.FreeRegisters(load) class DequantizeTransformation(object): """.""" def Check(self, in_type, out_type, kernel_size, leftovers): assert in_type is 'uint8_t' assert out_type is 'float' assert kernel_size is 16 assert leftovers < 16 def Prepare(self, emitter, registers, unused_kernel_size): """Duplicate quantization offsets to vector registers.""" emitter.EmitNewline() emitter.EmitComment('Dequantize::Prepare') self.range_min = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('range_min', 'params.range_min'), 4) self.range_offset = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('range_offset', 'params.range_offset'), 4) self.range_scale = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('range_scale', 'params.range_scale'), 4) def Transform(self, emitter, registers, input_address, elements, output_address): """Emit the dequantization inner loop.""" emitter.EmitNewline() emitter.EmitComment('Dequantize::Transform') register_count = (elements + 3) / 4 load = [registers.QuadRegister() for unused_i in range(register_count)] emitter.EmitVLoadAE(8, elements, load, input_address, None) emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(32)) if len(load) is 1: emitter.EmitVMovl('u8', load[0], load[0]) emitter.EmitVMovl('s16', load[0], load[0]) elif len(load) is 2: emitter.EmitVMovl('u8', load[0], load[0]) emitter.EmitVMovl2('s16', load[0], load[1], load[0]) elif len(load) is 3: emitter.EmitVMovl2('u8', load[0], load[1], load[0]) emitter.EmitVMovl('s16', load[2], load[1]) emitter.EmitVMovl2('s16', load[0], load[1], load[0]) elif len(load) is 4: emitter.EmitVMovl2('u8', load[0], load[1], load[0]) emitter.EmitVMovl2('s16', load[2], load[3], load[1]) emitter.EmitVMovl2('s16', load[0], load[1], load[0]) else: assert False for register in load: emitter.EmitVCvt('f32', 's32', register, register) for register in load: emitter.EmitVSub('f32', register, register, self.range_offset) for register in load: emitter.EmitVMul('f32', register, register, self.range_scale) for register in load: emitter.EmitVAdd('f32', register, register, self.range_min) emitter.EmitNewline() emitter.EmitVStoreAE(32, elements, load, output_address, None) emitter.EmitPld(output_address) registers.FreeRegisters(load) class QuantizeTransformation(object): """.""" def Check(self, in_type, out_type, kernel_size, leftovers): assert in_type is 'float' assert out_type is 'uint8_t' assert kernel_size is 16 assert leftovers < 16 def Prepare(self, emitter, registers, unused_kernel_size): """Duplicate quantization offsets to vector registers.""" emitter.EmitNewline() emitter.EmitComment('Quantize::Prepare') self.range_min = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('range_min', 'params.range_min'), 4) self.range_offset = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('range_offset', 'params.range_offset'), 4) self.range_scale = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('range_scale', 'params.range_scale'), 4) def Transform(self, emitter, registers, input_address, elements, output_address): """Emit quantization inner loop code.""" emitter.EmitNewline() emitter.EmitComment('Quantize::Transform') register_count = (elements + 3) / 4 load = [registers.QuadRegister() for unused_i in range(register_count)] emitter.EmitVLoadAE(32, elements, load, input_address, None) emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(64)) for register in load: emitter.EmitVSub('f32', register, register, self.range_min) for register in load: emitter.EmitVMul('f32', register, register, self.range_scale) for register in load: emitter.EmitVAdd('f32', register, register, self.range_offset) for register in load: emitter.EmitVCvt('s32', 'f32', register, register) if len(load) is 1: emitter.EmitVQmovn('s32', load[0], load[0]) emitter.EmitVQmovun('s16', load[0], load[0]) elif len(load) is 2: emitter.EmitVQmovn2('s32', load[0], load[0], load[1]) emitter.EmitVQmovun('s16', load[0], load[0]) elif len(load) is 3: emitter.EmitVQmovn2('s32', load[0], load[0], load[1]) emitter.EmitVQmovn('s32', load[2], load[2]) emitter.EmitVQmovun2('s16', load[0], load[0], load[2]) elif len(load) is 4: emitter.EmitVQmovn2('s32', load[0], load[0], load[1]) emitter.EmitVQmovn2('s32', load[2], load[2], load[3]) emitter.EmitVQmovun2('s16', load[0], load[0], load[2]) else: assert False emitter.EmitNewline() emitter.EmitVStoreAE(8, elements, load, output_address, None) emitter.EmitPld(output_address) registers.FreeRegisters(load) class RequantizeTransformation(object): """.""" def Check(self, in_type, out_type, kernel_size, leftovers): assert in_type is 'int32_t' assert out_type is 'uint8_t' assert kernel_size is 16 assert leftovers < 16 def Prepare(self, emitter, registers, unused_kernel_size): """Duplicate quantization parameters to vector registers.""" emitter.EmitNewline() emitter.EmitComment('Requantize::Prepare') self.range_min_delta = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('input_range_min', 'params.input_range_min'), 4) self.output_range_min = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('output_range_min', 'params.output_range_min'), 4) self.input_range_offset = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('input_range_offset', 'params.input_range_offset'), 4) self.input_range_scale = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('input_range_scale', 'params.input_range_scale'), 4) self.one_over_output_range_scale = _DuplicateGeneralRegister( 32, emitter, registers, registers.MapParameter('one_over_output_range_scale', 'params.one_over_output_range_scale'), 4) emitter.EmitVSub('f32', self.range_min_delta, self.range_min_delta, self.output_range_min) def Transform(self, emitter, registers, input_address, elements, output_address): """Emit requantization inner loop code.""" emitter.EmitNewline() emitter.EmitComment('Requantize::Transform') register_count = (elements + 3) / 4 load = [registers.QuadRegister() for unused_i in range(register_count)] emitter.EmitVLoadAE(32, elements, load, input_address, None) emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(64)) for register in load: emitter.EmitVCvt('f32', 's32', register, register) for register in load: emitter.EmitVSub('f32', register, register, self.input_range_offset) for register in load: emitter.EmitVMul('f32', register, register, self.input_range_scale) for register in load: emitter.EmitVAdd('f32', register, register, self.range_min_delta) for register in load: emitter.EmitVMul('f32', register, register, self.one_over_output_range_scale) for register in load: emitter.EmitVCvt('s32', 'f32', register, register) if len(load) is 1: emitter.EmitVQmovn('s32', load[0], load[0]) emitter.EmitVQmovun('s16', load[0], load[0]) elif len(load) is 2: emitter.EmitVQmovn2('s32', load[0], load[0], load[1]) emitter.EmitVQmovun('s16', load[0], load[0]) elif len(load) is 3: emitter.EmitVQmovn2('s32', load[0], load[0], load[1]) emitter.EmitVQmovn('s32', load[2], load[2]) emitter.EmitVQmovun2('s16', load[0], load[0], load[2]) elif len(load) is 4: emitter.EmitVQmovn2('s32', load[0], load[0], load[1]) emitter.EmitVQmovn2('s32', load[2], load[2], load[3]) emitter.EmitVQmovun2('s16', load[0], load[0], load[2]) else: assert False emitter.EmitNewline() emitter.EmitVStoreAE(8, elements, load, output_address, None) emitter.EmitPld(output_address) registers.FreeRegisters(load) class BaseTransform(common.Transform1DKernelGenerator): """.""" def __init__(self, cc_emitter, kernel_name, asm_emitter, transformation): common.Transform1DKernelGenerator.__init__(self, cc_emitter, kernel_name) self.asm_emitter = asm_emitter self.transformation = transformation def EmitTransform(self, in_type, out_type, kernel_size, leftovers): """.""" self.transformation.Check(in_type, out_type, kernel_size, leftovers) registers = self.asm_emitter.CreateRegisters() self.emitter.EmitDeclare('int', 'params_count_copy', 'params.count') self.asm_emitter.PushIndent(self.emitter.indent) self.asm_emitter.EmitAsmBegin() count = registers.MapOutputParameter('count', 'params_count_copy') input_address = registers.MapOutputParameter('input') output_address = registers.MapOutputParameter('output') self.transformation.Prepare(self.asm_emitter, registers, kernel_size) if leftovers: self.asm_emitter.EmitNewline() self.asm_emitter.EmitComment('Reduce count by leftovers.') self.asm_emitter.EmitSubs(count, count, self.asm_emitter.ImmediateConstant(leftovers)) self.asm_emitter.EmitBeqFront(2) self.asm_emitter.EmitNewline() self.asm_emitter.EmitNumericalLabel(1) self.asm_emitter.EmitSubs(count, count, self.asm_emitter.ImmediateConstant(kernel_size)) self.transformation.Transform(self.asm_emitter, registers, input_address, kernel_size, output_address) self.asm_emitter.EmitNewline() self.asm_emitter.EmitBneBack(1) if leftovers: self.asm_emitter.EmitNumericalLabel(2) self.asm_emitter.EmitNewline() self.asm_emitter.EmitComment('Handle leftovers.') self.transformation.Transform(self.asm_emitter, registers, input_address, leftovers, output_address) self.asm_emitter.EmitAsmEnd(registers) self.asm_emitter.PopIndent(len(self.emitter.indent)) class Requantize(BaseTransform): """.""" def __init__(self, cc_emitter, asm_emitter): BaseTransform.__init__(self, cc_emitter, 'Requantize', asm_emitter, RequantizeTransformation()) class Quantize(BaseTransform): """.""" def __init__(self, cc_emitter, asm_emitter): BaseTransform.__init__(self, cc_emitter, 'Quantize', asm_emitter, QuantizeTransformation()) class Dequantize(BaseTransform): """.""" def __init__(self, cc_emitter, asm_emitter): BaseTransform.__init__(self, cc_emitter, 'Dequantize', asm_emitter, DequantizeTransformation()) class MinMax(BaseTransform): """.""" def __init__(self, numerical_type, cc_emitter, asm_emitter): BaseTransform.__init__(self, cc_emitter, 'MinMax<%s>' % numerical_type, asm_emitter, MinMaxTransformation()) class BiasAdd(common.Transform1DKernelGenerator): """.""" def __init__(self, bias_type, cc_emitter, asm_emitter): common.Transform1DKernelGenerator.__init__(self, cc_emitter, 'BiasAdd<%s>' % bias_type) self.asm_emitter = asm_emitter def EmitTransform(self, in_type, out_type, kernel_size, leftovers): """.""" assert in_type is 'uint8_t' assert out_type is 'int32_t' assert kernel_size is 16 assert leftovers < 16 registers = self.asm_emitter.CreateRegisters() self.emitter.EmitDeclare('int', 'params_rows_copy', 'params.rows') self.asm_emitter.PushIndent(self.emitter.indent) self.asm_emitter.EmitAsmBegin() self._Prepare(self.asm_emitter, registers) rows = registers.MapParameter('rows', 'params_rows_copy') self.asm_emitter.EmitNumericalLabel(1) self._ProcessRow(self.asm_emitter, registers, kernel_size, leftovers) self.asm_emitter.EmitSubs(rows, rows, self.asm_emitter.ImmediateConstant(1)) self.asm_emitter.EmitBneBack(1) self.asm_emitter.EmitAsmEnd(registers) self.asm_emitter.PopIndent(len(self.emitter.indent)) def _Prepare(self, emitter, registers): self.input_range_min = _DuplicateGeneralMemoryRegister( 32, emitter, registers, registers.MapMemoryParameter('input_range_min', 'params.input_range_min'), 8) self.input_range_scale = _DuplicateGeneralMemoryRegister( 32, emitter, registers, registers.MapMemoryParameter('input_range_scale', 'params.input_range_scale'), 8) self.bias_range_min = _DuplicateGeneralMemoryRegister( 32, emitter, registers, registers.MapMemoryParameter('bias_range_min', 'params.bias_range_min'), 8) self.bias_range_scale = _DuplicateGeneralMemoryRegister( 32, emitter, registers, registers.MapMemoryParameter('bias_range_scale', 'params.bias_range_scale'), 8) self.output_range_min = _DuplicateGeneralMemoryRegister( 32, emitter, registers, registers.MapMemoryParameter('output_range_min', 'params.output_range_min'), 8) self.one_over_output_range_scale = _DuplicateGeneralMemoryRegister( 32, emitter, registers, registers.MapMemoryParameter('one_over_output_range_scale', 'params.one_over_output_range_scale'), 8) self.output_range_offset = _DuplicateGeneralMemoryRegister( 32, emitter, registers, registers.MapMemoryParameter('output_range_offset', 'params.output_range_offset'), 8) def _ProcessRow(self, emitter, registers, kernel_size, leftovers): const_count = registers.MapParameter('count', 'params.count') const_bias = registers.MapParameter('bias', 'params.bias') count = registers.GeneralRegister() bias = registers.GeneralRegister() input_address = registers.MapOutputParameter('input') output_address = registers.MapOutputParameter('output') emitter.EmitMov(count, const_count) emitter.EmitMov(bias, const_bias) if leftovers: emitter.EmitSubs(count, count, emitter.ImmediateConstant(leftovers)) emitter.EmitBeqFront(3) emitter.EmitNumericalLabel(2) emitter.EmitSubs(count, count, emitter.ImmediateConstant(kernel_size)) self._BiasAdd(emitter, registers, kernel_size, input_address, bias, output_address) emitter.EmitBneBack(2) if leftovers: emitter.EmitNumericalLabel(3) self._BiasAdd(emitter, registers, leftovers, input_address, bias, output_address) def _BiasAdd(self, emitter, registers, elements, input_address, bias, output_address): emitter.EmitNewline() emitter.EmitComment('BiasAdd::Transform') register_count = (elements + 3) / 4 load_input = [ registers.QuadRegister() for unused_i in range(register_count) ] load_bias = [registers.QuadRegister() for unused_i in range(register_count)] emitter.EmitVLoadAE(8, elements, load_input, input_address, None) emitter.EmitVLoadAE(8, elements, load_bias, bias, None) emitter.EmitPldOffset(input_address, emitter.ImmediateConstant(32)) if len(load_input) is 1: emitter.EmitVMovl('u8', load_input[0], load_input[0]) emitter.EmitVMovl('u8', load_bias[0], load_bias[0]) emitter.EmitVMovl('s16', load_input[0], load_input[0]) emitter.EmitVMovl('s16', load_bias[0], load_bias[0]) elif len(load_input) is 2: emitter.EmitVMovl('u8', load_input[0], load_input[0]) emitter.EmitVMovl('u8', load_bias[0], load_bias[0]) emitter.EmitVMovl2('s16', load_input[0], load_input[1], load_input[0]) emitter.EmitVMovl2('s16', load_bias[0], load_bias[1], load_bias[0]) elif len(load_input) is 3: emitter.EmitVMovl2('u8', load_input[0], load_input[1], load_input[0]) emitter.EmitVMovl2('u8', load_bias[0], load_bias[1], load_bias[0]) emitter.EmitVMovl('s16', load_input[2], load_input[1]) emitter.EmitVMovl('s16', load_bias[2], load_bias[1]) emitter.EmitVMovl2('s16', load_input[0], load_input[1], load_input[0]) emitter.EmitVMovl2('s16', load_bias[0], load_bias[1], load_bias[0]) elif len(load_input) is 4: emitter.EmitVMovl2('u8', load_input[0], load_input[1], load_input[0]) emitter.EmitVMovl2('u8', load_bias[0], load_bias[1], load_bias[0]) emitter.EmitVMovl2('s16', load_input[2], load_input[3], load_input[1]) emitter.EmitVMovl2('s16', load_bias[2], load_bias[3], load_bias[1]) emitter.EmitVMovl2('s16', load_input[0], load_input[1], load_input[0]) emitter.EmitVMovl2('s16', load_bias[0], load_bias[1], load_bias[0]) else: assert False for register in load_input + load_bias: emitter.EmitVCvt('f32', 's32', register, register) for register in load_input: emitter.EmitVMul('f32', register, register, self.input_range_scale) for register in load_bias: emitter.EmitVMul('f32', register, register, self.bias_range_scale) for register in load_input: emitter.EmitVAdd('f32', register, register, self.input_range_min) for register in load_bias: emitter.EmitVAdd('f32', register, register, self.bias_range_min) for (register_1, register_2) in zip(load_input, load_bias): emitter.EmitVAdd('f32', register_1, register_1, register_2) for register in load_input: emitter.EmitVSub('f32', register, register, self.output_range_min) for register in load_input: emitter.EmitVMul('f32', register, register, self.one_over_output_range_scale) for register in load_input: emitter.EmitVAdd('f32', register, register, self.output_range_offset) for register in load_input: emitter.EmitVCvt('s32', 'f32', register, register) emitter.EmitNewline() emitter.EmitVStoreAE(32, elements, load_input, output_address, None) emitter.EmitPld(output_address) registers.FreeRegisters(load_input + load_bias) def GenerateKernels(cc_emitter, asm_emitter, shapes): """Generate the quantization/dequantization/requantization kernels.""" requantize = Requantize(cc_emitter, asm_emitter) quantize = Quantize(cc_emitter, asm_emitter) dequantize = Dequantize(cc_emitter, asm_emitter) minmax = MinMax('uint8_t', cc_emitter, asm_emitter) biasadd = BiasAdd('uint8_t', cc_emitter, asm_emitter) for shape in shapes: requantize.SpecializeTransform1DKernel('int32_t', 'uint8_t', shape[0], shape[1]) for shape in shapes: quantize.SpecializeTransform1DKernel('float', 'uint8_t', shape[0], shape[1]) for shape in shapes: dequantize.SpecializeTransform1DKernel('uint8_t', 'float', shape[0], shape[1]) for shape in shapes: minmax.SpecializeTransform1DKernel('uint8_t', 'uint8_t', shape[0], shape[1]) for shape in shapes: biasadd.SpecializeTransform1DKernel('uint8_t', 'int32_t', shape[0], shape[1])