• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <math.h>
18 
19 #include <cstdint>
20 
21 #include "RenderScriptToolkit.h"
22 #include "TaskProcessor.h"
23 #include "Utils.h"
24 
25 #if defined(ARCH_X86_HAVE_AVX2)
26 #include <stdint.h>
27 #include <x86intrin.h>
28 #include <xmmintrin.h>
29 #endif
30 
31 #define LOG_TAG "renderscript.toolkit.Resize"
32 
33 namespace android {
34 namespace renderscript {
35 
36 class ResizeTask : public Task {
37     const uchar* mIn;
38     uchar* mOut;
39     float mScaleX;
40     float mScaleY;
41     size_t mInputSizeX;
42     size_t mInputSizeY;
43 
44     void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
45     void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
46     void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
47 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
48     void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
49     void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
50     void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
51 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
52 
53     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
54     virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
55                              size_t endY) override;
56 
57    public:
ResizeTask(const uchar * input,uchar * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)58     ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
59                size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
60                const Restriction* restriction)
61         : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
62           mIn{input},
63           mOut{output},
64           mInputSizeX{inputSizeX},
65           mInputSizeY{inputSizeY} {
66         mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
67         mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
68     }
69 };
70 
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)71 void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
72                              size_t endY) {
73     typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
74 
75     KernelFunction kernel;
76     switch (mVectorSize) {
77         case 4:
78             kernel = &ResizeTask::kernelU4;
79             break;
80         case 3:
81             kernel = &ResizeTask::kernelU4;
82             break;
83         case 2:
84             kernel = &ResizeTask::kernelU2;
85             break;
86         case 1:
87             kernel = &ResizeTask::kernelU1;
88             break;
89         default:
90             ALOGE("Bad vector size %zd", mVectorSize);
91     }
92 
93     for (size_t y = startY; y < endY; y++) {
94         size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
95         uchar* out = mOut + offset;
96         std::invoke(kernel, this, out, startX, endX, y);
97     }
98 }
99 
cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3,float x)100 static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
101     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
102             + x * (3.f * (p1 - p2) + p3 - p0)));
103 }
104 
cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3,float x)105 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
106     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
107             + x * (3.f * (p1 - p2) + p3 - p0)));
108 }
109 
110 
111 #if defined(ARCH_X86_HAVE_AVX2)
cubicInterpolate(float p0,float p1,float p2,float p3,float x)112 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
113    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
114            _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
115            + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
116                                               _mm_set1_ps(p3 - p0))))));
117 
118 }
119 #else
cubicInterpolate(float p0,float p1,float p2,float p3,float x)120 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
121     //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
122     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
123             + x * (3.f * (p1 - p2) + p3 - p0)));
124 }
125 #endif
126 
OneBiCubic(const uchar4 * yp0,const uchar4 * yp1,const uchar4 * yp2,const uchar4 * yp3,float xf,float yf,int width)127 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
128                          float xf, float yf, int width) {
129     int startx = (int) floor(xf - 1);
130     xf = xf - floor(xf);
131     int maxx = width - 1;
132     int xs0 = std::max(0, startx + 0);
133     int xs1 = std::max(0, startx + 1);
134     int xs2 = std::min(maxx, startx + 2);
135     int xs3 = std::min(maxx, startx + 3);
136 
137     float4 p0  = cubicInterpolate(convert<float4>(yp0[xs0]),
138                                   convert<float4>(yp0[xs1]),
139                                   convert<float4>(yp0[xs2]),
140                                   convert<float4>(yp0[xs3]), xf);
141 
142     float4 p1  = cubicInterpolate(convert<float4>(yp1[xs0]),
143                                   convert<float4>(yp1[xs1]),
144                                   convert<float4>(yp1[xs2]),
145                                   convert<float4>(yp1[xs3]), xf);
146 
147     float4 p2  = cubicInterpolate(convert<float4>(yp2[xs0]),
148                                   convert<float4>(yp2[xs1]),
149                                   convert<float4>(yp2[xs2]),
150                                   convert<float4>(yp2[xs3]), xf);
151 
152     float4 p3  = cubicInterpolate(convert<float4>(yp3[xs0]),
153                                   convert<float4>(yp3[xs1]),
154                                   convert<float4>(yp3[xs2]),
155                                   convert<float4>(yp3[xs3]), xf);
156 
157     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
158     p = clamp(p + 0.5f, 0.f, 255.f);
159     return convert<uchar4>(p);
160 }
161 
OneBiCubic(const uchar2 * yp0,const uchar2 * yp1,const uchar2 * yp2,const uchar2 * yp3,float xf,float yf,int width)162 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
163                          float xf, float yf, int width) {
164     int startx = (int) floor(xf - 1);
165     xf = xf - floor(xf);
166     int maxx = width - 1;
167     int xs0 = std::max(0, startx + 0);
168     int xs1 = std::max(0, startx + 1);
169     int xs2 = std::min(maxx, startx + 2);
170     int xs3 = std::min(maxx, startx + 3);
171 
172     float2 p0  = cubicInterpolate(convert<float2>(yp0[xs0]),
173                                   convert<float2>(yp0[xs1]),
174                                   convert<float2>(yp0[xs2]),
175                                   convert<float2>(yp0[xs3]), xf);
176 
177     float2 p1  = cubicInterpolate(convert<float2>(yp1[xs0]),
178                                   convert<float2>(yp1[xs1]),
179                                   convert<float2>(yp1[xs2]),
180                                   convert<float2>(yp1[xs3]), xf);
181 
182     float2 p2  = cubicInterpolate(convert<float2>(yp2[xs0]),
183                                   convert<float2>(yp2[xs1]),
184                                   convert<float2>(yp2[xs2]),
185                                   convert<float2>(yp2[xs3]), xf);
186 
187     float2 p3  = cubicInterpolate(convert<float2>(yp3[xs0]),
188                                   convert<float2>(yp3[xs1]),
189                                   convert<float2>(yp3[xs2]),
190                                   convert<float2>(yp3[xs3]), xf);
191 
192     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
193     p = clamp(p + 0.5f, 0.f, 255.f);
194     return convert<uchar2>(p);
195 }
196 
OneBiCubic(const uchar * yp0,const uchar * yp1,const uchar * yp2,const uchar * yp3,float xf,float yf,int width)197 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
198                         float xf, float yf, int width) {
199     int startx = (int) floor(xf - 1);
200     xf = xf - floor(xf);
201     int maxx = width - 1;
202     int xs0 = std::max(0, startx + 0);
203     int xs1 = std::max(0, startx + 1);
204     int xs2 = std::min(maxx, startx + 2);
205     int xs3 = std::min(maxx, startx + 3);
206 
207     float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
208                                  (float)yp0[xs2], (float)yp0[xs3], xf);
209     float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
210                                  (float)yp1[xs2], (float)yp1[xs3], xf);
211     float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
212                                  (float)yp2[xs2], (float)yp2[xs3], xf);
213     float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
214                                  (float)yp3[xs2], (float)yp3[xs3], xf);
215 
216     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
217     p = clamp(p + 0.5f, 0.f, 255.f);
218     //ALOGI("CUC,%f,%u", p, (uchar)p);
219     return (uchar)p;
220 }
221 
222 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
223 
224 extern "C" void rsdIntrinsicResizeB4_K(
225             uchar4 *dst,
226             size_t count,
227             uint32_t xf,
228             uint32_t xinc,
229             uchar4 const *srcn,
230             uchar4 const *src0,
231             uchar4 const *src1,
232             uchar4 const *src2,
233             size_t xclip,
234             size_t avail,
235             uint64_t osc_ctl,
236             int32_t const *yr);
237 
238 extern "C" void rsdIntrinsicResizeB2_K(
239             uchar2 *dst,
240             size_t count,
241             uint32_t xf,
242             uint32_t xinc,
243             uchar2 const *srcn,
244             uchar2 const *src0,
245             uchar2 const *src1,
246             uchar2 const *src2,
247             size_t xclip,
248             size_t avail,
249             uint64_t osc_ctl,
250             int32_t const *yr);
251 
252 extern "C" void rsdIntrinsicResizeB1_K(
253             uchar *dst,
254             size_t count,
255             uint32_t xf,
256             uint32_t xinc,
257             uchar const *srcn,
258             uchar const *src0,
259             uchar const *src1,
260             uchar const *src2,
261             size_t xclip,
262             size_t avail,
263             uint64_t osc_ctl,
264             int32_t const *yr);
265 
266 #if defined(ARCH_ARM_USE_INTRINSICS)
mkYCoeff(int32_t * yr,float yf)267 static void mkYCoeff(int32_t *yr, float yf) {
268     int32_t yf1 = rint(yf * 0x10000);
269     int32_t yf2 = rint(yf * yf * 0x10000);
270     int32_t yf3 = rint(yf * yf * yf * 0x10000);
271 
272     yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
273     yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
274     yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
275     yr[3] = -(yf3 - yf2) >> 1;
276 }
277 #endif
278 
279 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
OneBiCubic(const float4 * yp0,const float4 * yp1,const float4 * yp2,const float4 * yp3,float xf,float yf,int width)280 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
281                          float xf, float yf, int width) {
282     int startx = (int) floor(xf - 1);
283     xf = xf - floor(xf);
284     int maxx = width - 1;
285     int xs0 = std::max(0, startx + 0);
286     int xs1 = std::max(0, startx + 1);
287     int xs2 = std::min(maxx, startx + 2);
288     int xs3 = std::min(maxx, startx + 3);
289 
290     float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
291                                   yp0[xs2], yp0[xs3], xf);
292     float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
293                                   yp1[xs2], yp1[xs3], xf);
294     float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
295                                   yp2[xs2], yp2[xs3], xf);
296     float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
297                                   yp3[xs2], yp3[xs3], xf);
298 
299     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
300     return p;
301 }
302 
OneBiCubic(const float2 * yp0,const float2 * yp1,const float2 * yp2,const float2 * yp3,float xf,float yf,int width)303 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
304                          float xf, float yf, int width) {
305     int startx = (int) floor(xf - 1);
306     xf = xf - floor(xf);
307     int maxx = width - 1;
308     int xs0 = std::max(0, startx + 0);
309     int xs1 = std::max(0, startx + 1);
310     int xs2 = std::min(maxx, startx + 2);
311     int xs3 = std::min(maxx, startx + 3);
312 
313     float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
314                                   yp0[xs2], yp0[xs3], xf);
315     float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
316                                   yp1[xs2], yp1[xs3], xf);
317     float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
318                                   yp2[xs2], yp2[xs3], xf);
319     float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
320                                   yp3[xs2], yp3[xs3], xf);
321 
322     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
323     return p;
324 }
325 
OneBiCubic(const float * yp0,const float * yp1,const float * yp2,const float * yp3,float xf,float yf,int width)326 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
327                         float xf, float yf, int width) {
328     int startx = (int) floor(xf - 1);
329     xf = xf - floor(xf);
330     int maxx = width - 1;
331     int xs0 = std::max(0, startx + 0);
332     int xs1 = std::max(0, startx + 1);
333     int xs2 = std::min(maxx, startx + 2);
334     int xs3 = std::min(maxx, startx + 3);
335 
336     float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
337                                  yp0[xs2], yp0[xs3], xf);
338     float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
339                                  yp1[xs2], yp1[xs3], xf);
340     float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
341                                  yp2[xs2], yp2[xs3], xf);
342     float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
343                                  yp3[xs2], yp3[xs3], xf);
344 
345     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
346     return p;
347 }
348 #endif
349 
kernelU4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)350 void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
351     const uchar *pin = mIn;
352     const int srcHeight = mInputSizeY;
353     const int srcWidth = mInputSizeX;
354     const size_t stride = mInputSizeX * paddedSize(mVectorSize);
355 
356 
357 #if defined(ARCH_X86_HAVE_AVX2)
358     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
359                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
360 #else
361     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
362 #endif
363 
364 
365     int starty = (int) floor(yf - 1);
366     yf = yf - floor(yf);
367     int maxy = srcHeight - 1;
368     int ys0 = std::max(0, starty + 0);
369     int ys1 = std::max(0, starty + 1);
370     int ys2 = std::min(maxy, starty + 2);
371     int ys3 = std::min(maxy, starty + 3);
372 
373     const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
374     const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
375     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
376     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
377 
378     uchar4 *out = ((uchar4 *)outPtr);
379     uint32_t x1 = xstart;
380     uint32_t x2 = xend;
381 
382 #if defined(ARCH_ARM_USE_INTRINSICS)
383     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
384         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
385         long xf16 = rint(xf * 0x10000);
386         uint32_t xinc16 = rint(mScaleX * 0x10000);
387 
388         int xoff = (xf16 >> 16) - 1;
389         int xclip = std::max(0, xoff) - xoff;
390         int len = x2 - x1;
391 
392         int32_t yr[4];
393         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
394         mkYCoeff(yr, yf);
395 
396         xoff += xclip;
397 
398         rsdIntrinsicResizeB4_K(
399                 out, len,
400                 xf16 & 0xffff, xinc16,
401                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
402                 xclip, srcWidth - xoff + xclip,
403                 osc_ctl, yr);
404         out += len;
405         x1 += len;
406     }
407 #endif
408 
409     while(x1 < x2) {
410 #if defined(ARCH_X86_HAVE_AVX2)
411         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
412                                               _mm_set1_ps(0.5f)));
413 #else
414         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
415 #endif
416         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
417         out++;
418         x1++;
419     }
420 }
421 
kernelU2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)422 void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
423     const uchar *pin = mIn;
424     const int srcHeight = mInputSizeY;
425     const int srcWidth = mInputSizeX;
426     const size_t stride = mInputSizeX * mVectorSize;
427 
428 
429 #if defined(ARCH_X86_HAVE_AVX2)
430     float yf = _mm_cvtss_f32(
431             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
432 #else
433     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
434 #endif
435 
436     int starty = (int) floor(yf - 1);
437     yf = yf - floor(yf);
438     int maxy = srcHeight - 1;
439     int ys0 = std::max(0, starty + 0);
440     int ys1 = std::max(0, starty + 1);
441     int ys2 = std::min(maxy, starty + 2);
442     int ys3 = std::min(maxy, starty + 3);
443 
444     const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
445     const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
446     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
447     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
448 
449     uchar2 *out = ((uchar2 *)outPtr);
450     uint32_t x1 = xstart;
451     uint32_t x2 = xend;
452 
453 #if defined(ARCH_ARM_USE_INTRINSICS)
454     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
455         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
456         long xf16 = rint(xf * 0x10000);
457         uint32_t xinc16 = rint(mScaleX * 0x10000);
458 
459         int xoff = (xf16 >> 16) - 1;
460         int xclip = std::max(0, xoff) - xoff;
461         int len = x2 - x1;
462 
463         int32_t yr[4];
464         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
465         mkYCoeff(yr, yf);
466 
467         xoff += xclip;
468 
469         rsdIntrinsicResizeB2_K(
470                 out, len,
471                 xf16 & 0xffff, xinc16,
472                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
473                 xclip, srcWidth - xoff + xclip,
474                 osc_ctl, yr);
475         out += len;
476         x1 += len;
477     }
478 #endif
479 
480     while(x1 < x2) {
481 
482 #if defined(ARCH_X86_HAVE_AVX2)
483         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
484                                               _mm_set1_ps(0.5f)));
485 #else
486         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
487 #endif
488         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
489         out++;
490         x1++;
491     }
492 }
493 
kernelU1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)494 void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
495     //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
496     const uchar *pin = mIn;
497     const int srcHeight = mInputSizeY;
498     const int srcWidth = mInputSizeX;
499     const size_t stride = mInputSizeX * mVectorSize;
500 
501     // ALOGI("Toolkit   ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
502     // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
503 
504 #if defined(ARCH_X86_HAVE_AVX2)
505     float yf = _mm_cvtss_f32(
506             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
507 #else
508     float yf = (currentY + 0.5f) * mScaleY - 0.5f;
509 #endif
510 
511     int starty = (int) floor(yf - 1);
512     yf = yf - floor(yf);
513     int maxy = srcHeight - 1;
514     int ys0 = std::max(0, starty + 0);
515     int ys1 = std::min(maxy, std::max(0, starty + 1));
516     int ys2 = std::min(maxy, starty + 2);
517     int ys3 = std::min(maxy, starty + 3);
518 
519     const uchar *yp0 = pin + stride * ys0;
520     const uchar *yp1 = pin + stride * ys1;
521     const uchar *yp2 = pin + stride * ys2;
522     const uchar *yp3 = pin + stride * ys3;
523 
524     uchar *out = ((uchar *)outPtr);
525     uint32_t x1 = xstart;
526     uint32_t x2 = xend;
527 
528 #if defined(ARCH_ARM_USE_INTRINSICS)
529     if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
530         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
531         long xf16 = rint(xf * 0x10000);
532         uint32_t xinc16 = rint(mScaleX * 0x10000);
533 
534         int xoff = (xf16 >> 16) - 1;
535         int xclip = std::max(0, xoff) - xoff;
536         int len = x2 - x1;
537 
538         int32_t yr[4];
539         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
540         mkYCoeff(yr, yf);
541 
542         // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
543         // xclip %d, len %d, osc_ctl %lu)",
544         //       ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
545         //       osc_ctl);
546         // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
547         // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
548         // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
549 
550         xoff += xclip;
551 
552         rsdIntrinsicResizeB1_K(
553                 out, len,
554                 xf16 & 0xffff, xinc16,
555                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
556                 xclip, srcWidth - xoff + xclip,
557                 osc_ctl, yr);
558         out += len;
559         x1 += len;
560     }
561 #endif
562 
563     while(x1 < x2) {
564 
565 #if defined(ARCH_X86_HAVE_AVX2)
566         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
567                                               _mm_set1_ps(0.5f)));
568 #else
569         float xf = (x1 + 0.5f) * mScaleX - 0.5f;
570 #endif
571 
572         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
573         out++;
574         x1++;
575     }
576 }
577 
578 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
kernelF4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)579 void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
580     const uchar *pin = mIn;
581     const int srcHeight = inputSizeY;
582     const int srcWidth = inputSizeX;
583     const size_t stride = sizeX * vectorSize;
584 
585 #if defined(ARCH_X86_HAVE_AVX2)
586     float yf = _mm_cvtss_f32(
587             _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
588 #else
589     float yf = (currentY + 0.5f) * scaleY - 0.5f;
590 #endif
591 
592     int starty = (int) floor(yf - 1);
593     yf = yf - floor(yf);
594     int maxy = srcHeight - 1;
595     int ys0 = std::max(0, starty + 0);
596     int ys1 = std::max(0, starty + 1);
597     int ys2 = std::min(maxy, starty + 2);
598     int ys3 = std::min(maxy, starty + 3);
599 
600     const float4 *yp0 = (const float4 *)(pin + stride * ys0);
601     const float4 *yp1 = (const float4 *)(pin + stride * ys1);
602     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
603     const float4 *yp3 = (const float4 *)(pin + stride * ys3);
604 
605     float4 *out = ((float4 *)outPtr);
606     uint32_t x1 = xstart;
607     uint32_t x2 = xend;
608 
609     while(x1 < x2) {
610 
611 #if defined(ARCH_X86_HAVE_AVX2)
612         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
613                                               _mm_set1_ps(0.5f)));
614 #else
615         float xf = (x1 + 0.5f) * scaleX - 0.5f;
616 #endif
617 
618         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
619         out++;
620         x1++;
621     }
622 }
623 
kernelF2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)624 void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
625     const uchar *pin = mIn;
626     const int srcHeight = inputSizeY;
627     const int srcWidth = inputSizeX;
628     const size_t stride = sizeX * vectorSize;
629 
630 
631 #if defined(ARCH_X86_HAVE_AVX2)
632     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
633                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
634 #else
635     float yf = (currentY + 0.5f) * scaleY - 0.5f;
636 #endif
637 
638     int starty = (int) floor(yf - 1);
639     yf = yf - floor(yf);
640     int maxy = srcHeight - 1;
641     int ys0 = std::max(0, starty + 0);
642     int ys1 = std::max(0, starty + 1);
643     int ys2 = std::min(maxy, starty + 2);
644     int ys3 = std::min(maxy, starty + 3);
645 
646     const float2 *yp0 = (const float2 *)(pin + stride * ys0);
647     const float2 *yp1 = (const float2 *)(pin + stride * ys1);
648     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
649     const float2 *yp3 = (const float2 *)(pin + stride * ys3);
650 
651     float2 *out = ((float2 *)outPtr);
652     uint32_t x1 = xstart;
653     uint32_t x2 = xend;
654 
655     while(x1 < x2) {
656 
657 #if defined(ARCH_X86_HAVE_AVX2)
658         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
659                                               _mm_set1_ps(0.5f)));
660 #else
661         float xf = (x1 + 0.5f) * scaleX - 0.5f;
662 #endif
663 
664         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
665         out++;
666         x1++;
667     }
668 }
669 
kernelF1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)670 void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
671     const uchar *pin = mIn;
672     const int srcHeight = inputSizeY;
673     const int srcWidth = inputSizeX;
674     const size_t stride = sizeX * vectorSize;
675 
676 
677 #if defined(ARCH_X86_HAVE_AVX2)
678     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
679                                           _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
680 #else
681     float yf = (currentY + 0.5f) * scaleY - 0.5f;
682 #endif
683 
684     int starty = (int) floor(yf - 1);
685     yf = yf - floor(yf);
686     int maxy = srcHeight - 1;
687     int ys0 = std::max(0, starty + 0);
688     int ys1 = std::max(0, starty + 1);
689     int ys2 = std::min(maxy, starty + 2);
690     int ys3 = std::min(maxy, starty + 3);
691 
692     const float *yp0 = (const float *)(pin + stride * ys0);
693     const float *yp1 = (const float *)(pin + stride * ys1);
694     const float *yp2 = (const float *)(pin + stride * ys2);
695     const float *yp3 = (const float *)(pin + stride * ys3);
696 
697     float *out = ((float *)outPtr);
698     uint32_t x1 = xstart;
699     uint32_t x2 = xend;
700 
701     while(x1 < x2) {
702 
703 #if defined(ARCH_X86_HAVE_AVX2)
704         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
705                                               _mm_set1_ps(0.5f)));
706 #else
707         float xf = (x1 + 0.5f) * scaleX - 0.5f;
708 #endif
709 
710         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
711         out++;
712         x1++;
713     }
714 }
715 
preLaunch(uint32_t slot,const RsScriptCall * sc)716 void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
717 {
718 
719     //check the data type to determine F or U.
720     if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
721         switch(mAlloc->getType()->getElement()->getVectorSize()) {
722         case 1:
723             mRootPtr = &kernelU1;
724             break;
725         case 2:
726             mRootPtr = &kernelU2;
727             break;
728         case 3:
729         case 4:
730             mRootPtr = &kernelU4;
731             break;
732         }
733     } else {
734         switch(mAlloc->getType()->getElement()->getVectorSize()) {
735         case 1:
736             mRootPtr = &kernelF1;
737             break;
738         case 2:
739             mRootPtr = &kernelF2;
740             break;
741         case 3:
742         case 4:
743             mRootPtr = &kernelF4;
744             break;
745         }
746     }
747 }
748 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
749 
resize(const uint8_t * input,uint8_t * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)750 void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
751                                  size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
752                                  size_t outputSizeY, const Restriction* restriction) {
753 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
754     if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
755         return;
756     }
757     if (vectorSize < 1 || vectorSize > 4) {
758         ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
759         return;
760     }
761 #endif
762 
763     ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
764                     outputSizeX, outputSizeY, restriction);
765     processor->doTask(&task);
766 }
767 
768 }  // namespace renderscript
769 }  // namespace android
770