1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <math.h>
18
19 #include <cstdint>
20
21 #include "RenderScriptToolkit.h"
22 #include "TaskProcessor.h"
23 #include "Utils.h"
24
25 #if defined(ARCH_X86_HAVE_AVX2)
26 #include <stdint.h>
27 #include <x86intrin.h>
28 #include <xmmintrin.h>
29 #endif
30
31 #define LOG_TAG "renderscript.toolkit.Resize"
32
33 namespace android {
34 namespace renderscript {
35
36 class ResizeTask : public Task {
37 const uchar* mIn;
38 uchar* mOut;
39 float mScaleX;
40 float mScaleY;
41 size_t mInputSizeX;
42 size_t mInputSizeY;
43
44 void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
45 void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
46 void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
47 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
48 void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
49 void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
50 void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
51 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
52
53 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
54 virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
55 size_t endY) override;
56
57 public:
ResizeTask(const uchar * input,uchar * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)58 ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
59 size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
60 const Restriction* restriction)
61 : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
62 mIn{input},
63 mOut{output},
64 mInputSizeX{inputSizeX},
65 mInputSizeY{inputSizeY} {
66 mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
67 mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
68 }
69 };
70
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)71 void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
72 size_t endY) {
73 typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
74
75 KernelFunction kernel;
76 switch (mVectorSize) {
77 case 4:
78 kernel = &ResizeTask::kernelU4;
79 break;
80 case 3:
81 kernel = &ResizeTask::kernelU4;
82 break;
83 case 2:
84 kernel = &ResizeTask::kernelU2;
85 break;
86 case 1:
87 kernel = &ResizeTask::kernelU1;
88 break;
89 default:
90 ALOGE("Bad vector size %zd", mVectorSize);
91 }
92
93 for (size_t y = startY; y < endY; y++) {
94 size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
95 uchar* out = mOut + offset;
96 std::invoke(kernel, this, out, startX, endX, y);
97 }
98 }
99
cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3,float x)100 static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
101 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
102 + x * (3.f * (p1 - p2) + p3 - p0)));
103 }
104
cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3,float x)105 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
106 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
107 + x * (3.f * (p1 - p2) + p3 - p0)));
108 }
109
110
111 #if defined(ARCH_X86_HAVE_AVX2)
cubicInterpolate(float p0,float p1,float p2,float p3,float x)112 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
113 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
114 _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
115 + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
116 _mm_set1_ps(p3 - p0))))));
117
118 }
119 #else
cubicInterpolate(float p0,float p1,float p2,float p3,float x)120 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
121 //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
122 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
123 + x * (3.f * (p1 - p2) + p3 - p0)));
124 }
125 #endif
126
OneBiCubic(const uchar4 * yp0,const uchar4 * yp1,const uchar4 * yp2,const uchar4 * yp3,float xf,float yf,int width)127 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
128 float xf, float yf, int width) {
129 int startx = (int) floor(xf - 1);
130 xf = xf - floor(xf);
131 int maxx = width - 1;
132 int xs0 = std::max(0, startx + 0);
133 int xs1 = std::max(0, startx + 1);
134 int xs2 = std::min(maxx, startx + 2);
135 int xs3 = std::min(maxx, startx + 3);
136
137 float4 p0 = cubicInterpolate(convert<float4>(yp0[xs0]),
138 convert<float4>(yp0[xs1]),
139 convert<float4>(yp0[xs2]),
140 convert<float4>(yp0[xs3]), xf);
141
142 float4 p1 = cubicInterpolate(convert<float4>(yp1[xs0]),
143 convert<float4>(yp1[xs1]),
144 convert<float4>(yp1[xs2]),
145 convert<float4>(yp1[xs3]), xf);
146
147 float4 p2 = cubicInterpolate(convert<float4>(yp2[xs0]),
148 convert<float4>(yp2[xs1]),
149 convert<float4>(yp2[xs2]),
150 convert<float4>(yp2[xs3]), xf);
151
152 float4 p3 = cubicInterpolate(convert<float4>(yp3[xs0]),
153 convert<float4>(yp3[xs1]),
154 convert<float4>(yp3[xs2]),
155 convert<float4>(yp3[xs3]), xf);
156
157 float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
158 p = clamp(p + 0.5f, 0.f, 255.f);
159 return convert<uchar4>(p);
160 }
161
OneBiCubic(const uchar2 * yp0,const uchar2 * yp1,const uchar2 * yp2,const uchar2 * yp3,float xf,float yf,int width)162 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
163 float xf, float yf, int width) {
164 int startx = (int) floor(xf - 1);
165 xf = xf - floor(xf);
166 int maxx = width - 1;
167 int xs0 = std::max(0, startx + 0);
168 int xs1 = std::max(0, startx + 1);
169 int xs2 = std::min(maxx, startx + 2);
170 int xs3 = std::min(maxx, startx + 3);
171
172 float2 p0 = cubicInterpolate(convert<float2>(yp0[xs0]),
173 convert<float2>(yp0[xs1]),
174 convert<float2>(yp0[xs2]),
175 convert<float2>(yp0[xs3]), xf);
176
177 float2 p1 = cubicInterpolate(convert<float2>(yp1[xs0]),
178 convert<float2>(yp1[xs1]),
179 convert<float2>(yp1[xs2]),
180 convert<float2>(yp1[xs3]), xf);
181
182 float2 p2 = cubicInterpolate(convert<float2>(yp2[xs0]),
183 convert<float2>(yp2[xs1]),
184 convert<float2>(yp2[xs2]),
185 convert<float2>(yp2[xs3]), xf);
186
187 float2 p3 = cubicInterpolate(convert<float2>(yp3[xs0]),
188 convert<float2>(yp3[xs1]),
189 convert<float2>(yp3[xs2]),
190 convert<float2>(yp3[xs3]), xf);
191
192 float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
193 p = clamp(p + 0.5f, 0.f, 255.f);
194 return convert<uchar2>(p);
195 }
196
OneBiCubic(const uchar * yp0,const uchar * yp1,const uchar * yp2,const uchar * yp3,float xf,float yf,int width)197 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
198 float xf, float yf, int width) {
199 int startx = (int) floor(xf - 1);
200 xf = xf - floor(xf);
201 int maxx = width - 1;
202 int xs0 = std::max(0, startx + 0);
203 int xs1 = std::max(0, startx + 1);
204 int xs2 = std::min(maxx, startx + 2);
205 int xs3 = std::min(maxx, startx + 3);
206
207 float p0 = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
208 (float)yp0[xs2], (float)yp0[xs3], xf);
209 float p1 = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
210 (float)yp1[xs2], (float)yp1[xs3], xf);
211 float p2 = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
212 (float)yp2[xs2], (float)yp2[xs3], xf);
213 float p3 = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
214 (float)yp3[xs2], (float)yp3[xs3], xf);
215
216 float p = cubicInterpolate(p0, p1, p2, p3, yf);
217 p = clamp(p + 0.5f, 0.f, 255.f);
218 //ALOGI("CUC,%f,%u", p, (uchar)p);
219 return (uchar)p;
220 }
221
222 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
223
224 extern "C" void rsdIntrinsicResizeB4_K(
225 uchar4 *dst,
226 size_t count,
227 uint32_t xf,
228 uint32_t xinc,
229 uchar4 const *srcn,
230 uchar4 const *src0,
231 uchar4 const *src1,
232 uchar4 const *src2,
233 size_t xclip,
234 size_t avail,
235 uint64_t osc_ctl,
236 int32_t const *yr);
237
238 extern "C" void rsdIntrinsicResizeB2_K(
239 uchar2 *dst,
240 size_t count,
241 uint32_t xf,
242 uint32_t xinc,
243 uchar2 const *srcn,
244 uchar2 const *src0,
245 uchar2 const *src1,
246 uchar2 const *src2,
247 size_t xclip,
248 size_t avail,
249 uint64_t osc_ctl,
250 int32_t const *yr);
251
252 extern "C" void rsdIntrinsicResizeB1_K(
253 uchar *dst,
254 size_t count,
255 uint32_t xf,
256 uint32_t xinc,
257 uchar const *srcn,
258 uchar const *src0,
259 uchar const *src1,
260 uchar const *src2,
261 size_t xclip,
262 size_t avail,
263 uint64_t osc_ctl,
264 int32_t const *yr);
265
266 #if defined(ARCH_ARM_USE_INTRINSICS)
mkYCoeff(int32_t * yr,float yf)267 static void mkYCoeff(int32_t *yr, float yf) {
268 int32_t yf1 = rint(yf * 0x10000);
269 int32_t yf2 = rint(yf * yf * 0x10000);
270 int32_t yf3 = rint(yf * yf * yf * 0x10000);
271
272 yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
273 yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
274 yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
275 yr[3] = -(yf3 - yf2) >> 1;
276 }
277 #endif
278
279 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
OneBiCubic(const float4 * yp0,const float4 * yp1,const float4 * yp2,const float4 * yp3,float xf,float yf,int width)280 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
281 float xf, float yf, int width) {
282 int startx = (int) floor(xf - 1);
283 xf = xf - floor(xf);
284 int maxx = width - 1;
285 int xs0 = std::max(0, startx + 0);
286 int xs1 = std::max(0, startx + 1);
287 int xs2 = std::min(maxx, startx + 2);
288 int xs3 = std::min(maxx, startx + 3);
289
290 float4 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
291 yp0[xs2], yp0[xs3], xf);
292 float4 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
293 yp1[xs2], yp1[xs3], xf);
294 float4 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
295 yp2[xs2], yp2[xs3], xf);
296 float4 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
297 yp3[xs2], yp3[xs3], xf);
298
299 float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
300 return p;
301 }
302
OneBiCubic(const float2 * yp0,const float2 * yp1,const float2 * yp2,const float2 * yp3,float xf,float yf,int width)303 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
304 float xf, float yf, int width) {
305 int startx = (int) floor(xf - 1);
306 xf = xf - floor(xf);
307 int maxx = width - 1;
308 int xs0 = std::max(0, startx + 0);
309 int xs1 = std::max(0, startx + 1);
310 int xs2 = std::min(maxx, startx + 2);
311 int xs3 = std::min(maxx, startx + 3);
312
313 float2 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
314 yp0[xs2], yp0[xs3], xf);
315 float2 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
316 yp1[xs2], yp1[xs3], xf);
317 float2 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
318 yp2[xs2], yp2[xs3], xf);
319 float2 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
320 yp3[xs2], yp3[xs3], xf);
321
322 float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
323 return p;
324 }
325
OneBiCubic(const float * yp0,const float * yp1,const float * yp2,const float * yp3,float xf,float yf,int width)326 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
327 float xf, float yf, int width) {
328 int startx = (int) floor(xf - 1);
329 xf = xf - floor(xf);
330 int maxx = width - 1;
331 int xs0 = std::max(0, startx + 0);
332 int xs1 = std::max(0, startx + 1);
333 int xs2 = std::min(maxx, startx + 2);
334 int xs3 = std::min(maxx, startx + 3);
335
336 float p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
337 yp0[xs2], yp0[xs3], xf);
338 float p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
339 yp1[xs2], yp1[xs3], xf);
340 float p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
341 yp2[xs2], yp2[xs3], xf);
342 float p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
343 yp3[xs2], yp3[xs3], xf);
344
345 float p = cubicInterpolate(p0, p1, p2, p3, yf);
346 return p;
347 }
348 #endif
349
kernelU4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)350 void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
351 const uchar *pin = mIn;
352 const int srcHeight = mInputSizeY;
353 const int srcWidth = mInputSizeX;
354 const size_t stride = mInputSizeX * paddedSize(mVectorSize);
355
356
357 #if defined(ARCH_X86_HAVE_AVX2)
358 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
359 _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
360 #else
361 float yf = (currentY + 0.5f) * mScaleY - 0.5f;
362 #endif
363
364
365 int starty = (int) floor(yf - 1);
366 yf = yf - floor(yf);
367 int maxy = srcHeight - 1;
368 int ys0 = std::max(0, starty + 0);
369 int ys1 = std::max(0, starty + 1);
370 int ys2 = std::min(maxy, starty + 2);
371 int ys3 = std::min(maxy, starty + 3);
372
373 const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
374 const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
375 const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
376 const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
377
378 uchar4 *out = ((uchar4 *)outPtr);
379 uint32_t x1 = xstart;
380 uint32_t x2 = xend;
381
382 #if defined(ARCH_ARM_USE_INTRINSICS)
383 if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
384 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
385 long xf16 = rint(xf * 0x10000);
386 uint32_t xinc16 = rint(mScaleX * 0x10000);
387
388 int xoff = (xf16 >> 16) - 1;
389 int xclip = std::max(0, xoff) - xoff;
390 int len = x2 - x1;
391
392 int32_t yr[4];
393 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
394 mkYCoeff(yr, yf);
395
396 xoff += xclip;
397
398 rsdIntrinsicResizeB4_K(
399 out, len,
400 xf16 & 0xffff, xinc16,
401 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
402 xclip, srcWidth - xoff + xclip,
403 osc_ctl, yr);
404 out += len;
405 x1 += len;
406 }
407 #endif
408
409 while(x1 < x2) {
410 #if defined(ARCH_X86_HAVE_AVX2)
411 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
412 _mm_set1_ps(0.5f)));
413 #else
414 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
415 #endif
416 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
417 out++;
418 x1++;
419 }
420 }
421
kernelU2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)422 void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
423 const uchar *pin = mIn;
424 const int srcHeight = mInputSizeY;
425 const int srcWidth = mInputSizeX;
426 const size_t stride = mInputSizeX * mVectorSize;
427
428
429 #if defined(ARCH_X86_HAVE_AVX2)
430 float yf = _mm_cvtss_f32(
431 _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
432 #else
433 float yf = (currentY + 0.5f) * mScaleY - 0.5f;
434 #endif
435
436 int starty = (int) floor(yf - 1);
437 yf = yf - floor(yf);
438 int maxy = srcHeight - 1;
439 int ys0 = std::max(0, starty + 0);
440 int ys1 = std::max(0, starty + 1);
441 int ys2 = std::min(maxy, starty + 2);
442 int ys3 = std::min(maxy, starty + 3);
443
444 const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
445 const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
446 const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
447 const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
448
449 uchar2 *out = ((uchar2 *)outPtr);
450 uint32_t x1 = xstart;
451 uint32_t x2 = xend;
452
453 #if defined(ARCH_ARM_USE_INTRINSICS)
454 if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
455 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
456 long xf16 = rint(xf * 0x10000);
457 uint32_t xinc16 = rint(mScaleX * 0x10000);
458
459 int xoff = (xf16 >> 16) - 1;
460 int xclip = std::max(0, xoff) - xoff;
461 int len = x2 - x1;
462
463 int32_t yr[4];
464 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
465 mkYCoeff(yr, yf);
466
467 xoff += xclip;
468
469 rsdIntrinsicResizeB2_K(
470 out, len,
471 xf16 & 0xffff, xinc16,
472 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
473 xclip, srcWidth - xoff + xclip,
474 osc_ctl, yr);
475 out += len;
476 x1 += len;
477 }
478 #endif
479
480 while(x1 < x2) {
481
482 #if defined(ARCH_X86_HAVE_AVX2)
483 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
484 _mm_set1_ps(0.5f)));
485 #else
486 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
487 #endif
488 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
489 out++;
490 x1++;
491 }
492 }
493
kernelU1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)494 void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
495 //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
496 const uchar *pin = mIn;
497 const int srcHeight = mInputSizeY;
498 const int srcWidth = mInputSizeX;
499 const size_t stride = mInputSizeX * mVectorSize;
500
501 // ALOGI("Toolkit ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
502 // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
503
504 #if defined(ARCH_X86_HAVE_AVX2)
505 float yf = _mm_cvtss_f32(
506 _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
507 #else
508 float yf = (currentY + 0.5f) * mScaleY - 0.5f;
509 #endif
510
511 int starty = (int) floor(yf - 1);
512 yf = yf - floor(yf);
513 int maxy = srcHeight - 1;
514 int ys0 = std::max(0, starty + 0);
515 int ys1 = std::min(maxy, std::max(0, starty + 1));
516 int ys2 = std::min(maxy, starty + 2);
517 int ys3 = std::min(maxy, starty + 3);
518
519 const uchar *yp0 = pin + stride * ys0;
520 const uchar *yp1 = pin + stride * ys1;
521 const uchar *yp2 = pin + stride * ys2;
522 const uchar *yp3 = pin + stride * ys3;
523
524 uchar *out = ((uchar *)outPtr);
525 uint32_t x1 = xstart;
526 uint32_t x2 = xend;
527
528 #if defined(ARCH_ARM_USE_INTRINSICS)
529 if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
530 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
531 long xf16 = rint(xf * 0x10000);
532 uint32_t xinc16 = rint(mScaleX * 0x10000);
533
534 int xoff = (xf16 >> 16) - 1;
535 int xclip = std::max(0, xoff) - xoff;
536 int len = x2 - x1;
537
538 int32_t yr[4];
539 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
540 mkYCoeff(yr, yf);
541
542 // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
543 // xclip %d, len %d, osc_ctl %lu)",
544 // ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
545 // osc_ctl);
546 // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
547 // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
548 // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
549
550 xoff += xclip;
551
552 rsdIntrinsicResizeB1_K(
553 out, len,
554 xf16 & 0xffff, xinc16,
555 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
556 xclip, srcWidth - xoff + xclip,
557 osc_ctl, yr);
558 out += len;
559 x1 += len;
560 }
561 #endif
562
563 while(x1 < x2) {
564
565 #if defined(ARCH_X86_HAVE_AVX2)
566 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
567 _mm_set1_ps(0.5f)));
568 #else
569 float xf = (x1 + 0.5f) * mScaleX - 0.5f;
570 #endif
571
572 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
573 out++;
574 x1++;
575 }
576 }
577
578 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
kernelF4(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)579 void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
580 const uchar *pin = mIn;
581 const int srcHeight = inputSizeY;
582 const int srcWidth = inputSizeX;
583 const size_t stride = sizeX * vectorSize;
584
585 #if defined(ARCH_X86_HAVE_AVX2)
586 float yf = _mm_cvtss_f32(
587 _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
588 #else
589 float yf = (currentY + 0.5f) * scaleY - 0.5f;
590 #endif
591
592 int starty = (int) floor(yf - 1);
593 yf = yf - floor(yf);
594 int maxy = srcHeight - 1;
595 int ys0 = std::max(0, starty + 0);
596 int ys1 = std::max(0, starty + 1);
597 int ys2 = std::min(maxy, starty + 2);
598 int ys3 = std::min(maxy, starty + 3);
599
600 const float4 *yp0 = (const float4 *)(pin + stride * ys0);
601 const float4 *yp1 = (const float4 *)(pin + stride * ys1);
602 const float4 *yp2 = (const float4 *)(pin + stride * ys2);
603 const float4 *yp3 = (const float4 *)(pin + stride * ys3);
604
605 float4 *out = ((float4 *)outPtr);
606 uint32_t x1 = xstart;
607 uint32_t x2 = xend;
608
609 while(x1 < x2) {
610
611 #if defined(ARCH_X86_HAVE_AVX2)
612 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
613 _mm_set1_ps(0.5f)));
614 #else
615 float xf = (x1 + 0.5f) * scaleX - 0.5f;
616 #endif
617
618 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
619 out++;
620 x1++;
621 }
622 }
623
kernelF2(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)624 void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
625 const uchar *pin = mIn;
626 const int srcHeight = inputSizeY;
627 const int srcWidth = inputSizeX;
628 const size_t stride = sizeX * vectorSize;
629
630
631 #if defined(ARCH_X86_HAVE_AVX2)
632 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
633 _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
634 #else
635 float yf = (currentY + 0.5f) * scaleY - 0.5f;
636 #endif
637
638 int starty = (int) floor(yf - 1);
639 yf = yf - floor(yf);
640 int maxy = srcHeight - 1;
641 int ys0 = std::max(0, starty + 0);
642 int ys1 = std::max(0, starty + 1);
643 int ys2 = std::min(maxy, starty + 2);
644 int ys3 = std::min(maxy, starty + 3);
645
646 const float2 *yp0 = (const float2 *)(pin + stride * ys0);
647 const float2 *yp1 = (const float2 *)(pin + stride * ys1);
648 const float2 *yp2 = (const float2 *)(pin + stride * ys2);
649 const float2 *yp3 = (const float2 *)(pin + stride * ys3);
650
651 float2 *out = ((float2 *)outPtr);
652 uint32_t x1 = xstart;
653 uint32_t x2 = xend;
654
655 while(x1 < x2) {
656
657 #if defined(ARCH_X86_HAVE_AVX2)
658 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
659 _mm_set1_ps(0.5f)));
660 #else
661 float xf = (x1 + 0.5f) * scaleX - 0.5f;
662 #endif
663
664 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
665 out++;
666 x1++;
667 }
668 }
669
kernelF1(uchar * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)670 void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
671 const uchar *pin = mIn;
672 const int srcHeight = inputSizeY;
673 const int srcWidth = inputSizeX;
674 const size_t stride = sizeX * vectorSize;
675
676
677 #if defined(ARCH_X86_HAVE_AVX2)
678 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
679 _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
680 #else
681 float yf = (currentY + 0.5f) * scaleY - 0.5f;
682 #endif
683
684 int starty = (int) floor(yf - 1);
685 yf = yf - floor(yf);
686 int maxy = srcHeight - 1;
687 int ys0 = std::max(0, starty + 0);
688 int ys1 = std::max(0, starty + 1);
689 int ys2 = std::min(maxy, starty + 2);
690 int ys3 = std::min(maxy, starty + 3);
691
692 const float *yp0 = (const float *)(pin + stride * ys0);
693 const float *yp1 = (const float *)(pin + stride * ys1);
694 const float *yp2 = (const float *)(pin + stride * ys2);
695 const float *yp3 = (const float *)(pin + stride * ys3);
696
697 float *out = ((float *)outPtr);
698 uint32_t x1 = xstart;
699 uint32_t x2 = xend;
700
701 while(x1 < x2) {
702
703 #if defined(ARCH_X86_HAVE_AVX2)
704 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
705 _mm_set1_ps(0.5f)));
706 #else
707 float xf = (x1 + 0.5f) * scaleX - 0.5f;
708 #endif
709
710 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
711 out++;
712 x1++;
713 }
714 }
715
preLaunch(uint32_t slot,const RsScriptCall * sc)716 void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
717 {
718
719 //check the data type to determine F or U.
720 if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
721 switch(mAlloc->getType()->getElement()->getVectorSize()) {
722 case 1:
723 mRootPtr = &kernelU1;
724 break;
725 case 2:
726 mRootPtr = &kernelU2;
727 break;
728 case 3:
729 case 4:
730 mRootPtr = &kernelU4;
731 break;
732 }
733 } else {
734 switch(mAlloc->getType()->getElement()->getVectorSize()) {
735 case 1:
736 mRootPtr = &kernelF1;
737 break;
738 case 2:
739 mRootPtr = &kernelF2;
740 break;
741 case 3:
742 case 4:
743 mRootPtr = &kernelF4;
744 break;
745 }
746 }
747 }
748 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
749
resize(const uint8_t * input,uint8_t * output,size_t inputSizeX,size_t inputSizeY,size_t vectorSize,size_t outputSizeX,size_t outputSizeY,const Restriction * restriction)750 void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
751 size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
752 size_t outputSizeY, const Restriction* restriction) {
753 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
754 if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
755 return;
756 }
757 if (vectorSize < 1 || vectorSize > 4) {
758 ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
759 return;
760 }
761 #endif
762
763 ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
764 outputSizeX, outputSizeY, restriction);
765 processor->doTask(&task);
766 }
767
768 } // namespace renderscript
769 } // namespace android
770