• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* libs/pixelflinger/codeflinger/texturing.cpp
2 **
3 ** Copyright 2006, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #include <assert.h>
19 #include <stdint.h>
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <sys/types.h>
23 
24 #include <cutils/log.h>
25 
26 #include "codeflinger/GGLAssembler.h"
27 
28 
29 namespace android {
30 
31 // ---------------------------------------------------------------------------
32 
33 // iterators are initialized like this:
34 // (intToFixedCenter(x) * dx)>>16 + x0
35 // ((x<<16 + 0x8000) * dx)>>16 + x0
36 // ((x<<16)*dx + (0x8000*dx))>>16 + x0
37 // ( (x*dx) + dx>>1 ) + x0
38 // (x*dx) + (dx>>1 + x0)
39 
init_iterated_color(fragment_parts_t & parts,const reg_t & x)40 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
41 {
42     context_t const* c = mBuilderContext.c;
43     const needs_t& needs = mBuilderContext.needs;
44 
45     if (mSmooth) {
46         // NOTE: we could take this case in the mDithering + !mSmooth case,
47         // but this would use up to 4 more registers for the color components
48         // for only a little added quality.
49         // Currently, this causes the system to run out of registers in
50         // some case (see issue #719496)
51 
52         comment("compute initial iterated color (smooth and/or dither case)");
53 
54         parts.iterated_packed = 0;
55         parts.packed = 0;
56 
57         // 0x1: color component
58         // 0x2: iterators
59         const int optReload = mOptLevel >> 1;
60         if (optReload >= 3)         parts.reload = 0; // reload nothing
61         else if (optReload == 2)    parts.reload = 2; // reload iterators
62         else if (optReload == 1)    parts.reload = 1; // reload colors
63         else if (optReload <= 0)    parts.reload = 3; // reload both
64 
65         if (!mSmooth) {
66             // we're not smoothing (just dithering), we never have to
67             // reload the iterators
68             parts.reload &= ~2;
69         }
70 
71         Scratch scratches(registerFile());
72         const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
73         const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
74         for (int i=0 ; i<4 ; i++) {
75             if (!mInfo[i].iterated)
76                 continue;
77 
78             // this component exists in the destination and is not replaced
79             // by a texture unit.
80             const int c = (parts.reload & 1) ? t0 : obtainReg();
81             if (i==0) CONTEXT_LOAD(c, iterators.ydady);
82             if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
83             if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
84             if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
85             parts.argb[i].reg = c;
86 
87             if (mInfo[i].smooth) {
88                 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
89                 const int dvdx = parts.argb_dx[i].reg;
90                 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
91                 MLA(AL, 0, c, x.reg, dvdx, c);
92 
93                 // adjust the color iterator to make sure it won't overflow
94                 if (!mAA) {
95                     // this is not needed when we're using anti-aliasing
96                     // because we will (have to) clamp the components
97                     // anyway.
98                     int end = scratches.obtain();
99                     MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
100                     MLA(AL, 1, end, dvdx, end, c);
101                     SUB(MI, 0, c, c, end);
102                     BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
103                     scratches.recycle(end);
104                 }
105             }
106 
107             if (parts.reload & 1) {
108                 CONTEXT_STORE(c, generated_vars.argb[i].c);
109             }
110         }
111     } else {
112         // We're not smoothed, so we can
113         // just use a packed version of the color and extract the
114         // components as needed (or not at all if we don't blend)
115 
116         // figure out if we need the iterated color
117         int load = 0;
118         for (int i=0 ; i<4 ; i++) {
119             component_info_t& info = mInfo[i];
120             if ((info.inDest || info.needed) && !info.replaced)
121                 load |= 1;
122         }
123 
124         parts.iterated_packed = 1;
125         parts.packed = (!mTextureMachine.mask && !mBlending
126                 && !mFog && !mDithering);
127         parts.reload = 0;
128         if (load || parts.packed) {
129             if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
130                 comment("load initial iterated color (8888 packed)");
131                 parts.iterated.setTo(obtainReg(),
132                         &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
133                 CONTEXT_LOAD(parts.iterated.reg, packed8888);
134             } else {
135                 comment("load initial iterated color (dest format packed)");
136 
137                 parts.iterated.setTo(obtainReg(), &mCbFormat);
138 
139                 // pre-mask the iterated color
140                 const int bits = parts.iterated.size();
141                 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
142                 uint32_t mask = 0;
143                 if (mMasking) {
144                     for (int i=0 ; i<4 ; i++) {
145                         const int component_mask = 1<<i;
146                         const int h = parts.iterated.format.c[i].h;
147                         const int l = parts.iterated.format.c[i].l;
148                         if (h && (!(mMasking & component_mask))) {
149                             mask |= ((1<<(h-l))-1) << l;
150                         }
151                     }
152                 }
153 
154                 if (mMasking && ((mask & size)==0)) {
155                     // none of the components are present in the mask
156                 } else {
157                     CONTEXT_LOAD(parts.iterated.reg, packed);
158                     if (mCbFormat.size == 1) {
159                         AND(AL, 0, parts.iterated.reg,
160                                 parts.iterated.reg, imm(0xFF));
161                     } else if (mCbFormat.size == 2) {
162                         MOV(AL, 0, parts.iterated.reg,
163                                 reg_imm(parts.iterated.reg, LSR, 16));
164                     }
165                 }
166 
167                 // pre-mask the iterated color
168                 if (mMasking) {
169                     build_and_immediate(parts.iterated.reg, parts.iterated.reg,
170                             mask, bits);
171                 }
172             }
173         }
174     }
175 }
176 
build_iterated_color(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)177 void GGLAssembler::build_iterated_color(
178         component_t& fragment,
179         const fragment_parts_t& parts,
180         int component,
181         Scratch& regs)
182 {
183     fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
184 
185     if (!mInfo[component].iterated)
186         return;
187 
188     if (parts.iterated_packed) {
189         // iterated colors are packed, extract the one we need
190         extract(fragment, parts.iterated, component);
191     } else {
192         fragment.h = GGL_COLOR_BITS;
193         fragment.l = GGL_COLOR_BITS - 8;
194         fragment.flags |= CLEAR_LO;
195         // iterated colors are held in their own register,
196         // (smooth and/or dithering case)
197         if (parts.reload==3) {
198             // this implies mSmooth
199             Scratch scratches(registerFile());
200             int dx = scratches.obtain();
201             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
202             CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
203             ADD(AL, 0, dx, fragment.reg, dx);
204             CONTEXT_STORE(dx, generated_vars.argb[component].c);
205         } else if (parts.reload & 1) {
206             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
207         } else {
208             // we don't reload, so simply rename the register and mark as
209             // non CORRUPTIBLE so that the texture env or blending code
210             // won't modify this (renamed) register
211             regs.recycle(fragment.reg);
212             fragment.reg = parts.argb[component].reg;
213             fragment.flags &= ~CORRUPTIBLE;
214         }
215         if (mInfo[component].smooth && mAA) {
216             // when using smooth shading AND anti-aliasing, we need to clamp
217             // the iterators because there is always an extra pixel on the
218             // edges, which most of the time will cause an overflow
219             // (since technically its outside of the domain).
220             BIC(AL, 0, fragment.reg, fragment.reg,
221                     reg_imm(fragment.reg, ASR, 31));
222             component_sat(fragment);
223         }
224     }
225 }
226 
227 // ---------------------------------------------------------------------------
228 
decodeLogicOpNeeds(const needs_t & needs)229 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
230 {
231     // gather some informations about the components we need to process...
232     const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
233     switch(opcode) {
234     case GGL_COPY:
235         mLogicOp = 0;
236         break;
237     case GGL_CLEAR:
238     case GGL_SET:
239         mLogicOp = LOGIC_OP;
240         break;
241     case GGL_AND:
242     case GGL_AND_REVERSE:
243     case GGL_AND_INVERTED:
244     case GGL_XOR:
245     case GGL_OR:
246     case GGL_NOR:
247     case GGL_EQUIV:
248     case GGL_OR_REVERSE:
249     case GGL_OR_INVERTED:
250     case GGL_NAND:
251         mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
252         break;
253     case GGL_NOOP:
254     case GGL_INVERT:
255         mLogicOp = LOGIC_OP|LOGIC_OP_DST;
256         break;
257     case GGL_COPY_INVERTED:
258         mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
259         break;
260     };
261 }
262 
decodeTMUNeeds(const needs_t & needs,context_t const * c)263 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
264 {
265     uint8_t replaced=0;
266     mTextureMachine.mask = 0;
267     mTextureMachine.activeUnits = 0;
268     for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
269         texture_unit_t& tmu = mTextureMachine.tmu[i];
270         if (replaced == 0xF) {
271             // all components are replaced, skip this TMU.
272             tmu.format_idx = 0;
273             tmu.mask = 0;
274             tmu.replaced = replaced;
275             continue;
276         }
277         tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
278         tmu.format = c->formats[tmu.format_idx];
279         tmu.bits = tmu.format.size*8;
280         tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
281         tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
282         tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
283         tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
284         tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
285                 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
286 
287         // 5551 linear filtering is not supported
288         if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
289             tmu.linear = 0;
290 
291         tmu.mask = 0;
292         tmu.replaced = replaced;
293 
294         if (tmu.format_idx) {
295             mTextureMachine.activeUnits++;
296             if (tmu.format.c[0].h)    tmu.mask |= 0x1;
297             if (tmu.format.c[1].h)    tmu.mask |= 0x2;
298             if (tmu.format.c[2].h)    tmu.mask |= 0x4;
299             if (tmu.format.c[3].h)    tmu.mask |= 0x8;
300             if (tmu.env == GGL_REPLACE) {
301                 replaced |= tmu.mask;
302             } else if (tmu.env == GGL_DECAL) {
303                 if (!tmu.format.c[GGLFormat::ALPHA].h) {
304                     // if we don't have alpha, decal does nothing
305                     tmu.mask = 0;
306                 } else {
307                     // decal always ignores At
308                     tmu.mask &= ~(1<<GGLFormat::ALPHA);
309                 }
310             }
311         }
312         mTextureMachine.mask |= tmu.mask;
313         //printf("%d: mask=%08lx, replaced=%08lx\n",
314         //    i, int(tmu.mask), int(tmu.replaced));
315     }
316     mTextureMachine.replaced = replaced;
317     mTextureMachine.directTexture = 0;
318     //printf("replaced=%08lx\n", mTextureMachine.replaced);
319 }
320 
321 
init_textures(tex_coord_t * coords,const reg_t & x,const reg_t & y)322 void GGLAssembler::init_textures(
323         tex_coord_t* coords,
324         const reg_t& x, const reg_t& y)
325 {
326     context_t const* c = mBuilderContext.c;
327     const needs_t& needs = mBuilderContext.needs;
328     int Rctx = mBuilderContext.Rctx;
329     int Rx = x.reg;
330     int Ry = y.reg;
331 
332     if (mTextureMachine.mask) {
333         comment("compute texture coordinates");
334     }
335 
336     // init texture coordinates for each tmu
337     const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
338     const bool multiTexture = mTextureMachine.activeUnits > 1;
339     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
340         const texture_unit_t& tmu = mTextureMachine.tmu[i];
341         if (tmu.format_idx == 0)
342             continue;
343         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
344             (tmu.twrap == GGL_NEEDS_WRAP_11))
345         {
346             // 1:1 texture
347             pointer_t& txPtr = coords[i].ptr;
348             txPtr.setTo(obtainReg(), tmu.bits);
349             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
350             ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
351             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
352             ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
353             // merge base & offset
354             CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
355             SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
356             CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
357             base_offset(txPtr, txPtr, Rx);
358         } else {
359             Scratch scratches(registerFile());
360             reg_t& s = coords[i].s;
361             reg_t& t = coords[i].t;
362             // s = (x * dsdx)>>16 + ydsdy
363             // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
364             // t = (x * dtdx)>>16 + ydtdy
365             // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
366             s.setTo(obtainReg());
367             t.setTo(obtainReg());
368             const int need_w = GGL_READ_NEEDS(W, needs.n);
369             if (need_w) {
370                 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
371                 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
372             } else {
373                 int ydsdy = scratches.obtain();
374                 int ydtdy = scratches.obtain();
375                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
376                 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
377                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
378                 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
379                 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
380                 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
381             }
382 
383             if ((mOptLevel&1)==0) {
384                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
385                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
386                 recycleReg(s.reg);
387                 recycleReg(t.reg);
388             }
389         }
390 
391         // direct texture?
392         if (!multiTexture && !mBlending && !mDithering && !mFog &&
393             cb_format_idx == tmu.format_idx && !tmu.linear &&
394             mTextureMachine.replaced == tmu.mask)
395         {
396                 mTextureMachine.directTexture = i + 1;
397         }
398     }
399 }
400 
build_textures(fragment_parts_t & parts,Scratch & regs)401 void GGLAssembler::build_textures(  fragment_parts_t& parts,
402                                     Scratch& regs)
403 {
404     context_t const* c = mBuilderContext.c;
405     const needs_t& needs = mBuilderContext.needs;
406     int Rctx = mBuilderContext.Rctx;
407 
408     // We don't have a way to spill registers automatically
409     // spill depth and AA regs, when we know we may have to.
410     // build the spill list...
411     uint32_t spill_list = 0;
412     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
413         const texture_unit_t& tmu = mTextureMachine.tmu[i];
414         if (tmu.format_idx == 0)
415             continue;
416         if (tmu.linear) {
417             // we may run out of register if we have linear filtering
418             // at 1 or 4 bytes / pixel on any texture unit.
419             if (tmu.format.size == 1) {
420                 // if depth and AA enabled, we'll run out of 1 register
421                 if (parts.z.reg > 0 && parts.covPtr.reg > 0)
422                     spill_list |= 1<<parts.covPtr.reg;
423             }
424             if (tmu.format.size == 4) {
425                 // if depth or AA enabled, we'll run out of 1 or 2 registers
426                 if (parts.z.reg > 0)
427                     spill_list |= 1<<parts.z.reg;
428                 if (parts.covPtr.reg > 0)
429                     spill_list |= 1<<parts.covPtr.reg;
430             }
431         }
432     }
433 
434     Spill spill(registerFile(), *this, spill_list);
435 
436     const bool multiTexture = mTextureMachine.activeUnits > 1;
437     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
438         const texture_unit_t& tmu = mTextureMachine.tmu[i];
439         if (tmu.format_idx == 0)
440             continue;
441 
442         pointer_t& txPtr = parts.coords[i].ptr;
443         pixel_t& texel = parts.texel[i];
444 
445         // repeat...
446         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
447             (tmu.twrap == GGL_NEEDS_WRAP_11))
448         { // 1:1 textures
449             comment("fetch texel");
450             texel.setTo(regs.obtain(), &tmu.format);
451             load(txPtr, texel, WRITE_BACK);
452         } else {
453             Scratch scratches(registerFile());
454             reg_t& s = parts.coords[i].s;
455             reg_t& t = parts.coords[i].t;
456             if ((mOptLevel&1)==0) {
457                 comment("reload s/t (multitexture or linear filtering)");
458                 s.reg = scratches.obtain();
459                 t.reg = scratches.obtain();
460                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
461                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
462             }
463 
464             comment("compute repeat/clamp");
465             int u       = scratches.obtain();
466             int v       = scratches.obtain();
467             int width   = scratches.obtain();
468             int height  = scratches.obtain();
469             int U = 0;
470             int V = 0;
471 
472             CONTEXT_LOAD(width,  generated_vars.texture[i].width);
473             CONTEXT_LOAD(height, generated_vars.texture[i].height);
474 
475             int FRAC_BITS = 0;
476             if (tmu.linear) {
477                 // linear interpolation
478                 if (tmu.format.size == 1) {
479                     // for 8-bits textures, we can afford
480                     // 7 bits of fractional precision at no
481                     // additional cost (we can't do 8 bits
482                     // because filter8 uses signed 16 bits muls)
483                     FRAC_BITS = 7;
484                 } else if (tmu.format.size == 2) {
485                     // filter16() is internally limited to 4 bits, so:
486                     // FRAC_BITS=2 generates less instructions,
487                     // FRAC_BITS=3,4,5 creates unpleasant artifacts,
488                     // FRAC_BITS=6+ looks good
489                     FRAC_BITS = 6;
490                 } else if (tmu.format.size == 4) {
491                     // filter32() is internally limited to 8 bits, so:
492                     // FRAC_BITS=4 looks good
493                     // FRAC_BITS=5+ looks better, but generates 3 extra ipp
494                     FRAC_BITS = 6;
495                 } else {
496                     // for all other cases we use 4 bits.
497                     FRAC_BITS = 4;
498                 }
499             }
500             wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
501             wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
502 
503             if (tmu.linear) {
504                 comment("compute linear filtering offsets");
505                 // pixel size scale
506                 const int shift = 31 - gglClz(tmu.format.size);
507                 U = scratches.obtain();
508                 V = scratches.obtain();
509 
510                 // sample the texel center
511                 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
512                 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
513 
514                 // get the fractionnal part of U,V
515                 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
516                 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
517 
518                 // compute width-1 and height-1
519                 SUB(AL, 0, width,  width,  imm(1));
520                 SUB(AL, 0, height, height, imm(1));
521 
522                 // get the integer part of U,V and clamp/wrap
523                 // and compute offset to the next texel
524                 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
525                     // u has already been REPEATed
526                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
527                     MOV(MI, 0, u, width);
528                     CMP(AL, u, width);
529                     MOV(LT, 0, width, imm(1 << shift));
530                     if (shift)
531                         MOV(GE, 0, width, reg_imm(width, LSL, shift));
532                     RSB(GE, 0, width, width, imm(0));
533                 } else {
534                     // u has not been CLAMPed yet
535                     // algorithm:
536                     // if ((u>>4) >= width)
537                     //      u = width<<4
538                     //      width = 0
539                     // else
540                     //      width = 1<<shift
541                     // u = u>>4; // get integer part
542                     // if (u<0)
543                     //      u = 0
544                     //      width = 0
545                     // generated_vars.rt = width
546 
547                     CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
548                     MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
549                     MOV(LE, 0, width, imm(0));
550                     MOV(GT, 0, width, imm(1 << shift));
551                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
552                     MOV(MI, 0, u, imm(0));
553                     MOV(MI, 0, width, imm(0));
554                 }
555                 CONTEXT_STORE(width, generated_vars.rt);
556 
557                 const int stride = width;
558                 CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
559                 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
560                     // v has already been REPEATed
561                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
562                     MOV(MI, 0, v, height);
563                     CMP(AL, v, height);
564                     MOV(LT, 0, height, imm(1 << shift));
565                     if (shift)
566                         MOV(GE, 0, height, reg_imm(height, LSL, shift));
567                     RSB(GE, 0, height, height, imm(0));
568                     MUL(AL, 0, height, stride, height);
569                 } else {
570                     // u has not been CLAMPed yet
571                     CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
572                     MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
573                     MOV(LE, 0, height, imm(0));
574                     if (shift) {
575                         MOV(GT, 0, height, reg_imm(stride, LSL, shift));
576                     } else {
577                         MOV(GT, 0, height, stride);
578                     }
579                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
580                     MOV(MI, 0, v, imm(0));
581                     MOV(MI, 0, height, imm(0));
582                 }
583                 CONTEXT_STORE(height, generated_vars.lb);
584             }
585 
586             scratches.recycle(width);
587             scratches.recycle(height);
588 
589             // iterate texture coordinates...
590             comment("iterate s,t");
591             int dsdx = scratches.obtain();
592             int dtdx = scratches.obtain();
593             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
594             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
595             ADD(AL, 0, s.reg, s.reg, dsdx);
596             ADD(AL, 0, t.reg, t.reg, dtdx);
597             if ((mOptLevel&1)==0) {
598                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
599                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
600                 scratches.recycle(s.reg);
601                 scratches.recycle(t.reg);
602             }
603             scratches.recycle(dsdx);
604             scratches.recycle(dtdx);
605 
606             // merge base & offset...
607             comment("merge base & offset");
608             texel.setTo(regs.obtain(), &tmu.format);
609             txPtr.setTo(texel.reg, tmu.bits);
610             int stride = scratches.obtain();
611             CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
612             CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data);
613             SMLABB(AL, u, v, stride, u);    // u+v*stride
614             base_offset(txPtr, txPtr, u);
615 
616             // load texel
617             if (!tmu.linear) {
618                 comment("fetch texel");
619                 load(txPtr, texel, 0);
620             } else {
621                 // recycle registers we don't need anymore
622                 scratches.recycle(u);
623                 scratches.recycle(v);
624                 scratches.recycle(stride);
625 
626                 comment("fetch texel, bilinear");
627                 switch (tmu.format.size) {
628                 case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
629                 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
630                 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
631                 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
632                 }
633             }
634         }
635     }
636 }
637 
build_iterate_texture_coordinates(const fragment_parts_t & parts)638 void GGLAssembler::build_iterate_texture_coordinates(
639     const fragment_parts_t& parts)
640 {
641     const bool multiTexture = mTextureMachine.activeUnits > 1;
642     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
643         const texture_unit_t& tmu = mTextureMachine.tmu[i];
644         if (tmu.format_idx == 0)
645             continue;
646 
647         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
648             (tmu.twrap == GGL_NEEDS_WRAP_11))
649         { // 1:1 textures
650             const pointer_t& txPtr = parts.coords[i].ptr;
651             ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
652         } else {
653             Scratch scratches(registerFile());
654             int s = parts.coords[i].s.reg;
655             int t = parts.coords[i].t.reg;
656             if ((mOptLevel&1)==0) {
657                 s = scratches.obtain();
658                 t = scratches.obtain();
659                 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
660                 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
661             }
662             int dsdx = scratches.obtain();
663             int dtdx = scratches.obtain();
664             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
665             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
666             ADD(AL, 0, s, s, dsdx);
667             ADD(AL, 0, t, t, dtdx);
668             if ((mOptLevel&1)==0) {
669                 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
670                 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
671             }
672         }
673     }
674 }
675 
filter8(const fragment_parts_t & parts,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)676 void GGLAssembler::filter8(
677         const fragment_parts_t& parts,
678         pixel_t& texel, const texture_unit_t& tmu,
679         int U, int V, pointer_t& txPtr,
680         int FRAC_BITS)
681 {
682     if (tmu.format.components != GGL_ALPHA &&
683         tmu.format.components != GGL_LUMINANCE)
684     {
685         // this is a packed format, and we don't support
686         // linear filtering (it's probably RGB 332)
687         // Should not happen with OpenGL|ES
688         LDRB(AL, texel.reg, txPtr.reg);
689         return;
690     }
691 
692     // ------------------------
693     // about ~22 cycles / pixel
694     Scratch scratches(registerFile());
695 
696     int pixel= scratches.obtain();
697     int d    = scratches.obtain();
698     int u    = scratches.obtain();
699     int k    = scratches.obtain();
700     int rt   = scratches.obtain();
701     int lb   = scratches.obtain();
702 
703     // RB -> U * V
704 
705     CONTEXT_LOAD(rt, generated_vars.rt);
706     CONTEXT_LOAD(lb, generated_vars.lb);
707 
708     int offset = pixel;
709     ADD(AL, 0, offset, lb, rt);
710     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
711     SMULBB(AL, u, U, V);
712     SMULBB(AL, d, pixel, u);
713     RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
714 
715     // LB -> (1-U) * V
716     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
717     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
718     SMULBB(AL, u, U, V);
719     SMLABB(AL, d, pixel, u, d);
720     SUB(AL, 0, k, k, u);
721 
722     // LT -> (1-U)*(1-V)
723     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
724     LDRB(AL, pixel, txPtr.reg);
725     SMULBB(AL, u, U, V);
726     SMLABB(AL, d, pixel, u, d);
727 
728     // RT -> U*(1-V)
729     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
730     SUB(AL, 0, u, k, u);
731     SMLABB(AL, texel.reg, pixel, u, d);
732 
733     for (int i=0 ; i<4 ; i++) {
734         if (!texel.format.c[i].h) continue;
735         texel.format.c[i].h = FRAC_BITS*2+8;
736         texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
737     }
738     texel.format.size = 4;
739     texel.format.bitsPerPixel = 32;
740     texel.flags |= CLEAR_LO;
741 }
742 
filter16(const fragment_parts_t & parts,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)743 void GGLAssembler::filter16(
744         const fragment_parts_t& parts,
745         pixel_t& texel, const texture_unit_t& tmu,
746         int U, int V, pointer_t& txPtr,
747         int FRAC_BITS)
748 {
749     // compute the mask
750     // XXX: it would be nice if the mask below could be computed
751     // automatically.
752     uint32_t mask = 0;
753     int shift = 0;
754     int prec = 0;
755     switch (tmu.format_idx) {
756         case GGL_PIXEL_FORMAT_RGB_565:
757             // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
758             // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
759             mask = 0x07E0F81F;
760             shift = 16;
761             prec = 5;
762             break;
763         case GGL_PIXEL_FORMAT_RGBA_4444:
764             // 0000,1111,0000,1111 | 0000,1111,0000,1111
765             mask = 0x0F0F0F0F;
766             shift = 12;
767             prec = 4;
768             break;
769         case GGL_PIXEL_FORMAT_LA_88:
770             // 0000,0000,1111,1111 | 0000,0000,1111,1111
771             // AALL -> 00AA | 00LL
772             mask = 0x00FF00FF;
773             shift = 8;
774             prec = 8;
775             break;
776         default:
777             // unsupported format, do something sensical...
778             LOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
779             LDRH(AL, texel.reg, txPtr.reg);
780             return;
781     }
782 
783     const int adjust = FRAC_BITS*2 - prec;
784     const int round  = 0;
785 
786     // update the texel format
787     texel.format.size = 4;
788     texel.format.bitsPerPixel = 32;
789     texel.flags |= CLEAR_HI|CLEAR_LO;
790     for (int i=0 ; i<4 ; i++) {
791         if (!texel.format.c[i].h) continue;
792         const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
793         texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
794         texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
795     }
796 
797     // ------------------------
798     // about ~40 cycles / pixel
799     Scratch scratches(registerFile());
800 
801     int pixel= scratches.obtain();
802     int d    = scratches.obtain();
803     int u    = scratches.obtain();
804     int k    = scratches.obtain();
805 
806     // RB -> U * V
807     int offset = pixel;
808     CONTEXT_LOAD(offset, generated_vars.rt);
809     CONTEXT_LOAD(u, generated_vars.lb);
810     ADD(AL, 0, offset, offset, u);
811 
812     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
813     SMULBB(AL, u, U, V);
814     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
815     build_and_immediate(pixel, pixel, mask, 32);
816     if (adjust) {
817         if (round)
818             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
819         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
820     }
821     MUL(AL, 0, d, pixel, u);
822     RSB(AL, 0, k, u, imm(1<<prec));
823 
824     // LB -> (1-U) * V
825     CONTEXT_LOAD(offset, generated_vars.lb);
826     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
827     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
828     SMULBB(AL, u, U, V);
829     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
830     build_and_immediate(pixel, pixel, mask, 32);
831     if (adjust) {
832         if (round)
833             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
834         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
835     }
836     MLA(AL, 0, d, pixel, u, d);
837     SUB(AL, 0, k, k, u);
838 
839     // LT -> (1-U)*(1-V)
840     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
841     LDRH(AL, pixel, txPtr.reg);
842     SMULBB(AL, u, U, V);
843     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
844     build_and_immediate(pixel, pixel, mask, 32);
845     if (adjust) {
846         if (round)
847             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
848         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
849     }
850     MLA(AL, 0, d, pixel, u, d);
851 
852     // RT -> U*(1-V)
853     CONTEXT_LOAD(offset, generated_vars.rt);
854     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
855     SUB(AL, 0, u, k, u);
856     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
857     build_and_immediate(pixel, pixel, mask, 32);
858     MLA(AL, 0, texel.reg, pixel, u, d);
859 }
860 
filter24(const fragment_parts_t & parts,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)861 void GGLAssembler::filter24(
862         const fragment_parts_t& parts,
863         pixel_t& texel, const texture_unit_t& tmu,
864         int U, int V, pointer_t& txPtr,
865         int FRAC_BITS)
866 {
867     // not supported yet (currently disabled)
868     load(txPtr, texel, 0);
869 }
870 
filter32(const fragment_parts_t & parts,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)871 void GGLAssembler::filter32(
872         const fragment_parts_t& parts,
873         pixel_t& texel, const texture_unit_t& tmu,
874         int U, int V, pointer_t& txPtr,
875         int FRAC_BITS)
876 {
877     const int adjust = FRAC_BITS*2 - 8;
878     const int round  = 0;
879 
880     // ------------------------
881     // about ~38 cycles / pixel
882     Scratch scratches(registerFile());
883 
884     int pixel= scratches.obtain();
885     int dh   = scratches.obtain();
886     int u    = scratches.obtain();
887     int k    = scratches.obtain();
888 
889     int temp = scratches.obtain();
890     int dl   = scratches.obtain();
891     int mask = scratches.obtain();
892 
893     MOV(AL, 0, mask, imm(0xFF));
894     ORR(AL, 0, mask, mask, imm(0xFF0000));
895 
896     // RB -> U * V
897     int offset = pixel;
898     CONTEXT_LOAD(offset, generated_vars.rt);
899     CONTEXT_LOAD(u, generated_vars.lb);
900     ADD(AL, 0, offset, offset, u);
901 
902     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
903     SMULBB(AL, u, U, V);
904     AND(AL, 0, temp, mask, pixel);
905     if (adjust) {
906         if (round)
907             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
908         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
909     }
910     MUL(AL, 0, dh, temp, u);
911     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
912     MUL(AL, 0, dl, temp, u);
913     RSB(AL, 0, k, u, imm(0x100));
914 
915     // LB -> (1-U) * V
916     CONTEXT_LOAD(offset, generated_vars.lb);
917     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
918     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
919     SMULBB(AL, u, U, V);
920     AND(AL, 0, temp, mask, pixel);
921     if (adjust) {
922         if (round)
923             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
924         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
925     }
926     MLA(AL, 0, dh, temp, u, dh);
927     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
928     MLA(AL, 0, dl, temp, u, dl);
929     SUB(AL, 0, k, k, u);
930 
931     // LT -> (1-U)*(1-V)
932     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
933     LDR(AL, pixel, txPtr.reg);
934     SMULBB(AL, u, U, V);
935     AND(AL, 0, temp, mask, pixel);
936     if (adjust) {
937         if (round)
938             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
939         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
940     }
941     MLA(AL, 0, dh, temp, u, dh);
942     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
943     MLA(AL, 0, dl, temp, u, dl);
944 
945     // RT -> U*(1-V)
946     CONTEXT_LOAD(offset, generated_vars.rt);
947     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
948     SUB(AL, 0, u, k, u);
949     AND(AL, 0, temp, mask, pixel);
950     MLA(AL, 0, dh, temp, u, dh);
951     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
952     MLA(AL, 0, dl, temp, u, dl);
953 
954     AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
955     AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
956     ORR(AL, 0, texel.reg, dh, dl);
957 }
958 
build_texture_environment(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)959 void GGLAssembler::build_texture_environment(
960         component_t& fragment,
961         const fragment_parts_t& parts,
962         int component,
963         Scratch& regs)
964 {
965     const uint32_t component_mask = 1<<component;
966     const bool multiTexture = mTextureMachine.activeUnits > 1;
967     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
968         texture_unit_t& tmu = mTextureMachine.tmu[i];
969 
970         if (tmu.mask & component_mask) {
971             // replace or modulate with this texture
972             if ((tmu.replaced & component_mask) == 0) {
973                 // not replaced by a later tmu...
974 
975                 Scratch scratches(registerFile());
976                 pixel_t texel(parts.texel[i]);
977                 if (multiTexture &&
978                     tmu.swrap == GGL_NEEDS_WRAP_11 &&
979                     tmu.twrap == GGL_NEEDS_WRAP_11)
980                 {
981                     texel.reg = scratches.obtain();
982                     texel.flags |= CORRUPTIBLE;
983                     comment("fetch texel (multitexture 1:1)");
984                     load(parts.coords[i].ptr, texel, WRITE_BACK);
985                  }
986 
987                 component_t incoming(fragment);
988                 modify(fragment, regs);
989 
990                 switch (tmu.env) {
991                 case GGL_REPLACE:
992                     extract(fragment, texel, component);
993                     break;
994                 case GGL_MODULATE:
995                     modulate(fragment, incoming, texel, component);
996                     break;
997                 case GGL_DECAL:
998                     decal(fragment, incoming, texel, component);
999                     break;
1000                 case GGL_BLEND:
1001                     blend(fragment, incoming, texel, component, i);
1002                     break;
1003                 case GGL_ADD:
1004                     add(fragment, incoming, texel, component);
1005                     break;
1006                 }
1007             }
1008         }
1009     }
1010 }
1011 
1012 // ---------------------------------------------------------------------------
1013 
wrapping(int d,int coord,int size,int tx_wrap,int tx_linear)1014 void GGLAssembler::wrapping(
1015             int d,
1016             int coord, int size,
1017             int tx_wrap, int tx_linear)
1018 {
1019     // notes:
1020     // if tx_linear is set, we need 4 extra bits of precision on the result
1021     // SMULL/UMULL is 3 cycles
1022     Scratch scratches(registerFile());
1023     int c = coord;
1024     if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
1025         // UMULL takes 4 cycles (interlocked), and we can get away with
1026         // 2 cycles using SMULWB, but we're loosing 16 bits of precision
1027         // out of 32 (this is not a problem because the iterator keeps
1028         // its full precision)
1029         // UMULL(AL, 0, size, d, c, size);
1030         // note: we can't use SMULTB because it's signed.
1031         MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
1032         SMULWB(AL, d, d, size);
1033     } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
1034         if (tx_linear) {
1035             // 1 cycle
1036             MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
1037         } else {
1038             // 4 cycles (common case)
1039             MOV(AL, 0, d, reg_imm(coord, ASR, 16));
1040             BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
1041             CMP(AL, d, size);
1042             SUB(GE, 0, d, size, imm(1));
1043         }
1044     }
1045 }
1046 
1047 // ---------------------------------------------------------------------------
1048 
modulate(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1049 void GGLAssembler::modulate(
1050         component_t& dest,
1051         const component_t& incoming,
1052         const pixel_t& incomingTexel, int component)
1053 {
1054     Scratch locals(registerFile());
1055     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1056     extract(texel, incomingTexel, component);
1057 
1058     const int Nt = texel.size();
1059         // Nt should always be less than 10 bits because it comes
1060         // from the TMU.
1061 
1062     int Ni = incoming.size();
1063         // Ni could be big because it comes from previous MODULATEs
1064 
1065     if (Nt == 1) {
1066         // texel acts as a bit-mask
1067         // dest = incoming & ((texel << incoming.h)-texel)
1068         RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
1069         AND(AL, 0, dest.reg, dest.reg, incoming.reg);
1070         dest.l = incoming.l;
1071         dest.h = incoming.h;
1072         dest.flags |= (incoming.flags & CLEAR_LO);
1073     } else if (Ni == 1) {
1074         MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
1075         AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
1076         dest.l = 0;
1077         dest.h = Nt;
1078     } else {
1079         int inReg = incoming.reg;
1080         int shift = incoming.l;
1081         if ((Nt + Ni) > 32) {
1082             // we will overflow, reduce the precision of Ni to 8 bits
1083             // (Note Nt cannot be more than 10 bits which happens with
1084             // 565 textures and GGL_LINEAR)
1085             shift += Ni-8;
1086             Ni = 8;
1087         }
1088 
1089         // modulate by the component with the lowest precision
1090         if (Nt >= Ni) {
1091             if (shift) {
1092                 // XXX: we should be able to avoid this shift
1093                 // when shift==16 && Nt<16 && Ni<16, in which
1094                 // we could use SMULBT below.
1095                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1096                 inReg = dest.reg;
1097                 shift = 0;
1098             }
1099             // operation:           (Cf*Ct)/((1<<Ni)-1)
1100             // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
1101             // this operation doesn't change texel's size
1102             ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
1103             if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
1104             else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
1105             dest.l = Ni;
1106             dest.h = Nt + Ni;
1107         } else {
1108             if (shift && (shift != 16)) {
1109                 // if shift==16, we can use 16-bits mul instructions later
1110                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1111                 inReg = dest.reg;
1112                 shift = 0;
1113             }
1114             // operation:           (Cf*Ct)/((1<<Nt)-1)
1115             // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
1116             // this operation doesn't change incoming's size
1117             Scratch scratches(registerFile());
1118             int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
1119             if (t == inReg)
1120                 t = scratches.obtain();
1121             ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
1122             if (Nt<16 && Ni<16) {
1123                 if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
1124                 else            SMULBB(AL, dest.reg, t, inReg);
1125             } else              MUL(AL, 0, dest.reg, t, inReg);
1126             dest.l = Nt;
1127             dest.h = Nt + Ni;
1128         }
1129 
1130         // low bits are not valid
1131         dest.flags |= CLEAR_LO;
1132 
1133         // no need to keep more than 8 bits/component
1134         if (dest.size() > 8)
1135             dest.l = dest.h-8;
1136     }
1137 }
1138 
decal(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1139 void GGLAssembler::decal(
1140         component_t& dest,
1141         const component_t& incoming,
1142         const pixel_t& incomingTexel, int component)
1143 {
1144     // RGBA:
1145     // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
1146     // Av = Af
1147     Scratch locals(registerFile());
1148     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1149     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1150     extract(texel, incomingTexel, component);
1151     extract(factor, incomingTexel, GGLFormat::ALPHA);
1152 
1153     // no need to keep more than 8-bits for decal
1154     int Ni = incoming.size();
1155     int shift = incoming.l;
1156     if (Ni > 8) {
1157         shift += Ni-8;
1158         Ni = 8;
1159     }
1160     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1161     if (shift) {
1162         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1163         incomingNorm.reg = dest.reg;
1164         incomingNorm.flags |= CORRUPTIBLE;
1165     }
1166     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1167     build_blendOneMinusFF(dest, factor, incomingNorm, texel);
1168 }
1169 
blend(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component,int tmu)1170 void GGLAssembler::blend(
1171         component_t& dest,
1172         const component_t& incoming,
1173         const pixel_t& incomingTexel, int component, int tmu)
1174 {
1175     // RGBA:
1176     // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
1177     // Av = At*Af
1178 
1179     if (component == GGLFormat::ALPHA) {
1180         modulate(dest, incoming, incomingTexel, component);
1181         return;
1182     }
1183 
1184     Scratch locals(registerFile());
1185     integer_t color(locals.obtain(), 8, CORRUPTIBLE);
1186     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1187     LDRB(AL, color.reg, mBuilderContext.Rctx,
1188             immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
1189     extract(factor, incomingTexel, component);
1190 
1191     // no need to keep more than 8-bits for blend
1192     int Ni = incoming.size();
1193     int shift = incoming.l;
1194     if (Ni > 8) {
1195         shift += Ni-8;
1196         Ni = 8;
1197     }
1198     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1199     if (shift) {
1200         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1201         incomingNorm.reg = dest.reg;
1202         incomingNorm.flags |= CORRUPTIBLE;
1203     }
1204     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1205     build_blendOneMinusFF(dest, factor, incomingNorm, color);
1206 }
1207 
add(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1208 void GGLAssembler::add(
1209         component_t& dest,
1210         const component_t& incoming,
1211         const pixel_t& incomingTexel, int component)
1212 {
1213     // RGBA:
1214     // Cv = Cf + Ct;
1215     Scratch locals(registerFile());
1216 
1217     component_t incomingTemp(incoming);
1218 
1219     // use "dest" as a temporary for extracting the texel, unless "dest"
1220     // overlaps "incoming".
1221     integer_t texel(dest.reg, 32, CORRUPTIBLE);
1222     if (dest.reg == incomingTemp.reg)
1223         texel.reg = locals.obtain();
1224     extract(texel, incomingTexel, component);
1225 
1226     if (texel.s < incomingTemp.size()) {
1227         expand(texel, texel, incomingTemp.size());
1228     } else if (texel.s > incomingTemp.size()) {
1229         if (incomingTemp.flags & CORRUPTIBLE) {
1230             expand(incomingTemp, incomingTemp, texel.s);
1231         } else {
1232             incomingTemp.reg = locals.obtain();
1233             expand(incomingTemp, incoming, texel.s);
1234         }
1235     }
1236 
1237     if (incomingTemp.l) {
1238         ADD(AL, 0, dest.reg, texel.reg,
1239                 reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
1240     } else {
1241         ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
1242     }
1243     dest.l = 0;
1244     dest.h = texel.size();
1245     component_sat(dest);
1246 }
1247 
1248 // ----------------------------------------------------------------------------
1249 
1250 }; // namespace android
1251 
1252