• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21///**
22//******************************************************************************
23//*
24//* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC )
25//*                and do the prediction.
26//*
27//* @par Description
28//*   This function evaluates  first three intra chroma modes and compute corresponding sad
29//*   and return the buffer predicted with best mode.
30//*
31//* @param[in] pu1_src
32//*  UWORD8 pointer to the source
33//*
34//** @param[in] pu1_ngbr_pels
35//*  UWORD8 pointer to neighbouring pels
36//*
37//* @param[out] pu1_dst
38//*  UWORD8 pointer to the destination
39//*
40//* @param[in] src_strd
41//*  integer source stride
42//*
43//* @param[in] dst_strd
44//*  integer destination stride
45//*
46//* @param[in] u4_n_avblty
47//* availability of neighbouring pixels
48//*
49//* @param[in] u4_intra_mode
50//* Pointer to the variable in which best mode is returned
51//*
52//* @param[in] pu4_sadmin
53//* Pointer to the variable in which minimum sad is returned
54//*
55//* @param[in] u4_valid_intra_modes
56//* Says what all modes are valid
57//*
58//*
59//* @return      none
60//*
61//******************************************************************************
62//*/
63//
64//void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
65//                                      UWORD8 *pu1_ngbr_pels_i16,
66//                                      UWORD8 *pu1_dst,
67//                                      UWORD32 src_strd,
68//                                      UWORD32 dst_strd,
69//                                      WORD32 u4_n_avblty,
70//                                      UWORD32 *u4_intra_mode,
71//                                      WORD32 *pu4_sadmin,
72//                                       UWORD32 u4_valid_intra_modes)
73//
74.text
75.p2align 2
76.include "ih264_neon_macros.s"
77
78.global ih264e_evaluate_intra_chroma_modes_av8
79
80ih264e_evaluate_intra_chroma_modes_av8:
81
82//x0 = pu1_src,
83//x1 = pu1_ngbr_pels_i16,
84//x2 = pu1_dst,
85//x3 = src_strd,
86//x4 = dst_strd,
87//x5 = u4_n_avblty,
88//x6 = u4_intra_mode,
89//x7 = pu4_sadmin
90
91
92
93    // STMFD sp!, {x4-x12, x14}          //store register values to stack
94    push_v_regs
95    stp       x19, x20, [sp, #-16]!
96    //-----------------------
97    ldr       x16, [sp, #80]
98    mov       x17, x4
99    mov       x18, x5
100    mov       x14, x6
101    mov       x15, x7
102
103    mov       x19, #5
104    ands      x6, x5, x19
105    beq       none_available
106    cmp       x6, #1
107    beq       left_only_available
108    cmp       x6, #4
109    beq       top_only_available
110
111all_available:
112    ld1       {v0.8b, v1.8b}, [x1]
113    add       x6, x1, #18
114    ld1       {v2.8b, v3.8b}, [x6]
115    uxtl      v0.8h, v0.8b
116    uxtl      v1.8h, v1.8b
117    addp      v0.4s, v0.4s , v0.4s
118    addp      v1.4s, v1.4s , v1.4s
119    addp      v0.4s, v0.4s , v0.4s
120    addp      v1.4s, v1.4s , v1.4s
121    uxtl      v2.8h, v2.8b
122    uxtl      v3.8h, v3.8b
123    addp      v2.4s, v2.4s , v2.4s
124    addp      v3.4s, v3.4s , v3.4s
125    addp      v2.4s, v2.4s , v2.4s
126    addp      v3.4s, v3.4s , v3.4s
127    rshrn     v5.8b, v0.8h, #2
128    dup       v21.8h, v5.h[0]
129    rshrn     v6.8b, v3.8h, #2
130    dup       v20.8h, v6.h[0]
131    add       v1.8h, v1.8h, v2.8h
132    rshrn     v1.8b, v1.8h, #3
133    dup       v23.8h, v1.h[0]
134    mov       v20.d[0], v23.d[0]
135    add       v0.8h, v0.8h, v3.8h
136    rshrn     v0.8b, v0.8h, #3
137    dup       v23.8h, v0.h[0]
138    mov       v31.d[0], v23.d[0]
139    mov       v28.d[0], v20.d[0]
140    mov       v29.d[0], v20.d[1]
141    mov       v30.d[0], v21.d[0]
142    b         sad_comp
143
144left_only_available:
145    ld1       {v0.8b, v1.8b}, [x1]
146    uxtl      v0.8h, v0.8b
147    uxtl      v1.8h, v1.8b
148    addp      v0.4s, v0.4s , v0.4s
149    addp      v1.4s, v1.4s , v1.4s
150    addp      v0.4s, v0.4s , v0.4s
151    addp      v1.4s, v1.4s , v1.4s
152    rshrn     v0.8b, v0.8h, #2
153    rshrn     v1.8b, v1.8h, #2
154
155    dup       v28.8h , v1.h[0]
156    dup       v29.8h , v1.h[0]
157    dup       v30.8h, v0.h[0]
158    dup       v31.8h, v0.h[0]
159    b         sad_comp
160
161top_only_available:
162    add       x6, x1, #18
163    ld1       {v0.8b, v1.8b}, [x6]
164    uxtl      v0.8h, v0.8b
165    uxtl      v1.8h, v1.8b
166    addp      v0.4s, v0.4s , v0.4s
167    addp      v1.4s, v1.4s , v1.4s
168    addp      v0.4s, v0.4s , v0.4s
169    addp      v1.4s, v1.4s , v1.4s
170    rshrn     v0.8b, v0.8h, #2
171    rshrn     v1.8b, v1.8h, #2
172    dup       v28.8h , v0.h[0]
173    dup       v30.8h, v1.h[0]
174    mov       v29.d[0], v30.d[1]
175    mov       v30.d[0], v28.d[0]
176    mov       v31.d[0], v30.d[1]
177    b         sad_comp
178none_available:
179    mov       w20, #128
180    dup       v28.16b, w20
181    dup       v29.16b, w20
182    dup       v30.16b, w20
183    dup       v31.16b, w20
184
185
186
187sad_comp:
188    add       x6, x1, #18
189    ld1       {v10.8b, v11.8b}, [x6]    // vertical values
190
191    ld1       {v27.8h}, [x1]
192
193    dup       v20.8h, v27.h[7]          ///HORIZONTAL VALUE ROW=0//
194    dup       v21.8h, v27.h[7]
195
196    ld1       { v0.8b, v1.8b}, [x0], x3
197
198
199    ///vertical row 0@
200    uabdl     v16.8h, v0.8b, v10.8b
201    uabdl     v18.8h, v1.8b, v11.8b
202
203    ///HORZ row 0@
204    uabdl     v26.8h, v0.8b, v20.8b
205    uabdl     v14.8h, v1.8b, v21.8b
206
207    ld1       {v2.8b, v3.8b}, [x0], x3
208
209
210
211    ///dc row 0@
212    uabdl     v22.8h, v0.8b, v28.8b
213    uabdl     v24.8h, v1.8b, v29.8b
214
215
216    dup       v20.8h, v27.h[6]
217    dup       v21.8h, v27.h[6]          ///HORIZONTAL VALUE ROW=1//
218
219    ///vertical row 1@
220    uabal     v16.8h, v2.8b, v10.8b
221    uabal     v18.8h, v3.8b, v11.8b
222
223    ld1       { v4.8b, v5.8b}, [x0], x3
224
225    ///HORZ row 1@
226    uabal     v26.8h, v2.8b, v20.8b
227    uabal     v14.8h, v3.8b, v21.8b
228
229    ///dc row 1@
230    uabal     v22.8h, v2.8b, v28.8b
231    uabal     v24.8h, v3.8b, v29.8b
232
233    dup       v20.8h, v27.h[5]
234    dup       v21.8h, v27.h[5]          ///HORIZONTAL VALUE ROW=2//
235
236    ///vertical row 2@
237    uabal     v16.8h, v4.8b, v10.8b
238    uabal     v18.8h, v5.8b, v11.8b
239
240    ld1       { v6.8b, v7.8b}, [x0], x3
241    ///HORZ row 2@
242    uabal     v26.8h, v4.8b, v20.8b
243    uabal     v14.8h, v5.8b, v21.8b
244
245    ///dc row 2@
246    uabal     v22.8h, v4.8b, v28.8b
247    uabal     v24.8h, v5.8b, v29.8b
248
249    dup       v20.8h, v27.h[4]
250    dup       v21.8h, v27.h[4]          ///HORIZONTAL VALUE ROW=3//
251
252    ///vertical row 3@
253    uabal     v16.8h, v6.8b, v10.8b
254    uabal     v18.8h, v7.8b, v11.8b
255
256    ///HORZ row 3@
257    uabal     v26.8h, v6.8b, v20.8b
258    uabal     v14.8h, v7.8b, v21.8b
259
260    ///dc row 3@
261    uabal     v22.8h, v6.8b, v28.8b
262    uabal     v24.8h, v7.8b, v29.8b
263
264    //----------------------------------------------------------------------------------------------
265    ld1       { v0.8b, v1.8b}, [x0], x3
266
267
268    dup       v20.8h, v27.h[3]
269    dup       v21.8h, v27.h[3]          ///HORIZONTAL VALUE ROW=0//
270
271    ///vertical row 0@
272    uabal     v16.8h, v0.8b, v10.8b
273    uabal     v18.8h, v1.8b, v11.8b
274
275    ///HORZ row 0@
276    uabal     v26.8h, v0.8b, v20.8b
277    uabal     v14.8h, v1.8b, v21.8b
278
279    ld1       { v2.8b, v3.8b}, [x0], x3
280
281    ///dc row 0@
282    uabal     v22.8h, v0.8b, v30.8b
283    uabal     v24.8h, v1.8b, v31.8b
284
285    dup       v20.8h, v27.h[2]
286    dup       v21.8h, v27.h[2]          ///HORIZONTAL VALUE ROW=1//
287
288    ///vertical row 1@
289    uabal     v16.8h, v2.8b, v10.8b
290    uabal     v18.8h, v3.8b, v11.8b
291
292    ///HORZ row 1@
293    uabal     v26.8h, v2.8b, v20.8b
294    uabal     v14.8h, v3.8b, v21.8b
295
296    ld1       { v4.8b, v5.8b}, [x0], x3
297
298    ///dc row 1@
299    uabal     v22.8h, v2.8b, v30.8b
300    uabal     v24.8h, v3.8b, v31.8b
301
302    dup       v20.8h, v27.h[1]
303    dup       v21.8h, v27.h[1]          ///HORIZONTAL VALUE ROW=2//
304
305    ///vertical row 2@
306    uabal     v16.8h, v4.8b, v10.8b
307    uabal     v18.8h, v5.8b, v11.8b
308
309    ///HORZ row 2@
310    uabal     v26.8h, v4.8b, v20.8b
311    uabal     v14.8h, v5.8b, v21.8b
312
313    ld1       {v6.8b, v7.8b}, [x0], x3
314
315    ///dc row 2@
316    uabal     v22.8h, v4.8b, v30.8b
317    uabal     v24.8h, v5.8b, v31.8b
318
319    dup       v20.8h, v27.h[0]
320    dup       v21.8h, v27.h[0]          ///HORIZONTAL VALUE ROW=3//
321
322    ///vertical row 3@
323    uabal     v16.8h, v6.8b, v10.8b
324    uabal     v18.8h, v7.8b, v11.8b
325
326    ///HORZ row 3@
327    uabal     v26.8h, v6.8b, v20.8b
328    uabal     v14.8h, v7.8b, v21.8b
329
330    ///dc row 3@
331    uabal     v22.8h, v6.8b, v30.8b
332    uabal     v24.8h, v7.8b, v31.8b
333
334
335//-------------------------------------------
336
337
338//vert sum
339
340    add       v16.8h, v16.8h , v18.8h
341    mov       v18.d[0], v16.d[1]
342    add       v16.4h, v16.4h , v18.4h
343    uaddlp    v16.2s, v16.4h
344    addp      v16.2s, v16.2s, v16.2s
345    smov      x8, v16.s[0]
346
347
348    //horz sum
349
350    add       v26.8h, v26.8h , v14.8h
351    mov       v14.d[0], v26.d[1]
352    add       v26.4h, v26.4h , v14.4h
353    uaddlp    v26.2s, v26.4h
354    addp      v26.2s, v26.2s, v26.2s
355    smov      x9, v26.s[0]
356
357    //dc sum
358
359    add       v24.8h, v22.8h , v24.8h   ///DC
360    mov       v25.d[0], v24.d[1]
361    add       v24.4h, v24.4h , v25.4h   ///DC
362    uaddlp    v24.2s, v24.4h            ///DC
363    addp      v24.2s, v24.2s, v24.2s    ///DC
364    smov      x10, v24.s[0]             //dc
365
366
367
368
369    mov       x11, #1
370//-----------------------
371    mov       x0, x16 // u4_valid_intra_modes
372
373//--------------------------------------------
374
375
376    lsl       x11, x11, #30
377
378    ands      x7, x0, #04               // vert mode valid????????????
379    csel      x8, x11, x8, eq
380
381    ands      x6, x0, #02               // horz mode valid????????????
382    csel      x9, x11, x9, eq
383
384    ands      x6, x0, #01               // dc mode valid????????????
385    csel      x10, x11, x10, eq
386
387
388    //---------------------------
389
390    mov       x4, x17
391    mov       x6, x14
392    mov       x7, x15
393
394    //--------------------------
395
396    cmp       x10, x9
397    bgt       not_dc
398    cmp       x10, x8
399    bgt       do_vert
400
401    ///----------------------
402    //DO DC PREDICTION
403    str       w10 , [x7]                //MIN SAD
404
405    mov       w10, #0
406    str       w10 , [x6]                // MODE
407
408    b         do_dc_vert
409    //-----------------------------
410
411not_dc:
412    cmp       x9, x8
413    bgt       do_vert
414    ///----------------------
415    //DO HORIZONTAL
416    str       w9 , [x7]                 //MIN SAD
417
418    mov       w10, #1
419    str       w10 , [x6]                // MODE
420    ld1       {v0.8h}, [x1]
421
422    dup       v10.8h, v0.h[7]
423    dup       v11.8h, v0.h[6]
424    dup       v12.8h, v0.h[5]
425    dup       v13.8h, v0.h[4]
426    st1       {v10.8h}, [x2], x4
427    dup       v14.8h, v0.h[3]
428    st1       {v11.8h}, [x2], x4
429    dup       v15.8h, v0.h[2]
430    st1       {v12.8h}, [x2], x4
431    dup       v16.8h, v0.h[1]
432    st1       {v13.8h}, [x2], x4
433    dup       v17.8h, v0.h[0]
434    st1       {v14.8h}, [x2], x4
435    st1       {v15.8h}, [x2], x4
436    st1       {v16.8h}, [x2], x4
437    st1       {v17.8h}, [x2], x4
438
439    b         end_func
440
441do_vert:
442    //DO VERTICAL PREDICTION
443    str       w8 , [x7]                 //MIN SAD
444    mov       w8, #2
445    str       w8 , [x6]                 // MODE
446    add       x6, x1, #18
447    ld1       {v28.8b, v29.8b}, [x6]    // vertical values
448    ld1       {v30.8b, v31.8b}, [x6]    // vertical values
449
450do_dc_vert:
451    st1       {v28.2s, v29.2s} , [x2], x4 //0
452    st1       {v28.2s, v29.2s} , [x2], x4 //1
453    st1       {v28.2s, v29.2s} , [x2], x4 //2
454    st1       {v28.2s, v29.2s} , [x2], x4 //3
455    st1       {v30.2s, v31.2s} , [x2], x4 //4
456    st1       {v30.2s, v31.2s} , [x2], x4 //5
457    st1       {v30.2s, v31.2s} , [x2], x4 //6
458    st1       {v30.2s, v31.2s} , [x2], x4 //7
459
460end_func:
461    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
462    ldp       x19, x20, [sp], #16
463    pop_v_regs
464    ret
465
466
467