• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_intra_pred_luma_16x16_av8.s
24//*
25//* @brief
26//*  Contains function definitions for intra 16x16 Luma prediction .
27//*
28//* @author
29//*  Ittiam
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_intra_pred_luma_16x16_mode_vert_av8()
34//*  - ih264_intra_pred_luma_16x16_mode_horz_av8()
35//*  - ih264_intra_pred_luma_16x16_mode_dc_av8()
36//*  - ih264_intra_pred_luma_16x16_mode_plane_av8()
37//*
38//* @remarks
39//*  None
40//*
41//*******************************************************************************
42//*/
43
44///* All the functions here are replicated from ih264_intra_pred_filters.c
45//
46
47///**
48///**
49///**
50//
51
52
53.text
54.p2align 2
55.include "ih264_neon_macros.s"
56.extern ih264_gai1_intrapred_luma_plane_coeffs
57
58
59
60///**
61//*******************************************************************************
62//*
63//*ih264_intra_pred_luma_16x16_mode_vert
64//*
65//* @brief
66//*   Perform Intra prediction for  luma_16x16 mode:vertical
67//*
68//* @par Description:
69//* Perform Intra prediction for  luma_16x16 mode:Vertical ,described in sec 8.3.3.1
70//*
71//* @param[in] pu1_src
72//*  UWORD8 pointer to the source
73//*
74//* @param[out] pu1_dst
75//*  UWORD8 pointer to the destination
76//*
77//* @param[in] src_strd
78//*  integer source stride
79//*
80//* @param[in] dst_strd
81//*  integer destination stride
82//*
83//* @param[in] ui_neighboravailability
84//* availability of neighbouring pixels(Not used in this function)
85//*
86//* @returns
87//*
88//* @remarks
89//*  None
90//*
91//*******************************************************************************
92//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
93//                                        UWORD8 *pu1_dst,
94//                                        WORD32 src_strd,
95//                                        WORD32 dst_strd,
96//                                        WORD32 ui_neighboravailability)
97
98//**************Variables Vs Registers*****************************************
99//    x0 => *pu1_src
100//    x1 => *pu1_dst
101//    x2 =>  src_strd
102//    x3 =>  dst_strd
103//   x4 =>  ui_neighboravailability
104
105
106    .global ih264_intra_pred_luma_16x16_mode_vert_av8
107
108ih264_intra_pred_luma_16x16_mode_vert_av8:
109
110    push_v_regs
111
112
113    add       x0, x0, #17
114    ld1       {v0.8b, v1.8b}, [x0]
115
116    st1       {v0.8b, v1.8b}, [x1], x3
117    st1       {v0.8b, v1.8b}, [x1], x3
118    st1       {v0.8b, v1.8b}, [x1], x3
119    st1       {v0.8b, v1.8b}, [x1], x3
120    st1       {v0.8b, v1.8b}, [x1], x3
121    st1       {v0.8b, v1.8b}, [x1], x3
122    st1       {v0.8b, v1.8b}, [x1], x3
123    st1       {v0.8b, v1.8b}, [x1], x3
124    st1       {v0.8b, v1.8b}, [x1], x3
125    st1       {v0.8b, v1.8b}, [x1], x3
126    st1       {v0.8b, v1.8b}, [x1], x3
127    st1       {v0.8b, v1.8b}, [x1], x3
128    st1       {v0.8b, v1.8b}, [x1], x3
129    st1       {v0.8b, v1.8b}, [x1], x3
130    st1       {v0.8b, v1.8b}, [x1], x3
131    st1       {v0.8b, v1.8b}, [x1], x3
132
133    pop_v_regs
134    ret
135
136
137
138
139
140///******************************************************************************
141
142
143///**
144//*******************************************************************************
145//*
146//*ih264_intra_pred_luma_16x16_mode_horz
147//*
148//* @brief
149//*  Perform Intra prediction for  luma_16x16 mode:horizontal
150//*
151//* @par Description:
152//*  Perform Intra prediction for  luma_16x16 mode:horizontal ,described in sec 8.3.3.2
153//*
154//* @param[in] pu1_src
155//*  UWORD8 pointer to the source
156//*
157//* @param[out] pu1_dst
158//*  UWORD8 pointer to the destination
159//*
160//* @param[in] src_strd
161//*  integer source stride
162//*
163//* @param[in] dst_strd
164//*  integer destination stride
165//*
166//* @param[in] ui_neighboravailability
167//* availability of neighbouring pixels(Not used in this function)
168//*
169//* @returns
170//*
171//* @remarks
172//*  None
173//*
174//*******************************************************************************
175//*/
176//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
177//                                         UWORD8 *pu1_dst,
178//                                         WORD32 src_strd,
179//                                         WORD32 dst_strd,
180//                                         WORD32 ui_neighboravailability)
181//**************Variables Vs Registers*****************************************
182//    x0 => *pu1_src
183//    x1 => *pu1_dst
184//    x2 =>  src_strd
185//    x3 =>  dst_strd
186//   x4 =>  ui_neighboravailability
187
188    .global ih264_intra_pred_luma_16x16_mode_horz_av8
189
190ih264_intra_pred_luma_16x16_mode_horz_av8:
191
192
193
194    push_v_regs
195
196    ld1       {v0.16b}, [x0]
197
198
199
200    dup       v10.16b, v0.b[15]
201    dup       v11.16b, v0.b[14]
202    dup       v12.16b, v0.b[13]
203    dup       v13.16b, v0.b[12]
204    st1       {v10.16b}, [x1], x3
205    dup       v14.16b, v0.b[11]
206    st1       {v11.16b}, [x1], x3
207    dup       v15.16b, v0.b[10]
208    st1       {v12.16b}, [x1], x3
209    dup       v16.16b, v0.b[9]
210    st1       {v13.16b}, [x1], x3
211    dup       v17.16b, v0.b[8]
212    st1       {v14.16b}, [x1], x3
213    dup       v18.16b, v0.b[7]
214    st1       {v15.16b}, [x1], x3
215    dup       v19.16b, v0.b[6]
216    st1       {v16.16b}, [x1], x3
217    dup       v20.16b, v0.b[5]
218    st1       {v17.16b}, [x1], x3
219    dup       v21.16b, v0.b[4]
220    st1       {v18.16b}, [x1], x3
221    dup       v22.16b, v0.b[3]
222    st1       {v19.16b}, [x1], x3
223    dup       v23.16b, v0.b[2]
224    st1       {v20.16b}, [x1], x3
225    dup       v24.16b, v0.b[1]
226    st1       {v21.16b}, [x1], x3
227    dup       v25.16b, v0.b[0]
228    st1       {v22.16b}, [x1], x3
229    st1       {v23.16b}, [x1], x3
230    st1       {v24.16b}, [x1], x3
231    st1       {v25.16b}, [x1], x3
232
233    pop_v_regs
234    ret
235
236
237
238
239
240
241
242///******************************************************************************
243
244
245///**
246//*******************************************************************************
247//*
248//*ih264_intra_pred_luma_16x16_mode_dc
249//*
250//* @brief
251//*  Perform Intra prediction for  luma_16x16 mode:DC
252//*
253//* @par Description:
254//*  Perform Intra prediction for  luma_16x16 mode:DC ,described in sec 8.3.3.3
255//*
256//* @param[in] pu1_src
257//*  UWORD8 pointer to the source
258//*
259//* @param[out] pu1_dst
260//*  UWORD8 pointer to the destination
261//*
262//* @param[in] src_strd
263//*  integer source stride
264//*
265//* @param[in] dst_strd
266//*  integer destination stride
267//*
268//* @param[in] ui_neighboravailability
269//*  availability of neighbouring pixels
270//*
271//* @returns
272//*
273//* @remarks
274//*  None
275//*
276//*******************************************************************************/
277//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
278//                                       UWORD8 *pu1_dst,
279//                                       WORD32 src_strd,
280//                                       WORD32 dst_strd,
281//                                       WORD32 ui_neighboravailability)
282
283//**************Variables Vs Registers*****************************************
284//    x0 => *pu1_src
285//    x1 => *pu1_dst
286//    x2 =>  src_strd
287//    x3 =>  dst_strd
288//   x4 =>  ui_neighboravailability
289
290    .global ih264_intra_pred_luma_16x16_mode_dc_av8
291
292ih264_intra_pred_luma_16x16_mode_dc_av8:
293
294
295
296    push_v_regs
297    stp       x19, x20, [sp, #-16]!
298
299    sub       v0.16b, v0.16b, v0.16b
300    sub       v1.16b, v1.16b, v1.16b
301    mov       w10, #0
302    mov       w11 , #3
303    ands      x6, x4, #0x01
304    beq       top_available             //LEFT NOT AVAILABLE
305    ld1       {v0.16b}, [x0]
306    add       w10, w10, #8
307    add       w11, w11, #1
308top_available:
309    ands      x6, x4, #0x04
310    beq       none_available
311    add       x6, x0, #17
312    ld1       {v1.16b}, [x6]
313    add       w10, w10, #8
314    add       w11, w11, #1
315    b         summation
316none_available:
317    cmp       x4, #0
318    bne       summation
319    mov       w15, #128
320    dup       v20.16b, w15
321    b         store
322summation:
323    uaddl     v2.8h, v0.8b, v1.8b
324    uaddl2    v3.8h, v0.16b, v1.16b
325    dup       v10.8h, w10
326    neg       w11, w11
327    dup       v20.8h, w11
328    add       v0.8h, v2.8h, v3.8h
329    mov       v1.d[0], v0.d[1]
330    add       v0.4h, v0.4h, v1.4h
331    addp      v0.4h, v0.4h , v0.4h
332    addp      v0.4h, v0.4h , v0.4h
333    add       v0.4h, v0.4h, v10.4h
334    uqshl     v0.8h, v0.8h, v20.8h
335    sqxtun    v0.8b, v0.8h
336    dup       v20.16b, v0.b[0]
337
338store:
339
340    st1       { v20.16b}, [x1], x3
341    st1       { v20.16b}, [x1], x3
342    st1       { v20.16b}, [x1], x3
343    st1       { v20.16b}, [x1], x3
344    st1       { v20.16b}, [x1], x3
345    st1       { v20.16b}, [x1], x3
346    st1       { v20.16b}, [x1], x3
347    st1       { v20.16b}, [x1], x3
348    st1       { v20.16b}, [x1], x3
349    st1       { v20.16b}, [x1], x3
350    st1       { v20.16b}, [x1], x3
351    st1       { v20.16b}, [x1], x3
352    st1       { v20.16b}, [x1], x3
353    st1       { v20.16b}, [x1], x3
354    st1       { v20.16b}, [x1], x3
355    st1       { v20.16b}, [x1], x3
356
357
358
359end_func:
360
361    ldp       x19, x20, [sp], #16
362    pop_v_regs
363    ret
364
365
366
367
368
369///******************************************************************************
370
371
372///**
373//*******************************************************************************
374//*
375//*ih264_intra_pred_luma_16x16_mode_plane
376//*
377//* @brief
378//*  Perform Intra prediction for  luma_16x16 mode:PLANE
379//*
380//* @par Description:
381//*  Perform Intra prediction for  luma_16x16 mode:PLANE ,described in sec 8.3.3.4
382//*
383//* @param[in] pu1_src
384//*  UWORD8 pointer to the source
385//*
386//* @param[out] pu1_dst
387//*  UWORD8 pointer to the destination
388//*
389//* @param[in] src_strd
390//*  integer source stride
391//*
392//* @param[in] dst_strd
393//*  integer destination stride
394//*
395//* @param[in] ui_neighboravailability
396//*  availability of neighbouring pixels
397//*
398//* @returns
399//*
400//* @remarks
401//*  None
402//*
403//*******************************************************************************/
404//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
405//                                        UWORD8 *pu1_dst,
406//                                        WORD32 src_strd,
407//                                        WORD32 dst_strd,
408//                                        WORD32 ui_neighboravailability)
409
410//**************Variables Vs Registers*****************************************
411//    x0 => *pu1_src
412//    x1 => *pu1_dst
413//    x2 =>  src_strd
414//    x3 =>  dst_strd
415//   x4 =>  ui_neighboravailability
416
417    .global ih264_intra_pred_luma_16x16_mode_plane_av8
418ih264_intra_pred_luma_16x16_mode_plane_av8:
419
420    push_v_regs
421    stp       x19, x20, [sp, #-16]!
422    mov       x2, x1
423    add       x1, x0, #17
424    add       x0, x0, #15
425    mov       x8, #9
426    sub       x1, x1, #1
427    mov       x10, x1                   //top_left
428    mov       x4, #-1
429    ld1       {v2.2s}, [x1], x8
430
431    adrp      x7, :got:ih264_gai1_intrapred_luma_plane_coeffs
432    ldr       x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs]
433
434    ld1       {v0.2s}, [x1]
435    rev64     v2.8b, v2.8b
436    ld1       {v6.2s, v7.2s}, [x7]
437    usubl     v0.8h, v0.8b, v2.8b
438    uxtl      v16.8h, v6.8b
439    mul       v0.8h, v0.8h , v16.8h
440    uxtl      v18.8h, v7.8b
441    add       x7, x0, x4, lsl #3
442    sub       x0, x7, x4, lsl #1
443    sub       x20, x4, #0x0
444    neg       x14, x20
445    addp      v0.8h, v0.8h, v1.8h
446    ldrb      w8, [x7], #-1
447    sxtw      x8, w8
448    ldrb      w9, [x0], #1
449    sxtw      x9, w9
450    saddlp    v0.2s, v0.4h
451    sub       x12, x8, x9
452    ldrb      w8, [x7], #-1
453    sxtw      x8, w8
454    saddlp    v0.1d, v0.2s
455    ldrb      w9, [x0], #1
456    sxtw      x9, w9
457    sub       x8, x8, x9
458    shl       v2.2s, v0.2s, #2
459    add       x12, x12, x8, lsl #1
460    add       v0.2s, v0.2s , v2.2s
461    ldrb      w8, [x7], #-1
462    sxtw      x8, w8
463    ldrb      w9, [x0], #1
464    sxtw      x9, w9
465    srshr     v0.2s, v0.2s, #6          // i_b = D0[0]
466    sub       x8, x8, x9
467    ldrb      w5, [x7], #-1
468    sxtw      x5, w5
469    add       x8, x8, x8, lsl #1
470    dup       v4.8h, v0.h[0]
471    add       x12, x12, x8
472    ldrb      w9, [x0], #1
473    sxtw      x9, w9
474    mul       v0.8h, v4.8h , v16.8h
475    sub       x5, x5, x9
476    mul       v2.8h, v4.8h , v18.8h
477    add       x12, x12, x5, lsl #2
478    ldrb      w8, [x7], #-1
479    sxtw      x8, w8
480    ldrb      w9, [x0], #1
481    sxtw      x9, w9
482    sub       x8, x8, x9
483    ldrb      w5, [x7], #-1
484    sxtw      x5, w5
485    add       x8, x8, x8, lsl #2
486    ldrb      w6, [x0], #1
487    sxtw      x6, w6
488    add       x12, x12, x8
489    ldrb      w8, [x7], #-1
490    sxtw      x8, w8
491    ldrb      w9, [x0], #1
492    sxtw      x9, w9
493    sub       x5, x5, x6
494    sub       x8, x8, x9
495    add       x5, x5, x5, lsl #1
496    sub       x20, x8, x8, lsl #3
497    neg       x8, x20
498    add       x12, x12, x5, lsl #1
499    ldrb      w5, [x7], #-1
500    sxtw      x5, w5
501    ldrb      w6, [x10]                 //top_left
502    sxtw      x6, w6
503    add       x12, x12, x8
504    sub       x9, x5, x6
505    ldrb      w6, [x1, #7]
506    sxtw      x6, w6
507    add       x12, x12, x9, lsl #3      // i_c = x12
508    add       x8, x5, x6
509    add       x12, x12, x12, lsl #2
510    lsl       x8, x8, #4                // i_a = x8
511    add       x12, x12, #0x20
512    lsr       x12, x12, #6
513    shl       v28.8h, v4.8h, #3
514    dup       v6.8h, w12
515    dup       v30.8h, w8
516    shl       v26.8h, v6.8h, #3
517    sub       v30.8h, v30.8h , v28.8h
518    sub       v30.8h, v30.8h , v26.8h
519    add       v28.8h, v30.8h , v6.8h
520    add       v26.8h, v28.8h , v0.8h
521    add       v28.8h, v28.8h , v2.8h
522    sqrshrun  v20.8b, v26.8h, #5
523    sqrshrun  v21.8b, v28.8h, #5
524    add       v26.8h, v26.8h , v6.8h
525    add       v28.8h, v28.8h , v6.8h
526    sqrshrun  v22.8b, v26.8h, #5
527    st1       {v20.2s, v21.2s}, [x2], x3
528    sqrshrun  v23.8b, v28.8h, #5
529    add       v26.8h, v26.8h , v6.8h
530    add       v28.8h, v28.8h , v6.8h
531    sqrshrun  v20.8b, v26.8h, #5
532    st1       {v22.2s, v23.2s}, [x2], x3
533    sqrshrun  v21.8b, v28.8h, #5
534    add       v26.8h, v26.8h , v6.8h
535    add       v28.8h, v28.8h , v6.8h
536    sqrshrun  v22.8b, v26.8h, #5
537    st1       {v20.2s, v21.2s}, [x2], x3
538    sqrshrun  v23.8b, v28.8h, #5
539    add       v26.8h, v26.8h , v6.8h
540    add       v28.8h, v28.8h , v6.8h
541    sqrshrun  v20.8b, v26.8h, #5
542    st1       {v22.2s, v23.2s}, [x2], x3
543    sqrshrun  v21.8b, v28.8h, #5
544    add       v26.8h, v26.8h , v6.8h
545    add       v28.8h, v28.8h , v6.8h
546    sqrshrun  v22.8b, v26.8h, #5
547    st1       {v20.2s, v21.2s}, [x2], x3
548    sqrshrun  v23.8b, v28.8h, #5
549    add       v26.8h, v26.8h , v6.8h
550    add       v28.8h, v28.8h , v6.8h
551    sqrshrun  v20.8b, v26.8h, #5
552    st1       {v22.2s, v23.2s}, [x2], x3
553    sqrshrun  v21.8b, v28.8h, #5
554    add       v26.8h, v26.8h , v6.8h
555    add       v28.8h, v28.8h , v6.8h
556    sqrshrun  v22.8b, v26.8h, #5
557    st1       {v20.2s, v21.2s}, [x2], x3
558    sqrshrun  v23.8b, v28.8h, #5
559    add       v26.8h, v26.8h , v6.8h
560    add       v28.8h, v28.8h , v6.8h
561    sqrshrun  v20.8b, v26.8h, #5
562    st1       {v22.2s, v23.2s}, [x2], x3
563    sqrshrun  v21.8b, v28.8h, #5
564    add       v26.8h, v26.8h , v6.8h
565    add       v28.8h, v28.8h , v6.8h
566    sqrshrun  v22.8b, v26.8h, #5
567    st1       {v20.2s, v21.2s}, [x2], x3
568    sqrshrun  v23.8b, v28.8h, #5
569    add       v26.8h, v26.8h , v6.8h
570    add       v28.8h, v28.8h , v6.8h
571    sqrshrun  v20.8b, v26.8h, #5
572    st1       {v22.2s, v23.2s}, [x2], x3
573    sqrshrun  v21.8b, v28.8h, #5
574    add       v26.8h, v26.8h , v6.8h
575    add       v28.8h, v28.8h , v6.8h
576    sqrshrun  v22.8b, v26.8h, #5
577    st1       {v20.2s, v21.2s}, [x2], x3
578    sqrshrun  v23.8b, v28.8h, #5
579    add       v26.8h, v26.8h , v6.8h
580    add       v28.8h, v28.8h , v6.8h
581    sqrshrun  v20.8b, v26.8h, #5
582    st1       {v22.2s, v23.2s}, [x2], x3
583    sqrshrun  v21.8b, v28.8h, #5
584    add       v26.8h, v26.8h , v6.8h
585    add       v28.8h, v28.8h , v6.8h
586    sqrshrun  v22.8b, v26.8h, #5
587    st1       {v20.2s, v21.2s}, [x2], x3
588    sqrshrun  v23.8b, v28.8h, #5
589    add       v26.8h, v26.8h , v6.8h
590    add       v28.8h, v28.8h , v6.8h
591    sqrshrun  v20.8b, v26.8h, #5
592    st1       {v22.2s, v23.2s}, [x2], x3
593    sqrshrun  v21.8b, v28.8h, #5
594    add       v26.8h, v26.8h , v6.8h
595    add       v28.8h, v28.8h , v6.8h
596    sqrshrun  v22.8b, v26.8h, #5
597    st1       {v20.2s, v21.2s}, [x2], x3
598    sqrshrun  v23.8b, v28.8h, #5
599    st1       {v22.2s, v23.2s}, [x2], x3
600
601end_func_plane:
602
603    ldp       x19, x20, [sp], #16
604    pop_v_regs
605    ret
606
607