• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20
21@/*
22@//----------------------------------------------------------------------------
23@// File Name            : impeg2_inter_pred.s
24@//
25@// Description          : This file has motion compensation related
26@//                        interpolation functions on Neon + CortexA-8 platform
27@//
28@// Reference Document   :
29@//
30@// Revision History     :
31@//      Date            Author                  Detail Description
32@//   ------------    ----------------    ----------------------------------
33@//   18 jun 2010     S Hamsalekha              Created
34@//
35@//-------------------------------------------------------------------------
36@*/
37
38@/*
39@// ----------------------------------------------------------------------------
40@// Include Files
41@// ----------------------------------------------------------------------------
42@*/
43.text
44.p2align 2
45
46
47@/*
48@// ----------------------------------------------------------------------------
49@// Struct/Union Types and Define
50@// ----------------------------------------------------------------------------
51@*/
52
53
54@/*
55@// ----------------------------------------------------------------------------
56@// Static Global Data section variables
57@// ----------------------------------------------------------------------------
58@*/
59@// -------------------------- NONE --------------------------------------------
60
61
62@/*
63@// ----------------------------------------------------------------------------
64@// Static Prototype Functions
65@// ----------------------------------------------------------------------------
66@*/
67@// -------------------------- NONE --------------------------------------------
68
69@/*
70@// ----------------------------------------------------------------------------
71@// Exported functions
72@// ----------------------------------------------------------------------------
73@*/
74
75@//---------------------------------------------------------------------------
76@// Function Name      :   impeg2_copy_mb_a9q()
77@//
78@// Detail Description : Copies one MB worth of data from src to the dst
79@//
80@// Inputs             : r0 - pointer to src
81@//                      r1 - pointer to dst
82@//                      r2 - source width
83@//                      r3 - destination width
84@// Registers Used     : r4, r5, d0, d1
85@//
86@// Stack Usage        : 12 bytes
87@//
88@// Outputs            :
89@//
90@// Return Data        : None
91@//
92@// Programming Note   : <program limitation>
93@//-----------------------------------------------------------------------------
94@*/
95
96
97
98        .global impeg2_copy_mb_a9q
99
100
101impeg2_copy_mb_a9q:
102
103    stmfd           sp!, {r4, r5, r14}
104
105
106    ldr             r4, [r0]            @src->y
107    ldr             r5, [r1]            @dst->y
108    @Read one row of data from the src
109    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
110    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
111
112    @//Repeat 15 times for y
113    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
114    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
115    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
116    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
117    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
118    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
119    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
120    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
121    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
122    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
123    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
124    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
125    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
126    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
127    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
128    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
129    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
130    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
131    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
132    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
133    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
134    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
135    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
136    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
137    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
138    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
139    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
140    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
141    vld1.8          {d0, d1}, [r4], r2  @Load and increment src
142    vst1.8          {d0, d1}, [r5], r3  @Store and increment dst
143
144    mov             r2, r2, lsr #1      @src_offset /= 2
145    mov             r3, r3, lsr #1      @dst_offset /= 2
146
147    ldr             r4, [r0, #4]        @src->u
148    ldr             r5, [r1, #4]        @dst->u
149    @Read one row of data from the src
150    vld1.8          {d0}, [r4], r2      @Load and increment src
151    vst1.8          {d0}, [r5], r3      @Store and increment dst
152
153    @//Repeat 7 times for u
154    vld1.8          {d0}, [r4], r2      @Load and increment src
155    vst1.8          {d0}, [r5], r3      @Store and increment dst
156    vld1.8          {d0}, [r4], r2      @Load and increment src
157    vst1.8          {d0}, [r5], r3      @Store and increment dst
158    vld1.8          {d0}, [r4], r2      @Load and increment src
159    vst1.8          {d0}, [r5], r3      @Store and increment dst
160    vld1.8          {d0}, [r4], r2      @Load and increment src
161    vst1.8          {d0}, [r5], r3      @Store and increment dst
162    vld1.8          {d0}, [r4], r2      @Load and increment src
163    vst1.8          {d0}, [r5], r3      @Store and increment dst
164    vld1.8          {d0}, [r4], r2      @Load and increment src
165    vst1.8          {d0}, [r5], r3      @Store and increment dst
166    vld1.8          {d0}, [r4], r2      @Load and increment src
167    vst1.8          {d0}, [r5], r3      @Store and increment dst
168
169    ldr             r4, [r0, #8]        @src->v
170    ldr             r5, [r1, #8]        @dst->v
171    @Read one row of data from the src
172    vld1.8          {d0}, [r4], r2      @Load and increment src
173    vst1.8          {d0}, [r5], r3      @Store and increment dst
174
175    @//Repeat 7 times for v
176    vld1.8          {d0}, [r4], r2      @Load and increment src
177    vst1.8          {d0}, [r5], r3      @Store and increment dst
178    vld1.8          {d0}, [r4], r2      @Load and increment src
179    vst1.8          {d0}, [r5], r3      @Store and increment dst
180    vld1.8          {d0}, [r4], r2      @Load and increment src
181    vst1.8          {d0}, [r5], r3      @Store and increment dst
182    vld1.8          {d0}, [r4], r2      @Load and increment src
183    vst1.8          {d0}, [r5], r3      @Store and increment dst
184    vld1.8          {d0}, [r4], r2      @Load and increment src
185    vst1.8          {d0}, [r5], r3      @Store and increment dst
186    vld1.8          {d0}, [r4], r2      @Load and increment src
187    vst1.8          {d0}, [r5], r3      @Store and increment dst
188    vld1.8          {d0}, [r4], r2      @Load and increment src
189    vst1.8          {d0}, [r5], r3      @Store and increment dst
190
191    ldmfd           sp!, {r4, r5, pc}
192
193
194
195
196@/*
197@//---------------------------------------------------------------------------
198@// Function Name      :   impeg2_mc_fullx_halfy_8x8_a9q()
199@//
200@// Detail Description : This function pastes the reference block in the
201@//                      current frame buffer.This function is called for
202@//                      blocks that are not coded and have motion vectors
203@//                      with a half pel resolution.
204@//
205@// Inputs             : r0 - out    : Current Block Pointer
206@//                      r1 - ref     : Refernce Block Pointer
207@//                      r2 - ref_wid   : Refernce Block Width
208@//                      r3 - out_wid   ; Current Block Width
209@//
210@// Registers Used     : D0-D9
211@//
212@// Stack Usage        : 4 bytes
213@//
214@// Outputs            : The Motion Compensated Block
215@//
216@// Return Data        : None
217@//
218@// Programming Note   : <program limitation>
219@//-----------------------------------------------------------------------------
220@*/
221
222        .global impeg2_mc_fullx_halfy_8x8_a9q
223
224impeg2_mc_fullx_halfy_8x8_a9q:
225
226    stmfd           sp!, {r14}
227    vpush           {d8-d9}
228    add             r14, r1, r2
229    mov             r2, r2, lsl #1
230
231@/* Load 8 + 1 rows from reference block */
232@/* Do the addition with out rounding off as rounding value is 1 */
233    vld1.8          {d0}, [r1], r2      @// first row hence r1 = D0
234    vld1.8          {d2}, [r14], r2     @// second row hence r2 = D2
235    vld1.8          {d4}, [r1], r2      @// third row hence r3 = D4
236    vld1.8          {d6}, [r14], r2     @// fourth row hence r4 = D6
237    vld1.8          {d1}, [r1], r2      @// fifth row hence r5 = D1
238    vld1.8          {d3}, [r14], r2     @// sixth row hence r6 = D3
239    vrhadd.u8       d9, d1, d6          @// estimated row 4 = D9
240    vld1.8          {d5}, [r1], r2      @// seventh row hence r7 = D5
241    vrhadd.u8       q0, q0, q1          @// estimated row 1 = D0, row 5 = D1
242    vld1.8          {d7}, [r14], r2     @// eighth row hence r8 = D7
243    vrhadd.u8       q1, q1, q2          @// estimated row 2 = D2, row 6 = D3
244    vld1.8          {d8}, [r1], r2      @// ninth row hence r9 = D8
245    vrhadd.u8       q2, q2, q3          @// estimated row 3 = D4, row 7 = D5
246
247    add             r14, r0, r3
248    mov             r3, r3, lsl #1
249
250@/* Store the eight rows calculated above */
251    vst1.8          {d2}, [r14], r3     @// second row hence D2
252    vrhadd.u8       d7, d7, d8          @// estimated row 8 = D7
253    vst1.8          {d0}, [r0], r3      @// first row hence D0
254    vst1.8          {d9}, [r14], r3     @// fourth row hence D9
255    vst1.8          {d4}, [r0], r3      @// third row hence D4
256    vst1.8          {d3}, [r14], r3     @// sixth row hence r6 = D3
257    vst1.8          {d1}, [r0], r3      @// fifth row hence r5 = D1
258    vst1.8          {d7}, [r14], r3     @// eighth row hence r8 = D7
259    vst1.8          {d5}, [r0], r3      @// seventh row hence r7 = D5
260
261    vpop            {d8-d9}
262    ldmfd           sp!, {pc}
263
264
265
266
267
268
269@/*
270@//---------------------------------------------------------------------------
271@// Function Name      :   impeg2_mc_halfx_fully_8x8_a9q()
272@//
273@// Detail Description : This function pastes the reference block in the
274@//                      current frame buffer.This function is called for
275@//                      blocks that are not coded and have motion vectors
276@//                      with a half pel resolutionand VopRoundingType is 0 ..
277@//
278@// Inputs             : r0 - out    : Current Block Pointer
279@//                      r1 - ref     : Refernce Block Pointer
280@//                      r2 - ref_wid   : Refernce Block Width
281@//                      r3 - out_wid   ; Current Block Width
282@//
283@// Registers Used     : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
284
285@//
286@// Stack Usage        : 8 bytes
287@//
288@// Outputs            : The Motion Compensated Block
289@//
290@// Return Data        : None
291@//
292@// Programming Note   : <program limitation>
293@//-----------------------------------------------------------------------------
294@*/
295
296
297
298        .global impeg2_mc_halfx_fully_8x8_a9q
299
300
301
302impeg2_mc_halfx_fully_8x8_a9q:
303
304    stmfd           sp!, {r12, lr}
305
306    add             r14, r1, r2, lsl #2
307
308    add             r12, r0, r3, lsl#2
309
310    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
311
312    vld1.8          {d2, d3}, [r14], r2 @ row5
313
314
315    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
316
317    vld1.8          {d6, d7}, [r14], r2 @row6
318
319
320    vext.8          d24, d0, d1, #1     @Extract pixels (1-8) of row1
321
322    vext.8          d28, d2, d3, #1     @Extract pixels (1-8) of row5
323
324    vext.8          d16, d4, d5, #1     @Extract pixels (1-8) of row2
325
326    vext.8          d20, d6, d7, #1     @Extract pixels (1-8) of row6
327
328
329    vld1.8          {d25, d26}, [r1], r2 @load row3
330
331    vld1.8          {d29, d30}, [r14], r2 @load row7
332
333    vld1.8          {d17, d18}, [r1], r2 @load  row4
334
335    vld1.8          {d21, d22}, [r14], r2 @load  row8
336
337
338    vext.8          d1, d25, d26, #1    @Extract pixels (1-8) of row3
339
340    vext.8          d3, d29, d30, #1    @Extract pixels (1-8) of row7
341
342
343
344    vext.8          d5, d17, d18, #1    @Extract pixels (1-8) of row4
345
346    vext.8          d7, d21, d22, #1    @Extract pixels (1-8) of row8
347
348
349    vrhadd.u8       q0, q0, q12         @operate on row1 and row3
350
351    vrhadd.u8       q1, q1, q14         @operate on row5 and row7
352
353
354    vrhadd.u8       q2, q2, q8          @operate on row2 and row4
355
356
357
358    vrhadd.u8       q3, q3, q10         @operate on row6 and row8
359
360    vst1.8          d0, [r0], r3        @store row1
361
362    vst1.8          d2, [r12], r3       @store row5
363
364    vst1.8          d4, [r0], r3        @store row2
365
366    vst1.8          d6, [r12], r3       @store row6
367
368    vst1.8          d1, [r0], r3        @store row3
369
370    vst1.8          d3, [r12], r3       @store row7
371
372    vst1.8          d5, [r0], r3        @store row4
373
374    vst1.8          d7, [r12], r3       @store row8
375
376
377
378    ldmfd           sp!, {r12, pc}
379
380
381
382
383
384
385
386
387@/*
388@//---------------------------------------------------------------------------
389@// Function Name      :   impeg2_mc_halfx_halfy_8x8_a9q()
390@//
391@// Detail Description : This function pastes the reference block in the
392@//                      current frame buffer.This function is called for
393@//                      blocks that are not coded and have motion vectors
394@//                      with a half pel resolutionand VopRoundingType is 0 ..
395@//
396@// Inputs             : r0 - out    : Current Block Pointer
397@//                      r1 - ref     : Refernce Block Pointer
398@//                      r2 - ref_wid   : Refernce Block Width
399@//                      r3 - out_wid   ; Current Block Width
400@//
401@// Registers Used     : r14, q0-q15
402
403@//
404@// Stack Usage        : 4 bytes
405@//
406@// Outputs            : The Motion Compensated Block
407@//
408@// Return Data        : None
409@//
410@// Programming Note   : <program limitation>
411@//-----------------------------------------------------------------------------
412@*/
413
414
415        .global impeg2_mc_halfx_halfy_8x8_a9q
416
417impeg2_mc_halfx_halfy_8x8_a9q:
418
419    stmfd           sp!, {r14}
420    vpush           {d8-d15}
421
422    add             r14, r1, r2, lsl #2
423
424    vld1.8          {d0, d1}, [r1], r2  @load 16 pixels of  row1
425
426    vld1.8          {d2, d3}, [r14], r2 @ row5
427
428    vld1.8          {d4, d5}, [r1], r2  @load 16 pixels row2
429
430    vld1.8          {d6, d7}, [r14], r2 @row6
431
432    vext.8          d1, d0, d1, #1      @Extract pixels (1-8) of row1
433
434
435
436    vext.8          d3, d2, d3, #1      @Extract pixels (1-8) of row5
437
438
439
440    vext.8          d5, d4, d5, #1      @Extract pixels (1-8) of row2
441
442    vext.8          d7, d6, d7, #1      @Extract pixels (1-8) of row6
443
444
445
446
447    vld1.8          {d8, d9}, [r1], r2  @load row3
448
449
450
451    vld1.8          {d10, d11}, [r14], r2 @load row7
452
453    vld1.8          {d12, d13}, [r1], r2 @load  row4
454
455    vld1.8          {d14, d15}, [r14], r2 @load  row8
456
457    vext.8          d9, d8, d9, #1      @Extract pixels (1-8) of row3
458
459    vld1.8          {d16, d17}, [r14], r2 @load  row9
460
461
462
463
464
465    vext.8          d11, d10, d11, #1   @Extract pixels (1-8) of row7
466
467
468
469    vext.8          d13, d12, d13, #1   @Extract pixels (1-8) of row4
470
471
472
473    vext.8          d15, d14, d15, #1   @Extract pixels (1-8) of row8
474
475    vext.8          d17, d16, d17, #1   @Extract pixels (1-8) of row9
476
477
478    @interpolation in x direction
479
480    vaddl.u8        q0, d0, d1          @operate row1
481
482    vaddl.u8        q1, d2, d3          @operate row5
483
484    vaddl.u8        q2, d4, d5          @operate row2
485
486    vaddl.u8        q3, d6, d7          @operate row6
487
488    vaddl.u8        q4, d8, d9          @operate row3
489
490    vaddl.u8        q5, d10, d11        @operate row7
491
492    vaddl.u8        q6, d12, d13        @operate row4
493
494    vaddl.u8        q7, d14, d15        @operate row8
495
496    vaddl.u8        q8, d16, d17        @operate row9
497
498    @interpolation in y direction
499
500    add             r14, r0, r3, lsl #2
501
502
503
504    vadd.u16        q9, q0, q2          @operate row1 and row2
505
506    vadd.u16        q13, q1, q3         @operate row5 and row6
507
508    vadd.u16        q10, q2, q4         @operate row2 and row3
509
510    vadd.u16        q14, q3, q5         @operate row6 and row7
511
512    vrshrn.u16      d18, q9, #2         @row1
513
514    vrshrn.u16      d26, q13, #2        @row5
515
516    vrshrn.u16      d20, q10, #2        @row2
517
518    vrshrn.u16      d28, q14, #2        @row6
519
520    vadd.u16        q11, q4, q6         @operate row3 and row4
521
522    vst1.8          d18, [r0], r3       @store row1
523
524    vadd.u16        q15, q5, q7         @operate row7 and row8
525
526    vst1.8          d26, [r14], r3      @store row5
527
528    vadd.u16        q12, q6, q1         @operate row4 and row5
529
530    vst1.8          d20, [r0], r3       @store row2
531
532    vadd.u16        q7, q7, q8          @operate row8 and row9
533
534    vst1.8          d28, [r14], r3      @store row6
535
536
537
538    vrshrn.u16      d22, q11, #2        @row3
539
540    vrshrn.u16      d30, q15, #2        @row7
541
542    vrshrn.u16      d24, q12, #2        @row4
543
544    vrshrn.u16      d14, q7, #2         @row8
545
546
547    vst1.8          d22, [r0], r3       @store row3
548    vst1.8          d30, [r14], r3      @store row7
549    vst1.8          d24, [r0], r3       @store row4
550    vst1.8          d14, [r14], r3      @store row8
551
552
553
554    vpop            {d8-d15}
555    ldmfd           sp!, {pc}
556
557
558
559
560
561@/*
562@//---------------------------------------------------------------------------
563@// Function Name      :   impeg2_mc_fullx_fully_8x8_a9q()
564@//
565@// Detail Description : This function pastes the reference block in the
566@//                      current frame buffer.This function is called for
567@//                      blocks that are not coded and have motion vectors
568@//                      with a half pel resolutionand ..
569@//
570@// Inputs             : r0 - out    : Current Block Pointer
571@//                      r1 - ref     : Refernce Block Pointer
572@//                      r2 - ref_wid   : Refernce Block Width
573@//                      r3 - out_wid   ; Current Block Width
574@//
575@// Registers Used     : r12, r14, d0-d3
576
577@//
578@// Stack Usage        : 8 bytes
579@//
580@// Outputs            : The Motion Compensated Block
581@//
582@// Return Data        : None
583@//
584@// Programming Note   : <program limitation>
585@//-----------------------------------------------------------------------------
586@*/
587
588
589        .global impeg2_mc_fullx_fully_8x8_a9q
590impeg2_mc_fullx_fully_8x8_a9q:
591
592
593    stmfd           sp!, {r12, lr}
594
595    add             r14, r1, r2, lsl #2
596
597    add             r12, r0, r3, lsl #2
598
599
600    vld1.8          d0, [r1], r2        @load row1
601
602    vld1.8          d1, [r14], r2       @load row4
603
604    vld1.8          d2, [r1], r2        @load row2
605
606    vld1.8          d3, [r14], r2       @load row5
607
608
609    vst1.8          d0, [r0], r3        @store row1
610
611    vst1.8          d1, [r12], r3       @store row4
612
613    vst1.8          d2, [r0], r3        @store row2
614
615    vst1.8          d3, [r12], r3       @store row5
616
617
618    vld1.8          d0, [r1], r2        @load row3
619
620    vld1.8          d1, [r14], r2       @load row6
621
622    vld1.8          d2, [r1], r2        @load row4
623
624    vld1.8          d3, [r14], r2       @load row8
625
626
627    vst1.8          d0, [r0], r3        @store row3
628
629    vst1.8          d1, [r12], r3       @store row6
630
631    vst1.8          d2, [r0], r3        @store row4
632
633    vst1.8          d3, [r12], r3       @store row8
634
635
636    ldmfd           sp!, {r12, pc}
637
638
639
640
641
642@/*
643@//---------------------------------------------------------------------------
644@// Function Name      :   impeg2_interpolate_a9q()
645@//
646@// Detail Description : interpolates two buffers and adds pred
647@//
648@// Inputs             : r0 - pointer to src1
649@//                      r1 - pointer to src2
650@//                      r2 - dest buf
651@//                      r3 - dst stride
652@// Registers Used     : r4, r5, r7, r14, d0-d15
653@//
654@// Stack Usage        : 20 bytes
655@//
656@// Outputs            : The Motion Compensated Block
657@//
658@// Return Data        : None
659@//
660@// Programming Note   : <program limitation>
661@//-----------------------------------------------------------------------------
662@*/
663
664
665        .global impeg2_interpolate_a9q
666
667
668impeg2_interpolate_a9q:
669
670    stmfd           sp!, {r4, r5, r7, r12, r14}
671    vpush           {d8-d15}
672
673    ldr             r4, [r0, #0]        @ptr_y src1
674
675    ldr             r5, [r1, #0]        @ptr_y src2
676
677    ldr             r7, [r2, #0]        @ptr_y dst buf
678
679    mov             r12, #4             @counter for number of blocks
680
681
682interp_lumablocks_stride:
683
684    vld1.8          {d0, d1}, [r4]!     @row1 src1
685
686    vld1.8          {d2, d3}, [r4]!     @row2 src1
687
688    vld1.8          {d4, d5}, [r4]!     @row3 src1
689
690    vld1.8          {d6, d7}, [r4]!     @row4 src1
691
692
693    vld1.8          {d8, d9}, [r5]!     @row1 src2
694
695    vld1.8          {d10, d11}, [r5]!   @row2 src2
696
697    vld1.8          {d12, d13}, [r5]!   @row3 src2
698
699    vld1.8          {d14, d15}, [r5]!   @row4 src2
700
701
702
703
704    vrhadd.u8       q0, q0, q4          @operate on row1
705
706    vrhadd.u8       q1, q1, q5          @operate on row2
707
708    vrhadd.u8       q2, q2, q6          @operate on row3
709
710    vrhadd.u8       q3, q3, q7          @operate on row4
711
712
713
714    vst1.8          {d0, d1}, [r7], r3  @row1
715
716    vst1.8          {d2, d3}, [r7], r3  @row2
717
718    vst1.8          {d4, d5}, [r7], r3  @row3
719
720    vst1.8          {d6, d7}, [r7], r3  @row4
721
722    subs            r12, r12, #1
723
724    bne             interp_lumablocks_stride
725
726
727    mov             r3, r3, lsr #1      @stride >> 1
728
729    ldr             r4, [r0, #4]        @ptr_u src1
730
731    ldr             r5, [r1, #4]        @ptr_u src2
732
733    ldr             r7 , [r2, #4]       @ptr_u dst buf
734
735    mov             r12, #2             @counter for number of blocks
736
737
738
739@chroma blocks
740
741interp_chromablocks_stride:
742
743    vld1.8          {d0, d1}, [r4]!     @row1 & 2 src1
744
745    vld1.8          {d2, d3}, [r4]!     @row3 & 4 src1
746
747    vld1.8          {d4, d5}, [r4]!     @row5 & 6 src1
748
749    vld1.8          {d6, d7}, [r4]!     @row7 & 8 src1
750
751
752    vld1.8          {d8, d9}, [r5]!     @row1 & 2 src2
753
754    vld1.8          {d10, d11}, [r5]!   @row3 & 4 src2
755
756    vld1.8          {d12, d13}, [r5]!   @row5 & 6 src2
757
758    vld1.8          {d14, d15}, [r5]!   @row7 & 8 src2
759
760
761
762
763    vrhadd.u8       q0, q0, q4          @operate on row1 & 2
764
765    vrhadd.u8       q1, q1, q5          @operate on row3 & 4
766
767    vrhadd.u8       q2, q2, q6          @operate on row5 & 6
768
769    vrhadd.u8       q3, q3, q7          @operate on row7 & 8
770
771
772    vst1.8          {d0}, [r7], r3      @row1
773
774    vst1.8          {d1}, [r7], r3      @row2
775
776    vst1.8          {d2}, [r7], r3      @row3
777
778    vst1.8          {d3}, [r7], r3      @row4
779
780    vst1.8          {d4}, [r7], r3      @row5
781
782    vst1.8          {d5}, [r7], r3      @row6
783
784    vst1.8          {d6}, [r7], r3      @row7
785
786    vst1.8          {d7}, [r7], r3      @row8
787
788
789
790    ldr             r4, [r0, #8]        @ptr_v src1
791
792    ldr             r5, [r1, #8]        @ptr_v src2
793
794    ldr             r7, [r2, #8]        @ptr_v dst buf
795
796    subs            r12, r12, #1
797
798    bne             interp_chromablocks_stride
799
800
801    vpop            {d8-d15}
802    ldmfd           sp!, {r4, r5, r7, r12, pc}
803
804
805
806
807
808