1/* 2 * Alpha optimized IDCT-related routines 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/* 23 * These functions are scheduled for pca56. They should work 24 * reasonably on ev6, though. 25 */ 26 27#include "regdef.h" 28 29 .set noat 30 .set noreorder 31 .arch pca56 32 .text 33 34/************************************************************************ 35 * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, 36 * ptrdiff_t line_size) 37 */ 38 .align 6 39 .globl put_pixels_clamped_mvi_asm 40 .ent put_pixels_clamped_mvi_asm 41put_pixels_clamped_mvi_asm: 42 .frame sp, 0, ra 43 .prologue 0 44 45 lda t8, -1 46 lda t9, 8 # loop counter 47 zap t8, 0xaa, t8 # 00ff00ff00ff00ff 48 49 .align 4 501: ldq t0, 0(a0) 51 ldq t1, 8(a0) 52 ldq t2, 16(a0) 53 ldq t3, 24(a0) 54 55 maxsw4 t0, zero, t0 56 subq t9, 2, t9 57 maxsw4 t1, zero, t1 58 lda a0, 32(a0) 59 60 maxsw4 t2, zero, t2 61 addq a1, a2, ta 62 maxsw4 t3, zero, t3 63 minsw4 t0, t8, t0 64 65 minsw4 t1, t8, t1 66 minsw4 t2, t8, t2 67 minsw4 t3, t8, t3 68 pkwb t0, t0 69 70 pkwb t1, t1 71 pkwb t2, t2 72 pkwb t3, t3 73 stl t0, 0(a1) 74 75 stl t1, 4(a1) 76 addq ta, a2, a1 77 stl t2, 0(ta) 78 stl t3, 4(ta) 79 80 bne t9, 1b 81 ret 82 .end put_pixels_clamped_mvi_asm 83 84/************************************************************************ 85 * void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, 86 * ptrdiff_t line_size) 87 */ 88 .align 6 89 .globl add_pixels_clamped_mvi_asm 90 .ent add_pixels_clamped_mvi_asm 91add_pixels_clamped_mvi_asm: 92 .frame sp, 0, ra 93 .prologue 0 94 95 lda t1, -1 96 lda th, 8 97 zap t1, 0x33, tg 98 nop 99 100 srl tg, 1, t0 101 xor tg, t0, tg # 0x8000800080008000 102 zap t1, 0xaa, tf # 0x00ff00ff00ff00ff 103 104 .align 4 1051: ldl t1, 0(a1) # pix0 (try to hit cache line soon) 106 ldl t4, 4(a1) # pix1 107 addq a1, a2, te # pixels += line_size 108 ldq t0, 0(a0) # shorts0 109 110 ldl t7, 0(te) # pix2 (try to hit cache line soon) 111 ldl ta, 4(te) # pix3 112 ldq t3, 8(a0) # shorts1 113 ldq t6, 16(a0) # shorts2 114 115 ldq t9, 24(a0) # shorts3 116 unpkbw t1, t1 # 0 0 (quarter/op no.) 117 and t0, tg, t2 # 0 1 118 unpkbw t4, t4 # 1 0 119 120 bic t0, tg, t0 # 0 2 121 unpkbw t7, t7 # 2 0 122 and t3, tg, t5 # 1 1 123 addq t0, t1, t0 # 0 3 124 125 xor t0, t2, t0 # 0 4 126 unpkbw ta, ta # 3 0 127 and t6, tg, t8 # 2 1 128 maxsw4 t0, zero, t0 # 0 5 129 130 bic t3, tg, t3 # 1 2 131 bic t6, tg, t6 # 2 2 132 minsw4 t0, tf, t0 # 0 6 133 addq t3, t4, t3 # 1 3 134 135 pkwb t0, t0 # 0 7 136 xor t3, t5, t3 # 1 4 137 maxsw4 t3, zero, t3 # 1 5 138 addq t6, t7, t6 # 2 3 139 140 xor t6, t8, t6 # 2 4 141 and t9, tg, tb # 3 1 142 minsw4 t3, tf, t3 # 1 6 143 bic t9, tg, t9 # 3 2 144 145 maxsw4 t6, zero, t6 # 2 5 146 addq t9, ta, t9 # 3 3 147 stl t0, 0(a1) # 0 8 148 minsw4 t6, tf, t6 # 2 6 149 150 xor t9, tb, t9 # 3 4 151 maxsw4 t9, zero, t9 # 3 5 152 lda a0, 32(a0) # block += 16; 153 pkwb t3, t3 # 1 7 154 155 minsw4 t9, tf, t9 # 3 6 156 subq th, 2, th 157 pkwb t6, t6 # 2 7 158 pkwb t9, t9 # 3 7 159 160 stl t3, 4(a1) # 1 8 161 addq te, a2, a1 # pixels += line_size 162 stl t6, 0(te) # 2 8 163 stl t9, 4(te) # 3 8 164 165 bne th, 1b 166 ret 167 .end add_pixels_clamped_mvi_asm 168