• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
12 #define INCLUDE_LIBYUV_MACROS_MSA_H_
13 
14 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
15 #include <msa.h>
16 #include <stdint.h>
17 
18 #if (__mips_isa_rev >= 6)
19 #define LW(psrc)                                       \
20   ({                                                   \
21     const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
22     uint32_t val_m;                                    \
23     asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"     \
24                  : [val_m] "=r"(val_m)                 \
25                  : [psrc_lw_m] "m"(*psrc_lw_m));       \
26     val_m;                                             \
27   })
28 
29 #if (__mips == 64)
30 #define LD(psrc)                                       \
31   ({                                                   \
32     const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
33     uint64_t val_m = 0;                                \
34     asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"     \
35                  : [val_m] "=r"(val_m)                 \
36                  : [psrc_ld_m] "m"(*psrc_ld_m));       \
37     val_m;                                             \
38   })
39 #else  // !(__mips == 64)
40 #define LD(psrc)                                                         \
41   ({                                                                     \
42     const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
43     uint32_t val0_m, val1_m;                                             \
44     uint64_t val_m = 0;                                                  \
45     val0_m = LW(psrc_ld_m);                                              \
46     val1_m = LW(psrc_ld_m + 4);                                          \
47     val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
48     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
49     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
50     val_m;                                                               \
51   })
52 #endif  // (__mips == 64)
53 
54 #define SW(val, pdst)                                   \
55   ({                                                    \
56     uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
57     uint32_t val_m = (val);                             \
58     asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
59                  : [pdst_sw_m] "=m"(*pdst_sw_m)         \
60                  : [val_m] "r"(val_m));                 \
61   })
62 
63 #if (__mips == 64)
64 #define SD(val, pdst)                                   \
65   ({                                                    \
66     uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
67     uint64_t val_m = (val);                             \
68     asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
69                  : [pdst_sd_m] "=m"(*pdst_sd_m)         \
70                  : [val_m] "r"(val_m));                 \
71   })
72 #else  // !(__mips == 64)
73 #define SD(val, pdst)                                        \
74   ({                                                         \
75     uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
76     uint32_t val0_m, val1_m;                                 \
77     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
78     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
79     SW(val0_m, pdst_sd_m);                                   \
80     SW(val1_m, pdst_sd_m + 4);                               \
81   })
82 #endif  // !(__mips == 64)
83 #else   // !(__mips_isa_rev >= 6)
84 #define LW(psrc)                                \
85   ({                                            \
86     uint8_t* psrc_lw_m = (uint8_t*)(psrc);      \
87     uint32_t val_lw_m;                          \
88                                                 \
89     __asm__ volatile(                           \
90         "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
91         "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
92                                                 \
93         : [val_lw_m] "=&r"(val_lw_m)            \
94         : [psrc_lw_m] "r"(psrc_lw_m));          \
95                                                 \
96     val_lw_m;                                   \
97   })
98 
99 #if (__mips == 64)
100 #define LD(psrc)                                \
101   ({                                            \
102     uint8_t* psrc_ld_m = (uint8_t*)(psrc);      \
103     uint64_t val_ld_m = 0;                      \
104                                                 \
105     __asm__ volatile(                           \
106         "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
107         "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
108                                                 \
109         : [val_ld_m] "=&r"(val_ld_m)            \
110         : [psrc_ld_m] "r"(psrc_ld_m));          \
111                                                 \
112     val_ld_m;                                   \
113   })
114 #else  // !(__mips == 64)
115 #define LD(psrc)                                                         \
116   ({                                                                     \
117     const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
118     uint32_t val0_m, val1_m;                                             \
119     uint64_t val_m = 0;                                                  \
120     val0_m = LW(psrc_ld_m);                                              \
121     val1_m = LW(psrc_ld_m + 4);                                          \
122     val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
123     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
124     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
125     val_m;                                                               \
126   })
127 #endif  // (__mips == 64)
128 
129 #define SW(val, pdst)                                   \
130   ({                                                    \
131     uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
132     uint32_t val_m = (val);                             \
133     asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
134                  : [pdst_sw_m] "=m"(*pdst_sw_m)         \
135                  : [val_m] "r"(val_m));                 \
136   })
137 
138 #define SD(val, pdst)                                        \
139   ({                                                         \
140     uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
141     uint32_t val0_m, val1_m;                                 \
142     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
143     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
144     SW(val0_m, pdst_sd_m);                                   \
145     SW(val1_m, pdst_sd_m + 4);                               \
146   })
147 #endif  // (__mips_isa_rev >= 6)
148 
149 // TODO(fbarchard): Consider removing __VAR_ARGS versions.
150 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
151 #define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
152 
153 #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
154 #define LD_UH(...) LD_H(const v8u16, __VA_ARGS__)
155 
156 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
157 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
158 
159 #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
160 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
161 
162 /* Description : Load two vectors with 16 'byte' sized elements
163    Arguments   : Inputs  - psrc, stride
164                  Outputs - out0, out1
165                  Return Type - as per RTYPE
166    Details     : Load 16 byte elements in 'out0' from (psrc)
167                  Load 16 byte elements in 'out1' from (psrc + stride)
168 */
169 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
170   {                                            \
171     out0 = LD_B(RTYPE, (psrc));                \
172     out1 = LD_B(RTYPE, (psrc) + stride);       \
173   }
174 #define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
175 
176 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
177   {                                                        \
178     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
179     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
180   }
181 #define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
182 
183 /* Description : Store two vectors with stride each having 16 'byte' sized
184                  elements
185    Arguments   : Inputs - in0, in1, pdst, stride
186    Details     : Store 16 byte elements from 'in0' to (pdst)
187                  Store 16 byte elements from 'in1' to (pdst + stride)
188 */
189 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
190   {                                          \
191     ST_B(RTYPE, in0, (pdst));                \
192     ST_B(RTYPE, in1, (pdst) + stride);       \
193   }
194 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
195 
196 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
197   {                                                      \
198     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
199     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
200   }
201 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
202 
203 /* Description : Store vectors of 8 halfword elements with stride
204    Arguments   : Inputs - in0, in1, pdst, stride
205    Details     : Store 8 halfword elements from 'in0' to (pdst)
206                  Store 8 halfword elements from 'in1' to (pdst + stride)
207 */
208 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
209   {                                          \
210     ST_H(RTYPE, in0, (pdst));                \
211     ST_H(RTYPE, in1, (pdst) + stride);       \
212   }
213 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
214 
215 // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
216 /* Description : Shuffle byte vector elements as per mask vector
217    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
218                  Outputs - out0, out1
219                  Return Type - as per RTYPE
220    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
221                  'out0' as per control vector 'mask0'
222 */
223 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
224   {                                                                   \
225     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
226     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
227   }
228 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
229 
230 /* Description : Interleave both left and right half of input vectors
231    Arguments   : Inputs  - in0, in1
232                  Outputs - out0, out1
233                  Return Type - as per RTYPE
234    Details     : Right half of byte elements from 'in0' and 'in1' are
235                  interleaved and written to 'out0'
236 */
237 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
238   {                                                     \
239     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
240     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
241   }
242 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
243 
244 #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
245 
246 #endif  // INCLUDE_LIBYUV_MACROS_MSA_H_
247