FFmpeg  4.3
h264chroma_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264chroma_mips.h"
23 
24 static const uint8_t chroma_mask_arr[16 * 5] = {
25  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26  0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29  0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31 
33  uint32_t coeff0, uint32_t coeff1)
34 {
35  uint16_t out0, out1;
36  v16i8 src0, src1;
37  v8u16 res_r;
38  v8i16 res;
39  v16i8 mask;
40  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
43 
44  mask = LD_SB(&chroma_mask_arr[0]);
45 
47 
48  src0 = __msa_vshf_b(mask, src1, src0);
49  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
50  res_r <<= 3;
51  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52  res_r = __msa_sat_u_h(res_r, 7);
53  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
54 
55  out0 = __msa_copy_u_h(res, 0);
56  out1 = __msa_copy_u_h(res, 2);
57 
58  SH(out0, dst);
59  dst += stride;
60  SH(out1, dst);
61 }
62 
64  uint32_t coeff0, uint32_t coeff1)
65 {
66  v16u8 src0, src1, src2, src3;
67  v8u16 res_r;
68  v8i16 res;
69  v16i8 mask;
70  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
73 
74  mask = LD_SB(&chroma_mask_arr[64]);
75 
76  LD_UB4(src, stride, src0, src1, src2, src3);
77 
78  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
79 
80  src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
81 
82  res_r = __msa_dotp_u_h(src0, coeff_vec);
83  res_r <<= 3;
84  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85  res_r = __msa_sat_u_h(res_r, 7);
86  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
87 
88  ST_H4(res, 0, 1, 2, 3, dst, stride);
89 }
90 
92  uint32_t coeff0, uint32_t coeff1,
94 {
95  if (2 == height) {
96  avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97  } else if (4 == height) {
98  avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
99  }
100 }
101 
103  uint32_t coeff0, uint32_t coeff1)
104 {
105  v16i8 src0, src1;
106  v8u16 res_r;
107  v4i32 res;
108  v16i8 mask;
109  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
112 
113  mask = LD_SB(&chroma_mask_arr[0]);
114 
115  LD_SB2(src, stride, src0, src1);
116 
117  src0 = __msa_vshf_b(mask, src1, src0);
118  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
119  res_r <<= 3;
120  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121  res_r = __msa_sat_u_h(res_r, 7);
122  res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
123 
124  ST_W2(res, 0, 1, dst, stride);
125 }
126 
128  uint32_t coeff0, uint32_t coeff1)
129 {
130  v16u8 src0, src1, src2, src3, out;
131  v8u16 res0_r, res1_r;
132  v16i8 mask;
133  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
136 
137  mask = LD_SB(&chroma_mask_arr[0]);
138 
139  LD_UB4(src, stride, src0, src1, src2, src3);
140  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141  DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
142  res0_r <<= 3;
143  res1_r <<= 3;
144  SRARI_H2_UH(res0_r, res1_r, 6);
145  SAT_UH2_UH(res0_r, res1_r, 7);
146  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147  ST_W4(out, 0, 1, 2, 3, dst, stride);
148 }
149 
151  uint32_t coeff0, uint32_t coeff1)
152 {
153  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
154  v16i8 mask;
155  v8u16 res0, res1, res2, res3;
156  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
159 
160  mask = LD_SB(&chroma_mask_arr[0]);
161 
162  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165  DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166  DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167  SLLI_4V(res0, res1, res2, res3, 3);
168  SRARI_H4_UH(res0, res1, res2, res3, 6);
169  SAT_UH4_UH(res0, res1, res2, res3, 7);
170  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
172 }
173 
175  uint32_t coeff0, uint32_t coeff1,
176  int32_t height)
177 {
178  if (2 == height) {
179  avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180  } else if (4 == height) {
181  avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182  } else if (8 == height) {
183  avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
184  }
185 }
186 
188  uint32_t coeff0, uint32_t coeff1)
189 {
190  v16u8 src0, src1, src2, src3, out0, out1;
191  v8u16 res0, res1, res2, res3;
192  v16i8 mask;
193  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
196 
197  mask = LD_SB(&chroma_mask_arr[32]);
198  LD_UB4(src, stride, src0, src1, src2, src3);
200  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202  coeff_vec, res0, res1, res2, res3);
203  SLLI_4V(res0, res1, res2, res3, 3);
204  SRARI_H4_UH(res0, res1, res2, res3, 6);
205  SAT_UH4_UH(res0, res1, res2, res3, 7);
206  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
208 }
209 
211  uint32_t coeff0, uint32_t coeff1)
212 {
213  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214  v16u8 out0, out1, out2, out3;
215  v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
216  v16i8 mask;
217  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
220 
221  mask = LD_SB(&chroma_mask_arr[32]);
222 
223  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
225  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229  coeff_vec, res0, res1, res2, res3);
230  DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231  coeff_vec, res4, res5, res6, res7);
232  SLLI_4V(res0, res1, res2, res3, 3);
233  SLLI_4V(res4, res5, res6, res7, 3);
234  SRARI_H4_UH(res0, res1, res2, res3, 6);
235  SRARI_H4_UH(res4, res5, res6, res7, 6);
236  SAT_UH4_UH(res0, res1, res2, res3, 7);
237  SAT_UH4_UH(res4, res5, res6, res7, 7);
238  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239  PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
241 }
242 
244  int32_t stride, uint32_t coeff0,
245  uint32_t coeff1, int32_t height)
246 {
247  uint32_t row;
248  v16u8 src0, src1, src2, src3, out0, out1;
249  v8u16 res0, res1, res2, res3;
250  v16i8 mask;
251  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
254 
255  mask = LD_SB(&chroma_mask_arr[32]);
256 
257  for (row = height >> 2; row--;) {
258  LD_UB4(src, stride, src0, src1, src2, src3);
259  src += (4 * stride);
260 
262  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264  coeff_vec, res0, res1, res2, res3);
265  SLLI_4V(res0, res1, res2, res3, 3);
266  SRARI_H4_UH(res0, res1, res2, res3, 6);
267  SAT_UH4_UH(res0, res1, res2, res3, 7);
268  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
270  dst += (4 * stride);
271  }
272 
273  if (0 != (height % 4)) {
274  for (row = (height % 4); row--;) {
275  src0 = LD_UB(src);
276  src += stride;
277 
278  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
279 
280  res0 = __msa_dotp_u_h(src0, coeff_vec);
281  res0 <<= 3;
282  res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283  res0 = __msa_sat_u_h(res0, 7);
284  res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
285 
286  ST_D1(res0, 0, dst);
287  dst += stride;
288  }
289  }
290 }
291 
293  uint32_t coeff0, uint32_t coeff1,
294  int32_t height)
295 {
296  if (4 == height) {
297  avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298  } else if (8 == height) {
299  avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
300  } else {
301  avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
302  }
303 }
304 
306  uint32_t coeff0, uint32_t coeff1)
307 {
308  uint16_t out0, out1;
309  v16i8 src0, src1, src2;
310  v16u8 tmp0, tmp1;
311  v8i16 res;
312  v8u16 res_r;
313  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
316 
317  LD_SB3(src, stride, src0, src1, src2);
318 
319  ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
320 
321  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
322 
323  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
324  res_r <<= 3;
325  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326  res_r = __msa_sat_u_h(res_r, 7);
327  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
328 
329  out0 = __msa_copy_u_h(res, 0);
330  out1 = __msa_copy_u_h(res, 2);
331 
332  SH(out0, dst);
333  dst += stride;
334  SH(out1, dst);
335 }
336 
338  uint32_t coeff0, uint32_t coeff1)
339 {
340  v16u8 src0, src1, src2, src3, src4;
341  v16u8 tmp0, tmp1, tmp2, tmp3;
342  v8i16 res;
343  v8u16 res_r;
344  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
347 
348  LD_UB5(src, stride, src0, src1, src2, src3, src4);
349  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350  tmp0, tmp1, tmp2, tmp3);
351  ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
352 
353  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
354 
355  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
356  res_r <<= 3;
357  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358  res_r = __msa_sat_u_h(res_r, 7);
359 
360  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
361 
362  ST_H4(res, 0, 1, 2, 3, dst, stride);
363 }
364 
366  uint32_t coeff0, uint32_t coeff1,
367  int32_t height)
368 {
369  if (2 == height) {
370  avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371  } else if (4 == height) {
372  avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
373  }
374 }
375 
377  uint32_t coeff0, uint32_t coeff1)
378 {
379  v16u8 src0, src1, src2;
380  v16u8 tmp0, tmp1;
381  v4i32 res;
382  v8u16 res_r;
383  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
386 
387  LD_UB3(src, stride, src0, src1, src2);
388  ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
389 
390  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
392  res_r <<= 3;
393  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394  res_r = __msa_sat_u_h(res_r, 7);
395  res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
396 
397  ST_W2(res, 0, 1, dst, stride);
398 }
399 
401  uint32_t coeff0, uint32_t coeff1)
402 {
403  v16u8 src0, src1, src2, src3, src4;
404  v16u8 tmp0, tmp1, tmp2, tmp3;
405  v16u8 out;
406  v8u16 res0_r, res1_r;
407  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
410 
411  LD_UB5(src, stride, src0, src1, src2, src3, src4);
412  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
413  tmp3);
414  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415  DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
416  res0_r <<= 3;
417  res1_r <<= 3;
418  SRARI_H2_UH(res0_r, res1_r, 6);
419  SAT_UH2_UH(res0_r, res1_r, 7);
420  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421  ST_W4(out, 0, 1, 2, 3, dst, stride);
422 }
423 
425  uint32_t coeff0, uint32_t coeff1)
426 {
427  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428  v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429  v8u16 res0, res1, res2, res3;
430  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
433 
434  LD_UB5(src, stride, src0, src1, src2, src3, src4);
435  src += (5 * stride);
436  LD_UB4(src, stride, src5, src6, src7, src8);
437  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
438  tmp3);
439  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
440  tmp7);
441  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442  ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443  DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444  DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445  SLLI_4V(res0, res1, res2, res3, 3);
446  SRARI_H4_UH(res0, res1, res2, res3, 6);
447  SAT_UH4_UH(res0, res1, res2, res3, 7);
448  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
450 }
451 
453  uint32_t coeff0, uint32_t coeff1,
454  int32_t height)
455 {
456  if (2 == height) {
457  avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458  } else if (4 == height) {
459  avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460  } else if (8 == height) {
461  avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
462  }
463 }
464 
466  uint32_t coeff0, uint32_t coeff1)
467 {
468  v16u8 src0, src1, src2, src3, src4, out0, out1;
469  v8u16 res0, res1, res2, res3;
470  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
473 
474  LD_UB5(src, stride, src0, src1, src2, src3, src4);
475  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
476  src3);
477  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478  coeff_vec, res0, res1, res2, res3);
479  SLLI_4V(res0, res1, res2, res3, 3);
480  SRARI_H4_UH(res0, res1, res2, res3, 6);
481  SAT_UH4_UH(res0, res1, res2, res3, 7);
482  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
484 }
485 
487  uint32_t coeff0, uint32_t coeff1)
488 {
489  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490  v16u8 out0, out1, out2, out3;
491  v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
495 
496  LD_UB5(src, stride, src0, src1, src2, src3, src4);
497  src += (5 * stride);
498  LD_UB4(src, stride, src5, src6, src7, src8);
499  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
500  src3);
501  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
502  src7);
503  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504  coeff_vec, res0, res1, res2, res3);
505  DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506  coeff_vec, res4, res5, res6, res7);
507  SLLI_4V(res0, res1, res2, res3, 3);
508  SLLI_4V(res4, res5, res6, res7, 3);
509  SRARI_H4_UH(res0, res1, res2, res3, 6);
510  SRARI_H4_UH(res4, res5, res6, res7, 6);
511  SAT_UH4_UH(res0, res1, res2, res3, 7);
512  SAT_UH4_UH(res0, res1, res2, res3, 7);
513  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514  PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
516 }
517 
519  uint32_t coeff0, uint32_t coeff1,
520  int32_t height)
521 {
522  if (4 == height) {
523  avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524  } else if (8 == height) {
525  avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
526  }
527 }
528 
530  uint32_t coef_hor0, uint32_t coef_hor1,
531  uint32_t coef_ver0, uint32_t coef_ver1)
532 {
533  uint16_t out0, out1;
534  v16u8 src0, src1, src2;
535  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
536  v8i16 res_vert;
537  v16i8 mask;
538  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
543 
544  mask = LD_SB(&chroma_mask_arr[48]);
545 
546  LD_UB3(src, stride, src0, src1, src2);
547  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
550 
551  res_vt0 += res_vt1;
552  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553  res_vt0 = __msa_sat_u_h(res_vt0, 7);
554  res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
555 
556  out0 = __msa_copy_u_h(res_vert, 0);
557  out1 = __msa_copy_u_h(res_vert, 1);
558 
559  SH(out0, dst);
560  dst += stride;
561  SH(out1, dst);
562 }
563 
565  uint32_t coef_hor0, uint32_t coef_hor1,
566  uint32_t coef_ver0, uint32_t coef_ver1)
567 {
568  v16u8 src0, src1, src2, src3, src4;
569  v16u8 tmp0, tmp1, tmp2, tmp3;
570  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
571  v8i16 res;
572  v16i8 mask;
573  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
578 
579  mask = LD_SB(&chroma_mask_arr[48]);
580 
581  LD_UB5(src, stride, src0, src1, src2, src3, src4);
582 
583  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584  VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
588 
589  res_vt0 += res_vt1;
590  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591  res_vt0 = __msa_sat_u_h(res_vt0, 7);
592 
593  res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
594 
595  ST_H4(res, 0, 1, 2, 3, dst, stride);
596 }
597 
599  uint32_t coef_hor0, uint32_t coef_hor1,
600  uint32_t coef_ver0, uint32_t coef_ver1,
601  int32_t height)
602 {
603  if (2 == height) {
604  avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
605  coef_ver1);
606  } else if (4 == height) {
607  avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
608  coef_ver1);
609  }
610 }
611 
613  uint32_t coef_hor0, uint32_t coef_hor1,
614  uint32_t coef_ver0, uint32_t coef_ver1)
615 {
616  v16u8 src0, src1, src2;
617  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
618  v16i8 mask;
619  v4i32 res;
620  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
625 
626  mask = LD_SB(&chroma_mask_arr[0]);
627  LD_UB3(src, stride, src0, src1, src2);
628  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
631 
632  res_vt0 += res_vt1;
633  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634  res_vt0 = __msa_sat_u_h(res_vt0, 7);
635  res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
636 
637  ST_W2(res, 0, 1, dst, stride);
638 }
639 
641  uint32_t coef_hor0, uint32_t coef_hor1,
642  uint32_t coef_ver0, uint32_t coef_ver1)
643 {
644  v16u8 src0, src1, src2, src3, src4;
645  v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647  v16i8 mask;
648  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
653  v4i32 res0, res1;
654 
655  mask = LD_SB(&chroma_mask_arr[0]);
656 
657  LD_UB5(src, stride, src0, src1, src2, src3, src4);
658  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659  VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660  DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
662  res_hz3);
663  MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664  res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665  ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
666  SRARI_H2_UH(res_vt0, res_vt1, 6);
667  SAT_UH2_UH(res_vt0, res_vt1, 7);
668  PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669  ST_W2(res0, 0, 1, dst, stride);
670  ST_W2(res1, 0, 1, dst + 2 * stride, stride);
671 }
672 
674  uint32_t coef_hor0, uint32_t coef_hor1,
675  uint32_t coef_ver0, uint32_t coef_ver1)
676 {
677  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
678  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
679  v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
680  v16i8 mask;
681  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
682  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
683  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
684  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
685  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
686 
687  mask = LD_SB(&chroma_mask_arr[0]);
688 
689  LD_UB5(src, stride, src0, src1, src2, src3, src4);
690  src += (5 * stride);
691  LD_UB4(src, stride, src5, src6, src7, src8);
692 
693  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
694  VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
695  VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
696  VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
697  DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
698  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
699  DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
700  coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
701  MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
702  res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
703  MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
704  res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
705  ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
706  ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
707  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
708  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
709  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
710  ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
711 }
712 
714  uint32_t coef_hor0, uint32_t coef_hor1,
715  uint32_t coef_ver0, uint32_t coef_ver1,
716  int32_t height)
717 {
718  if (2 == height) {
719  avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
720  coef_ver1);
721  } else if (4 == height) {
722  avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
723  coef_ver1);
724  } else if (8 == height) {
725  avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
726  coef_ver1);
727  }
728 }
729 
731  uint32_t coef_hor0, uint32_t coef_hor1,
732  uint32_t coef_ver0, uint32_t coef_ver1)
733 {
734  v16u8 src0, src1, src2, src3, src4, out0, out1;
735  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
736  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
737  v16i8 mask;
738  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
739  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
740  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
741  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
742  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
743 
744  mask = LD_SB(&chroma_mask_arr[32]);
745 
746  src0 = LD_UB(src);
747  src += stride;
748 
749  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
750  res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
751 
752  LD_UB4(src, stride, src1, src2, src3, src4);
753  src += (4 * stride);
754 
755  VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
756  VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
757  DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
758  coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
759  MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
760  res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
761 
762  res_vt0 += (res_hz0 * coeff_vt_vec1);
763  res_vt1 += (res_hz1 * coeff_vt_vec1);
764  res_vt2 += (res_hz2 * coeff_vt_vec1);
765  res_vt3 += (res_hz3 * coeff_vt_vec1);
766 
767  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
768  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
769  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
770  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
771 }
772 
774  uint32_t coef_hor0, uint32_t coef_hor1,
775  uint32_t coef_ver0, uint32_t coef_ver1)
776 {
777  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
778  v16u8 out0, out1, out2, out3;
779  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
780  v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
781  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
782  v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
783  v16i8 mask;
784  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
785  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
786  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
787  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
788  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
789 
790  mask = LD_SB(&chroma_mask_arr[32]);
791 
792  LD_UB5(src, stride, src0, src1, src2, src3, src4);
793  src += (5 * stride);
794  LD_UB4(src, stride, src5, src6, src7, src8);
795  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
796  VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
797  VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
798  VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
799  VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
800  res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
801  DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
802  coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
803  res_hz4);
804  DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
805  coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
806  MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
807  coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
808  res_vt3);
809  MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
810  coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
811  res_vt7);
812  res_vt0 += (res_hz0 * coeff_vt_vec1);
813  res_vt1 += (res_hz1 * coeff_vt_vec1);
814  res_vt2 += (res_hz2 * coeff_vt_vec1);
815  res_vt3 += (res_hz3 * coeff_vt_vec1);
816  res_vt4 += (res_hz4 * coeff_vt_vec1);
817  res_vt5 += (res_hz5 * coeff_vt_vec1);
818  res_vt6 += (res_hz6 * coeff_vt_vec1);
819  res_vt7 += (res_hz7 * coeff_vt_vec1);
820  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
821  SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
822  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
823  SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
824  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
825  PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
826  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
827 }
828 
830  uint32_t coef_hor0, uint32_t coef_hor1,
831  uint32_t coef_ver0, uint32_t coef_ver1,
832  int32_t height)
833 {
834  if (4 == height) {
835  avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
836  coef_ver1);
837  } else if (8 == height) {
838  avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
839  coef_ver1);
840  }
841 }
842 
844  int32_t stride, uint32_t coeff0,
845  uint32_t coeff1)
846 {
847  uint16_t out0, out1;
848  v16i8 src0, src1;
849  v16u8 dst_data = { 0 };
850  v8u16 res_r;
851  v16u8 res;
852  v16i8 mask;
853  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
854  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
855  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
856 
857  mask = LD_SB(&chroma_mask_arr[0]);
858 
859  LD_SB2(src, stride, src0, src1);
860 
861  out0 = LH(dst);
862  out1 = LH(dst + stride);
863 
864  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
865  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
866 
867  src0 = __msa_vshf_b(mask, src1, src0);
868 
869  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
870  res_r <<= 3;
871  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
872  res_r = __msa_sat_u_h(res_r, 7);
873 
874  res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
875  dst_data = __msa_aver_u_b(res, dst_data);
876 
877  out0 = __msa_copy_u_h((v8i16) dst_data, 0);
878  out1 = __msa_copy_u_h((v8i16) dst_data, 2);
879 
880  SH(out0, dst);
881  dst += stride;
882  SH(out1, dst);
883 }
884 
886  int32_t stride, uint32_t coeff0,
887  uint32_t coeff1)
888 {
889  uint16_t tp0, tp1, tp2, tp3;
890  v16u8 src0, src1, src2, src3;
891  v16u8 dst0, dst_data = { 0 };
892  v8u16 res_r;
893  v16i8 mask;
894  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
895  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
896  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
897 
898  mask = LD_SB(&chroma_mask_arr[64]);
899 
900  LD_UB4(src, stride, src0, src1, src2, src3);
901  tp0 = LH(dst);
902  tp1 = LH(dst + stride);
903  tp2 = LH(dst + 2 * stride);
904  tp3 = LH(dst + 3 * stride);
905  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
906  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
907  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
908  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
909 
910  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
911 
912  src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
913 
914  res_r = __msa_dotp_u_h(src0, coeff_vec);
915  res_r <<= 3;
916  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
917  res_r = __msa_sat_u_h(res_r, 7);
918 
919  dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
920  dst0 = __msa_aver_u_b(dst0, dst_data);
921 
922  ST_H4(dst0, 0, 1, 2, 3, dst, stride);
923 }
924 
926  int32_t stride, uint32_t coeff0,
927  uint32_t coeff1, int32_t height)
928 {
929  if (2 == height) {
930  avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
931  } else if (4 == height) {
932  avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
933  }
934 }
935 
937  int32_t stride, uint32_t coeff0,
938  uint32_t coeff1)
939 {
940  uint32_t load0, load1;
941  v16i8 src0, src1;
942  v16u8 dst_data = { 0 };
943  v8u16 res_r;
944  v16i8 res, mask;
945  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
946  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
947  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
948 
949  mask = LD_SB(&chroma_mask_arr[0]);
950 
951  LD_SB2(src, stride, src0, src1);
952 
953  LW2(dst, stride, load0, load1);
954 
955  INSERT_W2_UB(load0, load1, dst_data);
956 
957  src0 = __msa_vshf_b(mask, src1, src0);
958 
959  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
960  res_r <<= 3;
961  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
962  res_r = __msa_sat_u_h(res_r, 7);
963  res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
964  dst_data = __msa_aver_u_b((v16u8) res, dst_data);
965 
966  ST_W2(dst_data, 0, 1, dst, stride);
967 }
968 
970  int32_t stride, uint32_t coeff0,
971  uint32_t coeff1)
972 {
973  uint32_t tp0, tp1, tp2, tp3;
974  v16u8 src0, src1, src2, src3;
975  v16u8 out, dst_data = { 0 };
976  v16i8 mask;
977  v8u16 res0_r, res1_r;
978  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
979  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
980  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
981 
982  mask = LD_SB(&chroma_mask_arr[0]);
983 
984  LD_UB4(src, stride, src0, src1, src2, src3);
985  LW4(dst, stride, tp0, tp1, tp2, tp3);
986  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
987  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
988  DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
989  res0_r <<= 3;
990  res1_r <<= 3;
991  SRARI_H2_UH(res0_r, res1_r, 6);
992  SAT_UH2_UH(res0_r, res1_r, 7);
993  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
994  out = __msa_aver_u_b(out, dst_data);
995  ST_W4(out, 0, 1, 2, 3, dst, stride);
996 }
997 
999  int32_t stride, uint32_t coeff0,
1000  uint32_t coeff1)
1001 {
1002  uint32_t tp0, tp1, tp2, tp3;
1003  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
1004  v16u8 dst0 = { 0 }, dst1 = { 0 };
1005  v16i8 mask;
1006  v8u16 res0, res1, res2, res3;
1007  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1008  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1009  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1010 
1011  mask = LD_SB(&chroma_mask_arr[0]);
1012 
1013  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1014  LW4(dst, stride, tp0, tp1, tp2, tp3);
1015  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1016  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1017  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1018  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1019  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
1020  DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
1021  DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1022  SLLI_4V(res0, res1, res2, res3, 3);
1023  SRARI_H4_UH(res0, res1, res2, res3, 6);
1024  SAT_UH4_UH(res0, res1, res2, res3, 7);
1025  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1026  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1027  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1028 }
1029 
1031  int32_t stride, uint32_t coeff0,
1032  uint32_t coeff1, int32_t height)
1033 {
1034  if (2 == height) {
1035  avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1036  } else if (4 == height) {
1037  avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1038  } else if (8 == height) {
1039  avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1040  }
1041 }
1042 
1044  int32_t stride, uint32_t coeff0,
1045  uint32_t coeff1)
1046 {
1047  uint64_t tp0, tp1, tp2, tp3;
1048  v16u8 src0, src1, src2, src3, out0, out1;
1049  v16u8 dst0 = { 0 }, dst1 = { 0 };
1050  v8u16 res0, res1, res2, res3;
1051  v16i8 mask;
1052  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1053  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1054  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1055 
1056  mask = LD_SB(&chroma_mask_arr[32]);
1057  LD_UB4(src, stride, src0, src1, src2, src3);
1058  LD4(dst, stride, tp0, tp1, tp2, tp3);
1059  INSERT_D2_UB(tp0, tp1, dst0);
1060  INSERT_D2_UB(tp2, tp3, dst1);
1062  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1063  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1064  coeff_vec, res0, res1, res2, res3);
1065  SLLI_4V(res0, res1, res2, res3, 3);
1066  SRARI_H4_UH(res0, res1, res2, res3, 6);
1067  SAT_UH4_UH(res0, res1, res2, res3, 7);
1068  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1069  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1070  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1071 }
1072 
1074  int32_t stride, uint32_t coeff0,
1075  uint32_t coeff1)
1076 {
1077  uint64_t tp0, tp1, tp2, tp3;
1078  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1079  v16u8 out0, out1, out2, out3;
1080  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1081  v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1082  v16i8 mask;
1083  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1084  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1085  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1086 
1087  mask = LD_SB(&chroma_mask_arr[32]);
1088 
1089  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1090  LD4(dst, stride, tp0, tp1, tp2, tp3);
1091  INSERT_D2_UB(tp0, tp1, dst0);
1092  INSERT_D2_UB(tp2, tp3, dst1);
1093  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1094  INSERT_D2_UB(tp0, tp1, dst2);
1095  INSERT_D2_UB(tp2, tp3, dst3);
1097  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1098  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
1099  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
1100  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1101  coeff_vec, res0, res1, res2, res3);
1102  DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1103  coeff_vec, res4, res5, res6, res7);
1104  SLLI_4V(res0, res1, res2, res3, 3);
1105  SLLI_4V(res4, res5, res6, res7, 3);
1106  SRARI_H4_UH(res0, res1, res2, res3, 6);
1107  SRARI_H4_UH(res4, res5, res6, res7, 6);
1108  SAT_UH4_UH(res0, res1, res2, res3, 7);
1109  SAT_UH4_UH(res4, res5, res6, res7, 7);
1110  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1111  PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1112  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1113  AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1114  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1115 }
1116 
1118  int32_t stride, uint32_t coeff0,
1119  uint32_t coeff1, int32_t height)
1120 {
1121  if (4 == height) {
1122  avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1123  } else if (8 == height) {
1124  avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1125  }
1126 }
1127 
1129  int32_t stride, uint32_t coeff0,
1130  uint32_t coeff1)
1131 {
1132  uint16_t out0, out1;
1133  v16i8 src0, src1, src2, tmp0, tmp1, res;
1134  v16u8 dst_data = { 0 };
1135  v8i16 out;
1136  v8u16 res_r;
1137  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1138  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1139  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1140 
1141  LD_SB3(src, stride, src0, src1, src2);
1142  out0 = LH(dst);
1143  out1 = LH(dst + stride);
1144 
1145  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
1146  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
1147 
1148  ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1149 
1150  tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1151  res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1152  res_r <<= 3;
1153  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1154  res_r = __msa_sat_u_h(res_r, 7);
1155  res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1156  out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1157  out0 = __msa_copy_u_h(out, 0);
1158  out1 = __msa_copy_u_h(out, 2);
1159 
1160  SH(out0, dst);
1161  dst += stride;
1162  SH(out1, dst);
1163 }
1164 
1166  int32_t stride, uint32_t coeff0,
1167  uint32_t coeff1)
1168 {
1169  uint16_t tp0, tp1, tp2, tp3;
1170  v16i8 src0, src1, src2, src3, src4;
1171  v16u8 tmp0, tmp1, tmp2, tmp3;
1172  v8u16 res_r;
1173  v8i16 res;
1174  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1175  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1176  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1177  v16u8 dst_data = { 0 };
1178 
1179  LD_SB5(src, stride, src0, src1, src2, src3, src4);
1180 
1181  tp0 = LH(dst);
1182  tp1 = LH(dst + stride);
1183  tp2 = LH(dst + 2 * stride);
1184  tp3 = LH(dst + 3 * stride);
1185  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
1186  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
1187  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
1188  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
1189 
1190  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1191  tmp0, tmp1, tmp2, tmp3);
1192  ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1193 
1194  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1195 
1196  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1197  res_r <<= 3;
1198  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1199  res_r = __msa_sat_u_h(res_r, 7);
1200 
1201  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1202  res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1203 
1204  ST_H4(res, 0, 1, 2, 3, dst, stride);
1205 }
1206 
1208  int32_t stride, uint32_t coeff0,
1209  uint32_t coeff1, int32_t height)
1210 {
1211  if (2 == height) {
1212  avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
1213  } else if (4 == height) {
1214  avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
1215  }
1216 }
1217 
1219  int32_t stride, uint32_t coeff0,
1220  uint32_t coeff1)
1221 {
1222  uint32_t load0, load1;
1223  v16u8 src0, src1, src2, tmp0, tmp1;
1224  v16u8 dst_data = { 0 };
1225  v8u16 res_r;
1226  v16u8 res;
1227  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1228  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1229  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1230 
1231  LD_UB3(src, stride, src0, src1, src2);
1232 
1233  LW2(dst, stride, load0, load1);
1234 
1235  INSERT_W2_UB(load0, load1, dst_data);
1236  ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
1237 
1238  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1239 
1240  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1241  res_r <<= 3;
1242  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1243  res_r = __msa_sat_u_h(res_r, 7);
1244  res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1245  res = __msa_aver_u_b(res, dst_data);
1246 
1247  ST_W2(res, 0, 1, dst, stride);
1248 }
1249 
1251  int32_t stride, uint32_t coeff0,
1252  uint32_t coeff1)
1253 {
1254  uint32_t tp0, tp1, tp2, tp3;
1255  v16u8 src0, src1, src2, src3, src4;
1256  v16u8 tmp0, tmp1, tmp2, tmp3;
1257  v16u8 dst0 = { 0 };
1258  v8u16 res0_r, res1_r;
1259  v16u8 out;
1260  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1261  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1262  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1263 
1264  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1265  LW4(dst, stride, tp0, tp1, tp2, tp3);
1266  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1267  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1268  tmp3);
1269  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1270  DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1271  res0_r <<= 3;
1272  res1_r <<= 3;
1273  SRARI_H2_UH(res0_r, res1_r, 6);
1274  SAT_UH2_UH(res0_r, res1_r, 7);
1275  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
1276  out = __msa_aver_u_b(out, dst0);
1277  ST_W4(out, 0, 1, 2, 3, dst, stride);
1278 }
1279 
1281  int32_t stride, uint32_t coeff0,
1282  uint32_t coeff1)
1283 {
1284  uint32_t tp0, tp1, tp2, tp3;
1285  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1286  v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
1287  v16u8 dst0 = { 0 }, dst1 = { 0 };
1288  v8u16 res0, res1, res2, res3;
1289  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1290  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1291  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1292 
1293  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1294  src += (5 * stride);
1295  LD_UB4(src, stride, src5, src6, src7, src8);
1296  LW4(dst, stride, tp0, tp1, tp2, tp3);
1297  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1298  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1299  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1300  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1301  tmp3);
1302  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
1303  tmp7);
1304  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1305  ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
1306  DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
1307  DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
1308  SLLI_4V(res0, res1, res2, res3, 3);
1309  SRARI_H4_UH(res0, res1, res2, res3, 6);
1310  SAT_UH4_UH(res0, res1, res2, res3, 7);
1311  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1312  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1313  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1314 }
1315 
1317  int32_t stride, uint32_t coeff0,
1318  uint32_t coeff1, int32_t height)
1319 {
1320  if (2 == height) {
1321  avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1322  } else if (4 == height) {
1323  avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1324  } else if (8 == height) {
1325  avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1326  }
1327 }
1328 
1330  int32_t stride, uint32_t coeff0,
1331  uint32_t coeff1)
1332 {
1333  uint64_t tp0, tp1, tp2, tp3;
1334  v16u8 src0, src1, src2, src3, src4;
1335  v16u8 out0, out1;
1336  v8u16 res0, res1, res2, res3;
1337  v16u8 dst0 = { 0 }, dst1 = { 0 };
1338  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1339  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1340  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1341 
1342  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1343  LD4(dst, stride, tp0, tp1, tp2, tp3);
1344  INSERT_D2_UB(tp0, tp1, dst0);
1345  INSERT_D2_UB(tp2, tp3, dst1);
1346  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1347  src0, src1, src2, src3);
1348  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1349  coeff_vec, res0, res1, res2, res3);
1350  SLLI_4V(res0, res1, res2, res3, 3);
1351  SRARI_H4_UH(res0, res1, res2, res3, 6);
1352  SAT_UH4_UH(res0, res1, res2, res3, 7);
1353  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1354  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1355  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1356 }
1357 
1359  int32_t stride, uint32_t coeff0,
1360  uint32_t coeff1)
1361 {
1362  uint64_t tp0, tp1, tp2, tp3;
1363  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1364  v16u8 out0, out1, out2, out3;
1365  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1366  v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1367  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1368  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1369  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1370 
1371  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1372  src += (5 * stride);
1373  LD_UB4(src, stride, src5, src6, src7, src8);
1374  LD4(dst, stride, tp0, tp1, tp2, tp3);
1375  INSERT_D2_UB(tp0, tp1, dst0);
1376  INSERT_D2_UB(tp2, tp3, dst1);
1377  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1378  INSERT_D2_UB(tp0, tp1, dst2);
1379  INSERT_D2_UB(tp2, tp3, dst3);
1380  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1381  src0, src1, src2, src3);
1382  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1383  src4, src5, src6, src7);
1384  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1385  coeff_vec, res0, res1, res2, res3);
1386  DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1387  coeff_vec, res4, res5, res6, res7);
1388  SLLI_4V(res0, res1, res2, res3, 3);
1389  SLLI_4V(res4, res5, res6, res7, 3);
1390  SRARI_H4_UH(res0, res1, res2, res3, 6);
1391  SRARI_H4_UH(res4, res5, res6, res7, 6);
1392  SAT_UH4_UH(res0, res1, res2, res3, 7);
1393  SAT_UH4_UH(res0, res1, res2, res3, 7);
1394  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1395  PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1396  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1397  AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1398  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1399 }
1400 
1402  int32_t stride, uint32_t coeff0,
1403  uint32_t coeff1, int32_t height)
1404 {
1405  if (4 == height) {
1406  avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1407  } else if (8 == height) {
1408  avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1409  }
1410 }
1411 
1413  int32_t stride,
1414  uint32_t coef_hor0,
1415  uint32_t coef_hor1,
1416  uint32_t coef_ver0,
1417  uint32_t coef_ver1)
1418 {
1419  uint16_t out0, out1;
1420  v16u8 dst0 = { 0 };
1421  v16u8 src0, src1, src2;
1422  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1423  v16i8 res, mask;
1424  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1425  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1426  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1427  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1428  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1429 
1430  mask = LD_SB(&chroma_mask_arr[48]);
1431 
1432  LD_UB3(src, stride, src0, src1, src2);
1433  out0 = LH(dst);
1434  out1 = LH(dst + stride);
1435  dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
1436  dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
1437  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1438  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1439  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1440 
1441  res_vt0 += res_vt1;
1442  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1443  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1444  res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1445  dst0 = __msa_aver_u_b((v16u8) res, dst0);
1446  out0 = __msa_copy_u_h((v8i16) dst0, 0);
1447  out1 = __msa_copy_u_h((v8i16) dst0, 1);
1448 
1449  SH(out0, dst);
1450  dst += stride;
1451  SH(out1, dst);
1452 }
1453 
1455  int32_t stride,
1456  uint32_t coef_hor0,
1457  uint32_t coef_hor1,
1458  uint32_t coef_ver0,
1459  uint32_t coef_ver1)
1460 {
1461  uint16_t tp0, tp1, tp2, tp3;
1462  v16u8 src0, src1, src2, src3, src4;
1463  v16u8 tmp0, tmp1, tmp2, tmp3;
1464  v16u8 dst0 = { 0 };
1465  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1466  v16i8 res, mask;
1467  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1468  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1469  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1470  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1471  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1472 
1473  mask = LD_SB(&chroma_mask_arr[48]);
1474 
1475  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1476  tp0 = LH(dst);
1477  tp1 = LH(dst + stride);
1478  tp2 = LH(dst + 2 * stride);
1479  tp3 = LH(dst + 3 * stride);
1480  dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
1481  dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
1482  dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
1483  dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
1484  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1485  VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1486  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1487  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1488  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1489 
1490  res_vt0 += res_vt1;
1491  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1492  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1493  res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1494  dst0 = __msa_aver_u_b((v16u8) res, dst0);
1495 
1496  ST_H4(dst0, 0, 1, 2, 3, dst, stride);
1497 }
1498 
1500  int32_t stride,
1501  uint32_t coef_hor0,
1502  uint32_t coef_hor1,
1503  uint32_t coef_ver0,
1504  uint32_t coef_ver1,
1505  int32_t height)
1506 {
1507  if (2 == height) {
1509  coef_hor1, coef_ver0, coef_ver1);
1510  } else if (4 == height) {
1512  coef_hor1, coef_ver0, coef_ver1);
1513  }
1514 }
1515 
1517  int32_t stride,
1518  uint32_t coef_hor0,
1519  uint32_t coef_hor1,
1520  uint32_t coef_ver0,
1521  uint32_t coef_ver1)
1522 {
1523  uint32_t tp0, tp1;
1524  v16u8 src0, src1, src2;
1525  v16u8 dst0, dst_data = { 0 };
1526  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1527  v16i8 mask;
1528  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1529  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1530  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1531  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1532  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1533 
1534  mask = LD_SB(&chroma_mask_arr[0]);
1535 
1536  LD_UB3(src, stride, src0, src1, src2);
1537  LW2(dst, stride, tp0, tp1);
1538  INSERT_W2_UB(tp0, tp1, dst_data);
1539  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1540  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1541  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1542 
1543  res_vt0 += res_vt1;
1544  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1545  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1546  dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1547  dst0 = __msa_aver_u_b(dst0, dst_data);
1548 
1549  ST_W2(dst0, 0, 1, dst, stride);
1550 }
1551 
1553  int32_t stride,
1554  uint32_t coef_hor0,
1555  uint32_t coef_hor1,
1556  uint32_t coef_ver0,
1557  uint32_t coef_ver1)
1558 {
1559  uint32_t tp0, tp1, tp2, tp3;
1560  v16u8 src0, src1, src2, src3, src4;
1561  v16u8 out, dst_data = { 0 };
1562  v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1563  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1564  v16i8 mask;
1565  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1566  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1567  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1568  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1569  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1570 
1571  mask = LD_SB(&chroma_mask_arr[0]);
1572 
1573  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1574  LW4(dst, stride, tp0, tp1, tp2, tp3);
1575  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
1576  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1577  VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1578  DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1579  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1580  res_hz3);
1581  MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1582  res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1583  ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1584  SRARI_H2_UH(res_vt0, res_vt1, 6);
1585  SAT_UH2_UH(res_vt0, res_vt1, 7);
1586  out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
1587  out = __msa_aver_u_b(out, dst_data);
1588  ST_W4(out, 0, 1, 2, 3, dst, stride);
1589 }
1590 
1592  int32_t stride,
1593  uint32_t coef_hor0,
1594  uint32_t coef_hor1,
1595  uint32_t coef_ver0,
1596  uint32_t coef_ver1)
1597 {
1598  uint32_t tp0, tp1, tp2, tp3;
1599  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1600  v16u8 dst0 = { 0 }, dst1 = { 0 };
1601  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
1602  v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
1603  v16i8 mask;
1604  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1605  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1606  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1607  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1608  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1609 
1610  mask = LD_SB(&chroma_mask_arr[0]);
1611 
1612  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1613  src += (5 * stride);
1614  LD_UB4(src, stride, src5, src6, src7, src8);
1615  LW4(dst, stride, tp0, tp1, tp2, tp3);
1616  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1617  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1618  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1619  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1620  VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1621  VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
1622  VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
1623  DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1624  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
1625  DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
1626  coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
1627  MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1628  res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1629  MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
1630  res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
1631  ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1632  ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
1633  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1634  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1635  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
1636  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1637  ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1638 }
1639 
1641  int32_t stride,
1642  uint32_t coef_hor0,
1643  uint32_t coef_hor1,
1644  uint32_t coef_ver0,
1645  uint32_t coef_ver1,
1646  int32_t height)
1647 {
1648  if (2 == height) {
1650  coef_hor1, coef_ver0, coef_ver1);
1651  } else if (4 == height) {
1653  coef_hor1, coef_ver0, coef_ver1);
1654  } else if (8 == height) {
1656  coef_hor1, coef_ver0, coef_ver1);
1657  }
1658 }
1659 
1661  int32_t stride,
1662  uint32_t coef_hor0,
1663  uint32_t coef_hor1,
1664  uint32_t coef_ver0,
1665  uint32_t coef_ver1)
1666 {
1667  uint64_t tp0, tp1, tp2, tp3;
1668  v16u8 src0, src1, src2, src3, src4, out0, out1;
1669  v8u16 res_hz0, res_hz1, res_hz2;
1670  v8u16 res_hz3, res_hz4;
1671  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1672  v16u8 dst0 = { 0 }, dst1 = { 0 };
1673  v16i8 mask;
1674  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1675  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1676  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1677  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1678  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1679 
1680  mask = LD_SB(&chroma_mask_arr[32]);
1681 
1682  src0 = LD_UB(src);
1683  src += stride;
1684  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1685  res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1686  LD_UB4(src, stride, src1, src2, src3, src4);
1687  src += (4 * stride);
1688  LD4(dst, stride, tp0, tp1, tp2, tp3);
1689  INSERT_D2_UB(tp0, tp1, dst0);
1690  INSERT_D2_UB(tp2, tp3, dst1);
1691  VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1692  VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1693  DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1694  coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
1695  MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
1696  res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1697  res_vt0 += (res_hz0 * coeff_vt_vec1);
1698  res_vt1 += (res_hz1 * coeff_vt_vec1);
1699  res_vt2 += (res_hz2 * coeff_vt_vec1);
1700  res_vt3 += (res_hz3 * coeff_vt_vec1);
1701  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1702  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1703  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1704  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1705  ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1706 }
1707 
1709  int32_t stride,
1710  uint32_t coef_hor0,
1711  uint32_t coef_hor1,
1712  uint32_t coef_ver0,
1713  uint32_t coef_ver1)
1714 {
1715  uint64_t tp0, tp1, tp2, tp3;
1716  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1717  v16u8 out0, out1, out2, out3;
1718  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1719  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
1720  v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
1721  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1722  v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
1723  v16i8 mask;
1724  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1725  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1726  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1727  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1728  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1729 
1730  mask = LD_SB(&chroma_mask_arr[32]);
1731 
1732  LD_UB5(src, stride, src0, src1, src2, src3, src4);
1733  src += (5 * stride);
1734  LD_UB4(src, stride, src5, src6, src7, src8);
1735  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1736  VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1737  VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1738  VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
1739  VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
1740  res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1741  DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1742  coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1743  res_hz4);
1744  DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
1745  coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
1746  MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1747  coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1748  res_vt3);
1749  MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
1750  coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
1751  res_vt7);
1752  LD4(dst, stride, tp0, tp1, tp2, tp3);
1753  INSERT_D2_UB(tp0, tp1, dst0);
1754  INSERT_D2_UB(tp2, tp3, dst1);
1755  LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1756  INSERT_D2_UB(tp0, tp1, dst2);
1757  INSERT_D2_UB(tp2, tp3, dst3);
1758  res_vt0 += (res_hz0 * coeff_vt_vec1);
1759  res_vt1 += (res_hz1 * coeff_vt_vec1);
1760  res_vt2 += (res_hz2 * coeff_vt_vec1);
1761  res_vt3 += (res_hz3 * coeff_vt_vec1);
1762  res_vt4 += (res_hz4 * coeff_vt_vec1);
1763  res_vt5 += (res_hz5 * coeff_vt_vec1);
1764  res_vt6 += (res_hz6 * coeff_vt_vec1);
1765  res_vt7 += (res_hz7 * coeff_vt_vec1);
1766  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1767  SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
1768  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1769  SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
1770  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1771  PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
1772  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1773  AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1774  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1775 }
1776 
1778  int32_t stride,
1779  uint32_t coef_hor0,
1780  uint32_t coef_hor1,
1781  uint32_t coef_ver0,
1782  uint32_t coef_ver1,
1783  int32_t height)
1784 {
1785  if (4 == height) {
1787  coef_hor1, coef_ver0, coef_ver1);
1788  } else if (8 == height) {
1790  coef_hor1, coef_ver0, coef_ver1);
1791  }
1792 }
1793 
1795  int32_t height)
1796 {
1797  uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1798 
1799  if (8 == height) {
1800  LW4(src, stride, tp0, tp1, tp2, tp3);
1801  src += 4 * stride;
1802  LW4(src, stride, tp4, tp5, tp6, tp7);
1803  SW4(tp0, tp1, tp2, tp3, dst, stride);
1804  dst += 4 * stride;
1805  SW4(tp4, tp5, tp6, tp7, dst, stride);
1806  } else if (4 == height) {
1807  LW4(src, stride, tp0, tp1, tp2, tp3);
1808  SW4(tp0, tp1, tp2, tp3, dst, stride);
1809  } else if (2 == height) {
1810  LW2(src, stride, tp0, tp1);
1811  SW(tp0, dst);
1812  dst += stride;
1813  SW(tp1, dst);
1814  }
1815 }
1816 
1818  int32_t height)
1819 {
1820  uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1821 
1822  if (8 == height) {
1823  LD4(src, stride, src0, src1, src2, src3);
1824  src += 4 * stride;
1825  LD4(src, stride, src4, src5, src6, src7);
1826  SD4(src0, src1, src2, src3, dst, stride);
1827  dst += 4 * stride;
1828  SD4(src4, src5, src6, src7, dst, stride);
1829  } else if (4 == height) {
1830  LD4(src, stride, src0, src1, src2, src3);
1831  SD4(src0, src1, src2, src3, dst, stride);
1832  }
1833 }
1834 
1836  int32_t height)
1837 {
1838  uint32_t tp0, tp1, tp2, tp3;
1839  v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
1840 
1841  if (8 == height) {
1842  LW4(src, stride, tp0, tp1, tp2, tp3);
1843  src += 4 * stride;
1844  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1845  LW4(src, stride, tp0, tp1, tp2, tp3);
1846  INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
1847  LW4(dst, stride, tp0, tp1, tp2, tp3);
1848  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1849  LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1850  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1851  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1852  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1853  } else if (4 == height) {
1854  LW4(src, stride, tp0, tp1, tp2, tp3);
1855  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1856  LW4(dst, stride, tp0, tp1, tp2, tp3);
1857  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1858  dst0 = __msa_aver_u_b(src0, dst0);
1859  ST_W4(dst0, 0, 1, 2, 3, dst, stride);
1860  } else if (2 == height) {
1861  LW2(src, stride, tp0, tp1);
1862  INSERT_W2_UB(tp0, tp1, src0);
1863  LW2(dst, stride, tp0, tp1);
1864  INSERT_W2_UB(tp0, tp1, dst0);
1865  dst0 = __msa_aver_u_b(src0, dst0);
1866  ST_W2(dst0, 0, 1, dst, stride);
1867  }
1868 }
1869 
1871  int32_t height)
1872 {
1873  uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1874  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1875  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1876 
1877  if (8 == height) {
1878  LD4(src, stride, tp0, tp1, tp2, tp3);
1879  src += 4 * stride;
1880  LD4(src, stride, tp4, tp5, tp6, tp7);
1881  INSERT_D2_UB(tp0, tp1, src0);
1882  INSERT_D2_UB(tp2, tp3, src1);
1883  INSERT_D2_UB(tp4, tp5, src2);
1884  INSERT_D2_UB(tp6, tp7, src3);
1885  LD4(dst, stride, tp0, tp1, tp2, tp3);
1886  LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1887  INSERT_D2_UB(tp0, tp1, dst0);
1888  INSERT_D2_UB(tp2, tp3, dst1);
1889  INSERT_D2_UB(tp4, tp5, dst2);
1890  INSERT_D2_UB(tp6, tp7, dst3);
1891  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1892  dst2, dst3);
1893  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1894  } else if (4 == height) {
1895  LD4(src, stride, tp0, tp1, tp2, tp3);
1896  INSERT_D2_UB(tp0, tp1, src0);
1897  INSERT_D2_UB(tp2, tp3, src1);
1898  LD4(dst, stride, tp0, tp1, tp2, tp3);
1899  INSERT_D2_UB(tp0, tp1, dst0);
1900  INSERT_D2_UB(tp2, tp3, dst1);
1901  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1902  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1903  }
1904 }
1905 
1907  ptrdiff_t stride, int height, int x, int y)
1908 {
1909  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1910 
1911  if (x && y) {
1912  avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1913  } else if (x) {
1914  avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1915  } else if (y) {
1916  avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1917  } else {
1918  copy_width8_msa(src, dst, stride, height);
1919  }
1920 }
1921 
1923  ptrdiff_t stride, int height, int x, int y)
1924 {
1925  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1926 
1927  if (x && y) {
1928  avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1929  } else if (x) {
1930  avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1931  } else if (y) {
1932  avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1933  } else {
1934  copy_width4_msa(src, dst, stride, height);
1935  }
1936 }
1937 
1939  ptrdiff_t stride, int height, int x, int y)
1940 {
1941  int32_t cnt;
1942 
1943  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1944 
1945  if (x && y) {
1946  avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1947  } else if (x) {
1948  avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1949  } else if (y) {
1950  avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1951  } else {
1952  for (cnt = height; cnt--;) {
1953  *((uint16_t *) dst) = *((uint16_t *) src);
1954 
1955  src += stride;
1956  dst += stride;
1957  }
1958  }
1959 }
1960 
1962  ptrdiff_t stride, int height, int x, int y)
1963 {
1964  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1965 
1966 
1967  if (x && y) {
1968  avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y,
1969  (8 - y), height);
1970  } else if (x) {
1972  } else if (y) {
1973  avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height);
1974  } else {
1975  avg_width8_msa(src, dst, stride, height);
1976  }
1977 }
1978 
1980  ptrdiff_t stride, int height, int x, int y)
1981 {
1982  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983 
1984  if (x && y) {
1985  avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y,
1986  (8 - y), height);
1987  } else if (x) {
1989  } else if (y) {
1990  avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height);
1991  } else {
1992  avg_width4_msa(src, dst, stride, height);
1993  }
1994 }
1995 
1997  ptrdiff_t stride, int height, int x, int y)
1998 {
1999  int32_t cnt;
2000 
2001  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2002 
2003  if (x && y) {
2004  avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y,
2005  (8 - y), height);
2006  } else if (x) {
2008  } else if (y) {
2009  avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height);
2010  } else {
2011  for (cnt = height; cnt--;) {
2012  dst[0] = (dst[0] + src[0] + 1) >> 1;
2013  dst[1] = (dst[1] + src[1] + 1) >> 1;
2014 
2015  src += stride;
2016  dst += stride;
2017  }
2018  }
2019 }
stride
int stride
Definition: mace.c:144
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:460
avc_chroma_vt_and_aver_dst_2x2_msa
static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1128
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:337
ff_avg_h264_chroma_mc4_msa
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
Definition: h264chroma_msa.c:1979
out
FILE * out
Definition: movenc.c:54
avc_chroma_hv_and_aver_dst_4x8_msa
static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:1591
avc_chroma_hv_and_aver_dst_2x2_msa
static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:1412
avc_chroma_hv_4x4_msa
static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:640
h264chroma_mips.h
avc_chroma_hz_2w_msa
static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:91
VSHF_B2_UB
#define VSHF_B2_UB(...)
Definition: generic_macros_msa.h:663
AVER_UB2_UB
#define AVER_UB2_UB(...)
Definition: generic_macros_msa.h:597
avc_chroma_hz_8x4_msa
static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:187
DOTP_UB2_UH
#define DOTP_UB2_UH(...)
Definition: generic_macros_msa.h:742
avc_chroma_vt_4x2_msa
static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:376
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:487
avc_chroma_hz_2x4_msa
static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:63
LD_UB5
#define LD_UB5(...)
Definition: generic_macros_msa.h:309
AVER_UB4_UB
#define AVER_UB4_UB(...)
Definition: generic_macros_msa.h:605
avc_chroma_hz_4x8_msa
static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:150
SRARI_H4_UH
#define SRARI_H4_UH(...)
Definition: generic_macros_msa.h:2078
avc_chroma_hv_and_aver_dst_8x8_msa
static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:1708
avc_chroma_hv_and_aver_dst_2w_msa
static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
Definition: h264chroma_msa.c:1499
SH
#define SH(val, pdst)
Definition: generic_macros_msa.h:156
avc_chroma_hv_4x8_msa
static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:673
avc_chroma_hz_and_aver_dst_4w_msa
static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:1030
ff_put_h264_chroma_mc8_msa
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
Definition: h264chroma_msa.c:1906
avc_chroma_hz_4w_msa
static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:174
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2130
generic_macros_msa.h
copy_width8_msa
static void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
Definition: h264chroma_msa.c:1817
x
FFmpeg Automated Testing Environment ************************************Introduction Using FATE from your FFmpeg source directory Submitting the results to the FFmpeg result aggregation server Uploading new samples to the fate suite FATE makefile targets and variables Makefile targets Makefile variables Examples Introduction **************FATE is an extended regression suite on the client side and a means for results aggregation and presentation on the server side The first part of this document explains how you can use FATE from your FFmpeg source directory to test your ffmpeg binary The second part describes how you can run FATE to submit the results to FFmpeg’s FATE server In any way you can have a look at the publicly viewable FATE results by visiting this as it can be seen if some test on some platform broke with their recent contribution This usually happens on the platforms the developers could not test on The second part of this document describes how you can run FATE to submit your results to FFmpeg’s FATE server If you want to submit your results be sure to check that your combination of OS and compiler is not already listed on the above mentioned website In the third part you can find a comprehensive listing of FATE makefile targets and variables Using FATE from your FFmpeg source directory **********************************************If you want to run FATE on your machine you need to have the samples in place You can get the samples via the build target fate rsync Use this command from the top level source this will cause FATE to fail NOTE To use a custom wrapper to run the pass ‘ target exec’ to ‘configure’ or set the TARGET_EXEC Make variable Submitting the results to the FFmpeg result aggregation server ****************************************************************To submit your results to the server you should run fate through the shell script ‘tests fate sh’ from the FFmpeg sources This script needs to be invoked with a configuration file as its first argument tests fate sh path to fate_config A configuration file template with comments describing the individual configuration variables can be found at ‘doc fate_config sh template’ Create a configuration that suits your based on the configuration template The ‘slot’ configuration variable can be any string that is not yet but it is suggested that you name it adhering to the following pattern ‘ARCH OS COMPILER COMPILER VERSION’ The configuration file itself will be sourced in a shell therefore all shell features may be used This enables you to setup the environment as you need it for your build For your first test runs the ‘fate_recv’ variable should be empty or commented out This will run everything as normal except that it will omit the submission of the results to the server The following files should be present in $workdir as specified in the configuration it may help to try out the ‘ssh’ command with one or more ‘ v’ options You should get detailed output concerning your SSH configuration and the authentication process The only thing left is to automate the execution of the fate sh script and the synchronisation of the samples directory Uploading new samples to the fate suite *****************************************If you need a sample uploaded send a mail to samples request This is for developers who have an account on the fate suite server If you upload new please make sure they are as small as space on each network bandwidth and so on benefit from smaller test cases Also keep in mind older checkouts use existing sample that means in practice generally do not remove or overwrite files as it likely would break older checkouts or releases Also all needed samples for a commit should be ideally before the push If you need an account for frequently uploading samples or you wish to help others by doing that send a mail to ffmpeg devel rsync vauL Duo x
Definition: fate.txt:150
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:37
ILVR_W2_UB
#define ILVR_W2_UB(...)
Definition: generic_macros_msa.h:1428
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:36
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:310
avc_chroma_vt_and_aver_dst_4x4_msa
static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1250
LH
#define LH(psrc)
Definition: generic_macros_msa.h:93
LD_UB3
#define LD_UB3(...)
Definition: generic_macros_msa.h:290
avg_width4_msa
static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
Definition: h264chroma_msa.c:1835
chroma_mask_arr
static const uint8_t chroma_mask_arr[16 *5]
Definition: h264chroma_msa.c:24
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:169
mask
static const uint16_t mask[17]
Definition: lzw.c:38
MUL2
#define MUL2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2113
ILVR_B4_UB
#define ILVR_B4_UB(...)
Definition: generic_macros_msa.h:1371
avg_width8_msa
static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
Definition: h264chroma_msa.c:1870
SAT_UH2_UH
#define SAT_UH2_UH(...)
Definition: generic_macros_msa.h:1579
avc_chroma_hv_2x2_msa
static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:529
avc_chroma_hz_nonmult_msa
static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:243
DOTP_UB4_UH
#define DOTP_UB4_UH(...)
Definition: generic_macros_msa.h:751
avc_chroma_hz_4x2_msa
static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:102
PCKEV_B2_SW
#define PCKEV_B2_SW(...)
Definition: generic_macros_msa.h:1734
int32_t
int32_t
Definition: audio_convert.c:194
avc_chroma_vt_4w_msa
static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:452
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:513
copy_width4_msa
static void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride, int32_t height)
Definition: h264chroma_msa.c:1794
avc_chroma_hz_and_aver_dst_8x4_msa
static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1043
avc_chroma_hv_4w_msa
static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
Definition: h264chroma_msa.c:713
avc_chroma_vt_and_aver_dst_8w_msa
static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:1401
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1732
avc_chroma_vt_and_aver_dst_2x4_msa
static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1165
src
#define src
Definition: vp8dsp.c:254
avc_chroma_hv_8w_msa
static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
Definition: h264chroma_msa.c:829
avc_chroma_hv_and_aver_dst_4x4_msa
static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:1552
avc_chroma_hv_and_aver_dst_8w_msa
static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
Definition: h264chroma_msa.c:1777
avc_chroma_vt_8x4_msa
static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:465
avc_chroma_hz_8w_msa
static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:292
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:452
avc_chroma_vt_2x2_msa
static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:305
avc_chroma_hz_and_aver_dst_4x2_msa
static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:936
avc_chroma_hz_and_aver_dst_2x2_msa
static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:843
SLLI_4V
#define SLLI_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1933
ff_avg_h264_chroma_mc2_msa
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
Definition: h264chroma_msa.c:1996
avc_chroma_hz_and_aver_dst_4x4_msa
static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:969
ILVR_D2_UB
#define ILVR_D2_UB(...)
Definition: generic_macros_msa.h:1455
avc_chroma_hv_2x4_msa
static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:564
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:258
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:298
avc_chroma_hz_4x4_msa
static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:127
avc_chroma_vt_4x8_msa
static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:424
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1350
avc_chroma_hv_2w_msa
static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
Definition: h264chroma_msa.c:598
height
#define height
avc_chroma_vt_and_aver_dst_4x2_msa
static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1218
avc_chroma_hv_and_aver_dst_4x2_msa
static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:1516
SW4
#define SW4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:243
avc_chroma_vt_and_aver_dst_8x4_msa
static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1329
avc_chroma_vt_and_aver_dst_8x8_msa
static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1358
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:204
src0
#define src0
Definition: h264pred.c:138
avc_chroma_hz_and_aver_dst_8x8_msa
static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1073
src1
#define src1
Definition: h264pred.c:139
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2070
avc_chroma_hz_8x8_msa
static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:210
avc_chroma_vt_2x4_msa
static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:337
ST_H4
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:419
avc_chroma_vt_4x4_msa
static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:400
avc_chroma_vt_8x8_msa
static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:486
avc_chroma_hz_and_aver_dst_2w_msa
static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:925
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:472
uint8_t
uint8_t
Definition: audio_convert.c:194
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:230
avc_chroma_hz_and_aver_dst_2x4_msa
static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:885
INSERT_W4_UB
#define INSERT_W4_UB(...)
Definition: generic_macros_msa.h:1165
avc_chroma_hv_and_aver_dst_8x4_msa
static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:1660
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:291
ILVR_B2_UB
#define ILVR_B2_UB(...)
Definition: generic_macros_msa.h:1349
ff_put_h264_chroma_mc2_msa
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
Definition: h264chroma_msa.c:1938
avc_chroma_vt_and_aver_dst_2w_msa
static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:1207
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:501
avc_chroma_hz_and_aver_dst_4x8_msa
static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:998
avc_chroma_vt_8w_msa
static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:518
ff_put_h264_chroma_mc4_msa
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
Definition: h264chroma_msa.c:1922
avc_chroma_hv_4x2_msa
static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:612
avc_chroma_hv_and_aver_dst_2x4_msa
static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:1454
INSERT_W2_UB
#define INSERT_W2_UB(...)
Definition: generic_macros_msa.h:1155
avc_chroma_hv_8x8_msa
static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:773
avc_chroma_hv_8x4_msa
static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
Definition: h264chroma_msa.c:730
MUL4
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2118
avc_chroma_hz_2x2_msa
static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:32
avc_chroma_vt_and_aver_dst_4w_msa
static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:1316
avc_chroma_hz_and_aver_dst_8w_msa
static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:1117
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1181
avc_chroma_vt_and_aver_dst_4x8_msa
static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
Definition: h264chroma_msa.c:1280
avc_chroma_vt_2w_msa
static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
Definition: h264chroma_msa.c:365
avc_chroma_hv_and_aver_dst_4w_msa
static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
Definition: h264chroma_msa.c:1640
ff_avg_h264_chroma_mc8_msa
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
Definition: h264chroma_msa.c:1961
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:280
LW2
#define LW2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:212
SAT_UH4_UH
#define SAT_UH4_UH(...)
Definition: generic_macros_msa.h:1587