FFmpeg  4.2.1
vp3dsp_idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "vp3dsp_mips.h"
23 #include "libavutil/intreadwrite.h"
24 #include "libavcodec/rnd_avg.h"
25 
26 static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
27 {
28  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
29  v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
30  r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
31  v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
32  v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
33  v16u8 sign_l;
34  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
35  v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
36  v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
37  v4i32 sign_t;
38  v16i8 zero = {0};
39  v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
40  v4i32 cnst64277w = {64277, 64277, 64277, 64277};
41  v4i32 cnst60547w = {60547, 60547, 60547, 60547};
42  v4i32 cnst54491w = {54491, 54491, 54491, 54491};
43  v4i32 cnst46341w = {46341, 46341, 46341, 46341};
44  v4i32 cnst36410w = {36410, 36410, 36410, 36410};
45  v4i32 cnst25080w = {25080, 25080, 25080, 25080};
46  v4i32 cnst12785w = {12785, 12785, 12785, 12785};
47  v4i32 cnst8w = {8, 8, 8, 8};
48  v4i32 cnst2048w = {2048, 2048, 2048, 2048};
49  v4i32 cnst128w = {128, 128, 128, 128};
50 
51  /* Extended input data */
52  LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
53  sign = __msa_clti_s_h(r0, 0);
54  r0_r = (v4i32) __msa_ilvr_h(sign, r0);
55  r0_l = (v4i32) __msa_ilvl_h(sign, r0);
56  sign = __msa_clti_s_h(r1, 0);
57  r1_r = (v4i32) __msa_ilvr_h(sign, r1);
58  r1_l = (v4i32) __msa_ilvl_h(sign, r1);
59  sign = __msa_clti_s_h(r2, 0);
60  r2_r = (v4i32) __msa_ilvr_h(sign, r2);
61  r2_l = (v4i32) __msa_ilvl_h(sign, r2);
62  sign = __msa_clti_s_h(r3, 0);
63  r3_r = (v4i32) __msa_ilvr_h(sign, r3);
64  r3_l = (v4i32) __msa_ilvl_h(sign, r3);
65  sign = __msa_clti_s_h(r4, 0);
66  r4_r = (v4i32) __msa_ilvr_h(sign, r4);
67  r4_l = (v4i32) __msa_ilvl_h(sign, r4);
68  sign = __msa_clti_s_h(r5, 0);
69  r5_r = (v4i32) __msa_ilvr_h(sign, r5);
70  r5_l = (v4i32) __msa_ilvl_h(sign, r5);
71  sign = __msa_clti_s_h(r6, 0);
72  r6_r = (v4i32) __msa_ilvr_h(sign, r6);
73  r6_l = (v4i32) __msa_ilvl_h(sign, r6);
74  sign = __msa_clti_s_h(r7, 0);
75  r7_r = (v4i32) __msa_ilvr_h(sign, r7);
76  r7_l = (v4i32) __msa_ilvl_h(sign, r7);
77 
78  /* Right part */
79  A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
80  B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
81  C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
82  D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
83  Ad = ((A - C) * cnst46341w) >> 16;
84  Bd = ((B - D) * cnst46341w) >> 16;
85  Cd = A + C;
86  Dd = B + D;
87  E = ((r0_r + r4_r) * cnst46341w) >> 16;
88  F = ((r0_r - r4_r) * cnst46341w) >> 16;
89  G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
90  H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
91  Ed = E - G;
92  Gd = E + G;
93  Add = F + Ad;
94  Bdd = Bd - H;
95  Fd = F - Ad;
96  Hd = Bd + H;
97  r0_r = Gd + Cd;
98  r7_r = Gd - Cd;
99  r1_r = Add + Hd;
100  r2_r = Add - Hd;
101  r3_r = Ed + Dd;
102  r4_r = Ed - Dd;
103  r5_r = Fd + Bdd;
104  r6_r = Fd - Bdd;
105 
106  /* Left part */
107  A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
108  B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
109  C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
110  D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
111  Ad = ((A - C) * cnst46341w) >> 16;
112  Bd = ((B - D) * cnst46341w) >> 16;
113  Cd = A + C;
114  Dd = B + D;
115  E = ((r0_l + r4_l) * cnst46341w) >> 16;
116  F = ((r0_l - r4_l) * cnst46341w) >> 16;
117  G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
118  H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
119  Ed = E - G;
120  Gd = E + G;
121  Add = F + Ad;
122  Bdd = Bd - H;
123  Fd = F - Ad;
124  Hd = Bd + H;
125  r0_l = Gd + Cd;
126  r7_l = Gd - Cd;
127  r1_l = Add + Hd;
128  r2_l = Add - Hd;
129  r3_l = Ed + Dd;
130  r4_l = Ed - Dd;
131  r5_l = Fd + Bdd;
132  r6_l = Fd - Bdd;
133 
134  /* Row 0 to 3 */
135  TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r,
136  r0_r, r1_r, r2_r, r3_r);
137  TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l,
138  r0_l, r1_l, r2_l, r3_l);
139  A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
140  B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
141  C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
142  D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
143  Ad = ((A - C) * cnst46341w) >> 16;
144  Bd = ((B - D) * cnst46341w) >> 16;
145  Cd = A + C;
146  Dd = B + D;
147  E = ((r0_r + r0_l) * cnst46341w) >> 16;
148  E += cnst8w;
149  F = ((r0_r - r0_l) * cnst46341w) >> 16;
150  F += cnst8w;
151  if (type == 1) { // HACK
152  E += cnst2048w;
153  F += cnst2048w;
154  }
155  G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
156  H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
157  Ed = E - G;
158  Gd = E + G;
159  Add = F + Ad;
160  Bdd = Bd - H;
161  Fd = F - Ad;
162  Hd = Bd + H;
163  A = (Gd + Cd) >> 4;
164  B = (Gd - Cd) >> 4;
165  C = (Add + Hd) >> 4;
166  D = (Add - Hd) >> 4;
167  E = (Ed + Dd) >> 4;
168  F = (Ed - Dd) >> 4;
169  G = (Fd + Bdd) >> 4;
170  H = (Fd - Bdd) >> 4;
171  if (type != 1) {
172  LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7);
173  ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
174  f0, f1, f2, f3);
175  ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
176  f4, f5, f6, f7);
177  ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
178  c0, c1, c2, c3);
179  ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
180  c4, c5, c6, c7);
181  A += c0;
182  B += c7;
183  C += c1;
184  D += c2;
185  E += c3;
186  F += c4;
187  G += c5;
188  H += c6;
189  }
190  A = CLIP_SW_0_255(A);
191  B = CLIP_SW_0_255(B);
192  C = CLIP_SW_0_255(C);
193  D = CLIP_SW_0_255(D);
194  E = CLIP_SW_0_255(E);
195  F = CLIP_SW_0_255(F);
196  G = CLIP_SW_0_255(G);
197  H = CLIP_SW_0_255(H);
198  sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
199  sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
200  sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
201  sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
202  sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
203  sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
204  sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
205  Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
206  if (type == 1) {
207  Bdd = Add + cnst128w;
208  Bdd = CLIP_SW_0_255(Bdd);
209  Ad = Bdd;
210  Bd = Bdd;
211  Cd = Bdd;
212  Dd = Bdd;
213  Ed = Bdd;
214  Fd = Bdd;
215  Gd = Bdd;
216  Hd = Bdd;
217  } else {
218  Ad = Add + c0;
219  Bd = Add + c1;
220  Cd = Add + c2;
221  Dd = Add + c3;
222  Ed = Add + c4;
223  Fd = Add + c5;
224  Gd = Add + c6;
225  Hd = Add + c7;
226  Ad = CLIP_SW_0_255(Ad);
227  Bd = CLIP_SW_0_255(Bd);
228  Cd = CLIP_SW_0_255(Cd);
229  Dd = CLIP_SW_0_255(Dd);
230  Ed = CLIP_SW_0_255(Ed);
231  Fd = CLIP_SW_0_255(Fd);
232  Gd = CLIP_SW_0_255(Gd);
233  Hd = CLIP_SW_0_255(Hd);
234  }
235  Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
236  Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
237  Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
238  Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
239  Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
240  Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
241  Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
242  Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
243  sign_t = __msa_ceqi_w(sign_t, 0);
244  A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
245  B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
246  C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
247  D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
248  E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
249  F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
250  G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
251  H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
252  r0_r = Ad + A;
253  r1_r = Bd + C;
254  r2_r = Cd + D;
255  r3_r = Dd + E;
256  r0_l = Ed + F;
257  r1_l = Fd + G;
258  r2_l = Gd + H;
259  r3_l = Hd + B;
260 
261  /* Row 4 to 7 */
262  TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r,
263  r4_r, r5_r, r6_r, r7_r);
264  TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l,
265  r4_l, r5_l, r6_l, r7_l);
266  A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
267  B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
268  C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
269  D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
270  Ad = ((A - C) * cnst46341w) >> 16;
271  Bd = ((B - D) * cnst46341w) >> 16;
272  Cd = A + C;
273  Dd = B + D;
274  E = ((r4_r + r4_l) * cnst46341w) >> 16;
275  E += cnst8w;
276  F = ((r4_r - r4_l) * cnst46341w) >> 16;
277  F += cnst8w;
278  if (type == 1) { // HACK
279  E += cnst2048w;
280  F += cnst2048w;
281  }
282  G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
283  H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
284  Ed = E - G;
285  Gd = E + G;
286  Add = F + Ad;
287  Bdd = Bd - H;
288  Fd = F - Ad;
289  Hd = Bd + H;
290  A = (Gd + Cd) >> 4;
291  B = (Gd - Cd) >> 4;
292  C = (Add + Hd) >> 4;
293  D = (Add - Hd) >> 4;
294  E = (Ed + Dd) >> 4;
295  F = (Ed - Dd) >> 4;
296  G = (Fd + Bdd) >> 4;
297  H = (Fd - Bdd) >> 4;
298  if (type != 1) {
299  ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
300  c0, c1, c2, c3);
301  ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
302  c4, c5, c6, c7);
303  A += c0;
304  B += c7;
305  C += c1;
306  D += c2;
307  E += c3;
308  F += c4;
309  G += c5;
310  H += c6;
311  }
312  A = CLIP_SW_0_255(A);
313  B = CLIP_SW_0_255(B);
314  C = CLIP_SW_0_255(C);
315  D = CLIP_SW_0_255(D);
316  E = CLIP_SW_0_255(E);
317  F = CLIP_SW_0_255(F);
318  G = CLIP_SW_0_255(G);
319  H = CLIP_SW_0_255(H);
320  sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
321  sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
322  sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
323  sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
324  sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
325  sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
326  sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
327  Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
328  if (type == 1) {
329  Bdd = Add + cnst128w;
330  Bdd = CLIP_SW_0_255(Bdd);
331  Ad = Bdd;
332  Bd = Bdd;
333  Cd = Bdd;
334  Dd = Bdd;
335  Ed = Bdd;
336  Fd = Bdd;
337  Gd = Bdd;
338  Hd = Bdd;
339  } else {
340  Ad = Add + c0;
341  Bd = Add + c1;
342  Cd = Add + c2;
343  Dd = Add + c3;
344  Ed = Add + c4;
345  Fd = Add + c5;
346  Gd = Add + c6;
347  Hd = Add + c7;
348  Ad = CLIP_SW_0_255(Ad);
349  Bd = CLIP_SW_0_255(Bd);
350  Cd = CLIP_SW_0_255(Cd);
351  Dd = CLIP_SW_0_255(Dd);
352  Ed = CLIP_SW_0_255(Ed);
353  Fd = CLIP_SW_0_255(Fd);
354  Gd = CLIP_SW_0_255(Gd);
355  Hd = CLIP_SW_0_255(Hd);
356  }
357  Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
358  Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
359  Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
360  Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
361  Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
362  Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
363  Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
364  Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
365  sign_t = __msa_ceqi_w(sign_t, 0);
366  A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
367  B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
368  C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
369  D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
370  E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
371  F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
372  G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
373  H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
374  r4_r = Ad + A;
375  r5_r = Bd + C;
376  r6_r = Cd + D;
377  r7_r = Dd + E;
378  r4_l = Ed + F;
379  r5_l = Fd + G;
380  r6_l = Gd + H;
381  r7_l = Hd + B;
382  VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1);
383  VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3);
384  VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5);
385  VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
386 
387  /* Final sequence of operations over-write original dst */
388  ST_D1(d0, 0, dst);
389  ST_D1(d1, 0, dst + stride);
390  ST_D1(d2, 0, dst + 2 * stride);
391  ST_D1(d3, 0, dst + 3 * stride);
392  ST_D1(d4, 0, dst + 4 * stride);
393  ST_D1(d5, 0, dst + 5 * stride);
394  ST_D1(d6, 0, dst + 6 * stride);
395  ST_D1(d7, 0, dst + 7 * stride);
396 }
397 
398 void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
399 {
400  idct_msa(dest, line_size, block, 1);
401  memset(block, 0, sizeof(*block) * 64);
402 }
403 
404 void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
405 {
406  idct_msa(dest, line_size, block, 2);
407  memset(block, 0, sizeof(*block) * 64);
408 }
409 
410 void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
411 {
412  int i = (block[0] + 15) >> 5;
413  v4i32 dc = {i, i, i, i};
414  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
415  v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
416  v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
417  v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
418  v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
419  v16i8 zero = {0};
420 
421  LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
422  ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
423  c0, c1, c2, c3);
424  ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
425  c4, c5, c6, c7);
426  /* Right part */
427  ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
428  e0, e1, e2, e3);
429  ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
430  e4, e5, e6, e7);
431  e0 += dc;
432  e1 += dc;
433  e2 += dc;
434  e3 += dc;
435  e4 += dc;
436  e5 += dc;
437  e6 += dc;
438  e7 += dc;
439  e0 = CLIP_SW_0_255(e0);
440  e1 = CLIP_SW_0_255(e1);
441  e2 = CLIP_SW_0_255(e2);
442  e3 = CLIP_SW_0_255(e3);
443  e4 = CLIP_SW_0_255(e4);
444  e5 = CLIP_SW_0_255(e5);
445  e6 = CLIP_SW_0_255(e6);
446  e7 = CLIP_SW_0_255(e7);
447 
448  /* Left part */
449  ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
450  r0, r1, r2, r3);
451  ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
452  r4, r5, r6, r7);
453  r0 += dc;
454  r1 += dc;
455  r2 += dc;
456  r3 += dc;
457  r4 += dc;
458  r5 += dc;
459  r6 += dc;
460  r7 += dc;
461  r0 = CLIP_SW_0_255(r0);
462  r1 = CLIP_SW_0_255(r1);
463  r2 = CLIP_SW_0_255(r2);
464  r3 = CLIP_SW_0_255(r3);
465  r4 = CLIP_SW_0_255(r4);
466  r5 = CLIP_SW_0_255(r5);
467  r6 = CLIP_SW_0_255(r6);
468  r7 = CLIP_SW_0_255(r7);
469  VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
470  VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
471  VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
472  VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
473 
474  /* Final sequence of operations over-write original dst */
475  ST_D1(d0, 0, dest);
476  ST_D1(d1, 0, dest + line_size);
477  ST_D1(d2, 0, dest + 2 * line_size);
478  ST_D1(d3, 0, dest + 3 * line_size);
479  ST_D1(d4, 0, dest + 4 * line_size);
480  ST_D1(d5, 0, dest + 5 * line_size);
481  ST_D1(d6, 0, dest + 6 * line_size);
482  ST_D1(d7, 0, dest + 7 * line_size);
483 
484  block[0] = 0;
485 }
486 
487 void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
488  int *bounding_values)
489 {
490  int nstride = -stride;
491  v4i32 e0, e1, f0, f1, g0, g1;
492  v16i8 zero = {0};
493  v16i8 d0, d1, d2, d3;
494  v8i16 c0, c1, c2, c3;
495  v8i16 r0;
496  v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
497  cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
498  v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
499  int16_t temp_16[8];
500  int temp_32[8];
501 
502  LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3);
503  ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
504  c0, c1, c2, c3);
505  r0 = (c0 - c3) + (c2 - c1) * cnst3h;
506  r0 += cnst4h;
507  r0 = r0 >> 3;
508  /* Get filter_value from bounding_values one by one */
509  ST_SH(r0, temp_16);
510  for (int i = 0; i < 8; i++)
511  temp_32[i] = bounding_values[temp_16[i]];
512  LD_SW2(temp_32, 4, e0, e1);
513  ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
514  ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
515  f0 += e0;
516  f1 += e1;
517  g0 -= e0;
518  g1 -= e1;
519  f0 = CLIP_SW_0_255(f0);
520  f1 = CLIP_SW_0_255(f1);
521  g0 = CLIP_SW_0_255(g0);
522  g1 = CLIP_SW_0_255(g1);
523  VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
524 
525  /* Final move to first_pixel */
526  ST_D1(d1, 0, first_pixel + nstride);
527  ST_D1(d2, 0, first_pixel);
528 }
529 
530 void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
531  int *bounding_values)
532 {
533  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
534  v8i16 c0, c1, c2, c3, c4, c5, c6, c7;
535  v8i16 r0;
536  v4i32 e0, e1, f0, f1, g0, g1;
537  v16i8 zero = {0};
538  v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
539  cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
540  v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
541  int16_t temp_16[8];
542  int temp_32[8];
543 
544  LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
545  ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
546  c0, c1, c2, c3);
547  ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7,
548  c4, c5, c6, c7);
549  TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7,
550  c0, c1, c2, c3, c4, c5, c6, c7);
551  r0 = (c0 - c3) + (c2 - c1) * cnst3h;
552  r0 += cnst4h;
553  r0 = r0 >> 3;
554 
555  /* Get filter_value from bounding_values one by one */
556  ST_SH(r0, temp_16);
557  for (int i = 0; i < 8; i++)
558  temp_32[i] = bounding_values[temp_16[i]];
559  LD_SW2(temp_32, 4, e0, e1);
560  ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
561  ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
562  f0 += e0;
563  f1 += e1;
564  g0 -= e0;
565  g1 -= e1;
566  f0 = CLIP_SW_0_255(f0);
567  f1 = CLIP_SW_0_255(f1);
568  g0 = CLIP_SW_0_255(g0);
569  g1 = CLIP_SW_0_255(g1);
570  VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
571  /* Final move to first_pixel */
572  ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
573  ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
574 }
575 
577  const uint8_t *src2, ptrdiff_t stride, int h)
578 {
579  if (h == 8) {
580  v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
581  v16i8 c0, c1, c2, c3;
582  v4i32 a0, a1, a2, a3, b0, b1, b2, b3;
583  v4i32 e0, e1, e2;
584  v4i32 f0, f1, f2;
585  v4u32 t0, t1, t2, t3;
586  v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
587  int32_t value = 0xfefefefe;
588  v4i32 fmask = {value, value, value, value};
589 
590  LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7);
591  VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
592  VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
593  a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
594  a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
595  a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
596  a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
597 
598  LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
599  VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
600  VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
601  b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
602  b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
603  b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
604  b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
605 
606  e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0);
607  e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
608  t0 = ((v4u32)e0) >> 1;
609  e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0);
610  t0 = t0 + (v4u32)e2;
611 
612  e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1);
613  e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
614  t1 = ((v4u32)e1) >> 1;
615  e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1);
616  t1 = t1 + (v4u32)e2;
617 
618  f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2);
619  f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
620  t2 = ((v4u32)f0) >> 1;
621  f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2);
622  t2 = t2 + (v4u32)f2;
623 
624  f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3);
625  f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
626  t3 = ((v4u32)f1) >> 1;
627  f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
628  t3 = t3 + (v4u32)f2;
629 
630  ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
631  ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
632  } else {
633  int i;
634 
635  for (i = 0; i < h; i++) {
636  uint32_t a, b;
637 
638  a = AV_RN32(&src1[i * stride]);
639  b = AV_RN32(&src2[i * stride]);
640  AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
641  a = AV_RN32(&src1[i * stride + 4]);
642  b = AV_RN32(&src2[i * stride + 4]);
643  AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));
644  }
645  }
646 }
#define ILVL_H4_SW(...)
#define C
#define a0
Definition: regdef.h:46
void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
const char * b
Definition: vf_curves.c:116
void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values)
#define a1
Definition: regdef.h:47
#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
int stride
Definition: mace.c:144
#define AV_WN32A(p, v)
Definition: intreadwrite.h:538
void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, ptrdiff_t stride, int h)
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)
static int16_t block[64]
Definition: dct.c:115
#define VSHF_B2_SB(...)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
#define a3
Definition: regdef.h:49
uint8_t
#define t0
Definition: regdef.h:28
#define ST_D1(in, idx, pdst)
#define CLIP_SW_0_255(in)
static const uint64_t c1
Definition: murmur3.c:49
#define LD_SB8(...)
#define A(x)
Definition: vp56_arith.h:28
#define ILVL_H2_SW(...)
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
static const uint16_t mask[17]
Definition: lzw.c:38
#define B
Definition: huffyuvdsp.h:32
#define t1
Definition: regdef.h:29
#define zero
Definition: regdef.h:64
#define TRANSPOSE8x8_SH_SH(...)
#define t3
Definition: regdef.h:31
#define LD_SH8(...)
static uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
Definition: rnd_avg.h:36
#define E
Definition: avdct.c:32
void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
int32_t
#define a2
Definition: regdef.h:48
#define LD_SB4(...)
#define ILVR_H4_SW(...)
#define src1
Definition: h264pred.c:139
double value
Definition: eval.c:98
#define ILVR_B4_SW(...)
#define ST_SH(...)
#define ILVR_B4_SH(...)
cl_device_type type
#define AV_RN32(p)
Definition: intreadwrite.h:364
void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> dc
#define LD_SW2(...)
#define G
Definition: huffyuvdsp.h:33
D(D(float, sse)
Definition: rematrix_init.c:28
static const uint64_t c2
Definition: murmur3.c:50
static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
#define ILVR_H2_SW(...)
#define F(x)
#define H
Definition: pixlet.c:39
void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride, int *bounding_values)
#define t2
Definition: regdef.h:30