FFmpeg  4.2.3
hevcpred_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/hevcdec.h"
23 #include "hevcpred_mips.h"
24 
25 static const int8_t intra_pred_angle_up[17] = {
26  -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27 };
28 
29 static const int8_t intra_pred_angle_low[16] = {
30  32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31 };
32 
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34  mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35  res0, res1, mul_val_b0, mul_val_b1, round) \
36 { \
37  v8i16 res0_m, res1_m, res2_m, res3_m; \
38  \
39  MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40  mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
41  \
42  res0_m += mul_val_h1 * tmp0; \
43  res1_m += mul_val_h3 * tmp0; \
44  res2_m += mul_val_h1 * tmp0; \
45  res3_m += mul_val_h3 * tmp0; \
46  \
47  res0_m += mul_val_b0 * src0_r; \
48  res1_m += mul_val_b0 * src0_l; \
49  res2_m += (mul_val_b0 - 1) * src0_r; \
50  res3_m += (mul_val_b0 - 1) * src0_l; \
51  \
52  res0_m += mul_val_b1 * tmp1; \
53  res1_m += mul_val_b1 * tmp1; \
54  res2_m += (mul_val_b1 + 1) * tmp1; \
55  res3_m += (mul_val_b1 + 1) * tmp1; \
56  \
57  SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58  PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
59 }
60 
61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62  const uint8_t *src_left,
63  uint8_t *dst, int32_t stride,
64  int32_t flag)
65 {
66  uint32_t col;
67  uint32_t src_data;
68  v8i16 vec0, vec1, vec2;
69  v16i8 zero = { 0 };
70 
71  src_data = LW(src_top);
72  SW4(src_data, src_data, src_data, src_data, dst, stride);
73 
74  if (0 == flag) {
75  src_data = LW(src_left);
76 
77  vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78 
79  vec0 = __msa_fill_h(src_left[-1]);
80  vec1 = __msa_fill_h(src_top[0]);
81 
82  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83  vec2 -= vec0;
84  vec2 >>= 1;
85  vec2 += vec1;
86  vec2 = CLIP_SH_0_255(vec2);
87 
88  for (col = 0; col < 4; col++) {
89  dst[stride * col] = (uint8_t) vec2[col];
90  }
91  }
92 }
93 
94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95  const uint8_t *src_left,
96  uint8_t *dst, int32_t stride,
97  int32_t flag)
98 {
99  uint8_t *tmp_dst = dst;
100  uint32_t row;
101  uint16_t val0, val1, val2, val3;
102  uint64_t src_data1;
103  v8i16 vec0, vec1, vec2;
104  v16i8 zero = { 0 };
105 
106  src_data1 = LD(src_top);
107 
108  for (row = 8; row--;) {
109  SD(src_data1, tmp_dst);
110  tmp_dst += stride;
111  }
112 
113  if (0 == flag) {
114  src_data1 = LD(src_left);
115 
116  vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117 
118  vec0 = __msa_fill_h(src_left[-1]);
119  vec1 = __msa_fill_h(src_top[0]);
120 
121  vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122  vec2 -= vec0;
123  vec2 >>= 1;
124  vec2 += vec1;
125  vec2 = CLIP_SH_0_255(vec2);
126 
127  val0 = vec2[0];
128  val1 = vec2[1];
129  val2 = vec2[2];
130  val3 = vec2[3];
131 
132  dst[0] = val0;
133  dst[stride] = val1;
134  dst[2 * stride] = val2;
135  dst[3 * stride] = val3;
136 
137  val0 = vec2[4];
138  val1 = vec2[5];
139  val2 = vec2[6];
140  val3 = vec2[7];
141 
142  dst[4 * stride] = val0;
143  dst[5 * stride] = val1;
144  dst[6 * stride] = val2;
145  dst[7 * stride] = val3;
146  }
147 }
148 
149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150  const uint8_t *src_left,
151  uint8_t *dst, int32_t stride,
152  int32_t flag)
153 {
154  int32_t col;
155  uint8_t *tmp_dst = dst;
156  uint32_t row;
157  v16u8 src;
158  v8i16 vec0, vec1, vec2, vec3;
159 
160  src = LD_UB(src_top);
161 
162  for (row = 16; row--;) {
163  ST_UB(src, tmp_dst);
164  tmp_dst += stride;
165  }
166 
167  if (0 == flag) {
168  src = LD_UB(src_left);
169 
170  vec0 = __msa_fill_h(src_left[-1]);
171  vec1 = __msa_fill_h(src_top[0]);
172 
173  UNPCK_UB_SH(src, vec2, vec3);
174  SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175 
176  vec2 >>= 1;
177  vec3 >>= 1;
178 
179  ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180  CLIP_SH2_0_255(vec2, vec3);
181 
182  src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183 
184  for (col = 0; col < 16; col++) {
185  dst[stride * col] = src[col];
186  }
187  }
188 }
189 
190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191  const uint8_t *src_left,
192  uint8_t *dst, int32_t stride,
193  int32_t flag)
194 {
195  uint32_t val0, val1, val2, val3;
196  v16i8 src0;
197  v8i16 src0_r, src_top_val, src_left_val;
198  v16i8 zero = { 0 };
199 
200  val0 = src_left[0] * 0x01010101;
201  val1 = src_left[1] * 0x01010101;
202  val2 = src_left[2] * 0x01010101;
203  val3 = src_left[3] * 0x01010101;
204  SW4(val0, val1, val2, val3, dst, stride);
205 
206  if (0 == flag) {
207  val0 = LW(src_top);
208  src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209  src_top_val = __msa_fill_h(src_top[-1]);
210  src_left_val = __msa_fill_h(src_left[0]);
211 
212  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213 
214  src0_r -= src_top_val;
215  src0_r >>= 1;
216  src0_r += src_left_val;
217  src0_r = CLIP_SH_0_255(src0_r);
218  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219  val0 = __msa_copy_s_w((v4i32) src0, 0);
220  SW(val0, dst);
221  }
222 }
223 
224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225  const uint8_t *src_left,
226  uint8_t *dst, int32_t stride,
227  int32_t flag)
228 {
229  uint64_t val0, val1, val2, val3;
230  v16i8 src0;
231  v8i16 src0_r, src_top_val, src_left_val;
232  v16i8 zero = { 0 };
233 
234  val0 = src_left[0] * 0x0101010101010101;
235  val1 = src_left[1] * 0x0101010101010101;
236  val2 = src_left[2] * 0x0101010101010101;
237  val3 = src_left[3] * 0x0101010101010101;
238  SD4(val0, val1, val2, val3, dst, stride);
239 
240  val0 = src_left[4] * 0x0101010101010101;
241  val1 = src_left[5] * 0x0101010101010101;
242  val2 = src_left[6] * 0x0101010101010101;
243  val3 = src_left[7] * 0x0101010101010101;
244  SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245 
246  if (0 == flag) {
247  val0 = LD(src_top);
248  src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249  src_top_val = __msa_fill_h(src_top[-1]);
250  src_left_val = __msa_fill_h(src_left[0]);
251 
252  src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253 
254  src0_r -= src_top_val;
255  src0_r >>= 1;
256  src0_r += src_left_val;
257  src0_r = CLIP_SH_0_255(src0_r);
258  src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259  val0 = __msa_copy_s_d((v2i64) src0, 0);
260  SD(val0, dst);
261  }
262 }
263 
264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265  const uint8_t *src_left,
266  uint8_t *dst, int32_t stride,
267  int32_t flag)
268 {
269  uint8_t *tmp_dst = dst;
270  uint32_t row;
271  uint8_t inp0, inp1, inp2, inp3;
272  v16i8 src0, src1, src2, src3;
273  v8i16 src0_r, src0_l, src_left_val, src_top_val;
274 
275  src_left_val = __msa_fill_h(src_left[0]);
276 
277  for (row = 4; row--;) {
278  inp0 = src_left[0];
279  inp1 = src_left[1];
280  inp2 = src_left[2];
281  inp3 = src_left[3];
282  src_left += 4;
283 
284  src0 = __msa_fill_b(inp0);
285  src1 = __msa_fill_b(inp1);
286  src2 = __msa_fill_b(inp2);
287  src3 = __msa_fill_b(inp3);
288 
289  ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290  tmp_dst += (4 * stride);
291  }
292 
293  if (0 == flag) {
294  src0 = LD_SB(src_top);
295  src_top_val = __msa_fill_h(src_top[-1]);
296 
297  UNPCK_UB_SH(src0, src0_r, src0_l);
298  SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299 
300  src0_r >>= 1;
301  src0_l >>= 1;
302 
303  ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304  CLIP_SH2_0_255(src0_r, src0_l);
305  src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306  ST_SB(src0, dst);
307  }
308 }
309 
310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311  const uint8_t *src_left,
312  uint8_t *dst, int32_t stride)
313 {
314  uint32_t row;
315  uint8_t inp0, inp1, inp2, inp3;
316  v16i8 src0, src1, src2, src3;
317 
318  for (row = 0; row < 8; row++) {
319  inp0 = src_left[row * 4];
320  inp1 = src_left[row * 4 + 1];
321  inp2 = src_left[row * 4 + 2];
322  inp3 = src_left[row * 4 + 3];
323 
324  src0 = __msa_fill_b(inp0);
325  src1 = __msa_fill_b(inp1);
326  src2 = __msa_fill_b(inp2);
327  src3 = __msa_fill_b(inp3);
328 
329  ST_SB2(src0, src0, dst, 16);
330  dst += stride;
331  ST_SB2(src1, src1, dst, 16);
332  dst += stride;
333  ST_SB2(src2, src2, dst, 16);
334  dst += stride;
335  ST_SB2(src3, src3, dst, 16);
336  dst += stride;
337  }
338 }
339 
340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341  const uint8_t *src_left,
342  uint8_t *dst, int32_t stride,
343  int32_t flag)
344 {
345  uint8_t *tmp_dst = dst;
346  uint32_t addition = 0;
347  uint32_t val0, val1, val2;
348  v16i8 src = { 0 };
349  v16u8 store;
350  v16i8 zero = { 0 };
351  v8u16 sum, vec0, vec1;
352 
353  val0 = LW(src_top);
354  val1 = LW(src_left);
355  INSERT_W2_SB(val0, val1, src);
356  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357  sum = (v8u16) __msa_hadd_u_w(sum, sum);
358  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359  sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360  addition = __msa_copy_u_w((v4i32) sum, 0);
361  store = (v16u8) __msa_fill_b(addition);
362  val0 = __msa_copy_u_w((v4i32) store, 0);
363  SW4(val0, val0, val0, val0, dst, stride)
364 
365  if (0 == flag) {
366  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367 
368  vec1 += vec0;
369  vec0 += vec0;
370  vec1 += vec0;
371 
372  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374  val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375  store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376  val0 = __msa_copy_u_w((v4i32) store, 0);
377  SW(val0, tmp_dst);
378 
379  val0 = src_left[1];
380  val1 = src_left[2];
381  val2 = src_left[3];
382 
383  addition *= 3;
384 
385  ADD2(val0, addition, val1, addition, val0, val1);
386  val2 += addition;
387 
388  val0 += 2;
389  val1 += 2;
390  val2 += 2;
391  val0 >>= 2;
392  val1 >>= 2;
393  val2 >>= 2;
394 
395  tmp_dst[stride * 1] = val0;
396  tmp_dst[stride * 2] = val1;
397  tmp_dst[stride * 3] = val2;
398  }
399 }
400 
401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402  const uint8_t *src_left,
403  uint8_t *dst, int32_t stride,
404  int32_t flag)
405 {
406  uint8_t *tmp_dst = dst;
407  uint32_t row, col, val;
408  uint32_t addition = 0;
409  uint64_t val0, val1;
410  v16u8 src = { 0 };
411  v16u8 store;
412  v8u16 sum, vec0, vec1;
413  v16i8 zero = { 0 };
414 
415  val0 = LD(src_top);
416  val1 = LD(src_left);
417  INSERT_D2_UB(val0, val1, src);
418  sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419  sum = (v8u16) __msa_hadd_u_w(sum, sum);
420  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423  sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424  addition = __msa_copy_u_w((v4i32) sum, 0);
425  store = (v16u8) __msa_fill_b(addition);
426  val0 = __msa_copy_u_d((v2i64) store, 0);
427 
428  for (row = 8; row--;) {
429  SD(val0, dst);
430  dst += stride;
431  }
432 
433  if (0 == flag) {
434  ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435 
436  vec1 += vec0;
437  vec0 += vec0;
438  vec1 += vec0;
439  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440  store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443  val0 = __msa_copy_u_d((v2i64) store, 0);
444  SD(val0, tmp_dst);
445 
446  val0 = LD(src_left);
447  src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448  vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449  vec0 = (v8u16) __msa_fill_h(addition);
450  vec0 *= 3;
451  vec1 += vec0;
452  vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453 
454  for (col = 1; col < 8; col++) {
455  tmp_dst[stride * col] = vec1[col];
456  }
457  }
458 }
459 
460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461  const uint8_t *src_left,
462  uint8_t *dst, int32_t stride,
463  int32_t flag)
464 {
465  uint8_t *tmp_dst = dst;
466  uint32_t row, col, val;
467  uint32_t addition = 0;
468  v16u8 src_above1, store, src_left1;
469  v8u16 sum, sum_above, sum_left;
470  v8u16 vec0, vec1, vec2;
471  v16i8 zero = { 0 };
472 
473  src_above1 = LD_UB(src_top);
474  src_left1 = LD_UB(src_left);
475 
476  HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477  sum = sum_above + sum_left;
478  sum = (v8u16) __msa_hadd_u_w(sum, sum);
479  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482  sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483  addition = __msa_copy_u_w((v4i32) sum, 0);
484  store = (v16u8) __msa_fill_b(addition);
485 
486  for (row = 16; row--;) {
487  ST_UB(store, dst);
488  dst += stride;
489  }
490 
491  if (0 == flag) {
492  vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493  ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495  vec0 += vec0;
496  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497  SRARI_H2_UH(vec1, vec2, 2);
498  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499  val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500  store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501  ST_UB(store, tmp_dst);
502 
503  ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504  vec0 = (v8u16) __msa_fill_h(addition);
505  vec0 *= 3;
506  ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507  SRARI_H2_UH(vec1, vec2, 2);
508  store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509 
510  for (col = 1; col < 16; col++) {
511  tmp_dst[stride * col] = store[col];
512  }
513  }
514 }
515 
516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517  const uint8_t *src_left,
518  uint8_t *dst, int32_t stride)
519 {
520  uint32_t row;
521  v16u8 src_above1, src_above2, store, src_left1, src_left2;
522  v8u16 sum_above1, sum_above2;
523  v8u16 sum_left1, sum_left2;
524  v8u16 sum, sum_above, sum_left;
525 
526  LD_UB2(src_top, 16, src_above1, src_above2);
527  LD_UB2(src_left, 16, src_left1, src_left2);
528  HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529  HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530  sum_above = sum_above1 + sum_above2;
531  sum_left = sum_left1 + sum_left2;
532  sum = sum_above + sum_left;
533  sum = (v8u16) __msa_hadd_u_w(sum, sum);
534  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535  sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536  sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537  sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538  store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539 
540  for (row = 16; row--;) {
541  ST_UB2(store, store, dst, 16);
542  dst += stride;
543  ST_UB2(store, store, dst, 16);
544  dst += stride;
545  }
546 }
547 
548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549  const uint8_t *src_left,
550  uint8_t *dst, int32_t stride)
551 {
552  uint32_t src0, src1;
553  v16i8 src_vec0, src_vec1;
554  v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555  v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556  v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557  v16i8 zero = { 0 };
558 
559  src0 = LW(src_top);
560  src1 = LW(src_left);
561 
562  mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563 
564  src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565  src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566 
567  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568  SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569 
570  tmp0 = __msa_fill_h(src_top[4]);
571  tmp1 = __msa_fill_h(src_left[4]);
572 
573  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574  res0, res1, res2, res3);
575 
576  res0 += mul_val1 * tmp0;
577  res1 += mul_val1 * tmp0;
578  res2 += mul_val1 * tmp0;
579  res3 += mul_val1 * tmp0;
580 
581  res0 += 3 * src_vec0_r;
582  res1 += 2 * src_vec0_r;
583  res2 += src_vec0_r;
584  res0 += tmp1;
585  res1 += 2 * tmp1;
586  res2 += 3 * tmp1;
587  res3 += 4 * tmp1;
588 
589  PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590  SRARI_H2_SH(res0, res1, 3);
591  src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592  ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593 }
594 
595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596  const uint8_t *src_left,
597  uint8_t *dst, int32_t stride)
598 {
599  uint64_t src0, src1;
600  v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601  v8i16 src_vec0_r, src_vec1_r;
602  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604  v8i16 tmp0, tmp1, tmp2;
605  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606  v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607  v16i8 zero = { 0 };
608 
609  src0 = LD(src_top);
610  src1 = LD(src_left);
611 
612  src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613  src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614 
615  ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616  SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617  SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618 
619  tmp0 = __msa_fill_h(src_top[8]);
620  tmp1 = __msa_fill_h(src_left[8]);
621 
622  MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623  res0, res1, res2, res3);
624  MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625  res4, res5, res6, res7);
626 
627  tmp2 = mul_val1 * tmp0;
628  res0 += tmp2;
629  res1 += tmp2;
630  res2 += tmp2;
631  res3 += tmp2;
632  res4 += tmp2;
633  res5 += tmp2;
634  res6 += tmp2;
635  res7 += tmp2;
636 
637  res0 += 7 * src_vec0_r;
638  res1 += 6 * src_vec0_r;
639  res2 += 5 * src_vec0_r;
640  res3 += 4 * src_vec0_r;
641  res4 += 3 * src_vec0_r;
642  res5 += 2 * src_vec0_r;
643  res6 += src_vec0_r;
644 
645  res0 += tmp1;
646  res1 += 2 * tmp1;
647  res2 += 3 * tmp1;
648  res3 += 4 * tmp1;
649  res4 += 5 * tmp1;
650  res5 += 6 * tmp1;
651  res6 += 7 * tmp1;
652  res7 += 8 * tmp1;
653 
654  SRARI_H4_SH(res0, res1, res2, res3, 4);
655  SRARI_H4_SH(res4, res5, res6, res7, 4);
656  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657  src_vec0, src_vec1, src_vec2, src_vec3);
658 
659  ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660  0, 1, 0, 1, dst, stride);
661 }
662 
663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664  const uint8_t *src_left,
665  uint8_t *dst, int32_t stride)
666 {
667  v16u8 src0, src1;
668  v8i16 src0_r, src1_r, src0_l, src1_l;
669  v8i16 vec0, vec1;
670  v8i16 res0, res1, tmp0, tmp1;
671  v8i16 mul_val2, mul_val3;
672  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673  v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674 
675  src0 = LD_UB(src_top);
676  src1 = LD_UB(src_left);
677 
678  UNPCK_UB_SH(src0, src0_r, src0_l);
679  UNPCK_UB_SH(src1, src1_r, src1_l);
680 
681  mul_val2 = mul_val0 - 8;
682  mul_val3 = mul_val1 + 8;
683 
684  tmp0 = __msa_fill_h(src_top[16]);
685  tmp1 = __msa_fill_h(src_left[16]);
686 
687  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689  mul_val0, mul_val1, mul_val2, mul_val3,
690  res0, res1, 15, 1, 5);
691  ST_SH2(res0, res1, dst, stride);
692  dst += (2 * stride);
693 
694  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696  mul_val0, mul_val1, mul_val2, mul_val3,
697  res0, res1, 13, 3, 5);
698  ST_SH2(res0, res1, dst, stride);
699  dst += (2 * stride);
700 
701  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703  mul_val0, mul_val1, mul_val2, mul_val3,
704  res0, res1, 11, 5, 5);
705  ST_SH2(res0, res1, dst, stride);
706  dst += (2 * stride);
707 
708  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710  mul_val0, mul_val1, mul_val2, mul_val3,
711  res0, res1, 9, 7, 5);
712  ST_SH2(res0, res1, dst, stride);
713  dst += (2 * stride);
714 
715  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717  mul_val0, mul_val1, mul_val2, mul_val3,
718  res0, res1, 7, 9, 5);
719  ST_SH2(res0, res1, dst, stride);
720  dst += (2 * stride);
721 
722  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724  mul_val0, mul_val1, mul_val2, mul_val3,
725  res0, res1, 5, 11, 5);
726  ST_SH2(res0, res1, dst, stride);
727  dst += (2 * stride);
728 
729  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731  mul_val0, mul_val1, mul_val2, mul_val3,
732  res0, res1, 3, 13, 5);
733  ST_SH2(res0, res1, dst, stride);
734  dst += (2 * stride);
735 
736  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738  mul_val0, mul_val1, mul_val2, mul_val3,
739  res0, res1, 1, 15, 5);
740  ST_SH2(res0, res1, dst, stride);
741 }
742 
743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744  const uint8_t *src_left,
745  uint8_t *dst, int32_t stride,
746  uint8_t offset)
747 {
748  v16i8 src0, src1;
749  v8i16 src0_r, src1_r, src0_l, src1_l;
750  v8i16 vec0, vec1, res0, res1;
751  v8i16 tmp0, tmp1;
752  v8i16 mul_val2, mul_val3;
753  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755 
756  tmp0 = __msa_fill_h(src_top[32 - offset]);
757  tmp1 = __msa_fill_h(src_left[32]);
758 
759  src0 = LD_SB(src_top);
760  src1 = LD_SB(src_left);
761 
762  UNPCK_UB_SH(src0, src0_r, src0_l);
763  UNPCK_UB_SH(src1, src1_r, src1_l);
764 
765  mul_val1 += offset;
766  mul_val0 -= offset;
767  mul_val2 = mul_val0 - 8;
768  mul_val3 = mul_val1 + 8;
769 
770  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772  mul_val0, mul_val1, mul_val2, mul_val3,
773  res0, res1, 31, 1, 6);
774  ST_SH2(res0, res1, dst, stride);
775  dst += (2 * stride);
776 
777  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779  mul_val0, mul_val1, mul_val2, mul_val3,
780  res0, res1, 29, 3, 6);
781  ST_SH2(res0, res1, dst, stride);
782  dst += (2 * stride);
783 
784  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786  mul_val0, mul_val1, mul_val2, mul_val3,
787  res0, res1, 27, 5, 6);
788  ST_SH2(res0, res1, dst, stride);
789  dst += (2 * stride);
790 
791  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793  mul_val0, mul_val1, mul_val2, mul_val3,
794  res0, res1, 25, 7, 6);
795  ST_SH2(res0, res1, dst, stride);
796  dst += (2 * stride);
797 
798  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800  mul_val0, mul_val1, mul_val2, mul_val3,
801  res0, res1, 23, 9, 6);
802  ST_SH2(res0, res1, dst, stride);
803  dst += (2 * stride);
804 
805  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807  mul_val0, mul_val1, mul_val2, mul_val3,
808  res0, res1, 21, 11, 6);
809  ST_SH2(res0, res1, dst, stride);
810  dst += (2 * stride);
811 
812  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814  mul_val0, mul_val1, mul_val2, mul_val3,
815  res0, res1, 19, 13, 6);
816  ST_SH2(res0, res1, dst, stride);
817  dst += (2 * stride);
818 
819  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821  mul_val0, mul_val1, mul_val2, mul_val3,
822  res0, res1, 17, 15, 6);
823  ST_SH2(res0, res1, dst, stride);
824 }
825 
826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827  const uint8_t *src_left,
828  uint8_t *dst, int32_t stride,
829  uint8_t offset)
830 {
831  v16i8 src0, src1;
832  v8i16 src0_r, src1_r, src0_l, src1_l;
833  v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834  v8i16 mul_val2, mul_val3;
835  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836  v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837 
838  tmp0 = __msa_fill_h(src_top[32 - offset]);
839  tmp1 = __msa_fill_h(src_left[16]);
840 
841  src0 = LD_SB(src_top);
842  src1 = LD_SB(src_left);
843 
844  UNPCK_UB_SH(src0, src0_r, src0_l);
845  UNPCK_UB_SH(src1, src1_r, src1_l);
846 
847  mul_val1 += offset;
848  mul_val0 -= offset;
849  mul_val2 = mul_val0 - 8;
850  mul_val3 = mul_val1 + 8;
851 
852  SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854  mul_val0, mul_val1, mul_val2, mul_val3,
855  res0, res1, 15, 17, 6);
856  ST_SH2(res0, res1, dst, stride);
857  dst += (2 * stride);
858 
859  SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861  mul_val0, mul_val1, mul_val2, mul_val3,
862  res0, res1, 13, 19, 6);
863  ST_SH2(res0, res1, dst, stride);
864  dst += (2 * stride);
865 
866  SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868  mul_val0, mul_val1, mul_val2, mul_val3,
869  res0, res1, 11, 21, 6);
870  ST_SH2(res0, res1, dst, stride);
871  dst += (2 * stride);
872 
873  SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875  mul_val0, mul_val1, mul_val2, mul_val3,
876  res0, res1, 9, 23, 6);
877  ST_SH2(res0, res1, dst, stride);
878  dst += (2 * stride);
879 
880  SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882  mul_val0, mul_val1, mul_val2, mul_val3,
883  res0, res1, 7, 25, 6);
884  ST_SH2(res0, res1, dst, stride);
885  dst += (2 * stride);
886 
887  SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889  mul_val0, mul_val1, mul_val2, mul_val3,
890  res0, res1, 5, 27, 6);
891  ST_SH2(res0, res1, dst, stride);
892  dst += (2 * stride);
893 
894  SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896  mul_val0, mul_val1, mul_val2, mul_val3,
897  res0, res1, 3, 29, 6);
898  ST_SH2(res0, res1, dst, stride);
899  dst += (2 * stride);
900 
901  SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902  HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903  mul_val0, mul_val1, mul_val2, mul_val3,
904  res0, res1, 1, 31, 6);
905  ST_SH2(res0, res1, dst, stride);
906 }
907 
908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909  const uint8_t *src_left,
910  uint8_t *dst, int32_t stride)
911 {
912  process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913  process_intra_upper_16x16_msa((src_top + 16), src_left,
914  (dst + 16), stride, 16);
915  dst += (16 * stride);
916  src_left += 16;
917 
918  process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919  process_intra_lower_16x16_msa((src_top + 16), src_left,
920  (dst + 16), stride, 16);
921 }
922 
924  const uint8_t *src_left,
925  uint8_t *dst,
926  int32_t stride,
927  int32_t mode)
928 {
929  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930  uint8_t ref_array[3 * 32 + 4];
931  uint8_t *ref_tmp = ref_array + 4;
932  const uint8_t *ref;
933  int32_t last;
934  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935  int32_t idx2, fact_val2, idx3, fact_val3;
936  int32_t angle, angle_loop;
937  int32_t inv_angle_val, offset;
938  uint64_t tmp0;
939  v16i8 top0, top1, top2, top3;
940  v16i8 dst_val0;
941  v16i8 zero = { 0 };
942  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944 
945  angle = intra_pred_angle_up[mode - 18];
946  inv_angle_val = inv_angle[mode - 18];
947  last = (angle) >> 3;
948  angle_loop = angle;
949 
950  ref = src_top - 1;
951  if (angle < 0 && last < -1) {
952  inv_angle_val = inv_angle[mode - 18];
953 
954  tmp0 = LD(ref);
955  SD(tmp0, ref_tmp);
956 
957  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959  ref_tmp[h_cnt] = src_left[offset];
960  }
961 
962  ref = ref_tmp;
963  }
964 
965  idx0 = angle_loop >> 5;
966  fact_val0 = angle_loop & 31;
967  angle_loop += angle;
968 
969  idx1 = angle_loop >> 5;
970  fact_val1 = angle_loop & 31;
971  angle_loop += angle;
972 
973  idx2 = angle_loop >> 5;
974  fact_val2 = angle_loop & 31;
975  angle_loop += angle;
976 
977  idx3 = angle_loop >> 5;
978  fact_val3 = angle_loop & 31;
979 
980  top0 = LD_SB(ref + idx0 + 1);
981  top1 = LD_SB(ref + idx1 + 1);
982  top2 = LD_SB(ref + idx2 + 1);
983  top3 = LD_SB(ref + idx3 + 1);
984 
985  fact0 = __msa_fill_h(fact_val0);
986  fact1 = __msa_fill_h(32 - fact_val0);
987 
988  fact2 = __msa_fill_h(fact_val1);
989  fact3 = __msa_fill_h(32 - fact_val1);
990 
991  fact4 = __msa_fill_h(fact_val2);
992  fact5 = __msa_fill_h(32 - fact_val2);
993 
994  fact6 = __msa_fill_h(fact_val3);
995  fact7 = __msa_fill_h(32 - fact_val3);
996 
997  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000  diff0, diff2, diff4, diff6);
1001  SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1002  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1003  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1004  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1005 
1006  diff1 += diff0 * fact1;
1007  diff3 += diff2 * fact3;
1008 
1009  SRARI_H2_SH(diff1, diff3, 5);
1010  dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1011  ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1012 }
1013 
1015  const uint8_t *src_left,
1016  uint8_t *dst,
1017  int32_t stride,
1018  int32_t mode)
1019 {
1020  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1021  uint8_t ref_array[3 * 32 + 4];
1022  uint8_t *ref_tmp = ref_array + 8;
1023  const uint8_t *ref;
1024  const uint8_t *src_left_tmp = src_left - 1;
1025  int32_t last, offset;
1026  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1027  int32_t idx2, fact_val2, idx3, fact_val3;
1028  int32_t angle, angle_loop;
1029  int32_t inv_angle_val, inv_angle_val_loop;
1030  int32_t tmp0, tmp1, tmp2;
1031  v16i8 top0, top1, top2, top3;
1032  v16u8 dst_val0, dst_val1;
1033  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1034  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1035 
1036  angle = intra_pred_angle_up[mode - 18];
1037  inv_angle_val = inv_angle[mode - 18];
1038  last = (angle) >> 2;
1039  angle_loop = angle;
1040 
1041  ref = src_top - 1;
1042  if (last < -1) {
1043  inv_angle_val_loop = inv_angle_val * last;
1044 
1045  tmp0 = LW(ref);
1046  tmp1 = LW(ref + 4);
1047  tmp2 = LW(ref + 8);
1048  SW(tmp0, ref_tmp);
1049  SW(tmp1, ref_tmp + 4);
1050  SW(tmp2, ref_tmp + 8);
1051 
1052  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1053  offset = (inv_angle_val_loop + 128) >> 8;
1054  ref_tmp[h_cnt] = src_left_tmp[offset];
1055  inv_angle_val_loop += inv_angle_val;
1056  }
1057  ref = ref_tmp;
1058  }
1059 
1060  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1061  idx0 = (angle_loop) >> 5;
1062  fact_val0 = (angle_loop) & 31;
1063  angle_loop += angle;
1064 
1065  idx1 = (angle_loop) >> 5;
1066  fact_val1 = (angle_loop) & 31;
1067  angle_loop += angle;
1068 
1069  idx2 = (angle_loop) >> 5;
1070  fact_val2 = (angle_loop) & 31;
1071  angle_loop += angle;
1072 
1073  idx3 = (angle_loop) >> 5;
1074  fact_val3 = (angle_loop) & 31;
1075  angle_loop += angle;
1076 
1077  top0 = LD_SB(ref + idx0 + 1);
1078  top1 = LD_SB(ref + idx1 + 1);
1079  top2 = LD_SB(ref + idx2 + 1);
1080  top3 = LD_SB(ref + idx3 + 1);
1081 
1082  fact0 = __msa_fill_h(fact_val0);
1083  fact1 = __msa_fill_h(32 - fact_val0);
1084  fact2 = __msa_fill_h(fact_val1);
1085  fact3 = __msa_fill_h(32 - fact_val1);
1086  fact4 = __msa_fill_h(fact_val2);
1087  fact5 = __msa_fill_h(32 - fact_val2);
1088  fact6 = __msa_fill_h(fact_val3);
1089  fact7 = __msa_fill_h(32 - fact_val3);
1090 
1091  UNPCK_UB_SH(top0, diff0, diff1);
1092  UNPCK_UB_SH(top1, diff2, diff3);
1093  UNPCK_UB_SH(top2, diff4, diff5);
1094  UNPCK_UB_SH(top3, diff6, diff7);
1095 
1096  SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1097  SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1098  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1099  diff1, diff3, diff5, diff7);
1100 
1101  diff1 += diff0 * fact1;
1102  diff3 += diff2 * fact3;
1103  diff5 += diff4 * fact5;
1104  diff7 += diff6 * fact7;
1105 
1106  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1107  PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1108  ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1109  dst += (4 * stride);
1110  }
1111 }
1112 
1114  const uint8_t *src_left,
1115  uint8_t *dst,
1116  int32_t stride,
1117  int32_t mode)
1118 {
1119  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1120  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1121  int32_t idx2, fact_val2, idx3, fact_val3;
1122  int32_t tmp0;
1123  int32_t angle, angle_loop, offset;
1124  int32_t inv_angle_val, inv_angle_val_loop;
1125  uint8_t ref_array[3 * 32 + 4];
1126  uint8_t *ref_tmp = ref_array + 16;
1127  const uint8_t *ref;
1128  const uint8_t *src_left_tmp = src_left - 1;
1129  int32_t last;
1130  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1131  v16i8 dst0, dst1, dst2, dst3;
1132  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1133  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1134  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1135 
1136  angle = intra_pred_angle_up[mode - 18];
1137  inv_angle_val = inv_angle[mode - 18];
1138  last = angle >> 1;
1139  angle_loop = angle;
1140 
1141  ref = src_top - 1;
1142  if (last < -1) {
1143  inv_angle_val_loop = inv_angle_val * last;
1144 
1145  top0 = LD_UB(ref);
1146  tmp0 = LW(ref + 16);
1147  ST_UB(top0, ref_tmp);
1148  SW(tmp0, ref_tmp + 16);
1149 
1150  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1151  offset = (inv_angle_val_loop + 128) >> 8;
1152  ref_tmp[h_cnt] = src_left_tmp[offset];
1153  inv_angle_val_loop += inv_angle_val;
1154  }
1155  ref = ref_tmp;
1156  }
1157 
1158  for (v_cnt = 4; v_cnt--;) {
1159  idx0 = (angle_loop) >> 5;
1160  fact_val0 = (angle_loop) & 31;
1161  angle_loop += angle;
1162 
1163  idx1 = (angle_loop) >> 5;
1164  fact_val1 = (angle_loop) & 31;
1165  angle_loop += angle;
1166 
1167  idx2 = (angle_loop) >> 5;
1168  fact_val2 = (angle_loop) & 31;
1169  angle_loop += angle;
1170 
1171  idx3 = (angle_loop) >> 5;
1172  fact_val3 = (angle_loop) & 31;
1173  angle_loop += angle;
1174 
1175  LD_UB2(ref + idx0 + 1, 16, top0, top1);
1176  LD_UB2(ref + idx1 + 1, 16, top2, top3);
1177  LD_UB2(ref + idx2 + 1, 16, top4, top5);
1178  LD_UB2(ref + idx3 + 1, 16, top6, top7);
1179 
1180  fact0 = __msa_fill_h(fact_val0);
1181  fact1 = __msa_fill_h(32 - fact_val0);
1182  fact2 = __msa_fill_h(fact_val1);
1183  fact3 = __msa_fill_h(32 - fact_val1);
1184  fact4 = __msa_fill_h(fact_val2);
1185  fact5 = __msa_fill_h(32 - fact_val2);
1186  fact6 = __msa_fill_h(fact_val3);
1187  fact7 = __msa_fill_h(32 - fact_val3);
1188 
1189  SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1190  SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1191  UNPCK_UB_SH(top0, diff0, diff1);
1192  UNPCK_UB_SH(top1, diff2, diff3);
1193  UNPCK_UB_SH(top2, diff4, diff5);
1194  UNPCK_UB_SH(top3, diff6, diff7);
1195  UNPCK_UB_SH(top4, diff8, diff9);
1196  UNPCK_UB_SH(top5, diff10, diff11);
1197  UNPCK_UB_SH(top6, diff12, diff13);
1198  UNPCK_UB_SH(top7, diff14, diff15);
1199 
1200  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1201  diff2, diff3, diff6, diff7);
1202  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1203  diff10, diff11, diff14, diff15);
1204 
1205  diff2 += diff0 * fact1;
1206  diff3 += diff1 * fact1;
1207  diff6 += diff4 * fact3;
1208  diff7 += diff5 * fact3;
1209  diff10 += diff8 * fact5;
1210  diff11 += diff9 * fact5;
1211  diff14 += diff12 * fact7;
1212  diff15 += diff13 * fact7;
1213 
1214  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1215  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1216  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1217  dst0, dst1, dst2, dst3);
1218  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1219  dst += (4 * stride);
1220  }
1221 }
1222 
1224  const uint8_t *src_left,
1225  uint8_t *dst,
1226  int32_t stride,
1227  int32_t mode)
1228 {
1229  int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1230  uint8_t ref_array[3 * 32 + 4];
1231  uint8_t *ref_tmp;
1232  const uint8_t *ref;
1233  const uint8_t *src_left_tmp = src_left - 1;
1234  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1235  int32_t tmp0, tmp1, tmp2, tmp3;
1236  int32_t angle, angle_loop;
1237  int32_t inv_angle_val, inv_angle_val_loop;
1238  int32_t last, offset;
1239  v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1240  v16i8 dst0, dst1, dst2, dst3;
1241  v8i16 fact0, fact1, fact2, fact3;
1242  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1243  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1244 
1245  ref_tmp = ref_array + 32;
1246 
1247  angle = intra_pred_angle_up[mode - 18];
1248  inv_angle_val = inv_angle[mode - 18];
1249  last = angle;
1250  angle_loop = angle;
1251 
1252  ref = src_top - 1;
1253  if (last < -1) {
1254  inv_angle_val_loop = inv_angle_val * last;
1255  LD_UB2(ref, 16, top0, top1);
1256  tmp0 = ref[32];
1257  tmp1 = ref[33];
1258  tmp2 = ref[34];
1259  tmp3 = ref[35];
1260 
1261  ST_UB2(top0, top1, ref_tmp, 16);
1262  ref_tmp[32] = tmp0;
1263  ref_tmp[33] = tmp1;
1264  ref_tmp[34] = tmp2;
1265  ref_tmp[35] = tmp3;
1266 
1267  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1268  offset = (inv_angle_val_loop + 128) >> 8;
1269  ref_tmp[h_cnt] = src_left_tmp[offset];
1270  inv_angle_val_loop += inv_angle_val;
1271  }
1272 
1273  ref = ref_tmp;
1274  }
1275 
1276  for (v_cnt = 16; v_cnt--;) {
1277  idx0 = (angle_loop) >> 5;
1278  fact_val0 = (angle_loop) & 31;
1279  angle_loop += angle;
1280 
1281  idx1 = (angle_loop) >> 5;
1282  fact_val1 = (angle_loop) & 31;
1283  angle_loop += angle;
1284 
1285  top0 = LD_UB(ref + idx0 + 1);
1286  top4 = LD_UB(ref + idx1 + 1);
1287  top1 = LD_UB(ref + idx0 + 17);
1288  top5 = LD_UB(ref + idx1 + 17);
1289  top3 = LD_UB(ref + idx0 + 33);
1290  top7 = LD_UB(ref + idx1 + 33);
1291 
1292  fact0 = __msa_fill_h(fact_val0);
1293  fact1 = __msa_fill_h(32 - fact_val0);
1294  fact2 = __msa_fill_h(fact_val1);
1295  fact3 = __msa_fill_h(32 - fact_val1);
1296 
1297  top2 = top1;
1298  top6 = top5;
1299 
1300  SLDI_B2_UB(top1, top3, top0, top2, top1, top3, 1);
1301  SLDI_B2_UB(top5, top7, top4, top6, top5, top7, 1);
1302  UNPCK_UB_SH(top0, diff0, diff1);
1303  UNPCK_UB_SH(top1, diff2, diff3);
1304  UNPCK_UB_SH(top2, diff4, diff5);
1305  UNPCK_UB_SH(top3, diff6, diff7);
1306  UNPCK_UB_SH(top4, diff8, diff9);
1307  UNPCK_UB_SH(top5, diff10, diff11);
1308  UNPCK_UB_SH(top6, diff12, diff13);
1309  UNPCK_UB_SH(top7, diff14, diff15);
1310 
1311  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1312  diff2, diff3, diff6, diff7);
1313  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1314  diff10, diff11, diff14, diff15);
1315 
1316  diff2 += diff0 * fact1;
1317  diff3 += diff1 * fact1;
1318  diff6 += diff4 * fact1;
1319  diff7 += diff5 * fact1;
1320  diff10 += diff8 * fact3;
1321  diff11 += diff9 * fact3;
1322  diff14 += diff12 * fact3;
1323  diff15 += diff13 * fact3;
1324 
1325  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1326  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1327  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1328  dst0, dst1, dst2, dst3);
1329 
1330  ST_SB2(dst0, dst1, dst, 16);
1331  dst += stride;
1332  ST_SB2(dst2, dst3, dst, 16);
1333  dst += stride;
1334  }
1335 }
1336 
1338  const uint8_t *src_left,
1339  uint8_t *dst,
1340  int32_t stride,
1341  int32_t mode)
1342 {
1343  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1344  uint8_t ref_array[3 * 32 + 4];
1345  uint8_t *ref_tmp = ref_array + 4;
1346  const uint8_t *ref;
1347  int32_t last, offset;
1348  int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1349  int32_t idx2, fact_val2, idx3, fact_val3;
1350  int32_t angle, angle_loop, inv_angle_val;
1351  uint64_t tmp0;
1352  v16i8 dst_val0, dst_val1;
1353  v16u8 top0, top1, top2, top3;
1354  v16u8 zero = { 0 };
1355  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1356  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1357 
1358  angle = intra_pred_angle_low[mode - 2];
1359  last = angle >> 3;
1360  angle_loop = angle;
1361 
1362  ref = src_left - 1;
1363  if (last < -1) {
1364  inv_angle_val = inv_angle[mode - 11];
1365 
1366  tmp0 = LD(ref);
1367  SD(tmp0, ref_tmp);
1368 
1369  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1370  offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1371  ref_tmp[h_cnt] = src_top[offset];
1372  }
1373 
1374  ref = ref_tmp;
1375  }
1376 
1377  idx0 = angle_loop >> 5;
1378  fact_val0 = angle_loop & 31;
1379  angle_loop += angle;
1380 
1381  idx1 = angle_loop >> 5;
1382  fact_val1 = angle_loop & 31;
1383  angle_loop += angle;
1384 
1385  idx2 = angle_loop >> 5;
1386  fact_val2 = angle_loop & 31;
1387  angle_loop += angle;
1388 
1389  idx3 = angle_loop >> 5;
1390  fact_val3 = angle_loop & 31;
1391 
1392  top0 = LD_UB(ref + idx0 + 1);
1393  top1 = LD_UB(ref + idx1 + 1);
1394  top2 = LD_UB(ref + idx2 + 1);
1395  top3 = LD_UB(ref + idx3 + 1);
1396 
1397  fact0 = __msa_fill_h(fact_val0);
1398  fact1 = __msa_fill_h(32 - fact_val0);
1399  fact2 = __msa_fill_h(fact_val1);
1400  fact3 = __msa_fill_h(32 - fact_val1);
1401  fact4 = __msa_fill_h(fact_val2);
1402  fact5 = __msa_fill_h(32 - fact_val2);
1403  fact6 = __msa_fill_h(fact_val3);
1404  fact7 = __msa_fill_h(32 - fact_val3);
1405 
1406  ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1407  ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1408  ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1409  diff0, diff2, diff4, diff6);
1410  SLDI_B4_0_SH(diff0, diff2, diff4, diff6, diff1, diff3, diff5, diff7, 2);
1411  ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1412  ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1413  MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1414 
1415  diff1 += diff0 * fact1;
1416  diff3 += diff2 * fact3;
1417 
1418  SRARI_H2_SH(diff1, diff3, 5);
1419  PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1420 
1421  diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1422  diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1423 
1424  diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1425 
1426  dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1427  dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1428 
1429  ST_W2(dst_val0, 0, 1, dst, stride);
1430  ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1431 }
1432 
1434  const uint8_t *src_left,
1435  uint8_t *dst,
1436  int32_t stride,
1437  int32_t mode)
1438 {
1439  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1440  uint8_t ref_array[3 * 32 + 4];
1441  uint8_t *ref_tmp = ref_array + 8;
1442  const uint8_t *ref;
1443  const uint8_t *src_top_tmp = src_top - 1;
1444  uint8_t *dst_org;
1445  int32_t last, offset, tmp0, tmp1, tmp2;
1446  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1447  int32_t idx2, fact_val2, idx3, fact_val3;
1448  int32_t angle, angle_loop, inv_angle_val;
1449  v16i8 top0, top1, top2, top3;
1450  v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1451  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1452  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1453 
1454  angle = intra_pred_angle_low[mode - 2];
1455  last = (angle) >> 2;
1456  angle_loop = angle;
1457 
1458  ref = src_left - 1;
1459  if (last < -1) {
1460  inv_angle_val = inv_angle[mode - 11];
1461 
1462  tmp0 = LW(ref);
1463  tmp1 = LW(ref + 4);
1464  tmp2 = LW(ref + 8);
1465  SW(tmp0, ref_tmp);
1466  SW(tmp1, ref_tmp + 4);
1467  SW(tmp2, ref_tmp + 8);
1468 
1469  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1470  offset = (h_cnt * inv_angle_val + 128) >> 8;
1471  ref_tmp[h_cnt] = src_top_tmp[offset];
1472  }
1473 
1474  ref = ref_tmp;
1475  }
1476 
1477  for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1478  dst_org = dst;
1479 
1480  idx0 = angle_loop >> 5;
1481  fact_val0 = angle_loop & 31;
1482  angle_loop += angle;
1483 
1484  idx1 = angle_loop >> 5;
1485  fact_val1 = angle_loop & 31;
1486  angle_loop += angle;
1487 
1488  idx2 = angle_loop >> 5;
1489  fact_val2 = angle_loop & 31;
1490  angle_loop += angle;
1491 
1492  idx3 = angle_loop >> 5;
1493  fact_val3 = angle_loop & 31;
1494  angle_loop += angle;
1495 
1496  top0 = LD_SB(ref + idx0 + 1);
1497  top1 = LD_SB(ref + idx1 + 1);
1498  top2 = LD_SB(ref + idx2 + 1);
1499  top3 = LD_SB(ref + idx3 + 1);
1500 
1501  fact0 = __msa_fill_h(fact_val0);
1502  fact1 = __msa_fill_h(32 - fact_val0);
1503  fact2 = __msa_fill_h(fact_val1);
1504  fact3 = __msa_fill_h(32 - fact_val1);
1505  fact4 = __msa_fill_h(fact_val2);
1506  fact5 = __msa_fill_h(32 - fact_val2);
1507  fact6 = __msa_fill_h(fact_val3);
1508  fact7 = __msa_fill_h(32 - fact_val3);
1509 
1510  UNPCK_UB_SH(top0, diff0, diff1);
1511  UNPCK_UB_SH(top1, diff2, diff3);
1512  UNPCK_UB_SH(top2, diff4, diff5);
1513  UNPCK_UB_SH(top3, diff6, diff7);
1514  SLDI_B2_SH(diff1, diff3, diff0, diff2, diff1, diff3, 2);
1515  SLDI_B2_SH(diff5, diff7, diff4, diff6, diff5, diff7, 2);
1516  MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1517  diff1, diff3, diff5, diff7);
1518 
1519  diff1 += diff0 * fact1;
1520  diff3 += diff2 * fact3;
1521  diff5 += diff4 * fact5;
1522  diff7 += diff6 * fact7;
1523 
1524  SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1525  PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1526  dst_val0, dst_val1, dst_val2, dst_val3);
1527  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1528  ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1529  ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1530  dst += 4;
1531  }
1532 }
1533 
1535  const uint8_t *src_left,
1536  uint8_t *dst,
1537  int32_t stride,
1538  int32_t mode)
1539 {
1540  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1541  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1542  int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1543  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1544  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1545  v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1546  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1547  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1548  int32_t angle, angle_loop, inv_angle_val, offset;
1549  uint8_t ref_array[3 * 32 + 4];
1550  uint8_t *ref_tmp = ref_array + 16;
1551  const uint8_t *ref, *src_top_tmp = src_top - 1;
1552  uint8_t *dst_org;
1553  int32_t last;
1554 
1555  angle = intra_pred_angle_low[mode - 2];
1556  last = (angle) >> 1;
1557  angle_loop = angle;
1558 
1559  ref = src_left - 1;
1560  if (last < -1) {
1561  inv_angle_val = inv_angle[mode - 11];
1562 
1563  top0 = LD_SB(ref);
1564  tmp0 = LW(ref + 16);
1565  ST_SB(top0, ref_tmp);
1566  SW(tmp0, ref_tmp + 16);
1567 
1568  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1569  offset = (h_cnt * inv_angle_val + 128) >> 8;
1570  ref_tmp[h_cnt] = src_top_tmp[offset];
1571  }
1572 
1573  ref = ref_tmp;
1574  }
1575 
1576  for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1577  dst_org = dst;
1578 
1579  idx0 = angle_loop >> 5;
1580  fact_val0 = angle_loop & 31;
1581  angle_loop += angle;
1582 
1583  idx1 = angle_loop >> 5;
1584  fact_val1 = angle_loop & 31;
1585  angle_loop += angle;
1586 
1587  idx2 = angle_loop >> 5;
1588  fact_val2 = angle_loop & 31;
1589  angle_loop += angle;
1590 
1591  idx3 = angle_loop >> 5;
1592  fact_val3 = angle_loop & 31;
1593  angle_loop += angle;
1594 
1595  LD_SB2(ref + idx0 + 1, 16, top0, top1);
1596  LD_SB2(ref + idx1 + 1, 16, top2, top3);
1597  LD_SB2(ref + idx2 + 1, 16, top4, top5);
1598  LD_SB2(ref + idx3 + 1, 16, top6, top7);
1599 
1600  fact0 = __msa_fill_h(fact_val0);
1601  fact1 = __msa_fill_h(32 - fact_val0);
1602  fact2 = __msa_fill_h(fact_val1);
1603  fact3 = __msa_fill_h(32 - fact_val1);
1604  fact4 = __msa_fill_h(fact_val2);
1605  fact5 = __msa_fill_h(32 - fact_val2);
1606  fact6 = __msa_fill_h(fact_val3);
1607  fact7 = __msa_fill_h(32 - fact_val3);
1608 
1609  SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1610  SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1611 
1612  UNPCK_UB_SH(top0, diff0, diff1);
1613  UNPCK_UB_SH(top1, diff2, diff3);
1614  UNPCK_UB_SH(top2, diff4, diff5);
1615  UNPCK_UB_SH(top3, diff6, diff7);
1616  UNPCK_UB_SH(top4, diff8, diff9);
1617  UNPCK_UB_SH(top5, diff10, diff11);
1618  UNPCK_UB_SH(top6, diff12, diff13);
1619  UNPCK_UB_SH(top7, diff14, diff15);
1620 
1621  MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1622  diff2, diff3, diff6, diff7);
1623  MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1624  diff10, diff11, diff14, diff15);
1625 
1626  diff2 += diff0 * fact1;
1627  diff3 += diff1 * fact1;
1628  diff6 += diff4 * fact3;
1629  diff7 += diff5 * fact3;
1630  diff10 += diff8 * fact5;
1631  diff11 += diff9 * fact5;
1632  diff14 += diff12 * fact7;
1633  diff15 += diff13 * fact7;
1634 
1635  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1636  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1637  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1638  dst_val0, dst_val1, dst_val2, dst_val3);
1639  ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1640  ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1641  ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1642  ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1643  ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1644  dst_org += (8 * stride);
1645  ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646  dst += 4;
1647  }
1648 }
1649 
1651  const uint8_t *src_left,
1652  uint8_t *dst,
1653  int32_t stride,
1654  int32_t mode)
1655 {
1656  int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1657  int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1658  v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1659  v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1660  v8i16 fact0, fact1, fact2, fact3;
1661  v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1662  v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1663  int32_t angle, angle_loop, inv_angle_val, offset;
1664  uint8_t ref_array[3 * 32 + 4];
1665  uint8_t *ref_tmp = ref_array + 32;
1666  const uint8_t *ref, *src_top_tmp = src_top - 1;
1667  uint8_t *dst_org;
1668  int32_t last;
1669 
1670  angle = intra_pred_angle_low[mode - 2];
1671  last = angle;
1672  angle_loop = angle;
1673 
1674  ref = src_left - 1;
1675  if (last < -1) {
1676  inv_angle_val = inv_angle[mode - 11];
1677 
1678  LD_SB2(ref, 16, top0, top1);
1679  tmp0 = LW(ref + 32);
1680  ST_SB2(top0, top1, ref_tmp, 16);
1681  SW(tmp0, ref_tmp + 32);
1682 
1683  for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1684  offset = (h_cnt * inv_angle_val + 128) >> 8;
1685  ref_tmp[h_cnt] = src_top_tmp[offset];
1686  }
1687 
1688  ref = ref_tmp;
1689  }
1690 
1691  for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1692  dst_org = dst;
1693  idx0 = angle_loop >> 5;
1694  fact_val0 = angle_loop & 31;
1695  angle_loop += angle;
1696 
1697  idx1 = angle_loop >> 5;
1698  fact_val1 = angle_loop & 31;
1699  angle_loop += angle;
1700 
1701  top0 = LD_SB(ref + idx0 + 1);
1702  top4 = LD_SB(ref + idx1 + 1);
1703  top1 = LD_SB(ref + idx0 + 17);
1704  top5 = LD_SB(ref + idx1 + 17);
1705  top3 = LD_SB(ref + idx0 + 33);
1706  top7 = LD_SB(ref + idx1 + 33);
1707 
1708  fact0 = __msa_fill_h(fact_val0);
1709  fact1 = __msa_fill_h(32 - fact_val0);
1710  fact2 = __msa_fill_h(fact_val1);
1711  fact3 = __msa_fill_h(32 - fact_val1);
1712 
1713  top2 = top1;
1714  top6 = top5;
1715 
1716  SLDI_B2_SB(top1, top3, top0, top2, top1, top3, 1);
1717  SLDI_B2_SB(top5, top7, top4, top6, top5, top7, 1);
1718 
1719  UNPCK_UB_SH(top0, diff0, diff1);
1720  UNPCK_UB_SH(top1, diff2, diff3);
1721  UNPCK_UB_SH(top2, diff4, diff5);
1722  UNPCK_UB_SH(top3, diff6, diff7);
1723  UNPCK_UB_SH(top4, diff8, diff9);
1724  UNPCK_UB_SH(top5, diff10, diff11);
1725  UNPCK_UB_SH(top6, diff12, diff13);
1726  UNPCK_UB_SH(top7, diff14, diff15);
1727 
1728  MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1729  diff2, diff3, diff6, diff7);
1730  MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1731  diff10, diff11, diff14, diff15);
1732 
1733  diff2 += diff0 * fact1;
1734  diff3 += diff1 * fact1;
1735  diff6 += diff4 * fact1;
1736  diff7 += diff5 * fact1;
1737  diff10 += diff8 * fact3;
1738  diff11 += diff9 * fact3;
1739  diff14 += diff12 * fact3;
1740  diff15 += diff13 * fact3;
1741 
1742  SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1743  SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1744  PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1745  dst_val0, dst_val1, dst_val2, dst_val3);
1746  ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1747  ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1748 
1749  ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1750  dst_org += (8 * stride);
1751  ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752  dst_org += (8 * stride);
1753  ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754  dst_org += (8 * stride);
1755  ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756  dst_org += (8 * stride);
1757 
1758  dst += 2;
1759  }
1760 }
1761 
1763  int32_t dst_stride)
1764 {
1765  uint32_t row;
1766  v16u8 src1, src2;
1767 
1768  src1 = LD_UB(src);
1769  src2 = LD_UB(src + 16);
1770 
1771  for (row = 32; row--;) {
1772  ST_UB2(src1, src2, dst, 16);
1773  dst += dst_stride;
1774  }
1775 }
1776 
1778  const uint8_t *src_top,
1779  const uint8_t *src_left,
1780  ptrdiff_t stride)
1781 {
1782  hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1783 }
1784 
1786  const uint8_t *src_top,
1787  const uint8_t *src_left,
1788  ptrdiff_t stride)
1789 {
1790  hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1791 }
1792 
1794  const uint8_t *src_top,
1795  const uint8_t *src_left,
1796  ptrdiff_t stride)
1797 {
1798  hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1799 }
1800 
1802  const uint8_t *src_top,
1803  const uint8_t *src_left,
1804  ptrdiff_t stride)
1805 {
1806  hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1807 }
1808 
1809 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1810  const uint8_t *src_left,
1811  ptrdiff_t stride, int log2, int c_idx)
1812 {
1813  switch (log2) {
1814  case 2:
1815  hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1816  break;
1817 
1818  case 3:
1819  hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1820  break;
1821 
1822  case 4:
1823  hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1824  break;
1825 
1826  case 5:
1827  hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1828  break;
1829  }
1830 }
1831 
1833  const uint8_t *src_top,
1834  const uint8_t *src_left,
1835  ptrdiff_t stride, int c_idx, int mode)
1836 {
1837  if (mode == 10) {
1838  hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1839  } else if (mode == 26) {
1840  hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841  } else if (mode >= 18) {
1842  hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1843  dst, stride, mode);
1844  } else {
1845  hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1846  dst, stride, mode);
1847  }
1848 }
1849 
1851  const uint8_t *src_top,
1852  const uint8_t *src_left,
1853  ptrdiff_t stride, int c_idx, int mode)
1854 {
1855  if (mode == 10) {
1856  hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1857  } else if (mode == 26) {
1858  hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859  } else if (mode >= 18) {
1860  hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1861  dst, stride, mode);
1862  } else {
1863  hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1864  dst, stride, mode);
1865  }
1866 }
1867 
1869  const uint8_t *src_top,
1870  const uint8_t *src_left,
1871  ptrdiff_t stride, int c_idx, int mode)
1872 {
1873  if (mode == 10) {
1874  hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1875  } else if (mode == 26) {
1876  hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877  } else if (mode >= 18) {
1879  dst, stride, mode);
1880  } else {
1882  dst, stride, mode);
1883  }
1884 }
1885 
1887  const uint8_t *src_top,
1888  const uint8_t *src_left,
1889  ptrdiff_t stride, int c_idx, int mode)
1890 {
1891  if (mode == 10) {
1892  hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1893  } else if (mode == 26) {
1894  intra_predict_vert_32x32_msa(src_top, dst, stride);
1895  } else if (mode >= 18) {
1897  dst, stride, mode);
1898  } else {
1900  dst, stride, mode);
1901  }
1902 }
1903 
1904 void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1905 {
1906  v16u8 vec0;
1907  HEVCLocalContext *lc = s->HEVClc;
1908  int i;
1909  int hshift = s->ps.sps->hshift[c_idx];
1910  int vshift = s->ps.sps->vshift[c_idx];
1911  int size_in_luma_h = 16 << hshift;
1912  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1913  int size_in_luma_v = 16 << vshift;
1914  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1915  int x = x0 >> hshift;
1916  int y = y0 >> vshift;
1917  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1918  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1919 
1920  int cur_tb_addr =
1921  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1922 
1923  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1924  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1925 
1926  int min_pu_width = s->ps.sps->min_pu_width;
1927 
1928  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1929  lc->tu.intra_pred_mode;
1930  uint32_t a;
1931  uint8_t left_array[2 * 32 + 1];
1932  uint8_t filtered_left_array[2 * 32 + 1];
1933  uint8_t top_array[2 * 32 + 1];
1934  uint8_t filtered_top_array[2 * 32 + 1];
1935 
1936  uint8_t *left = left_array + 1;
1937  uint8_t *top = top_array + 1;
1938  uint8_t *filtered_left = filtered_left_array + 1;
1939  uint8_t *filtered_top = filtered_top_array + 1;
1940  int cand_bottom_left = lc->na.cand_bottom_left
1941  && cur_tb_addr >
1942  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1943  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1944  int cand_left = lc->na.cand_left;
1945  int cand_up_left = lc->na.cand_up_left;
1946  int cand_up = lc->na.cand_up;
1947  int cand_up_right = lc->na.cand_up_right
1948  && cur_tb_addr >
1949  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1950  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1951 
1952  int bottom_left_size =
1953  (((y0 + 2 * size_in_luma_v) >
1954  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1955  2 * size_in_luma_v)) -
1956  (y0 + size_in_luma_v)) >> vshift;
1957  int top_right_size =
1958  (((x0 + 2 * size_in_luma_h) >
1959  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1960  (x0 + size_in_luma_h)) >> hshift;
1961 
1962  if (s->ps.pps->constrained_intra_pred_flag == 1) {
1963  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1964  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1965  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1966  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1967  if (!size_in_luma_pu_h)
1968  size_in_luma_pu_h++;
1969  if (cand_bottom_left == 1 && on_pu_edge_x) {
1970  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1971  int y_bottom_pu =
1972  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1973  int max =
1974  ((size_in_luma_pu_v) >
1975  (s->ps.sps->min_pu_height -
1976  y_bottom_pu) ? (s->ps.sps->min_pu_height -
1977  y_bottom_pu) : (size_in_luma_pu_v));
1978  cand_bottom_left = 0;
1979  for (i = 0; i < max; i += 2)
1980  cand_bottom_left |=
1981  ((s->ref->tab_mvf[(x_left_pu) +
1982  (y_bottom_pu +
1983  i) * min_pu_width]).pred_flag ==
1984  PF_INTRA);
1985  }
1986  if (cand_left == 1 && on_pu_edge_x) {
1987  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1988  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1989  int max =
1990  ((size_in_luma_pu_v) >
1991  (s->ps.sps->min_pu_height -
1992  y_left_pu) ? (s->ps.sps->min_pu_height -
1993  y_left_pu) : (size_in_luma_pu_v));
1994  cand_left = 0;
1995  for (i = 0; i < max; i += 2)
1996  cand_left |=
1997  ((s->ref->tab_mvf[(x_left_pu) +
1998  (y_left_pu +
1999  i) * min_pu_width]).pred_flag ==
2000  PF_INTRA);
2001  }
2002  if (cand_up_left == 1) {
2003  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2004  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2005  cand_up_left =
2006  (s->ref->tab_mvf[(x_left_pu) +
2007  (y_top_pu) * min_pu_width]).pred_flag ==
2008  PF_INTRA;
2009  }
2010  if (cand_up == 1 && on_pu_edge_y) {
2011  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2012  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2013  int max =
2014  ((size_in_luma_pu_h) >
2015  (s->ps.sps->min_pu_width -
2016  x_top_pu) ? (s->ps.sps->min_pu_width -
2017  x_top_pu) : (size_in_luma_pu_h));
2018  cand_up = 0;
2019  for (i = 0; i < max; i += 2)
2020  cand_up |=
2021  ((s->ref->tab_mvf[(x_top_pu + i) +
2022  (y_top_pu) *
2023  min_pu_width]).pred_flag == PF_INTRA);
2024  }
2025  if (cand_up_right == 1 && on_pu_edge_y) {
2026  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2027  int x_right_pu =
2028  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2029  int max =
2030  ((size_in_luma_pu_h) >
2031  (s->ps.sps->min_pu_width -
2032  x_right_pu) ? (s->ps.sps->min_pu_width -
2033  x_right_pu) : (size_in_luma_pu_h));
2034  cand_up_right = 0;
2035  for (i = 0; i < max; i += 2)
2036  cand_up_right |=
2037  ((s->ref->tab_mvf[(x_right_pu + i) +
2038  (y_top_pu) *
2039  min_pu_width]).pred_flag == PF_INTRA);
2040  }
2041 
2042  vec0 = (v16u8) __msa_ldi_b(128);
2043 
2044  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2045 
2046  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2047 
2048  top[-1] = 128;
2049  }
2050  if (cand_up_left) {
2051  left[-1] = src[(-1) + stride * (-1)];
2052  top[-1] = left[-1];
2053  }
2054  if (cand_up) {
2055  vec0 = LD_UB(src - stride);
2056  ST_UB(vec0, top);
2057  }
2058  if (cand_up_right) {
2059  vec0 = LD_UB(src - stride + 16);
2060  ST_UB(vec0, (top + 16));
2061 
2062  do {
2063  uint32_t pix =
2064  ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2065  0x01010101U);
2066  for (i = 0; i < (16 - top_right_size); i += 4)
2067  ((((union unaligned_32 *) (top + 16 + top_right_size +
2068  i))->l) = (pix));
2069  } while (0);
2070  }
2071  if (cand_left)
2072  for (i = 0; i < 16; i++)
2073  left[i] = src[(-1) + stride * (i)];
2074  if (cand_bottom_left) {
2075  for (i = 16; i < 16 + bottom_left_size; i++)
2076  left[i] = src[(-1) + stride * (i)];
2077  do {
2078  uint32_t pix =
2079  ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2080  0x01010101U);
2081  for (i = 0; i < (16 - bottom_left_size); i += 4)
2082  ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2083  i))->l) = (pix));
2084  } while (0);
2085  }
2086 
2087  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2088  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2089  || cand_up_right) {
2090  int size_max_x =
2091  x0 + ((2 * 16) << hshift) <
2092  s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2093  int size_max_y =
2094  y0 + ((2 * 16) << vshift) <
2095  s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2096  int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2097  if (!cand_up_right) {
2098  size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2099  16 : (s->ps.sps->width - x0) >> hshift;
2100  }
2101  if (!cand_bottom_left) {
2102  size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2103  16 : (s->ps.sps->height - y0) >> vshift;
2104  }
2105  if (cand_bottom_left || cand_left || cand_up_left) {
2106  while (j > -1
2107  &&
2108  !((s->ref->tab_mvf[(((x0 +
2109  ((-1) << hshift)) >> s->ps.sps->
2110  log2_min_pu_size)) + (((y0 +
2111  ((j) <<
2112  vshift))
2113  >> s->ps.sps->
2114  log2_min_pu_size))
2115  * min_pu_width]).pred_flag ==
2116  PF_INTRA))
2117  j--;
2118  if (!
2119  ((s->ref->tab_mvf[(((x0 +
2120  ((-1) << hshift)) >> s->ps.sps->
2121  log2_min_pu_size)) + (((y0 + ((j)
2122  <<
2123  vshift))
2124  >> s->ps.sps->
2125  log2_min_pu_size))
2126  * min_pu_width]).pred_flag == PF_INTRA)) {
2127  j = 0;
2128  while (j < size_max_x
2129  &&
2130  !((s->ref->tab_mvf[(((x0 +
2131  ((j) << hshift)) >> s->ps.sps->
2132  log2_min_pu_size)) + (((y0 +
2133  ((-1) <<
2134  vshift))
2135  >> s->
2136  ps.sps->
2137  log2_min_pu_size))
2138  * min_pu_width]).pred_flag ==
2139  PF_INTRA))
2140  j++;
2141  for (i = j; i > (j) - (j + 1); i--)
2142  if (!
2143  ((s->ref->tab_mvf[(((x0 +
2144  ((i -
2145  1) << hshift)) >> s->ps.sps->
2146  log2_min_pu_size)) + (((y0 +
2147  ((-1) <<
2148  vshift))
2149  >> s->
2150  ps.sps->
2151  log2_min_pu_size))
2152  * min_pu_width]).pred_flag ==
2153  PF_INTRA))
2154  top[i - 1] = top[i];
2155  left[-1] = top[-1];
2156  }
2157  } else {
2158  j = 0;
2159  while (j < size_max_x
2160  &&
2161  !((s->ref->tab_mvf[(((x0 +
2162  ((j) << hshift)) >> s->ps.sps->
2163  log2_min_pu_size)) + (((y0 + ((-1)
2164  <<
2165  vshift))
2166  >> s->ps.sps->
2167  log2_min_pu_size))
2168  * min_pu_width]).pred_flag ==
2169  PF_INTRA))
2170  j++;
2171  if (j > 0)
2172  if (x0 > 0) {
2173  for (i = j; i > (j) - (j + 1); i--)
2174  if (!
2175  ((s->ref->tab_mvf[(((x0 +
2176  ((i -
2177  1) << hshift)) >>
2178  s->ps.sps->log2_min_pu_size))
2179  + (((y0 + ((-1)
2180  << vshift))
2181  >>
2182  s->ps.sps->log2_min_pu_size))
2183  *
2184  min_pu_width]).pred_flag ==
2185  PF_INTRA))
2186  top[i - 1] = top[i];
2187  } else {
2188  for (i = j; i > (j) - (j); i--)
2189  if (!
2190  ((s->ref->tab_mvf[(((x0 +
2191  ((i -
2192  1) << hshift)) >>
2193  s->ps.sps->log2_min_pu_size))
2194  + (((y0 + ((-1)
2195  << vshift))
2196  >>
2197  s->ps.sps->log2_min_pu_size))
2198  *
2199  min_pu_width]).pred_flag ==
2200  PF_INTRA))
2201  top[i - 1] = top[i];
2202  top[-1] = top[0];
2203  }
2204  left[-1] = top[-1];
2205  }
2206  left[-1] = top[-1];
2207  if (cand_bottom_left || cand_left) {
2208  a = ((left[-1]) * 0x01010101U);
2209  for (i = 0; i < (0) + (size_max_y); i += 4)
2210  if (!
2211  ((s->ref->tab_mvf[(((x0 +
2212  ((-1) << hshift)) >> s->ps.sps->
2213  log2_min_pu_size)) + (((y0 +
2214  ((i) <<
2215  vshift))
2216  >> s->ps.sps->
2217  log2_min_pu_size))
2218  * min_pu_width]).pred_flag ==
2219  PF_INTRA))
2220  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2221  else
2222  a = ((left[i + 3]) * 0x01010101U);
2223  }
2224  if (!cand_left) {
2225  vec0 = (v16u8) __msa_fill_b(left[-1]);
2226 
2227  ST_UB(vec0, left);
2228  }
2229  if (!cand_bottom_left) {
2230 
2231  vec0 = (v16u8) __msa_fill_b(left[15]);
2232 
2233  ST_UB(vec0, (left + 16));
2234  }
2235  if (x0 != 0 && y0 != 0) {
2236  a = ((left[size_max_y - 1]) * 0x01010101U);
2237  for (i = (size_max_y - 1);
2238  i > (size_max_y - 1) - (size_max_y); i -= 4)
2239  if (!
2240  ((s->ref->tab_mvf[(((x0 +
2241  ((-1) << hshift)) >> s->ps.sps->
2242  log2_min_pu_size)) + (((y0 +
2243  ((i -
2244  3) <<
2245  vshift))
2246  >> s->ps.sps->
2247  log2_min_pu_size))
2248  * min_pu_width]).pred_flag ==
2249  PF_INTRA))
2250  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2251  else
2252  a = ((left[i - 3]) * 0x01010101U);
2253  if (!
2254  ((s->ref->tab_mvf[(((x0 +
2255  ((-1) << hshift)) >> s->ps.sps->
2256  log2_min_pu_size)) + (((y0 + ((-1)
2257  <<
2258  vshift))
2259  >> s->ps.sps->
2260  log2_min_pu_size))
2261  * min_pu_width]).pred_flag == PF_INTRA))
2262  left[-1] = left[0];
2263  } else if (x0 == 0) {
2264  do {
2265  uint32_t pix = ((0) * 0x01010101U);
2266  for (i = 0; i < (size_max_y); i += 4)
2267  ((((union unaligned_32 *) (left + i))->l) = (pix));
2268  } while (0);
2269  } else {
2270  a = ((left[size_max_y - 1]) * 0x01010101U);
2271  for (i = (size_max_y - 1);
2272  i > (size_max_y - 1) - (size_max_y); i -= 4)
2273  if (!
2274  ((s->ref->tab_mvf[(((x0 +
2275  ((-1) << hshift)) >> s->ps.sps->
2276  log2_min_pu_size)) + (((y0 +
2277  ((i -
2278  3) <<
2279  vshift))
2280  >> s->ps.sps->
2281  log2_min_pu_size))
2282  * min_pu_width]).pred_flag ==
2283  PF_INTRA))
2284  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2285  else
2286  a = ((left[i - 3]) * 0x01010101U);
2287  }
2288  top[-1] = left[-1];
2289  if (y0 != 0) {
2290  a = ((left[-1]) * 0x01010101U);
2291  for (i = 0; i < (0) + (size_max_x); i += 4)
2292  if (!
2293  ((s->ref->tab_mvf[(((x0 +
2294  ((i) << hshift)) >> s->ps.sps->
2295  log2_min_pu_size)) + (((y0 + ((-1)
2296  <<
2297  vshift))
2298  >> s->ps.sps->
2299  log2_min_pu_size))
2300  * min_pu_width]).pred_flag ==
2301  PF_INTRA))
2302  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2303  else
2304  a = ((top[i + 3]) * 0x01010101U);
2305  }
2306  }
2307  }
2308 
2309  if (!cand_bottom_left) {
2310  if (cand_left) {
2311  vec0 = (v16u8) __msa_fill_b(left[15]);
2312 
2313  ST_UB(vec0, (left + 16));
2314 
2315  } else if (cand_up_left) {
2316  vec0 = (v16u8) __msa_fill_b(left[-1]);
2317 
2318  ST_UB2(vec0, vec0, left, 16);
2319 
2320  cand_left = 1;
2321  } else if (cand_up) {
2322  left[-1] = top[0];
2323 
2324  vec0 = (v16u8) __msa_fill_b(left[-1]);
2325 
2326  ST_UB2(vec0, vec0, left, 16);
2327 
2328  cand_up_left = 1;
2329  cand_left = 1;
2330  } else if (cand_up_right) {
2331  vec0 = (v16u8) __msa_fill_b(top[16]);
2332 
2333  ST_UB(vec0, top);
2334 
2335  left[-1] = top[16];
2336 
2337  ST_UB2(vec0, vec0, left, 16);
2338 
2339  cand_up = 1;
2340  cand_up_left = 1;
2341  cand_left = 1;
2342  } else {
2343  left[-1] = 128;
2344  vec0 = (v16u8) __msa_ldi_b(128);
2345 
2346  ST_UB2(vec0, vec0, top, 16);
2347  ST_UB2(vec0, vec0, left, 16);
2348  }
2349  }
2350 
2351  if (!cand_left) {
2352  vec0 = (v16u8) __msa_fill_b(left[16]);
2353  ST_UB(vec0, left);
2354  }
2355  if (!cand_up_left) {
2356  left[-1] = left[0];
2357  }
2358  if (!cand_up) {
2359  vec0 = (v16u8) __msa_fill_b(left[-1]);
2360  ST_UB(vec0, top);
2361  }
2362  if (!cand_up_right) {
2363  vec0 = (v16u8) __msa_fill_b(top[15]);
2364  ST_UB(vec0, (top + 16));
2365  }
2366 
2367  top[-1] = left[-1];
2368 
2369 
2371  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2372  if (mode != INTRA_DC && 16 != 4) {
2373  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2374  int min_dist_vert_hor =
2375  (((((int) (mode - 26U)) >=
2376  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2377  ((((int) (mode - 10U)) >=
2378  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2379  ? ((((int) (mode - 10U)) >=
2380  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381  : ((((int) (mode - 26U)) >=
2382  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2383  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2384  filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2385  filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2386  for (i = 2 * 16 - 2; i >= 0; i--)
2387  filtered_left[i] = (left[i + 1] + 2 * left[i] +
2388  left[i - 1] + 2) >> 2;
2389  filtered_top[-1] =
2390  filtered_left[-1] =
2391  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2392  for (i = 2 * 16 - 2; i >= 0; i--)
2393  filtered_top[i] = (top[i + 1] + 2 * top[i] +
2394  top[i - 1] + 2) >> 2;
2395  left = filtered_left;
2396  top = filtered_top;
2397  }
2398  }
2399  }
2400 
2401  switch (mode) {
2402  case INTRA_PLANAR:
2403  s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2404  (uint8_t *) left, stride);
2405  break;
2406  case INTRA_DC:
2407  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2408  (uint8_t *) left, stride, 4, c_idx);
2409  break;
2410  default:
2411  s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2412  (uint8_t *) left, stride, c_idx, mode);
2413  break;
2414  }
2415 }
2416 
2417 void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2418 {
2419  v16u8 vec0, vec1;
2420  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2421  v8i16 res0, res1, res2, res3;
2422  v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2423  v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2424  HEVCLocalContext *lc = s->HEVClc;
2425  int i;
2426  int hshift = s->ps.sps->hshift[c_idx];
2427  int vshift = s->ps.sps->vshift[c_idx];
2428  int size_in_luma_h = 32 << hshift;
2429  int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2430  int size_in_luma_v = 32 << vshift;
2431  int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2432  int x = x0 >> hshift;
2433  int y = y0 >> vshift;
2434  int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2435  int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2436 
2437  int cur_tb_addr =
2438  s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2439 
2440  ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2441  uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2442 
2443  int min_pu_width = s->ps.sps->min_pu_width;
2444 
2445  enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2446  lc->tu.intra_pred_mode;
2447  uint32_t a;
2448  uint8_t left_array[2 * 32 + 1];
2449  uint8_t filtered_left_array[2 * 32 + 1];
2450  uint8_t top_array[2 * 32 + 1];
2451  uint8_t filtered_top_array[2 * 32 + 1];
2452 
2453  uint8_t *left = left_array + 1;
2454  uint8_t *top = top_array + 1;
2455  uint8_t *filtered_left = filtered_left_array + 1;
2456  uint8_t *filtered_top = filtered_top_array + 1;
2457  int cand_bottom_left = lc->na.cand_bottom_left
2458  && cur_tb_addr >
2459  s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2460  (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2461  int cand_left = lc->na.cand_left;
2462  int cand_up_left = lc->na.cand_up_left;
2463  int cand_up = lc->na.cand_up;
2464  int cand_up_right = lc->na.cand_up_right
2465  && cur_tb_addr >
2466  s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2467  ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2468 
2469  int bottom_left_size =
2470  (((y0 + 2 * size_in_luma_v) >
2471  (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2472  2 * size_in_luma_v)) -
2473  (y0 + size_in_luma_v)) >> vshift;
2474  int top_right_size =
2475  (((x0 + 2 * size_in_luma_h) >
2476  (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2477  (x0 + size_in_luma_h)) >> hshift;
2478 
2479  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2480  int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2481  int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2482  int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2483  int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2484  if (!size_in_luma_pu_h)
2485  size_in_luma_pu_h++;
2486  if (cand_bottom_left == 1 && on_pu_edge_x) {
2487  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2488  int y_bottom_pu =
2489  ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2490  int max =
2491  ((size_in_luma_pu_v) >
2492  (s->ps.sps->min_pu_height -
2493  y_bottom_pu) ? (s->ps.sps->min_pu_height -
2494  y_bottom_pu) : (size_in_luma_pu_v));
2495  cand_bottom_left = 0;
2496  for (i = 0; i < max; i += 2)
2497  cand_bottom_left |=
2498  ((s->ref->tab_mvf[(x_left_pu) +
2499  (y_bottom_pu +
2500  i) * min_pu_width]).pred_flag ==
2501  PF_INTRA);
2502  }
2503  if (cand_left == 1 && on_pu_edge_x) {
2504  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2505  int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2506  int max =
2507  ((size_in_luma_pu_v) >
2508  (s->ps.sps->min_pu_height -
2509  y_left_pu) ? (s->ps.sps->min_pu_height -
2510  y_left_pu) : (size_in_luma_pu_v));
2511  cand_left = 0;
2512  for (i = 0; i < max; i += 2)
2513  cand_left |=
2514  ((s->ref->tab_mvf[(x_left_pu) +
2515  (y_left_pu +
2516  i) * min_pu_width]).pred_flag ==
2517  PF_INTRA);
2518  }
2519  if (cand_up_left == 1) {
2520  int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2521  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2522  cand_up_left =
2523  (s->ref->tab_mvf[(x_left_pu) +
2524  (y_top_pu) * min_pu_width]).pred_flag ==
2525  PF_INTRA;
2526  }
2527  if (cand_up == 1 && on_pu_edge_y) {
2528  int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2529  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2530  int max =
2531  ((size_in_luma_pu_h) >
2532  (s->ps.sps->min_pu_width -
2533  x_top_pu) ? (s->ps.sps->min_pu_width -
2534  x_top_pu) : (size_in_luma_pu_h));
2535  cand_up = 0;
2536  for (i = 0; i < max; i += 2)
2537  cand_up |=
2538  ((s->ref->tab_mvf[(x_top_pu + i) +
2539  (y_top_pu) *
2540  min_pu_width]).pred_flag == PF_INTRA);
2541  }
2542  if (cand_up_right == 1 && on_pu_edge_y) {
2543  int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2544  int x_right_pu =
2545  ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2546  int max =
2547  ((size_in_luma_pu_h) >
2548  (s->ps.sps->min_pu_width -
2549  x_right_pu) ? (s->ps.sps->min_pu_width -
2550  x_right_pu) : (size_in_luma_pu_h));
2551  cand_up_right = 0;
2552  for (i = 0; i < max; i += 2)
2553  cand_up_right |=
2554  ((s->ref->tab_mvf[(x_right_pu + i) +
2555  (y_top_pu) *
2556  min_pu_width]).pred_flag == PF_INTRA);
2557  }
2558  vec0 = (v16u8) __msa_ldi_b(128);
2559 
2560  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2561  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2562 
2563  top[-1] = 128;
2564  }
2565  if (cand_up_left) {
2566  left[-1] = src[(-1) + stride * (-1)];
2567  top[-1] = left[-1];
2568  }
2569  if (cand_up) {
2570  LD_UB2(src - stride, 16, vec0, vec1);
2571  ST_UB2(vec0, vec1, top, 16);
2572  }
2573 
2574  if (cand_up_right) {
2575  LD_UB2(src - stride + 32, 16, vec0, vec1);
2576  ST_UB2(vec0, vec1, (top + 32), 16);
2577  do {
2578  uint32_t pix =
2579  ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2580  0x01010101U);
2581  for (i = 0; i < (32 - top_right_size); i += 4)
2582  ((((union unaligned_32 *) (top + 32 + top_right_size +
2583  i))->l) = (pix));
2584  } while (0);
2585  }
2586  if (cand_left)
2587  for (i = 0; i < 32; i++)
2588  left[i] = src[(-1) + stride * (i)];
2589  if (cand_bottom_left) {
2590  for (i = 32; i < 32 + bottom_left_size; i++)
2591  left[i] = src[(-1) + stride * (i)];
2592  do {
2593  uint32_t pix =
2594  ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2595  0x01010101U);
2596  for (i = 0; i < (32 - bottom_left_size); i += 4)
2597  ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2598  i))->l) = (pix));
2599  } while (0);
2600  }
2601 
2602  if (s->ps.pps->constrained_intra_pred_flag == 1) {
2603  if (cand_bottom_left || cand_left || cand_up_left || cand_up
2604  || cand_up_right) {
2605  int size_max_x =
2606  x0 + ((2 * 32) << hshift) <
2607  s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2608  int size_max_y =
2609  y0 + ((2 * 32) << vshift) <
2610  s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2611  int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2612  if (!cand_up_right) {
2613  size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2614  32 : (s->ps.sps->width - x0) >> hshift;
2615  }
2616  if (!cand_bottom_left) {
2617  size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2618  32 : (s->ps.sps->height - y0) >> vshift;
2619  }
2620  if (cand_bottom_left || cand_left || cand_up_left) {
2621  while (j > -1
2622  &&
2623  !((s->ref->tab_mvf[(((x0 +
2624  ((-1) << hshift)) >> s->ps.sps->
2625  log2_min_pu_size)) + (((y0 +
2626  ((j) <<
2627  vshift))
2628  >> s->ps.sps->
2629  log2_min_pu_size))
2630  * min_pu_width]).pred_flag ==
2631  PF_INTRA))
2632  j--;
2633  if (!
2634  ((s->ref->tab_mvf[(((x0 +
2635  ((-1) << hshift)) >> s->ps.sps->
2636  log2_min_pu_size)) + (((y0 + ((j)
2637  <<
2638  vshift))
2639  >> s->ps.sps->
2640  log2_min_pu_size))
2641  * min_pu_width]).pred_flag == PF_INTRA)) {
2642  j = 0;
2643  while (j < size_max_x
2644  &&
2645  !((s->ref->tab_mvf[(((x0 +
2646  ((j) << hshift)) >> s->ps.sps->
2647  log2_min_pu_size)) + (((y0 +
2648  ((-1) <<
2649  vshift))
2650  >> s->
2651  ps.sps->
2652  log2_min_pu_size))
2653  * min_pu_width]).pred_flag ==
2654  PF_INTRA))
2655  j++;
2656  for (i = j; i > (j) - (j + 1); i--)
2657  if (!
2658  ((s->ref->tab_mvf[(((x0 +
2659  ((i -
2660  1) << hshift)) >> s->ps.sps->
2661  log2_min_pu_size)) + (((y0 +
2662  ((-1) <<
2663  vshift))
2664  >> s->
2665  ps.sps->
2666  log2_min_pu_size))
2667  * min_pu_width]).pred_flag ==
2668  PF_INTRA))
2669  top[i - 1] = top[i];
2670  left[-1] = top[-1];
2671  }
2672  } else {
2673  j = 0;
2674  while (j < size_max_x
2675  &&
2676  !((s->ref->tab_mvf[(((x0 +
2677  ((j) << hshift)) >> s->ps.sps->
2678  log2_min_pu_size)) + (((y0 + ((-1)
2679  <<
2680  vshift))
2681  >> s->ps.sps->
2682  log2_min_pu_size))
2683  * min_pu_width]).pred_flag ==
2684  PF_INTRA))
2685  j++;
2686  if (j > 0)
2687  if (x0 > 0) {
2688  for (i = j; i > (j) - (j + 1); i--)
2689  if (!
2690  ((s->ref->tab_mvf[(((x0 +
2691  ((i -
2692  1) << hshift)) >>
2693  s->ps.sps->log2_min_pu_size))
2694  + (((y0 + ((-1)
2695  << vshift))
2696  >>
2697  s->ps.sps->log2_min_pu_size))
2698  *
2699  min_pu_width]).pred_flag ==
2700  PF_INTRA))
2701  top[i - 1] = top[i];
2702  } else {
2703  for (i = j; i > (j) - (j); i--)
2704  if (!
2705  ((s->ref->tab_mvf[(((x0 +
2706  ((i -
2707  1) << hshift)) >>
2708  s->ps.sps->log2_min_pu_size))
2709  + (((y0 + ((-1)
2710  << vshift))
2711  >>
2712  s->ps.sps->log2_min_pu_size))
2713  *
2714  min_pu_width]).pred_flag ==
2715  PF_INTRA))
2716  top[i - 1] = top[i];
2717  top[-1] = top[0];
2718  }
2719  left[-1] = top[-1];
2720  }
2721  left[-1] = top[-1];
2722  if (cand_bottom_left || cand_left) {
2723  a = ((left[-1]) * 0x01010101U);
2724  for (i = 0; i < (0) + (size_max_y); i += 4)
2725  if (!
2726  ((s->ref->tab_mvf[(((x0 +
2727  ((-1) << hshift)) >> s->ps.sps->
2728  log2_min_pu_size)) + (((y0 +
2729  ((i) <<
2730  vshift))
2731  >> s->ps.sps->
2732  log2_min_pu_size))
2733  * min_pu_width]).pred_flag ==
2734  PF_INTRA))
2735  ((((union unaligned_32 *) (&left[i]))->l) = (a));
2736  else
2737  a = ((left[i + 3]) * 0x01010101U);
2738  }
2739  if (!cand_left) {
2740  vec0 = (v16u8) __msa_fill_b(left[-1]);
2741 
2742  ST_UB2(vec0, vec0, left, 16);
2743  }
2744  if (!cand_bottom_left) {
2745  vec0 = (v16u8) __msa_fill_b(left[31]);
2746 
2747  ST_UB2(vec0, vec0, (left + 32), 16);
2748  }
2749  if (x0 != 0 && y0 != 0) {
2750  a = ((left[size_max_y - 1]) * 0x01010101U);
2751  for (i = (size_max_y - 1);
2752  i > (size_max_y - 1) - (size_max_y); i -= 4)
2753  if (!
2754  ((s->ref->tab_mvf[(((x0 +
2755  ((-1) << hshift)) >> s->ps.sps->
2756  log2_min_pu_size)) + (((y0 +
2757  ((i -
2758  3) <<
2759  vshift))
2760  >> s->ps.sps->
2761  log2_min_pu_size))
2762  * min_pu_width]).pred_flag ==
2763  PF_INTRA))
2764  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2765  else
2766  a = ((left[i - 3]) * 0x01010101U);
2767  if (!
2768  ((s->ref->tab_mvf[(((x0 +
2769  ((-1) << hshift)) >> s->ps.sps->
2770  log2_min_pu_size)) + (((y0 + ((-1)
2771  <<
2772  vshift))
2773  >> s->ps.sps->
2774  log2_min_pu_size))
2775  * min_pu_width]).pred_flag == PF_INTRA))
2776  left[-1] = left[0];
2777  } else if (x0 == 0) {
2778  do {
2779  uint32_t pix = ((0) * 0x01010101U);
2780  for (i = 0; i < (size_max_y); i += 4)
2781  ((((union unaligned_32 *) (left + i))->l) = (pix));
2782  } while (0);
2783  } else {
2784  a = ((left[size_max_y - 1]) * 0x01010101U);
2785  for (i = (size_max_y - 1);
2786  i > (size_max_y - 1) - (size_max_y); i -= 4)
2787  if (!
2788  ((s->ref->tab_mvf[(((x0 +
2789  ((-1) << hshift)) >> s->ps.sps->
2790  log2_min_pu_size)) + (((y0 +
2791  ((i -
2792  3) <<
2793  vshift))
2794  >> s->ps.sps->
2795  log2_min_pu_size))
2796  * min_pu_width]).pred_flag ==
2797  PF_INTRA))
2798  ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2799  else
2800  a = ((left[i - 3]) * 0x01010101U);
2801  }
2802  top[-1] = left[-1];
2803  if (y0 != 0) {
2804  a = ((left[-1]) * 0x01010101U);
2805  for (i = 0; i < (0) + (size_max_x); i += 4)
2806  if (!
2807  ((s->ref->tab_mvf[(((x0 +
2808  ((i) << hshift)) >> s->ps.sps->
2809  log2_min_pu_size)) + (((y0 + ((-1)
2810  <<
2811  vshift))
2812  >> s->ps.sps->
2813  log2_min_pu_size))
2814  * min_pu_width]).pred_flag ==
2815  PF_INTRA))
2816  ((((union unaligned_32 *) (&top[i]))->l) = (a));
2817  else
2818  a = ((top[i + 3]) * 0x01010101U);
2819  }
2820  }
2821  }
2822 
2823  if (!cand_bottom_left) {
2824  if (cand_left) {
2825  vec0 = (v16u8) __msa_fill_b(left[31]);
2826 
2827  ST_UB2(vec0, vec0, (left + 32), 16);
2828  } else if (cand_up_left) {
2829  vec0 = (v16u8) __msa_fill_b(left[-1]);
2830 
2831  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2832 
2833  cand_left = 1;
2834  } else if (cand_up) {
2835  left[-1] = top[0];
2836 
2837  vec0 = (v16u8) __msa_fill_b(left[-1]);
2838 
2839  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2840 
2841  cand_up_left = 1;
2842  cand_left = 1;
2843  } else if (cand_up_right) {
2844  vec0 = (v16u8) __msa_fill_b(top[32]);
2845 
2846  ST_UB2(vec0, vec0, top, 16);
2847 
2848  left[-1] = top[32];
2849 
2850  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2851 
2852  cand_up = 1;
2853  cand_up_left = 1;
2854  cand_left = 1;
2855  } else {
2856  left[-1] = 128;
2857 
2858  vec0 = (v16u8) __msa_ldi_b(128);
2859 
2860  ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2861  ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2862  }
2863  }
2864 
2865  if (!cand_left) {
2866  vec0 = (v16u8) __msa_fill_b(left[32]);
2867 
2868  ST_UB2(vec0, vec0, left, 16);
2869  }
2870  if (!cand_up_left) {
2871  left[-1] = left[0];
2872  }
2873  if (!cand_up) {
2874  vec0 = (v16u8) __msa_fill_b(left[-1]);
2875 
2876  ST_UB2(vec0, vec0, top, 16);
2877  }
2878  if (!cand_up_right) {
2879  vec0 = (v16u8) __msa_fill_b(top[31]);
2880 
2881  ST_UB2(vec0, vec0, (top + 32), 16);
2882  }
2883 
2884  top[-1] = left[-1];
2885 
2886 
2888  && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2889  if (mode != INTRA_DC && 32 != 4) {
2890  int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2891  int min_dist_vert_hor =
2892  (((((int) (mode - 26U)) >=
2893  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2894  ((((int) (mode - 10U)) >=
2895  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2896  ? ((((int) (mode - 10U)) >=
2897  0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898  : ((((int) (mode - 26U)) >=
2899  0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2900  if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2901  int threshold = 1 << (8 - 5);
2903  && c_idx == 0
2904  && ((top[-1] + top[63] - 2 * top[31]) >=
2905  0 ? (top[-1] + top[63] -
2906  2 * top[31]) : (-(top[-1] + top[63] -
2907  2 * top[31]))) < threshold
2908  && ((left[-1] + left[63] - 2 * left[31]) >=
2909  0 ? (left[-1] + left[63] -
2910  2 * left[31]) : (-(left[-1] + left[63] -
2911  2 * left[31]))) < threshold) {
2912 
2913 
2914  filtered_top[-1] = top[-1];
2915  filtered_top[63] = top[63];
2916 
2917 
2918  for (i = 0; i < 63; i++) {
2919  filtered_top[i] =
2920  ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2921  }
2922 
2923  tmp0 = __msa_fill_h(top[-1]);
2924  tmp1 = __msa_fill_h(top[63]);
2925 
2926  tmp2 = mul_val0 - 8;
2927  tmp3 = mul_val0 - 16;
2928  tmp4 = mul_val0 - 24;
2929  tmp5 = mul_val1 + 8;
2930  tmp6 = mul_val1 + 16;
2931  tmp7 = mul_val1 + 24;
2932 
2933  res0 = mul_val0 * tmp0;
2934  res1 = tmp2 * tmp0;
2935  res2 = tmp3 * tmp0;
2936  res3 = tmp4 * tmp0;
2937  res0 += mul_val1 * tmp1;
2938  res1 += tmp5 * tmp1;
2939  res2 += tmp6 * tmp1;
2940  res3 += tmp7 * tmp1;
2941 
2942  res0 = __msa_srari_h(res0, 6);
2943  res1 = __msa_srari_h(res1, 6);
2944  res2 = __msa_srari_h(res2, 6);
2945  res3 = __msa_srari_h(res3, 6);
2946 
2947  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2948  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2949 
2950  ST_UB2(vec0, vec1, filtered_top, 16);
2951 
2952  res0 = mul_val0 - 32;
2953  tmp2 = mul_val0 - 40;
2954  tmp3 = mul_val0 - 48;
2955  tmp4 = mul_val0 - 56;
2956  res3 = mul_val1 + 32;
2957  tmp5 = mul_val1 + 40;
2958  tmp6 = mul_val1 + 48;
2959  tmp7 = mul_val1 + 56;
2960 
2961  res0 = res0 * tmp0;
2962  res1 = tmp2 * tmp0;
2963  res2 = tmp3 * tmp0;
2964  res0 += res3 * tmp1;
2965  res3 = tmp4 * tmp0;
2966  res1 += tmp5 * tmp1;
2967  res2 += tmp6 * tmp1;
2968  res3 += tmp7 * tmp1;
2969 
2970  res0 = __msa_srari_h(res0, 6);
2971  res1 = __msa_srari_h(res1, 6);
2972  res2 = __msa_srari_h(res2, 6);
2973  res3 = __msa_srari_h(res3, 6);
2974 
2975  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2976  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2977 
2978  ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2979 
2980  filtered_top[63] = top[63];
2981 
2982  tmp0 = __msa_fill_h(left[-1]);
2983  tmp1 = __msa_fill_h(left[63]);
2984 
2985  tmp2 = mul_val0 - 8;
2986  tmp3 = mul_val0 - 16;
2987  tmp4 = mul_val0 - 24;
2988  tmp5 = mul_val1 + 8;
2989  tmp6 = mul_val1 + 16;
2990  tmp7 = mul_val1 + 24;
2991 
2992  res0 = mul_val0 * tmp0;
2993  res1 = tmp2 * tmp0;
2994  res2 = tmp3 * tmp0;
2995  res3 = tmp4 * tmp0;
2996  res0 += mul_val1 * tmp1;
2997  res1 += tmp5 * tmp1;
2998  res2 += tmp6 * tmp1;
2999  res3 += tmp7 * tmp1;
3000 
3001  res0 = __msa_srari_h(res0, 6);
3002  res1 = __msa_srari_h(res1, 6);
3003  res2 = __msa_srari_h(res2, 6);
3004  res3 = __msa_srari_h(res3, 6);
3005 
3006  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3007  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3008 
3009  ST_UB2(vec0, vec1, left, 16);
3010 
3011  res0 = mul_val0 - 32;
3012  tmp2 = mul_val0 - 40;
3013  tmp3 = mul_val0 - 48;
3014  tmp4 = mul_val0 - 56;
3015  res3 = mul_val1 + 32;
3016  tmp5 = mul_val1 + 40;
3017  tmp6 = mul_val1 + 48;
3018  tmp7 = mul_val1 + 56;
3019 
3020  res0 = res0 * tmp0;
3021  res1 = tmp2 * tmp0;
3022  res2 = tmp3 * tmp0;
3023  res0 += res3 * tmp1;
3024  res3 = tmp4 * tmp0;
3025  res1 += tmp5 * tmp1;
3026  res2 += tmp6 * tmp1;
3027  res3 += tmp7 * tmp1;
3028 
3029  res0 = __msa_srari_h(res0, 6);
3030  res1 = __msa_srari_h(res1, 6);
3031  res2 = __msa_srari_h(res2, 6);
3032  res3 = __msa_srari_h(res3, 6);
3033 
3034  vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3035  vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3036 
3037  ST_UB2(vec0, vec1, (left + 32), 16);
3038 
3039  left[63] = tmp1[0];
3040 
3041  top = filtered_top;
3042  } else {
3043  filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3044  filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3045  for (i = 2 * 32 - 2; i >= 0; i--)
3046  filtered_left[i] = (left[i + 1] + 2 * left[i] +
3047  left[i - 1] + 2) >> 2;
3048  filtered_top[-1] =
3049  filtered_left[-1] =
3050  (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3051  for (i = 2 * 32 - 2; i >= 0; i--)
3052  filtered_top[i] = (top[i + 1] + 2 * top[i] +
3053  top[i - 1] + 2) >> 2;
3054  left = filtered_left;
3055  top = filtered_top;
3056  }
3057  }
3058  }
3059  }
3060 
3061  switch (mode) {
3062  case INTRA_PLANAR:
3063  s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3064  (uint8_t *) left, stride);
3065  break;
3066  case INTRA_DC:
3067  s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3068  (uint8_t *) left, stride, 5, c_idx);
3069  break;
3070  default:
3071  s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3072  (uint8_t *) left, stride, c_idx, mode);
3073  break;
3074  }
3075 }
const HEVCPPS * pps
Definition: hevc_ps.h:407
const char const char void * val
Definition: avisynth_c.h:863
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
HEVCPredContext hpc
Definition: hevcdec.h:438
NeighbourAvailable na
Definition: hevcdec.h:372
HEVCFrame * ref
Definition: hevcdec.h:423
#define ILVRL_B2_SH(...)
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:190
#define SLDI_B2_SH(...)
MvField * tab_mvf
Definition: hevcdec.h:314
int vshift[3]
Definition: hevc_ps.h:313
#define LW(psrc)
#define MUL2(in0, in1, in2, in3, out0, out1)
#define SD
Definition: ccaption_dec.c:819
#define LD_SB(...)
#define ST_SB(...)
static const int8_t intra_pred_angle_up[17]
Definition: hevcpred_msa.c:25
#define ST_SB2(...)
HEVCParamSets ps
Definition: hevcdec.h:408
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:401
#define SPLATI_H2_SH(...)
#define src
Definition: vp8dsp.c:254
int stride
Definition: mace.c:144
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:94
int width
Definition: hevc_ps.h:299
#define LD_SB2(...)
#define log2(x)
Definition: libm.h:404
int chroma_format_idc
Definition: hevc_ps.h:227
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:908
uint8_t
#define ST_SH2(...)
#define UNPCK_UB_SH(in, out0, out1)
#define LD_UB2(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define SRARI_H4_SH(...)
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
#define SLDI_B4_0_SH(...)
#define ILVRL_H2_SH(...)
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
#define max(a, b)
Definition: cuda_runtime.h:33
#define CLIP_SH2_0_255(in0, in1)
#define PCKEV_B2_SB(...)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define U(x)
Definition: vp56_arith.h:37
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
#define ILVRL_B2_UH(...)
int min_pu_height
Definition: hevc_ps.h:309
static const int8_t intra_pred_angle_low[16]
Definition: hevcpred_msa.c:29
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:224
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
#define zero
Definition: regdef.h:64
#define ILVR_B2_SH(...)
#define SW4(in0, in1, in2, in3, pdst, stride)
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:264
int intra_pred_mode
Definition: hevcdec.h:291
const HEVCSPS * sps
Definition: hevc_ps.h:406
#define SRARI_H2_SH(...)
#define SLDI_B2_UB(...)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:663
#define PCKEV_D2_SH(...)
#define SRARI_H2_UH(...)
int hshift[3]
Definition: hevc_ps.h:312
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:548
int32_t
#define PCKEV_B4_SB(...)
#define s(width, name)
Definition: cbs_vp9.c:257
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
int intra_pred_mode_c
Definition: hevcdec.h:292
int height
Definition: hevc_ps.h:300
#define ST_UB(...)
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, res0, res1, mul_val_b0, mul_val_b1, round)
Definition: hevcpred_msa.c:33
uint8_t constrained_intra_pred_flag
Definition: hevc_ps.h:332
int tb_mask
Definition: hevc_ps.h:310
#define ST_UB2(...)
#define ILVL_B2_SH(...)
#define ST_UB4(...)
IntraPredMode
Definition: hevcdec.h:173
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:310
#define ILVR_B2_UH(...)
#define src1
Definition: h264pred.c:139
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
int linesize[AV_NUM_DATA_POINTERS]
For video, size in bytes of each picture line.
Definition: frame.h:326
unsigned int log2_min_pu_size
Definition: hevc_ps.h:282
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
TransformUnit tu
Definition: hevcdec.h:355
#define HADD_UB2_UH(...)
void(* pred_angular[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int mode)
Definition: hevcpred.h:38
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:149
#define ILVR_B4_SH(...)
void(* pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx)
Definition: hevcpred.h:36
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:595
AVFrame * frame
Definition: hevcdec.h:403
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:340
unsigned int log2_min_tb_size
Definition: hevc_ps.h:279
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
#define LD(psrc)
#define INSERT_W2_SB(...)
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:61
#define SD4(in0, in1, in2, in3, pdst, stride)
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
Definition: hevcpred_msa.c:516
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
#define SW(val, pdst)
HEVCLocalContext * HEVClc
Definition: hevcdec.h:390
#define ST_SB4(...)
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:826
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:309
#define INSERT_D2_UB(...)
int
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
#define SUB2(in0, in1, in2, in3, out0, out1)
#define flag(name)
Definition: cbs_av1.c:556
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
uint8_t sps_strong_intra_smoothing_enable_flag
Definition: hevc_ps.h:275
#define LD_UB(...)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
int min_pu_width
Definition: hevc_ps.h:308
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
Definition: hevcpred_msa.c:460
void(* pred_planar[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride)
Definition: hevcpred.h:34
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
Definition: hevcpred_msa.c:743
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
#define ST_W2(in, idx0, idx1, pdst, stride)
int * min_tb_addr_zs
MinTbAddrZS.
Definition: hevc_ps.h:392
int intra_smoothing_disabled_flag
Definition: hevc_ps.h:293
#define ILVR_D2_SH(...)
#define PCKEV_B2_UB(...)
#define SLDI_B2_SB(...)
mode
Use these values in ebur128_init (or&#39;ed).
Definition: ebur128.h:83
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
Definition: hevcpred_msa.c:923