FFmpeg  4.2.2
swscale_vsx.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
33 
34 #if HAVE_VSX
35 #define vzero vec_splat_s32(0)
36 
37 #if !HAVE_BIGENDIAN
38 #define GET_LS(a,b,c,s) {\
39  ls = a;\
40  a = vec_vsx_ld(((b) << 1) + 16, s);\
41  }
42 
43 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
44  vector signed short ls;\
45  vector signed int vf1, vf2, i1, i2;\
46  GET_LS(l1, x, perm, src);\
47  i1 = vec_mule(filter, ls);\
48  i2 = vec_mulo(filter, ls);\
49  vf1 = vec_mergeh(i1, i2);\
50  vf2 = vec_mergel(i1, i2);\
51  d1 = vec_add(d1, vf1);\
52  d2 = vec_add(d2, vf2);\
53  } while (0)
54 
55 #define LOAD_FILTER(vf,f) {\
56  vf = vec_vsx_ld(joffset, f);\
57 }
58 #define LOAD_L1(ll1,s,p){\
59  ll1 = vec_vsx_ld(xoffset, s);\
60 }
61 
62 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
63 
64 // The neat trick: We only care for half the elements,
65 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
66 // and we're going to use vec_mule, so we choose
67 // carefully how to "unpack" the elements into the even slots.
68 #define GET_VF4(a, vf, f) {\
69  vf = (vector signed short)vec_vsx_ld(a << 3, f);\
70  vf = vec_mergeh(vf, (vector signed short)vzero);\
71 }
72 #define FIRST_LOAD(sv, pos, s, per) {}
73 #define UPDATE_PTR(s0, d0, s1, d1) {}
74 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
75  vf = vec_vsx_ld(pos + a, s);\
76 }
77 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
78 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
79  vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
80 }
81 
82 #define FUNC(name) name ## _vsx
83 #include "swscale_ppc_template.c"
84 #undef FUNC
85 
86 #undef vzero
87 
88 #endif /* !HAVE_BIGENDIAN */
89 
90 static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
91  const uint8_t *dither, int offset, int start)
92 {
93  int i;
94  for (i = start; i < dstW; i++) {
95  int val = (src[i] + dither[(i + offset) & 7]) >> 7;
96  dest[i] = av_clip_uint8(val);
97  }
98 }
99 
100 static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW,
101  const uint8_t *dither, int offset)
102 {
103  const int dst_u = -(uintptr_t)dest & 15;
104  int i, j;
105  LOCAL_ALIGNED(16, int16_t, val, [16]);
106  const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
107  vector int16_t vi, vileft, ditherleft, ditherright;
108  vector uint8_t vd;
109 
110  for (j = 0; j < 16; j++) {
111  val[j] = dither[(dst_u + offset + j) & 7];
112  }
113 
114  ditherleft = vec_ld(0, val);
115  ditherright = vec_ld(0, &val[8]);
116 
117  yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
118 
119  for (i = dst_u; i < dstW - 15; i += 16) {
120 
121  vi = vec_vsx_ld(0, &src[i]);
122  vi = vec_adds(ditherleft, vi);
123  vileft = vec_sra(vi, shifts);
124 
125  vi = vec_vsx_ld(0, &src[i + 8]);
126  vi = vec_adds(ditherright, vi);
127  vi = vec_sra(vi, shifts);
128 
129  vd = vec_packsu(vileft, vi);
130  vec_st(vd, 0, &dest[i]);
131  }
132 
133  yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
134 }
135 
136 #if !HAVE_BIGENDIAN
137 
138 #define output_pixel(pos, val) \
139  if (big_endian) { \
140  AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
141  } else { \
142  AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
143  }
144 
145 static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW,
146  int big_endian, int output_bits, int start)
147 {
148  int i;
149  int shift = 15 - output_bits;
150 
151  for (i = start; i < dstW; i++) {
152  int val = src[i] + (1 << (shift - 1));
153  output_pixel(&dest[i], val);
154  }
155 }
156 
157 static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
158  int big_endian, int output_bits)
159 {
160  const int dst_u = -(uintptr_t)dest & 7;
161  const int shift = 15 - output_bits;
162  const int add = (1 << (shift - 1));
163  const int clip = (1 << output_bits) - 1;
164  const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, add, add, add};
165  const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
166  const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift);
167  const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
168  vector uint16_t v;
169  int i;
170 
171  yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);
172 
173  for (i = dst_u; i < dstW - 7; i += 8) {
174  v = vec_vsx_ld(0, (const uint16_t *) &src[i]);
175  v = vec_add(v, vadd);
176  v = vec_sr(v, vshift);
177  v = vec_min(v, vlargest);
178  v = vec_rl(v, vswap);
179  vec_st(v, 0, &dest[i]);
180  }
181 
182  yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
183 }
184 
185 static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
186  const int16_t **src, uint16_t *dest, int dstW,
187  int big_endian, int output_bits, int start)
188 {
189  int i;
190  int shift = 11 + 16 - output_bits;
191 
192  for (i = start; i < dstW; i++) {
193  int val = 1 << (shift - 1);
194  int j;
195 
196  for (j = 0; j < filterSize; j++)
197  val += src[j][i] * filter[j];
198 
199  output_pixel(&dest[i], val);
200  }
201 }
202 
203 static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
204  const int16_t **src, uint16_t *dest, int dstW,
205  int big_endian, int output_bits)
206 {
207  const int dst_u = -(uintptr_t)dest & 7;
208  const int shift = 11 + 16 - output_bits;
209  const int add = (1 << (shift - 1));
210  const int clip = (1 << output_bits) - 1;
211  const uint16_t swap = big_endian ? 8 : 0;
212  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
213  const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
214  const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
215  const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
216  const vector int16_t vzero = vec_splat_s16(0);
217  const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
218  vector int16_t vfilter[MAX_FILTER_SIZE], vin;
219  vector uint16_t v;
220  vector uint32_t vleft, vright, vtmp;
221  int i, j;
222 
223  for (i = 0; i < filterSize; i++) {
224  vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], filter[i],
225  filter[i], filter[i], filter[i], filter[i]};
226  }
227 
228  yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
229 
230  for (i = dst_u; i < dstW - 7; i += 8) {
231  vleft = vright = vadd;
232 
233  for (j = 0; j < filterSize; j++) {
234  vin = vec_vsx_ld(0, &src[j][i]);
235  vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
236  vleft = vec_add(vleft, vtmp);
237  vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
238  vright = vec_add(vright, vtmp);
239  }
240 
241  vleft = vec_sra(vleft, vshift);
242  vright = vec_sra(vright, vshift);
243  v = vec_packsu(vleft, vright);
244  v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
245  v = vec_min(v, vlargest);
246  v = vec_rl(v, vswap);
247  v = vec_perm(v, v, vperm);
248  vec_st(v, 0, &dest[i]);
249  }
250 
251  yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
252 }
253 
254 
255 #undef output_pixel
256 
257 #define output_pixel(pos, val, bias, signedness) \
258  if (big_endian) { \
259  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
260  } else { \
261  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
262  }
263 
264 static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
265  int big_endian, int output_bits, int start)
266 {
267  int i;
268  const int shift = 3;
269 
270  for (i = start; i < dstW; i++) {
271  int val = src[i] + (1 << (shift - 1));
272  output_pixel(&dest[i], val, 0, uint);
273  }
274 }
275 
276 static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
277  int big_endian, int output_bits)
278 {
279  const int dst_u = -(uintptr_t)dest & 7;
280  const int shift = 3;
281  const int add = (1 << (shift - 1));
282  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
283  const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
284  const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
285  vector uint32_t v, v2;
286  vector uint16_t vd;
287  int i;
288 
289  yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
290 
291  for (i = dst_u; i < dstW - 7; i += 8) {
292  v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
293  v = vec_add(v, vadd);
294  v = vec_sr(v, vshift);
295 
296  v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
297  v2 = vec_add(v2, vadd);
298  v2 = vec_sr(v2, vshift);
299 
300  vd = vec_packsu(v, v2);
301  vd = vec_rl(vd, vswap);
302 
303  vec_st(vd, 0, &dest[i]);
304  }
305 
306  yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
307 }
308 
309 #if HAVE_POWER8
310 
311 static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
312  const int32_t **src, uint16_t *dest, int dstW,
313  int big_endian, int output_bits, int start)
314 {
315  int i;
316  int shift = 15;
317 
318  for (i = start; i < dstW; i++) {
319  int val = 1 << (shift - 1);
320  int j;
321 
322  /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
323  * filters (or anything with negative coeffs, the range can be slightly
324  * wider in both directions. To account for this overflow, we subtract
325  * a constant so it always fits in the signed range (assuming a
326  * reasonable filterSize), and re-add that at the end. */
327  val -= 0x40000000;
328  for (j = 0; j < filterSize; j++)
329  val += src[j][i] * (unsigned)filter[j];
330 
331  output_pixel(&dest[i], val, 0x8000, int);
332  }
333 }
334 
335 static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
336  const int32_t **src, uint16_t *dest, int dstW,
337  int big_endian, int output_bits)
338 {
339  const int dst_u = -(uintptr_t)dest & 7;
340  const int shift = 15;
341  const int bias = 0x8000;
342  const int add = (1 << (shift - 1)) - 0x40000000;
343  const uint16_t swap = big_endian ? 8 : 0;
344  const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
345  const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
346  const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
347  const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias, bias, bias, bias, bias, bias};
348  vector int32_t vfilter[MAX_FILTER_SIZE];
349  vector uint16_t v;
350  vector uint32_t vleft, vright, vtmp;
351  vector int32_t vin32l, vin32r;
352  int i, j;
353 
354  for (i = 0; i < filterSize; i++) {
355  vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i], filter[i]};
356  }
357 
358  yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
359 
360  for (i = dst_u; i < dstW - 7; i += 8) {
361  vleft = vright = vadd;
362 
363  for (j = 0; j < filterSize; j++) {
364  vin32l = vec_vsx_ld(0, &src[j][i]);
365  vin32r = vec_vsx_ld(0, &src[j][i + 4]);
366 
367  vtmp = (vector uint32_t) vec_mul(vin32l, vfilter[j]);
368  vleft = vec_add(vleft, vtmp);
369  vtmp = (vector uint32_t) vec_mul(vin32r, vfilter[j]);
370  vright = vec_add(vright, vtmp);
371  }
372 
373  vleft = vec_sra(vleft, vshift);
374  vright = vec_sra(vright, vshift);
375  v = (vector uint16_t) vec_packs((vector int32_t) vleft, (vector int32_t) vright);
376  v = vec_add(v, vbias);
377  v = vec_rl(v, vswap);
378  vec_st(v, 0, &dest[i]);
379  }
380 
381  yuv2planeX_16_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
382 }
383 
384 #endif /* HAVE_POWER8 */
385 
386 #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
387  yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
388  yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t)
389 
390 #define yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
391 static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
392  uint8_t *dest, int dstW, \
393  const uint8_t *dither, int offset) \
394 { \
395  yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \
396  (uint16_t *) dest, dstW, is_be, bits); \
397 }
398 
399 #define yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t) \
400 static void yuv2planeX_ ## bits ## BE_LE ## _vsx(const int16_t *filter, int filterSize, \
401  const int16_t **src, uint8_t *dest, int dstW, \
402  const uint8_t *dither, int offset)\
403 { \
404  yuv2planeX_## template_size ## _vsx(filter, \
405  filterSize, (const typeX_t **) src, \
406  (uint16_t *) dest, dstW, is_be, bits); \
407 }
408 
409 yuv2NBPS( 9, BE, 1, nbps, int16_t)
410 yuv2NBPS( 9, LE, 0, nbps, int16_t)
411 yuv2NBPS(10, BE, 1, nbps, int16_t)
412 yuv2NBPS(10, LE, 0, nbps, int16_t)
413 yuv2NBPS(12, BE, 1, nbps, int16_t)
414 yuv2NBPS(12, LE, 0, nbps, int16_t)
415 yuv2NBPS(14, BE, 1, nbps, int16_t)
416 yuv2NBPS(14, LE, 0, nbps, int16_t)
417 
418 yuv2NBPS1(16, BE, 1, 16, int32_t)
419 yuv2NBPS1(16, LE, 0, 16, int32_t)
420 #if HAVE_POWER8
421 yuv2NBPSX(16, BE, 1, 16, int32_t)
422 yuv2NBPSX(16, LE, 0, 16, int32_t)
423 #endif
424 
425 #define WRITERGB \
426  R_l = vec_max(R_l, zero32); \
427  R_r = vec_max(R_r, zero32); \
428  G_l = vec_max(G_l, zero32); \
429  G_r = vec_max(G_r, zero32); \
430  B_l = vec_max(B_l, zero32); \
431  B_r = vec_max(B_r, zero32); \
432 \
433  R_l = vec_min(R_l, rgbclip); \
434  R_r = vec_min(R_r, rgbclip); \
435  G_l = vec_min(G_l, rgbclip); \
436  G_r = vec_min(G_r, rgbclip); \
437  B_l = vec_min(B_l, rgbclip); \
438  B_r = vec_min(B_r, rgbclip); \
439 \
440  R_l = vec_sr(R_l, shift22); \
441  R_r = vec_sr(R_r, shift22); \
442  G_l = vec_sr(G_l, shift22); \
443  G_r = vec_sr(G_r, shift22); \
444  B_l = vec_sr(B_l, shift22); \
445  B_r = vec_sr(B_r, shift22); \
446 \
447  rd16 = vec_packsu(R_l, R_r); \
448  gd16 = vec_packsu(G_l, G_r); \
449  bd16 = vec_packsu(B_l, B_r); \
450  rd = vec_packsu(rd16, zero16); \
451  gd = vec_packsu(gd16, zero16); \
452  bd = vec_packsu(bd16, zero16); \
453 \
454  switch(target) { \
455  case AV_PIX_FMT_RGB24: \
456  out0 = vec_perm(rd, gd, perm3rg0); \
457  out0 = vec_perm(out0, bd, perm3tb0); \
458  out1 = vec_perm(rd, gd, perm3rg1); \
459  out1 = vec_perm(out1, bd, perm3tb1); \
460 \
461  vec_vsx_st(out0, 0, dest); \
462  vec_vsx_st(out1, 16, dest); \
463 \
464  dest += 24; \
465  break; \
466  case AV_PIX_FMT_BGR24: \
467  out0 = vec_perm(bd, gd, perm3rg0); \
468  out0 = vec_perm(out0, rd, perm3tb0); \
469  out1 = vec_perm(bd, gd, perm3rg1); \
470  out1 = vec_perm(out1, rd, perm3tb1); \
471 \
472  vec_vsx_st(out0, 0, dest); \
473  vec_vsx_st(out1, 16, dest); \
474 \
475  dest += 24; \
476  break; \
477  case AV_PIX_FMT_BGRA: \
478  out0 = vec_mergeh(bd, gd); \
479  out1 = vec_mergeh(rd, ad); \
480 \
481  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
482  vec_vsx_st(tmp8, 0, dest); \
483  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
484  vec_vsx_st(tmp8, 16, dest); \
485 \
486  dest += 32; \
487  break; \
488  case AV_PIX_FMT_RGBA: \
489  out0 = vec_mergeh(rd, gd); \
490  out1 = vec_mergeh(bd, ad); \
491 \
492  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
493  vec_vsx_st(tmp8, 0, dest); \
494  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
495  vec_vsx_st(tmp8, 16, dest); \
496 \
497  dest += 32; \
498  break; \
499  case AV_PIX_FMT_ARGB: \
500  out0 = vec_mergeh(ad, rd); \
501  out1 = vec_mergeh(gd, bd); \
502 \
503  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
504  vec_vsx_st(tmp8, 0, dest); \
505  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
506  vec_vsx_st(tmp8, 16, dest); \
507 \
508  dest += 32; \
509  break; \
510  case AV_PIX_FMT_ABGR: \
511  out0 = vec_mergeh(ad, bd); \
512  out1 = vec_mergeh(gd, rd); \
513 \
514  tmp8 = (vector uint8_t) vec_mergeh((vector uint16_t) out0, (vector uint16_t) out1); \
515  vec_vsx_st(tmp8, 0, dest); \
516  tmp8 = (vector uint8_t) vec_mergel((vector uint16_t) out0, (vector uint16_t) out1); \
517  vec_vsx_st(tmp8, 16, dest); \
518 \
519  dest += 32; \
520  break; \
521  }
522 
523 static av_always_inline void
524 yuv2rgb_full_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
525  const int16_t **lumSrc, int lumFilterSize,
526  const int16_t *chrFilter, const int16_t **chrUSrc,
527  const int16_t **chrVSrc, int chrFilterSize,
528  const int16_t **alpSrc, uint8_t *dest,
529  int dstW, int y, enum AVPixelFormat target, int hasAlpha)
530 {
531  vector int16_t vv;
532  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
533  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
534  vector int32_t tmp, tmp2, tmp3, tmp4;
535  vector uint16_t rd16, gd16, bd16;
536  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
537  vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
538  const vector int32_t ystart = vec_splats(1 << 9);
539  const vector int32_t uvstart = vec_splats((1 << 9) - (128 << 19));
540  const vector uint16_t zero16 = vec_splat_u16(0);
541  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
542  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
543  const vector int32_t y_add = vec_splats(1 << 21);
544  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
545  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
546  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
547  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
548  const vector int32_t rgbclip = vec_splats(1 << 30);
549  const vector int32_t zero32 = vec_splat_s32(0);
550  const vector uint32_t shift22 = vec_splats(22U);
551  const vector uint32_t shift10 = vec_splat_u32(10);
552  int i, j;
553 
554  // Various permutations
555  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
556  0x1, 0x11, 0,
557  0x2, 0x12, 0,
558  0x3, 0x13, 0,
559  0x4, 0x14, 0,
560  0x5 };
561  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
562  0x6, 0x16, 0,
563  0x7, 0x17, 0 };
564  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
565  0x3, 0x4, 0x11,
566  0x6, 0x7, 0x12,
567  0x9, 0xa, 0x13,
568  0xc, 0xd, 0x14,
569  0xf };
570  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
571  0x2, 0x3, 0x16,
572  0x5, 0x6, 0x17 };
573 
574  ad = vec_splats((uint8_t) 255);
575 
576  for (i = 0; i < lumFilterSize; i++)
577  vlumFilter[i] = vec_splats(lumFilter[i]);
578  for (i = 0; i < chrFilterSize; i++)
579  vchrFilter[i] = vec_splats(chrFilter[i]);
580 
581  for (i = 0; i < dstW; i += 8) {
582  vy32_l =
583  vy32_r = ystart;
584  vu32_l =
585  vu32_r =
586  vv32_l =
587  vv32_r = uvstart;
588 
589  for (j = 0; j < lumFilterSize; j++) {
590  vv = vec_ld(0, &lumSrc[j][i]);
591  tmp = vec_mule(vv, vlumFilter[j]);
592  tmp2 = vec_mulo(vv, vlumFilter[j]);
593  tmp3 = vec_mergeh(tmp, tmp2);
594  tmp4 = vec_mergel(tmp, tmp2);
595 
596  vy32_l = vec_adds(vy32_l, tmp3);
597  vy32_r = vec_adds(vy32_r, tmp4);
598  }
599 
600  for (j = 0; j < chrFilterSize; j++) {
601  vv = vec_ld(0, &chrUSrc[j][i]);
602  tmp = vec_mule(vv, vchrFilter[j]);
603  tmp2 = vec_mulo(vv, vchrFilter[j]);
604  tmp3 = vec_mergeh(tmp, tmp2);
605  tmp4 = vec_mergel(tmp, tmp2);
606 
607  vu32_l = vec_adds(vu32_l, tmp3);
608  vu32_r = vec_adds(vu32_r, tmp4);
609 
610  vv = vec_ld(0, &chrVSrc[j][i]);
611  tmp = vec_mule(vv, vchrFilter[j]);
612  tmp2 = vec_mulo(vv, vchrFilter[j]);
613  tmp3 = vec_mergeh(tmp, tmp2);
614  tmp4 = vec_mergel(tmp, tmp2);
615 
616  vv32_l = vec_adds(vv32_l, tmp3);
617  vv32_r = vec_adds(vv32_r, tmp4);
618  }
619 
620  vy32_l = vec_sra(vy32_l, shift10);
621  vy32_r = vec_sra(vy32_r, shift10);
622  vu32_l = vec_sra(vu32_l, shift10);
623  vu32_r = vec_sra(vu32_r, shift10);
624  vv32_l = vec_sra(vv32_l, shift10);
625  vv32_r = vec_sra(vv32_r, shift10);
626 
627  vy32_l = vec_sub(vy32_l, y_offset);
628  vy32_r = vec_sub(vy32_r, y_offset);
629  vy32_l = vec_mul(vy32_l, y_coeff);
630  vy32_r = vec_mul(vy32_r, y_coeff);
631  vy32_l = vec_add(vy32_l, y_add);
632  vy32_r = vec_add(vy32_r, y_add);
633 
634  R_l = vec_mul(vv32_l, v2r_coeff);
635  R_l = vec_add(R_l, vy32_l);
636  R_r = vec_mul(vv32_r, v2r_coeff);
637  R_r = vec_add(R_r, vy32_r);
638  G_l = vec_mul(vv32_l, v2g_coeff);
639  tmp32 = vec_mul(vu32_l, u2g_coeff);
640  G_l = vec_add(G_l, vy32_l);
641  G_l = vec_add(G_l, tmp32);
642  G_r = vec_mul(vv32_r, v2g_coeff);
643  tmp32 = vec_mul(vu32_r, u2g_coeff);
644  G_r = vec_add(G_r, vy32_r);
645  G_r = vec_add(G_r, tmp32);
646 
647  B_l = vec_mul(vu32_l, u2b_coeff);
648  B_l = vec_add(B_l, vy32_l);
649  B_r = vec_mul(vu32_r, u2b_coeff);
650  B_r = vec_add(B_r, vy32_r);
651 
652  WRITERGB
653  }
654 }
655 
656 #define SETUP(x, buf0, alpha1, buf1, alpha) { \
657  x = vec_ld(0, buf0); \
658  tmp = vec_mule(x, alpha1); \
659  tmp2 = vec_mulo(x, alpha1); \
660  tmp3 = vec_mergeh(tmp, tmp2); \
661  tmp4 = vec_mergel(tmp, tmp2); \
662 \
663  x = vec_ld(0, buf1); \
664  tmp = vec_mule(x, alpha); \
665  tmp2 = vec_mulo(x, alpha); \
666  tmp5 = vec_mergeh(tmp, tmp2); \
667  tmp6 = vec_mergel(tmp, tmp2); \
668 \
669  tmp3 = vec_add(tmp3, tmp5); \
670  tmp4 = vec_add(tmp4, tmp6); \
671 }
672 
673 
674 static av_always_inline void
675 yuv2rgb_full_2_vsx_template(SwsContext *c, const int16_t *buf[2],
676  const int16_t *ubuf[2], const int16_t *vbuf[2],
677  const int16_t *abuf[2], uint8_t *dest, int dstW,
678  int yalpha, int uvalpha, int y,
679  enum AVPixelFormat target, int hasAlpha)
680 {
681  const int16_t *buf0 = buf[0], *buf1 = buf[1],
682  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
683  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
684  *abuf0 = hasAlpha ? abuf[0] : NULL,
685  *abuf1 = hasAlpha ? abuf[1] : NULL;
686  const int16_t yalpha1 = 4096 - yalpha;
687  const int16_t uvalpha1 = 4096 - uvalpha;
688  vector int16_t vy, vu, vv, A = vec_splat_s16(0);
689  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
690  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
691  vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
692  vector uint16_t rd16, gd16, bd16;
693  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
694  const vector int16_t vyalpha1 = vec_splats(yalpha1);
695  const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
696  const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
697  const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
698  const vector uint16_t zero16 = vec_splat_u16(0);
699  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
700  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
701  const vector int32_t y_add = vec_splats(1 << 21);
702  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
703  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
704  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
705  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
706  const vector int32_t rgbclip = vec_splats(1 << 30);
707  const vector int32_t zero32 = vec_splat_s32(0);
708  const vector uint32_t shift19 = vec_splats(19U);
709  const vector uint32_t shift22 = vec_splats(22U);
710  const vector uint32_t shift10 = vec_splat_u32(10);
711  const vector int32_t dec128 = vec_splats(128 << 19);
712  const vector int32_t add18 = vec_splats(1 << 18);
713  int i;
714 
715  // Various permutations
716  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
717  0x1, 0x11, 0,
718  0x2, 0x12, 0,
719  0x3, 0x13, 0,
720  0x4, 0x14, 0,
721  0x5 };
722  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
723  0x6, 0x16, 0,
724  0x7, 0x17, 0 };
725  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
726  0x3, 0x4, 0x11,
727  0x6, 0x7, 0x12,
728  0x9, 0xa, 0x13,
729  0xc, 0xd, 0x14,
730  0xf };
731  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
732  0x2, 0x3, 0x16,
733  0x5, 0x6, 0x17 };
734 
735  av_assert2(yalpha <= 4096U);
736  av_assert2(uvalpha <= 4096U);
737 
738  for (i = 0; i < dstW; i += 8) {
739  SETUP(vy, &buf0[i], vyalpha1, &buf1[i], vyalpha);
740  vy32_l = vec_sra(tmp3, shift10);
741  vy32_r = vec_sra(tmp4, shift10);
742 
743  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
744  tmp3 = vec_sub(tmp3, dec128);
745  tmp4 = vec_sub(tmp4, dec128);
746  vu32_l = vec_sra(tmp3, shift10);
747  vu32_r = vec_sra(tmp4, shift10);
748 
749  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
750  tmp3 = vec_sub(tmp3, dec128);
751  tmp4 = vec_sub(tmp4, dec128);
752  vv32_l = vec_sra(tmp3, shift10);
753  vv32_r = vec_sra(tmp4, shift10);
754 
755  if (hasAlpha) {
756  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
757  tmp3 = vec_add(tmp3, add18);
758  tmp4 = vec_add(tmp4, add18);
759  tmp3 = vec_sra(tmp3, shift19);
760  tmp4 = vec_sra(tmp4, shift19);
761  A = vec_packs(tmp3, tmp4);
762  ad = vec_packsu(A, (vector int16_t) zero16);
763  } else {
764  ad = vec_splats((uint8_t) 255);
765  }
766 
767  vy32_l = vec_sub(vy32_l, y_offset);
768  vy32_r = vec_sub(vy32_r, y_offset);
769  vy32_l = vec_mul(vy32_l, y_coeff);
770  vy32_r = vec_mul(vy32_r, y_coeff);
771  vy32_l = vec_add(vy32_l, y_add);
772  vy32_r = vec_add(vy32_r, y_add);
773 
774  R_l = vec_mul(vv32_l, v2r_coeff);
775  R_l = vec_add(R_l, vy32_l);
776  R_r = vec_mul(vv32_r, v2r_coeff);
777  R_r = vec_add(R_r, vy32_r);
778  G_l = vec_mul(vv32_l, v2g_coeff);
779  tmp32 = vec_mul(vu32_l, u2g_coeff);
780  G_l = vec_add(G_l, vy32_l);
781  G_l = vec_add(G_l, tmp32);
782  G_r = vec_mul(vv32_r, v2g_coeff);
783  tmp32 = vec_mul(vu32_r, u2g_coeff);
784  G_r = vec_add(G_r, vy32_r);
785  G_r = vec_add(G_r, tmp32);
786 
787  B_l = vec_mul(vu32_l, u2b_coeff);
788  B_l = vec_add(B_l, vy32_l);
789  B_r = vec_mul(vu32_r, u2b_coeff);
790  B_r = vec_add(B_r, vy32_r);
791 
792  WRITERGB
793  }
794 }
795 
796 static av_always_inline void
797 yuv2rgb_2_vsx_template(SwsContext *c, const int16_t *buf[2],
798  const int16_t *ubuf[2], const int16_t *vbuf[2],
799  const int16_t *abuf[2], uint8_t *dest, int dstW,
800  int yalpha, int uvalpha, int y,
801  enum AVPixelFormat target, int hasAlpha)
802 {
803  const int16_t *buf0 = buf[0], *buf1 = buf[1],
804  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
805  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
806  *abuf0 = hasAlpha ? abuf[0] : NULL,
807  *abuf1 = hasAlpha ? abuf[1] : NULL;
808  const int16_t yalpha1 = 4096 - yalpha;
809  const int16_t uvalpha1 = 4096 - uvalpha;
810  vector int16_t vy, vu, vv, A = vec_splat_s16(0);
811  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32;
812  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r, vud32_l, vud32_r, vvd32_l, vvd32_r;
813  vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
814  vector uint16_t rd16, gd16, bd16;
815  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
816  const vector int16_t vyalpha1 = vec_splats(yalpha1);
817  const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
818  const vector int16_t vyalpha = vec_splats((int16_t) yalpha);
819  const vector int16_t vuvalpha = vec_splats((int16_t) uvalpha);
820  const vector uint16_t zero16 = vec_splat_u16(0);
821  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
822  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
823  const vector int32_t y_add = vec_splats(1 << 21);
824  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
825  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
826  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
827  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
828  const vector int32_t rgbclip = vec_splats(1 << 30);
829  const vector int32_t zero32 = vec_splat_s32(0);
830  const vector uint32_t shift19 = vec_splats(19U);
831  const vector uint32_t shift22 = vec_splats(22U);
832  const vector uint32_t shift10 = vec_splat_u32(10);
833  const vector int32_t dec128 = vec_splats(128 << 19);
834  const vector int32_t add18 = vec_splats(1 << 18);
835  int i;
836 
837  // Various permutations
838  const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3,
839  0, 1, 2, 3,
840  4, 5, 6, 7,
841  4, 5, 6, 7 };
842  const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11,
843  8, 9, 10, 11,
844  12, 13, 14, 15,
845  12, 13, 14, 15 };
846  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
847  0x1, 0x11, 0,
848  0x2, 0x12, 0,
849  0x3, 0x13, 0,
850  0x4, 0x14, 0,
851  0x5 };
852  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
853  0x6, 0x16, 0,
854  0x7, 0x17, 0 };
855  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
856  0x3, 0x4, 0x11,
857  0x6, 0x7, 0x12,
858  0x9, 0xa, 0x13,
859  0xc, 0xd, 0x14,
860  0xf };
861  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
862  0x2, 0x3, 0x16,
863  0x5, 0x6, 0x17 };
864 
865  av_assert2(yalpha <= 4096U);
866  av_assert2(uvalpha <= 4096U);
867 
868  for (i = 0; i < (dstW + 1) >> 1; i += 8) {
869  SETUP(vy, &buf0[i * 2], vyalpha1, &buf1[i * 2], vyalpha);
870  vy32_l = vec_sra(tmp3, shift10);
871  vy32_r = vec_sra(tmp4, shift10);
872 
873  SETUP(vu, &ubuf0[i], vuvalpha1, &ubuf1[i], vuvalpha);
874  tmp3 = vec_sub(tmp3, dec128);
875  tmp4 = vec_sub(tmp4, dec128);
876  vu32_l = vec_sra(tmp3, shift10);
877  vu32_r = vec_sra(tmp4, shift10);
878 
879  SETUP(vv, &vbuf0[i], vuvalpha1, &vbuf1[i], vuvalpha);
880  tmp3 = vec_sub(tmp3, dec128);
881  tmp4 = vec_sub(tmp4, dec128);
882  vv32_l = vec_sra(tmp3, shift10);
883  vv32_r = vec_sra(tmp4, shift10);
884 
885  if (hasAlpha) {
886  SETUP(A, &abuf0[i], vyalpha1, &abuf1[i], vyalpha);
887  tmp3 = vec_add(tmp3, add18);
888  tmp4 = vec_add(tmp4, add18);
889  tmp3 = vec_sra(tmp3, shift19);
890  tmp4 = vec_sra(tmp4, shift19);
891  A = vec_packs(tmp3, tmp4);
892  ad = vec_packsu(A, (vector int16_t) zero16);
893  } else {
894  ad = vec_splats((uint8_t) 255);
895  }
896 
897  vy32_l = vec_sub(vy32_l, y_offset);
898  vy32_r = vec_sub(vy32_r, y_offset);
899  vy32_l = vec_mul(vy32_l, y_coeff);
900  vy32_r = vec_mul(vy32_r, y_coeff);
901  vy32_l = vec_add(vy32_l, y_add);
902  vy32_r = vec_add(vy32_r, y_add);
903 
904  // Use the first UV half
905  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
906  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
907  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
908  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
909 
910  R_l = vec_mul(vvd32_l, v2r_coeff);
911  R_l = vec_add(R_l, vy32_l);
912  R_r = vec_mul(vvd32_r, v2r_coeff);
913  R_r = vec_add(R_r, vy32_r);
914  G_l = vec_mul(vvd32_l, v2g_coeff);
915  tmp32 = vec_mul(vud32_l, u2g_coeff);
916  G_l = vec_add(G_l, vy32_l);
917  G_l = vec_add(G_l, tmp32);
918  G_r = vec_mul(vvd32_r, v2g_coeff);
919  tmp32 = vec_mul(vud32_r, u2g_coeff);
920  G_r = vec_add(G_r, vy32_r);
921  G_r = vec_add(G_r, tmp32);
922 
923  B_l = vec_mul(vud32_l, u2b_coeff);
924  B_l = vec_add(B_l, vy32_l);
925  B_r = vec_mul(vud32_r, u2b_coeff);
926  B_r = vec_add(B_r, vy32_r);
927 
928  WRITERGB
929 
930  // New Y for the second half
931  SETUP(vy, &buf0[i * 2 + 8], vyalpha1, &buf1[i * 2 + 8], vyalpha);
932  vy32_l = vec_sra(tmp3, shift10);
933  vy32_r = vec_sra(tmp4, shift10);
934 
935  vy32_l = vec_sub(vy32_l, y_offset);
936  vy32_r = vec_sub(vy32_r, y_offset);
937  vy32_l = vec_mul(vy32_l, y_coeff);
938  vy32_r = vec_mul(vy32_r, y_coeff);
939  vy32_l = vec_add(vy32_l, y_add);
940  vy32_r = vec_add(vy32_r, y_add);
941 
942  // Second UV half
943  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
944  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
945  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
946  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
947 
948  R_l = vec_mul(vvd32_l, v2r_coeff);
949  R_l = vec_add(R_l, vy32_l);
950  R_r = vec_mul(vvd32_r, v2r_coeff);
951  R_r = vec_add(R_r, vy32_r);
952  G_l = vec_mul(vvd32_l, v2g_coeff);
953  tmp32 = vec_mul(vud32_l, u2g_coeff);
954  G_l = vec_add(G_l, vy32_l);
955  G_l = vec_add(G_l, tmp32);
956  G_r = vec_mul(vvd32_r, v2g_coeff);
957  tmp32 = vec_mul(vud32_r, u2g_coeff);
958  G_r = vec_add(G_r, vy32_r);
959  G_r = vec_add(G_r, tmp32);
960 
961  B_l = vec_mul(vud32_l, u2b_coeff);
962  B_l = vec_add(B_l, vy32_l);
963  B_r = vec_mul(vud32_r, u2b_coeff);
964  B_r = vec_add(B_r, vy32_r);
965 
966  WRITERGB
967  }
968 }
969 
970 #undef SETUP
971 
972 static av_always_inline void
973 yuv2rgb_full_1_vsx_template(SwsContext *c, const int16_t *buf0,
974  const int16_t *ubuf[2], const int16_t *vbuf[2],
975  const int16_t *abuf0, uint8_t *dest, int dstW,
976  int uvalpha, int y, enum AVPixelFormat target,
977  int hasAlpha)
978 {
979  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
980  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
981  vector int16_t vy, vu, vv, A = vec_splat_s16(0), tmp16;
982  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
983  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
984  vector uint16_t rd16, gd16, bd16;
985  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
986  const vector uint16_t zero16 = vec_splat_u16(0);
987  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
988  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
989  const vector int32_t y_add = vec_splats(1 << 21);
990  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
991  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
992  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
993  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
994  const vector int32_t rgbclip = vec_splats(1 << 30);
995  const vector int32_t zero32 = vec_splat_s32(0);
996  const vector uint32_t shift2 = vec_splat_u32(2);
997  const vector uint32_t shift22 = vec_splats(22U);
998  const vector uint16_t sub7 = vec_splats((uint16_t) (128 << 7));
999  const vector uint16_t sub8 = vec_splats((uint16_t) (128 << 8));
1000  const vector int16_t mul4 = vec_splat_s16(4);
1001  const vector int16_t mul8 = vec_splat_s16(8);
1002  const vector int16_t add64 = vec_splat_s16(64);
1003  const vector uint16_t shift7 = vec_splat_u16(7);
1004  const vector int16_t max255 = vec_splat_s16(255);
1005  int i;
1006 
1007  // Various permutations
1008  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
1009  0x1, 0x11, 0,
1010  0x2, 0x12, 0,
1011  0x3, 0x13, 0,
1012  0x4, 0x14, 0,
1013  0x5 };
1014  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
1015  0x6, 0x16, 0,
1016  0x7, 0x17, 0 };
1017  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
1018  0x3, 0x4, 0x11,
1019  0x6, 0x7, 0x12,
1020  0x9, 0xa, 0x13,
1021  0xc, 0xd, 0x14,
1022  0xf };
1023  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
1024  0x2, 0x3, 0x16,
1025  0x5, 0x6, 0x17 };
1026 
1027  for (i = 0; i < dstW; i += 8) { // The x86 asm also overwrites padding bytes.
1028  vy = vec_ld(0, &buf0[i]);
1029  vy32_l = vec_unpackh(vy);
1030  vy32_r = vec_unpackl(vy);
1031  vy32_l = vec_sl(vy32_l, shift2);
1032  vy32_r = vec_sl(vy32_r, shift2);
1033 
1034  vu = vec_ld(0, &ubuf0[i]);
1035  vv = vec_ld(0, &vbuf0[i]);
1036  if (uvalpha < 2048) {
1037  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub7);
1038  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub7);
1039 
1040  tmp32 = vec_mule(vu, mul4);
1041  tmp32_2 = vec_mulo(vu, mul4);
1042  vu32_l = vec_mergeh(tmp32, tmp32_2);
1043  vu32_r = vec_mergel(tmp32, tmp32_2);
1044  tmp32 = vec_mule(vv, mul4);
1045  tmp32_2 = vec_mulo(vv, mul4);
1046  vv32_l = vec_mergeh(tmp32, tmp32_2);
1047  vv32_r = vec_mergel(tmp32, tmp32_2);
1048  } else {
1049  tmp16 = vec_ld(0, &ubuf1[i]);
1050  vu = vec_add(vu, tmp16);
1051  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub8);
1052  tmp16 = vec_ld(0, &vbuf1[i]);
1053  vv = vec_add(vv, tmp16);
1054  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub8);
1055 
1056  vu32_l = vec_mule(vu, mul8);
1057  vu32_r = vec_mulo(vu, mul8);
1058  vv32_l = vec_mule(vv, mul8);
1059  vv32_r = vec_mulo(vv, mul8);
1060  }
1061 
1062  if (hasAlpha) {
1063  A = vec_ld(0, &abuf0[i]);
1064  A = vec_add(A, add64);
1065  A = vec_sr(A, shift7);
1066  A = vec_max(A, max255);
1067  ad = vec_packsu(A, (vector int16_t) zero16);
1068  } else {
1069  ad = vec_splats((uint8_t) 255);
1070  }
1071 
1072  vy32_l = vec_sub(vy32_l, y_offset);
1073  vy32_r = vec_sub(vy32_r, y_offset);
1074  vy32_l = vec_mul(vy32_l, y_coeff);
1075  vy32_r = vec_mul(vy32_r, y_coeff);
1076  vy32_l = vec_add(vy32_l, y_add);
1077  vy32_r = vec_add(vy32_r, y_add);
1078 
1079  R_l = vec_mul(vv32_l, v2r_coeff);
1080  R_l = vec_add(R_l, vy32_l);
1081  R_r = vec_mul(vv32_r, v2r_coeff);
1082  R_r = vec_add(R_r, vy32_r);
1083  G_l = vec_mul(vv32_l, v2g_coeff);
1084  tmp32 = vec_mul(vu32_l, u2g_coeff);
1085  G_l = vec_add(G_l, vy32_l);
1086  G_l = vec_add(G_l, tmp32);
1087  G_r = vec_mul(vv32_r, v2g_coeff);
1088  tmp32 = vec_mul(vu32_r, u2g_coeff);
1089  G_r = vec_add(G_r, vy32_r);
1090  G_r = vec_add(G_r, tmp32);
1091 
1092  B_l = vec_mul(vu32_l, u2b_coeff);
1093  B_l = vec_add(B_l, vy32_l);
1094  B_r = vec_mul(vu32_r, u2b_coeff);
1095  B_r = vec_add(B_r, vy32_r);
1096 
1097  WRITERGB
1098  }
1099 }
1100 
1101 static av_always_inline void
1102 yuv2rgb_1_vsx_template(SwsContext *c, const int16_t *buf0,
1103  const int16_t *ubuf[2], const int16_t *vbuf[2],
1104  const int16_t *abuf0, uint8_t *dest, int dstW,
1105  int uvalpha, int y, enum AVPixelFormat target,
1106  int hasAlpha)
1107 {
1108  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1109  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1110  vector int16_t vy, vu, vv, A = vec_splat_s16(0), tmp16;
1111  vector int32_t vy32_l, vy32_r, vu32_l, vu32_r, vv32_l, vv32_r, tmp32, tmp32_2;
1112  vector int32_t vud32_l, vud32_r, vvd32_l, vvd32_r;
1113  vector int32_t R_l, R_r, G_l, G_r, B_l, B_r;
1114  vector uint16_t rd16, gd16, bd16;
1115  vector uint8_t rd, bd, gd, ad, out0, out1, tmp8;
1116  const vector uint16_t zero16 = vec_splat_u16(0);
1117  const vector int32_t y_offset = vec_splats(c->yuv2rgb_y_offset);
1118  const vector int32_t y_coeff = vec_splats(c->yuv2rgb_y_coeff);
1119  const vector int32_t y_add = vec_splats(1 << 21);
1120  const vector int32_t v2r_coeff = vec_splats(c->yuv2rgb_v2r_coeff);
1121  const vector int32_t v2g_coeff = vec_splats(c->yuv2rgb_v2g_coeff);
1122  const vector int32_t u2g_coeff = vec_splats(c->yuv2rgb_u2g_coeff);
1123  const vector int32_t u2b_coeff = vec_splats(c->yuv2rgb_u2b_coeff);
1124  const vector int32_t rgbclip = vec_splats(1 << 30);
1125  const vector int32_t zero32 = vec_splat_s32(0);
1126  const vector uint32_t shift2 = vec_splat_u32(2);
1127  const vector uint32_t shift22 = vec_splats(22U);
1128  const vector uint16_t sub7 = vec_splats((uint16_t) (128 << 7));
1129  const vector uint16_t sub8 = vec_splats((uint16_t) (128 << 8));
1130  const vector int16_t mul4 = vec_splat_s16(4);
1131  const vector int16_t mul8 = vec_splat_s16(8);
1132  const vector int16_t add64 = vec_splat_s16(64);
1133  const vector uint16_t shift7 = vec_splat_u16(7);
1134  const vector int16_t max255 = vec_splat_s16(255);
1135  int i;
1136 
1137  // Various permutations
1138  const vector uint8_t doubleleft = (vector uint8_t) {0, 1, 2, 3,
1139  0, 1, 2, 3,
1140  4, 5, 6, 7,
1141  4, 5, 6, 7 };
1142  const vector uint8_t doubleright = (vector uint8_t) {8, 9, 10, 11,
1143  8, 9, 10, 11,
1144  12, 13, 14, 15,
1145  12, 13, 14, 15 };
1146  const vector uint8_t perm3rg0 = (vector uint8_t) {0x0, 0x10, 0,
1147  0x1, 0x11, 0,
1148  0x2, 0x12, 0,
1149  0x3, 0x13, 0,
1150  0x4, 0x14, 0,
1151  0x5 };
1152  const vector uint8_t perm3rg1 = (vector uint8_t) { 0x15, 0,
1153  0x6, 0x16, 0,
1154  0x7, 0x17, 0 };
1155  const vector uint8_t perm3tb0 = (vector uint8_t) {0x0, 0x1, 0x10,
1156  0x3, 0x4, 0x11,
1157  0x6, 0x7, 0x12,
1158  0x9, 0xa, 0x13,
1159  0xc, 0xd, 0x14,
1160  0xf };
1161  const vector uint8_t perm3tb1 = (vector uint8_t) { 0x0, 0x15,
1162  0x2, 0x3, 0x16,
1163  0x5, 0x6, 0x17 };
1164 
1165  for (i = 0; i < (dstW + 1) >> 1; i += 8) { // The x86 asm also overwrites padding bytes.
1166  vy = vec_ld(0, &buf0[i * 2]);
1167  vy32_l = vec_unpackh(vy);
1168  vy32_r = vec_unpackl(vy);
1169  vy32_l = vec_sl(vy32_l, shift2);
1170  vy32_r = vec_sl(vy32_r, shift2);
1171 
1172  vu = vec_ld(0, &ubuf0[i]);
1173  vv = vec_ld(0, &vbuf0[i]);
1174  if (uvalpha < 2048) {
1175  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub7);
1176  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub7);
1177 
1178  tmp32 = vec_mule(vu, mul4);
1179  tmp32_2 = vec_mulo(vu, mul4);
1180  vu32_l = vec_mergeh(tmp32, tmp32_2);
1181  vu32_r = vec_mergel(tmp32, tmp32_2);
1182  tmp32 = vec_mule(vv, mul4);
1183  tmp32_2 = vec_mulo(vv, mul4);
1184  vv32_l = vec_mergeh(tmp32, tmp32_2);
1185  vv32_r = vec_mergel(tmp32, tmp32_2);
1186  } else {
1187  tmp16 = vec_ld(0, &ubuf1[i]);
1188  vu = vec_add(vu, tmp16);
1189  vu = (vector int16_t) vec_sub((vector uint16_t) vu, sub8);
1190  tmp16 = vec_ld(0, &vbuf1[i]);
1191  vv = vec_add(vv, tmp16);
1192  vv = (vector int16_t) vec_sub((vector uint16_t) vv, sub8);
1193 
1194  vu32_l = vec_mule(vu, mul8);
1195  vu32_r = vec_mulo(vu, mul8);
1196  vv32_l = vec_mule(vv, mul8);
1197  vv32_r = vec_mulo(vv, mul8);
1198  }
1199 
1200  if (hasAlpha) {
1201  A = vec_ld(0, &abuf0[i]);
1202  A = vec_add(A, add64);
1203  A = vec_sr(A, shift7);
1204  A = vec_max(A, max255);
1205  ad = vec_packsu(A, (vector int16_t) zero16);
1206  } else {
1207  ad = vec_splats((uint8_t) 255);
1208  }
1209 
1210  vy32_l = vec_sub(vy32_l, y_offset);
1211  vy32_r = vec_sub(vy32_r, y_offset);
1212  vy32_l = vec_mul(vy32_l, y_coeff);
1213  vy32_r = vec_mul(vy32_r, y_coeff);
1214  vy32_l = vec_add(vy32_l, y_add);
1215  vy32_r = vec_add(vy32_r, y_add);
1216 
1217  // Use the first UV half
1218  vud32_l = vec_perm(vu32_l, vu32_l, doubleleft);
1219  vud32_r = vec_perm(vu32_l, vu32_l, doubleright);
1220  vvd32_l = vec_perm(vv32_l, vv32_l, doubleleft);
1221  vvd32_r = vec_perm(vv32_l, vv32_l, doubleright);
1222 
1223  R_l = vec_mul(vvd32_l, v2r_coeff);
1224  R_l = vec_add(R_l, vy32_l);
1225  R_r = vec_mul(vvd32_r, v2r_coeff);
1226  R_r = vec_add(R_r, vy32_r);
1227  G_l = vec_mul(vvd32_l, v2g_coeff);
1228  tmp32 = vec_mul(vud32_l, u2g_coeff);
1229  G_l = vec_add(G_l, vy32_l);
1230  G_l = vec_add(G_l, tmp32);
1231  G_r = vec_mul(vvd32_r, v2g_coeff);
1232  tmp32 = vec_mul(vud32_r, u2g_coeff);
1233  G_r = vec_add(G_r, vy32_r);
1234  G_r = vec_add(G_r, tmp32);
1235 
1236  B_l = vec_mul(vud32_l, u2b_coeff);
1237  B_l = vec_add(B_l, vy32_l);
1238  B_r = vec_mul(vud32_r, u2b_coeff);
1239  B_r = vec_add(B_r, vy32_r);
1240 
1241  WRITERGB
1242 
1243  // New Y for the second half
1244  vy = vec_ld(16, &buf0[i * 2]);
1245  vy32_l = vec_unpackh(vy);
1246  vy32_r = vec_unpackl(vy);
1247  vy32_l = vec_sl(vy32_l, shift2);
1248  vy32_r = vec_sl(vy32_r, shift2);
1249 
1250  vy32_l = vec_sub(vy32_l, y_offset);
1251  vy32_r = vec_sub(vy32_r, y_offset);
1252  vy32_l = vec_mul(vy32_l, y_coeff);
1253  vy32_r = vec_mul(vy32_r, y_coeff);
1254  vy32_l = vec_add(vy32_l, y_add);
1255  vy32_r = vec_add(vy32_r, y_add);
1256 
1257  // Second UV half
1258  vud32_l = vec_perm(vu32_r, vu32_r, doubleleft);
1259  vud32_r = vec_perm(vu32_r, vu32_r, doubleright);
1260  vvd32_l = vec_perm(vv32_r, vv32_r, doubleleft);
1261  vvd32_r = vec_perm(vv32_r, vv32_r, doubleright);
1262 
1263  R_l = vec_mul(vvd32_l, v2r_coeff);
1264  R_l = vec_add(R_l, vy32_l);
1265  R_r = vec_mul(vvd32_r, v2r_coeff);
1266  R_r = vec_add(R_r, vy32_r);
1267  G_l = vec_mul(vvd32_l, v2g_coeff);
1268  tmp32 = vec_mul(vud32_l, u2g_coeff);
1269  G_l = vec_add(G_l, vy32_l);
1270  G_l = vec_add(G_l, tmp32);
1271  G_r = vec_mul(vvd32_r, v2g_coeff);
1272  tmp32 = vec_mul(vud32_r, u2g_coeff);
1273  G_r = vec_add(G_r, vy32_r);
1274  G_r = vec_add(G_r, tmp32);
1275 
1276  B_l = vec_mul(vud32_l, u2b_coeff);
1277  B_l = vec_add(B_l, vy32_l);
1278  B_r = vec_mul(vud32_r, u2b_coeff);
1279  B_r = vec_add(B_r, vy32_r);
1280 
1281  WRITERGB
1282  }
1283 }
1284 
1285 #undef WRITERGB
1286 
1287 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1288 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1289  const int16_t **lumSrc, int lumFilterSize, \
1290  const int16_t *chrFilter, const int16_t **chrUSrc, \
1291  const int16_t **chrVSrc, int chrFilterSize, \
1292  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1293  int y) \
1294 { \
1295  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1296  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1297  alpSrc, dest, dstW, y, fmt, hasAlpha); \
1298 }
1299 
1300 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
1301 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1302  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1303  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1304  int yalpha, int uvalpha, int y) \
1305 { \
1306  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1307  dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1308 }
1309 
1310 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1311 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1312  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1313  const int16_t *abuf0, uint8_t *dest, int dstW, \
1314  int uvalpha, int y) \
1315 { \
1316  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1317  dstW, uvalpha, y, fmt, hasAlpha); \
1318 }
1319 
1320 YUV2RGBWRAPPER(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1321 YUV2RGBWRAPPER(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1322 YUV2RGBWRAPPER(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1323 YUV2RGBWRAPPER(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1324 
1325 YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1326 YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1327 
1328 YUV2RGBWRAPPERX2(yuv2, rgb, bgrx32, AV_PIX_FMT_BGRA, 0)
1329 YUV2RGBWRAPPERX2(yuv2, rgb, rgbx32, AV_PIX_FMT_RGBA, 0)
1330 YUV2RGBWRAPPERX2(yuv2, rgb, xrgb32, AV_PIX_FMT_ARGB, 0)
1331 YUV2RGBWRAPPERX2(yuv2, rgb, xbgr32, AV_PIX_FMT_ABGR, 0)
1332 
1333 YUV2RGBWRAPPERX2(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
1334 YUV2RGBWRAPPERX2(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
1335 
1336 YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1337 YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1338 YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1339 YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1340 
1341 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1342 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1343 
1344 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1345 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1346 YUV2RGBWRAPPERX2(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1347 YUV2RGBWRAPPERX2(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1348 
1349 YUV2RGBWRAPPERX2(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1350 YUV2RGBWRAPPERX2(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1351 
1352 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1353 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1354 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1355 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1356 
1357 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1358 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1359 
1360 static av_always_inline void
1361 write422(const vector int16_t vy1, const vector int16_t vy2,
1362  const vector int16_t vu, const vector int16_t vv,
1363  uint8_t *dest, const enum AVPixelFormat target)
1364 {
1365  vector uint8_t vd1, vd2, tmp;
1366  const vector uint8_t yuyv1 = (vector uint8_t) {
1367  0x0, 0x10, 0x1, 0x18,
1368  0x2, 0x11, 0x3, 0x19,
1369  0x4, 0x12, 0x5, 0x1a,
1370  0x6, 0x13, 0x7, 0x1b };
1371  const vector uint8_t yuyv2 = (vector uint8_t) {
1372  0x8, 0x14, 0x9, 0x1c,
1373  0xa, 0x15, 0xb, 0x1d,
1374  0xc, 0x16, 0xd, 0x1e,
1375  0xe, 0x17, 0xf, 0x1f };
1376  const vector uint8_t yvyu1 = (vector uint8_t) {
1377  0x0, 0x18, 0x1, 0x10,
1378  0x2, 0x19, 0x3, 0x11,
1379  0x4, 0x1a, 0x5, 0x12,
1380  0x6, 0x1b, 0x7, 0x13 };
1381  const vector uint8_t yvyu2 = (vector uint8_t) {
1382  0x8, 0x1c, 0x9, 0x14,
1383  0xa, 0x1d, 0xb, 0x15,
1384  0xc, 0x1e, 0xd, 0x16,
1385  0xe, 0x1f, 0xf, 0x17 };
1386  const vector uint8_t uyvy1 = (vector uint8_t) {
1387  0x10, 0x0, 0x18, 0x1,
1388  0x11, 0x2, 0x19, 0x3,
1389  0x12, 0x4, 0x1a, 0x5,
1390  0x13, 0x6, 0x1b, 0x7 };
1391  const vector uint8_t uyvy2 = (vector uint8_t) {
1392  0x14, 0x8, 0x1c, 0x9,
1393  0x15, 0xa, 0x1d, 0xb,
1394  0x16, 0xc, 0x1e, 0xd,
1395  0x17, 0xe, 0x1f, 0xf };
1396 
1397  vd1 = vec_packsu(vy1, vy2);
1398  vd2 = vec_packsu(vu, vv);
1399 
1400  switch (target) {
1401  case AV_PIX_FMT_YUYV422:
1402  tmp = vec_perm(vd1, vd2, yuyv1);
1403  vec_st(tmp, 0, dest);
1404  tmp = vec_perm(vd1, vd2, yuyv2);
1405  vec_st(tmp, 16, dest);
1406  break;
1407  case AV_PIX_FMT_YVYU422:
1408  tmp = vec_perm(vd1, vd2, yvyu1);
1409  vec_st(tmp, 0, dest);
1410  tmp = vec_perm(vd1, vd2, yvyu2);
1411  vec_st(tmp, 16, dest);
1412  break;
1413  case AV_PIX_FMT_UYVY422:
1414  tmp = vec_perm(vd1, vd2, uyvy1);
1415  vec_st(tmp, 0, dest);
1416  tmp = vec_perm(vd1, vd2, uyvy2);
1417  vec_st(tmp, 16, dest);
1418  break;
1419  }
1420 }
1421 
1422 static av_always_inline void
1423 yuv2422_X_vsx_template(SwsContext *c, const int16_t *lumFilter,
1424  const int16_t **lumSrc, int lumFilterSize,
1425  const int16_t *chrFilter, const int16_t **chrUSrc,
1426  const int16_t **chrVSrc, int chrFilterSize,
1427  const int16_t **alpSrc, uint8_t *dest, int dstW,
1428  int y, enum AVPixelFormat target)
1429 {
1430  int i, j;
1431  vector int16_t vy1, vy2, vu, vv;
1432  vector int32_t vy32[4], vu32[2], vv32[2], tmp, tmp2, tmp3, tmp4;
1433  vector int16_t vlumFilter[MAX_FILTER_SIZE], vchrFilter[MAX_FILTER_SIZE];
1434  const vector int32_t start = vec_splats(1 << 18);
1435  const vector uint32_t shift19 = vec_splats(19U);
1436 
1437  for (i = 0; i < lumFilterSize; i++)
1438  vlumFilter[i] = vec_splats(lumFilter[i]);
1439  for (i = 0; i < chrFilterSize; i++)
1440  vchrFilter[i] = vec_splats(chrFilter[i]);
1441 
1442  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1443  vy32[0] =
1444  vy32[1] =
1445  vy32[2] =
1446  vy32[3] =
1447  vu32[0] =
1448  vu32[1] =
1449  vv32[0] =
1450  vv32[1] = start;
1451 
1452  for (j = 0; j < lumFilterSize; j++) {
1453  vv = vec_ld(0, &lumSrc[j][i * 2]);
1454  tmp = vec_mule(vv, vlumFilter[j]);
1455  tmp2 = vec_mulo(vv, vlumFilter[j]);
1456  tmp3 = vec_mergeh(tmp, tmp2);
1457  tmp4 = vec_mergel(tmp, tmp2);
1458 
1459  vy32[0] = vec_adds(vy32[0], tmp3);
1460  vy32[1] = vec_adds(vy32[1], tmp4);
1461 
1462  vv = vec_ld(0, &lumSrc[j][(i + 4) * 2]);
1463  tmp = vec_mule(vv, vlumFilter[j]);
1464  tmp2 = vec_mulo(vv, vlumFilter[j]);
1465  tmp3 = vec_mergeh(tmp, tmp2);
1466  tmp4 = vec_mergel(tmp, tmp2);
1467 
1468  vy32[2] = vec_adds(vy32[2], tmp3);
1469  vy32[3] = vec_adds(vy32[3], tmp4);
1470  }
1471 
1472  for (j = 0; j < chrFilterSize; j++) {
1473  vv = vec_ld(0, &chrUSrc[j][i]);
1474  tmp = vec_mule(vv, vchrFilter[j]);
1475  tmp2 = vec_mulo(vv, vchrFilter[j]);
1476  tmp3 = vec_mergeh(tmp, tmp2);
1477  tmp4 = vec_mergel(tmp, tmp2);
1478 
1479  vu32[0] = vec_adds(vu32[0], tmp3);
1480  vu32[1] = vec_adds(vu32[1], tmp4);
1481 
1482  vv = vec_ld(0, &chrVSrc[j][i]);
1483  tmp = vec_mule(vv, vchrFilter[j]);
1484  tmp2 = vec_mulo(vv, vchrFilter[j]);
1485  tmp3 = vec_mergeh(tmp, tmp2);
1486  tmp4 = vec_mergel(tmp, tmp2);
1487 
1488  vv32[0] = vec_adds(vv32[0], tmp3);
1489  vv32[1] = vec_adds(vv32[1], tmp4);
1490  }
1491 
1492  for (j = 0; j < 4; j++) {
1493  vy32[j] = vec_sra(vy32[j], shift19);
1494  }
1495  for (j = 0; j < 2; j++) {
1496  vu32[j] = vec_sra(vu32[j], shift19);
1497  vv32[j] = vec_sra(vv32[j], shift19);
1498  }
1499 
1500  vy1 = vec_packs(vy32[0], vy32[1]);
1501  vy2 = vec_packs(vy32[2], vy32[3]);
1502  vu = vec_packs(vu32[0], vu32[1]);
1503  vv = vec_packs(vv32[0], vv32[1]);
1504 
1505  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1506  }
1507 }
1508 
1509 #define SETUP(x, buf0, buf1, alpha) { \
1510  x = vec_ld(0, buf0); \
1511  tmp = vec_mule(x, alpha); \
1512  tmp2 = vec_mulo(x, alpha); \
1513  tmp3 = vec_mergeh(tmp, tmp2); \
1514  tmp4 = vec_mergel(tmp, tmp2); \
1515 \
1516  x = vec_ld(0, buf1); \
1517  tmp = vec_mule(x, alpha); \
1518  tmp2 = vec_mulo(x, alpha); \
1519  tmp5 = vec_mergeh(tmp, tmp2); \
1520  tmp6 = vec_mergel(tmp, tmp2); \
1521 \
1522  tmp3 = vec_add(tmp3, tmp5); \
1523  tmp4 = vec_add(tmp4, tmp6); \
1524 \
1525  tmp3 = vec_sra(tmp3, shift19); \
1526  tmp4 = vec_sra(tmp4, shift19); \
1527  x = vec_packs(tmp3, tmp4); \
1528 }
1529 
1530 static av_always_inline void
1531 yuv2422_2_vsx_template(SwsContext *c, const int16_t *buf[2],
1532  const int16_t *ubuf[2], const int16_t *vbuf[2],
1533  const int16_t *abuf[2], uint8_t *dest, int dstW,
1534  int yalpha, int uvalpha, int y,
1535  enum AVPixelFormat target)
1536 {
1537  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1538  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1539  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1540  const int16_t yalpha1 = 4096 - yalpha;
1541  const int16_t uvalpha1 = 4096 - uvalpha;
1542  vector int16_t vy1, vy2, vu, vv;
1543  vector int32_t tmp, tmp2, tmp3, tmp4, tmp5, tmp6;
1544  const vector int16_t vyalpha1 = vec_splats(yalpha1);
1545  const vector int16_t vuvalpha1 = vec_splats(uvalpha1);
1546  const vector uint32_t shift19 = vec_splats(19U);
1547  int i;
1548  av_assert2(yalpha <= 4096U);
1549  av_assert2(uvalpha <= 4096U);
1550 
1551  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1552 
1553  SETUP(vy1, &buf0[i * 2], &buf1[i * 2], vyalpha1)
1554  SETUP(vy2, &buf0[(i + 4) * 2], &buf1[(i + 4) * 2], vyalpha1)
1555  SETUP(vu, &ubuf0[i], &ubuf1[i], vuvalpha1)
1556  SETUP(vv, &vbuf0[i], &vbuf1[i], vuvalpha1)
1557 
1558  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1559  }
1560 }
1561 
1562 #undef SETUP
1563 
1564 static av_always_inline void
1565 yuv2422_1_vsx_template(SwsContext *c, const int16_t *buf0,
1566  const int16_t *ubuf[2], const int16_t *vbuf[2],
1567  const int16_t *abuf0, uint8_t *dest, int dstW,
1568  int uvalpha, int y, enum AVPixelFormat target)
1569 {
1570  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1571  vector int16_t vy1, vy2, vu, vv, tmp;
1572  const vector int16_t add64 = vec_splats((int16_t) 64);
1573  const vector int16_t add128 = vec_splats((int16_t) 128);
1574  const vector uint16_t shift7 = vec_splat_u16(7);
1575  const vector uint16_t shift8 = vec_splat_u16(8);
1576  int i;
1577 
1578  if (uvalpha < 2048) {
1579  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1580  vy1 = vec_ld(0, &buf0[i * 2]);
1581  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1582  vu = vec_ld(0, &ubuf0[i]);
1583  vv = vec_ld(0, &vbuf0[i]);
1584 
1585  vy1 = vec_add(vy1, add64);
1586  vy2 = vec_add(vy2, add64);
1587  vu = vec_add(vu, add64);
1588  vv = vec_add(vv, add64);
1589 
1590  vy1 = vec_sra(vy1, shift7);
1591  vy2 = vec_sra(vy2, shift7);
1592  vu = vec_sra(vu, shift7);
1593  vv = vec_sra(vv, shift7);
1594 
1595  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1596  }
1597  } else {
1598  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1599  for (i = 0; i < ((dstW + 1) >> 1); i += 8) {
1600  vy1 = vec_ld(0, &buf0[i * 2]);
1601  vy2 = vec_ld(0, &buf0[(i + 4) * 2]);
1602  vu = vec_ld(0, &ubuf0[i]);
1603  tmp = vec_ld(0, &ubuf1[i]);
1604  vu = vec_adds(vu, tmp);
1605  vv = vec_ld(0, &vbuf0[i]);
1606  tmp = vec_ld(0, &vbuf1[i]);
1607  vv = vec_adds(vv, tmp);
1608 
1609  vy1 = vec_add(vy1, add64);
1610  vy2 = vec_add(vy2, add64);
1611  vu = vec_adds(vu, add128);
1612  vv = vec_adds(vv, add128);
1613 
1614  vy1 = vec_sra(vy1, shift7);
1615  vy2 = vec_sra(vy2, shift7);
1616  vu = vec_sra(vu, shift8);
1617  vv = vec_sra(vv, shift8);
1618 
1619  write422(vy1, vy2, vu, vv, &dest[i * 4], target);
1620  }
1621  }
1622 }
1623 
1624 #define YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1625 static void name ## ext ## _X_vsx(SwsContext *c, const int16_t *lumFilter, \
1626  const int16_t **lumSrc, int lumFilterSize, \
1627  const int16_t *chrFilter, const int16_t **chrUSrc, \
1628  const int16_t **chrVSrc, int chrFilterSize, \
1629  const int16_t **alpSrc, uint8_t *dest, int dstW, \
1630  int y) \
1631 { \
1632  name ## base ## _X_vsx_template(c, lumFilter, lumSrc, lumFilterSize, \
1633  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1634  alpSrc, dest, dstW, y, fmt); \
1635 }
1636 
1637 #define YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1638 YUV2PACKEDWRAPPERX(name, base, ext, fmt) \
1639 static void name ## ext ## _2_vsx(SwsContext *c, const int16_t *buf[2], \
1640  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1641  const int16_t *abuf[2], uint8_t *dest, int dstW, \
1642  int yalpha, int uvalpha, int y) \
1643 { \
1644  name ## base ## _2_vsx_template(c, buf, ubuf, vbuf, abuf, \
1645  dest, dstW, yalpha, uvalpha, y, fmt); \
1646 }
1647 
1648 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
1649 YUV2PACKEDWRAPPER2(name, base, ext, fmt) \
1650 static void name ## ext ## _1_vsx(SwsContext *c, const int16_t *buf0, \
1651  const int16_t *ubuf[2], const int16_t *vbuf[2], \
1652  const int16_t *abuf0, uint8_t *dest, int dstW, \
1653  int uvalpha, int y) \
1654 { \
1655  name ## base ## _1_vsx_template(c, buf0, ubuf, vbuf, \
1656  abuf0, dest, dstW, uvalpha, \
1657  y, fmt); \
1658 }
1659 
1660 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
1661 YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
1662 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
1663 
1664 static void hyscale_fast_vsx(SwsContext *c, int16_t *dst, int dstWidth,
1665  const uint8_t *src, int srcW, int xInc)
1666 {
1667  int i;
1668  unsigned int xpos = 0, xx;
1669  vector uint8_t vin, vin2, vperm;
1670  vector int8_t vmul, valpha;
1671  vector int16_t vtmp, vtmp2, vtmp3, vtmp4;
1672  vector uint16_t vd_l, vd_r, vcoord16[2];
1673  vector uint32_t vcoord[4];
1674  const vector uint32_t vadd = (vector uint32_t) {
1675  0,
1676  xInc * 1,
1677  xInc * 2,
1678  xInc * 3,
1679  };
1680  const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
1681  0,
1682  xInc * 1,
1683  xInc * 2,
1684  xInc * 3,
1685  xInc * 4,
1686  xInc * 5,
1687  xInc * 6,
1688  xInc * 7,
1689  };
1690  const vector uint32_t vshift16 = vec_splats((uint32_t) 16);
1691  const vector uint16_t vshift9 = vec_splat_u16(9);
1692  const vector uint8_t vzero = vec_splat_u8(0);
1693  const vector uint16_t vshift = vec_splat_u16(7);
1694 
1695  for (i = 0; i < dstWidth; i += 16) {
1696  vcoord16[0] = vec_splats((uint16_t) xpos);
1697  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1698 
1699  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1700  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1701 
1702  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1703  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1704  valpha = (vector int8_t) vec_pack(vcoord16[0], vcoord16[1]);
1705 
1706  xx = xpos >> 16;
1707  vin = vec_vsx_ld(0, &src[xx]);
1708 
1709  vcoord[0] = vec_splats(xpos & 0xffff);
1710  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1711  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1712  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1713 
1714  vcoord[0] = vec_add(vcoord[0], vadd);
1715  vcoord[1] = vec_add(vcoord[1], vadd);
1716  vcoord[2] = vec_add(vcoord[2], vadd);
1717  vcoord[3] = vec_add(vcoord[3], vadd);
1718 
1719  vcoord[0] = vec_sr(vcoord[0], vshift16);
1720  vcoord[1] = vec_sr(vcoord[1], vshift16);
1721  vcoord[2] = vec_sr(vcoord[2], vshift16);
1722  vcoord[3] = vec_sr(vcoord[3], vshift16);
1723 
1724  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1725  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1726  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1727 
1728  vin = vec_perm(vin, vin, vperm);
1729 
1730  vin2 = vec_vsx_ld(1, &src[xx]);
1731  vin2 = vec_perm(vin2, vin2, vperm);
1732 
1733  vmul = (vector int8_t) vec_sub(vin2, vin);
1734  vtmp = vec_mule(vmul, valpha);
1735  vtmp2 = vec_mulo(vmul, valpha);
1736  vtmp3 = vec_mergeh(vtmp, vtmp2);
1737  vtmp4 = vec_mergel(vtmp, vtmp2);
1738 
1739  vd_l = (vector uint16_t) vec_mergeh(vin, vzero);
1740  vd_r = (vector uint16_t) vec_mergel(vin, vzero);
1741  vd_l = vec_sl(vd_l, vshift);
1742  vd_r = vec_sl(vd_r, vshift);
1743 
1744  vd_l = vec_add(vd_l, (vector uint16_t) vtmp3);
1745  vd_r = vec_add(vd_r, (vector uint16_t) vtmp4);
1746 
1747  vec_st((vector int16_t) vd_l, 0, &dst[i]);
1748  vec_st((vector int16_t) vd_r, 0, &dst[i + 8]);
1749 
1750  xpos += xInc * 16;
1751  }
1752  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1753  dst[i] = src[srcW-1]*128;
1754 }
1755 
1756 #define HCSCALE(in, out) \
1757  vin = vec_vsx_ld(0, &in[xx]); \
1758  vin = vec_perm(vin, vin, vperm); \
1759 \
1760  vin2 = vec_vsx_ld(1, &in[xx]); \
1761  vin2 = vec_perm(vin2, vin2, vperm); \
1762 \
1763  vtmp = vec_mule(vin, valphaxor); \
1764  vtmp2 = vec_mulo(vin, valphaxor); \
1765  vtmp3 = vec_mergeh(vtmp, vtmp2); \
1766  vtmp4 = vec_mergel(vtmp, vtmp2); \
1767 \
1768  vtmp = vec_mule(vin2, valpha); \
1769  vtmp2 = vec_mulo(vin2, valpha); \
1770  vd_l = vec_mergeh(vtmp, vtmp2); \
1771  vd_r = vec_mergel(vtmp, vtmp2); \
1772 \
1773  vd_l = vec_add(vd_l, vtmp3); \
1774  vd_r = vec_add(vd_r, vtmp4); \
1775 \
1776  vec_st((vector int16_t) vd_l, 0, &out[i]); \
1777  vec_st((vector int16_t) vd_r, 0, &out[i + 8])
1778 
1779 static void hcscale_fast_vsx(SwsContext *c, int16_t *dst1, int16_t *dst2,
1780  int dstWidth, const uint8_t *src1,
1781  const uint8_t *src2, int srcW, int xInc)
1782 {
1783  int i;
1784  unsigned int xpos = 0, xx;
1785  vector uint8_t vin, vin2, vperm;
1786  vector uint8_t valpha, valphaxor;
1787  vector uint16_t vtmp, vtmp2, vtmp3, vtmp4;
1788  vector uint16_t vd_l, vd_r, vcoord16[2];
1789  vector uint32_t vcoord[4];
1790  const vector uint8_t vxor = vec_splats((uint8_t) 127);
1791  const vector uint32_t vadd = (vector uint32_t) {
1792  0,
1793  xInc * 1,
1794  xInc * 2,
1795  xInc * 3,
1796  };
1797  const vector uint16_t vadd16 = (vector uint16_t) { // Modulo math
1798  0,
1799  xInc * 1,
1800  xInc * 2,
1801  xInc * 3,
1802  xInc * 4,
1803  xInc * 5,
1804  xInc * 6,
1805  xInc * 7,
1806  };
1807  const vector uint32_t vshift16 = vec_splats((uint32_t) 16);
1808  const vector uint16_t vshift9 = vec_splat_u16(9);
1809 
1810  for (i = 0; i < dstWidth; i += 16) {
1811  vcoord16[0] = vec_splats((uint16_t) xpos);
1812  vcoord16[1] = vec_splats((uint16_t) (xpos + xInc * 8));
1813 
1814  vcoord16[0] = vec_add(vcoord16[0], vadd16);
1815  vcoord16[1] = vec_add(vcoord16[1], vadd16);
1816 
1817  vcoord16[0] = vec_sr(vcoord16[0], vshift9);
1818  vcoord16[1] = vec_sr(vcoord16[1], vshift9);
1819  valpha = vec_pack(vcoord16[0], vcoord16[1]);
1820  valphaxor = vec_xor(valpha, vxor);
1821 
1822  xx = xpos >> 16;
1823 
1824  vcoord[0] = vec_splats(xpos & 0xffff);
1825  vcoord[1] = vec_splats((xpos & 0xffff) + xInc * 4);
1826  vcoord[2] = vec_splats((xpos & 0xffff) + xInc * 8);
1827  vcoord[3] = vec_splats((xpos & 0xffff) + xInc * 12);
1828 
1829  vcoord[0] = vec_add(vcoord[0], vadd);
1830  vcoord[1] = vec_add(vcoord[1], vadd);
1831  vcoord[2] = vec_add(vcoord[2], vadd);
1832  vcoord[3] = vec_add(vcoord[3], vadd);
1833 
1834  vcoord[0] = vec_sr(vcoord[0], vshift16);
1835  vcoord[1] = vec_sr(vcoord[1], vshift16);
1836  vcoord[2] = vec_sr(vcoord[2], vshift16);
1837  vcoord[3] = vec_sr(vcoord[3], vshift16);
1838 
1839  vcoord16[0] = vec_pack(vcoord[0], vcoord[1]);
1840  vcoord16[1] = vec_pack(vcoord[2], vcoord[3]);
1841  vperm = vec_pack(vcoord16[0], vcoord16[1]);
1842 
1843  HCSCALE(src1, dst1);
1844  HCSCALE(src2, dst2);
1845 
1846  xpos += xInc * 16;
1847  }
1848  for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1849  dst1[i] = src1[srcW-1]*128;
1850  dst2[i] = src2[srcW-1]*128;
1851  }
1852 }
1853 
1854 #undef HCSCALE
1855 
1856 static void hScale8To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1857  const uint8_t *src, const int16_t *filter,
1858  const int32_t *filterPos, int filterSize)
1859 {
1860  int i, j;
1861  int32_t *dst = (int32_t *) _dst;
1862  vector int16_t vfilter, vin;
1863  vector uint8_t vin8;
1864  vector int32_t vout;
1865  const vector uint8_t vzero = vec_splat_u8(0);
1866  const vector uint8_t vunusedtab[8] = {
1867  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1868  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1869  (vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1870  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1871  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1872  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1873  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1874  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1875  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1876  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1877  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1878  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1879  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1880  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1881  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1882  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1883  };
1884  const vector uint8_t vunused = vunusedtab[filterSize % 8];
1885 
1886  if (filterSize == 1) {
1887  for (i = 0; i < dstW; i++) {
1888  int srcPos = filterPos[i];
1889  int val = 0;
1890  for (j = 0; j < filterSize; j++) {
1891  val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
1892  }
1893  dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
1894  }
1895  } else {
1896  for (i = 0; i < dstW; i++) {
1897  const int srcPos = filterPos[i];
1898  vout = vec_splat_s32(0);
1899  for (j = 0; j < filterSize; j += 8) {
1900  vin8 = vec_vsx_ld(0, &src[srcPos + j]);
1901  vin = (vector int16_t) vec_mergeh(vin8, vzero);
1902  if (j + 8 > filterSize) // Remove the unused elements on the last round
1903  vin = vec_perm(vin, (vector int16_t) vzero, vunused);
1904 
1905  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1906  vout = vec_msums(vin, vfilter, vout);
1907  }
1908  vout = vec_sums(vout, (vector int32_t) vzero);
1909  dst[i] = FFMIN(vout[3] >> 3, (1 << 19) - 1);
1910  }
1911  }
1912 }
1913 
1914 static void hScale16To19_vsx(SwsContext *c, int16_t *_dst, int dstW,
1915  const uint8_t *_src, const int16_t *filter,
1916  const int32_t *filterPos, int filterSize)
1917 {
1919  int i, j;
1920  int32_t *dst = (int32_t *) _dst;
1921  const uint16_t *src = (const uint16_t *) _src;
1922  int bits = desc->comp[0].depth - 1;
1923  int sh = bits - 4;
1924  vector int16_t vfilter, vin;
1925  vector int32_t vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
1926  const vector uint8_t vzero = vec_splat_u8(0);
1927  const vector uint8_t vunusedtab[8] = {
1928  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1929  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
1930  (vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
1931  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1932  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
1933  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1934  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
1935  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1936  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1937  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1938  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1939  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
1940  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1941  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
1942  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
1943  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
1944  };
1945  const vector uint8_t vunused = vunusedtab[filterSize % 8];
1946 
1947  if ((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth<16) {
1948  sh = 9;
1949  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
1950  sh = 16 - 1 - 4;
1951  }
1952 
1953  if (filterSize == 1) {
1954  for (i = 0; i < dstW; i++) {
1955  int srcPos = filterPos[i];
1956  int val = 0;
1957 
1958  for (j = 0; j < filterSize; j++) {
1959  val += src[srcPos + j] * filter[filterSize * i + j];
1960  }
1961  // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1962  dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1963  }
1964  } else {
1965  for (i = 0; i < dstW; i++) {
1966  const int srcPos = filterPos[i];
1967  vout = vec_splat_s32(0);
1968  for (j = 0; j < filterSize; j += 8) {
1969  vin = (vector int16_t) vec_vsx_ld(0, &src[srcPos + j]);
1970  if (j + 8 > filterSize) // Remove the unused elements on the last round
1971  vin = vec_perm(vin, (vector int16_t) vzero, vunused);
1972 
1973  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
1974  vfilter32_l = vec_unpackh(vfilter);
1975  vfilter32_r = vec_unpackl(vfilter);
1976 
1977  vtmp = (vector int32_t) vec_mergeh(vin, (vector int16_t) vzero);
1978  vtmp2 = (vector int32_t) vec_mergel(vin, (vector int16_t) vzero);
1979 
1980  vtmp = vec_mul(vtmp, vfilter32_l);
1981  vtmp2 = vec_mul(vtmp2, vfilter32_r);
1982 
1983  vout = vec_adds(vout, vtmp);
1984  vout = vec_adds(vout, vtmp2);
1985  }
1986  vout = vec_sums(vout, (vector int32_t) vzero);
1987  dst[i] = FFMIN(vout[3] >> sh, (1 << 19) - 1);
1988  }
1989  }
1990 }
1991 
1992 static void hScale16To15_vsx(SwsContext *c, int16_t *dst, int dstW,
1993  const uint8_t *_src, const int16_t *filter,
1994  const int32_t *filterPos, int filterSize)
1995 {
1997  int i, j;
1998  const uint16_t *src = (const uint16_t *) _src;
1999  int sh = desc->comp[0].depth - 1;
2000  vector int16_t vfilter, vin;
2001  vector int32_t vout, vtmp, vtmp2, vfilter32_l, vfilter32_r;
2002  const vector uint8_t vzero = vec_splat_u8(0);
2003  const vector uint8_t vunusedtab[8] = {
2004  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2005  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf},
2006  (vector uint8_t) {0x0, 0x1, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
2007  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2008  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x10, 0x10, 0x10, 0x10,
2009  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2010  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x10, 0x10,
2011  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2012  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2013  0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2014  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2015  0x8, 0x9, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10},
2016  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2017  0x8, 0x9, 0xa, 0xb, 0x10, 0x10, 0x10, 0x10},
2018  (vector uint8_t) {0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
2019  0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x10, 0x10},
2020  };
2021  const vector uint8_t vunused = vunusedtab[filterSize % 8];
2022 
2023  if (sh<15) {
2024  sh = isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : (desc->comp[0].depth - 1);
2025  } else if (desc->flags & AV_PIX_FMT_FLAG_FLOAT) { /* float input are process like uint 16bpc */
2026  sh = 16 - 1;
2027  }
2028 
2029  if (filterSize == 1) {
2030  for (i = 0; i < dstW; i++) {
2031  int srcPos = filterPos[i];
2032  int val = 0;
2033 
2034  for (j = 0; j < filterSize; j++) {
2035  val += src[srcPos + j] * filter[filterSize * i + j];
2036  }
2037  // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
2038  dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
2039  }
2040  } else {
2041  for (i = 0; i < dstW; i++) {
2042  const int srcPos = filterPos[i];
2043  vout = vec_splat_s32(0);
2044  for (j = 0; j < filterSize; j += 8) {
2045  vin = (vector int16_t) vec_vsx_ld(0, &src[srcPos + j]);
2046  if (j + 8 > filterSize) // Remove the unused elements on the last round
2047  vin = vec_perm(vin, (vector int16_t) vzero, vunused);
2048 
2049  vfilter = vec_vsx_ld(0, &filter[filterSize * i + j]);
2050  vfilter32_l = vec_unpackh(vfilter);
2051  vfilter32_r = vec_unpackl(vfilter);
2052 
2053  vtmp = (vector int32_t) vec_mergeh(vin, (vector int16_t) vzero);
2054  vtmp2 = (vector int32_t) vec_mergel(vin, (vector int16_t) vzero);
2055 
2056  vtmp = vec_mul(vtmp, vfilter32_l);
2057  vtmp2 = vec_mul(vtmp2, vfilter32_r);
2058 
2059  vout = vec_adds(vout, vtmp);
2060  vout = vec_adds(vout, vtmp2);
2061  }
2062  vout = vec_sums(vout, (vector int32_t) vzero);
2063  dst[i] = FFMIN(vout[3] >> sh, (1 << 15) - 1);
2064  }
2065  }
2066 }
2067 
2068 #endif /* !HAVE_BIGENDIAN */
2069 
2070 #endif /* HAVE_VSX */
2071 
2073 {
2074 #if HAVE_VSX
2075  enum AVPixelFormat dstFormat = c->dstFormat;
2076  const int cpu_flags = av_get_cpu_flags();
2077  const unsigned char power8 = HAVE_POWER8 && cpu_flags & AV_CPU_FLAG_POWER8;
2078 
2079  if (!(cpu_flags & AV_CPU_FLAG_VSX))
2080  return;
2081 
2082 #if !HAVE_BIGENDIAN
2083  if (c->srcBpc == 8) {
2084  if (c->dstBpc <= 14) {
2085  c->hyScale = c->hcScale = hScale_real_vsx;
2086  if (c->flags & SWS_FAST_BILINEAR && c->dstW >= c->srcW && c->chrDstW >= c->chrSrcW) {
2087  c->hyscale_fast = hyscale_fast_vsx;
2088  c->hcscale_fast = hcscale_fast_vsx;
2089  }
2090  } else {
2091  c->hyScale = c->hcScale = hScale8To19_vsx;
2092  }
2093  } else {
2094  if (power8) {
2095  c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_vsx
2096  : hScale16To15_vsx;
2097  }
2098  }
2099  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
2100  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
2101  !c->needAlpha) {
2102  c->yuv2planeX = yuv2planeX_vsx;
2103  }
2104 #endif
2105 
2106  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
2107  switch (c->dstBpc) {
2108  case 8:
2109  c->yuv2plane1 = yuv2plane1_8_vsx;
2110  break;
2111 #if !HAVE_BIGENDIAN
2112  case 9:
2113  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx : yuv2plane1_9LE_vsx;
2114  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_vsx : yuv2planeX_9LE_vsx;
2115  break;
2116  case 10:
2117  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx : yuv2plane1_10LE_vsx;
2118  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_vsx : yuv2planeX_10LE_vsx;
2119  break;
2120  case 12:
2121  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx : yuv2plane1_12LE_vsx;
2122  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_vsx : yuv2planeX_12LE_vsx;
2123  break;
2124  case 14:
2125  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx;
2126  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_vsx : yuv2planeX_14LE_vsx;
2127  break;
2128  case 16:
2129  c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx;
2130 #if HAVE_POWER8
2131  if (cpu_flags & AV_CPU_FLAG_POWER8) {
2132  c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx : yuv2planeX_16LE_vsx;
2133  }
2134 #endif /* HAVE_POWER8 */
2135  break;
2136 #endif /* !HAVE_BIGENDIAN */
2137  }
2138  }
2139 
2140  if (c->flags & SWS_BITEXACT)
2141  return;
2142 
2143 #if !HAVE_BIGENDIAN
2144  if (c->flags & SWS_FULL_CHR_H_INT) {
2145  switch (dstFormat) {
2146  case AV_PIX_FMT_RGB24:
2147  if (power8) {
2148  c->yuv2packed1 = yuv2rgb24_full_1_vsx;
2149  c->yuv2packed2 = yuv2rgb24_full_2_vsx;
2150  c->yuv2packedX = yuv2rgb24_full_X_vsx;
2151  }
2152  break;
2153  case AV_PIX_FMT_BGR24:
2154  if (power8) {
2155  c->yuv2packed1 = yuv2bgr24_full_1_vsx;
2156  c->yuv2packed2 = yuv2bgr24_full_2_vsx;
2157  c->yuv2packedX = yuv2bgr24_full_X_vsx;
2158  }
2159  break;
2160  case AV_PIX_FMT_BGRA:
2161  if (power8) {
2162  if (!c->needAlpha) {
2163  c->yuv2packed1 = yuv2bgrx32_full_1_vsx;
2164  c->yuv2packed2 = yuv2bgrx32_full_2_vsx;
2165  c->yuv2packedX = yuv2bgrx32_full_X_vsx;
2166  }
2167  }
2168  break;
2169  case AV_PIX_FMT_RGBA:
2170  if (power8) {
2171  if (!c->needAlpha) {
2172  c->yuv2packed1 = yuv2rgbx32_full_1_vsx;
2173  c->yuv2packed2 = yuv2rgbx32_full_2_vsx;
2174  c->yuv2packedX = yuv2rgbx32_full_X_vsx;
2175  }
2176  }
2177  break;
2178  case AV_PIX_FMT_ARGB:
2179  if (power8) {
2180  if (!c->needAlpha) {
2181  c->yuv2packed1 = yuv2xrgb32_full_1_vsx;
2182  c->yuv2packed2 = yuv2xrgb32_full_2_vsx;
2183  c->yuv2packedX = yuv2xrgb32_full_X_vsx;
2184  }
2185  }
2186  break;
2187  case AV_PIX_FMT_ABGR:
2188  if (power8) {
2189  if (!c->needAlpha) {
2190  c->yuv2packed1 = yuv2xbgr32_full_1_vsx;
2191  c->yuv2packed2 = yuv2xbgr32_full_2_vsx;
2192  c->yuv2packedX = yuv2xbgr32_full_X_vsx;
2193  }
2194  }
2195  break;
2196  }
2197  } else { /* !SWS_FULL_CHR_H_INT */
2198  switch (dstFormat) {
2199  case AV_PIX_FMT_YUYV422:
2200  c->yuv2packed1 = yuv2yuyv422_1_vsx;
2201  c->yuv2packed2 = yuv2yuyv422_2_vsx;
2202  c->yuv2packedX = yuv2yuyv422_X_vsx;
2203  break;
2204  case AV_PIX_FMT_YVYU422:
2205  c->yuv2packed1 = yuv2yvyu422_1_vsx;
2206  c->yuv2packed2 = yuv2yvyu422_2_vsx;
2207  c->yuv2packedX = yuv2yvyu422_X_vsx;
2208  break;
2209  case AV_PIX_FMT_UYVY422:
2210  c->yuv2packed1 = yuv2uyvy422_1_vsx;
2211  c->yuv2packed2 = yuv2uyvy422_2_vsx;
2212  c->yuv2packedX = yuv2uyvy422_X_vsx;
2213  break;
2214  case AV_PIX_FMT_BGRA:
2215  if (power8) {
2216  if (!c->needAlpha) {
2217  c->yuv2packed1 = yuv2bgrx32_1_vsx;
2218  c->yuv2packed2 = yuv2bgrx32_2_vsx;
2219  }
2220  }
2221  break;
2222  case AV_PIX_FMT_RGBA:
2223  if (power8) {
2224  if (!c->needAlpha) {
2225  c->yuv2packed1 = yuv2rgbx32_1_vsx;
2226  c->yuv2packed2 = yuv2rgbx32_2_vsx;
2227  }
2228  }
2229  break;
2230  case AV_PIX_FMT_ARGB:
2231  if (power8) {
2232  if (!c->needAlpha) {
2233  c->yuv2packed1 = yuv2xrgb32_1_vsx;
2234  c->yuv2packed2 = yuv2xrgb32_2_vsx;
2235  }
2236  }
2237  break;
2238  case AV_PIX_FMT_ABGR:
2239  if (power8) {
2240  if (!c->needAlpha) {
2241  c->yuv2packed1 = yuv2xbgr32_1_vsx;
2242  c->yuv2packed2 = yuv2xbgr32_2_vsx;
2243  }
2244  }
2245  break;
2246  case AV_PIX_FMT_RGB24:
2247  if (power8) {
2248  c->yuv2packed1 = yuv2rgb24_1_vsx;
2249  c->yuv2packed2 = yuv2rgb24_2_vsx;
2250  }
2251  break;
2252  case AV_PIX_FMT_BGR24:
2253  if (power8) {
2254  c->yuv2packed1 = yuv2bgr24_1_vsx;
2255  c->yuv2packed2 = yuv2bgr24_2_vsx;
2256  }
2257  break;
2258  }
2259  }
2260 #endif /* !HAVE_BIGENDIAN */
2261 
2262 #endif /* HAVE_VSX */
2263 }
packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
Definition: pixfmt.h:81
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:340
#define NULL
Definition: coverity.c:32
void(* hcScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
const char const char void * val
Definition: avisynth_c.h:863
#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)
Definition: output.c:1798
static av_always_inline int isAnyRGB(enum AVPixelFormat pix_fmt)
static int shift(int a, int b)
Definition: sonic.c:82
const AVPixFmtDescriptor * av_pix_fmt_desc_get(enum AVPixelFormat pix_fmt)
Definition: pixdesc.c:2522
#define HAVE_POWER8
Definition: config.h:52
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:68
const char * desc
Definition: nvenc.c:68
#define mul8(a, b)
#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)
Definition: output.c:1785
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
void(* hyScale)(struct SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize)
Scale one horizontal line of input data using a filter over the input lines, to produce one (differen...
static atomic_int cpu_flags
Definition: cpu.c:50
void(* hyscale_fast)(struct SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Scale one horizontal line of input data using a bilinear filter to produce one line of output data...
#define src
Definition: vp8dsp.c:254
Macro definitions for various function/variable attributes.
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhd.c:153
AVComponentDescriptor comp[4]
Parameters that describe how pixels are packed.
Definition: pixdesc.h:117
uint8_t
#define av_cold
Definition: attributes.h:82
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
8 bits with AV_PIX_FMT_RGB32 palette
Definition: pixfmt.h:77
#define AV_PIX_FMT_FLAG_FLOAT
The pixel format contains IEEE-754 floating point values.
Definition: pixdesc.h:188
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:79
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:94
#define SWS_FAST_BILINEAR
Definition: swscale.h:58
external API header
enum AVPixelFormat dstFormat
Destination pixel format.
#define A(x)
Definition: vp56_arith.h:28
yuv2packedX_fn yuv2packedX
#define U(x)
Definition: vp56_arith.h:37
const AVS_VideoInfo * vi
Definition: avisynth_c.h:887
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:95
static const uint8_t dither[8][8]
Definition: vf_fspp.c:57
yuv2packed1_fn yuv2packed1
uint8_t bits
Definition: vp3data.h:202
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
void(* hcscale_fast)(struct SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:92
int chrDstW
Width of destination chroma planes.
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:93
uint64_t flags
Combination of AV_PIX_FMT_FLAG_...
Definition: pixdesc.h:106
#define FFMIN(a, b)
Definition: common.h:96
yuv2planar1_fn yuv2plane1
int32_t
packed YUV 4:2:2, 16bpp, Y0 Cr Y1 Cb
Definition: pixfmt.h:210
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:69
int dstW
Width of destination luma/alpha planes.
static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
#define src1
Definition: h264pred.c:139
Descriptor that unambiguously describes how the bits of a pixel are stored in the up to 4 data planes...
Definition: pixdesc.h:81
yuv2planarX_fn yuv2planeX
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:67
void * buf
Definition: avisynth_c.h:766
#define AV_CPU_FLAG_VSX
ISA 2.06.
Definition: cpu.h:61
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)
Definition: output.c:1809
Contains misc utility macros and inline functions.
#define SWS_BITEXACT
Definition: swscale.h:84
#define AV_CPU_FLAG_POWER8
ISA 2.07.
Definition: cpu.h:62
yuv2NBPS(yuv2NBPS(9, yuv2NBPS(BE, yuv2NBPS(1, yuv2NBPS(10, int16_t)
Definition: output.c:369
int
yuv2packed2_fn yuv2packed2
#define LOCAL_ALIGNED(a, t, v,...)
Definition: internal.h:114
static double clip(void *opaque, double val)
Clip value val in the minval - maxval range.
Definition: vf_lut.c:162
static const int shift2[6]
Definition: dxa.c:51
static double c[64]
static const uint8_t shifts[2][12]
Definition: camellia.c:174
enum AVPixelFormat srcFormat
Source pixel format.
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:887
#define MAX_FILTER_SIZE
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2072
void INT64 start
Definition: avisynth_c.h:766
#define av_always_inline
Definition: attributes.h:39
int chrSrcW
Width of source chroma planes.
int depth
Number of bits in the component.
Definition: pixdesc.h:58
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:341
#define YUV2PACKEDWRAPPER(name, base, ext, fmt)
Definition: output.c:709
int srcW
Width of source luma/alpha planes.
int flags
Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
AVPixelFormat
Pixel format.
Definition: pixfmt.h:64
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
static uint8_t tmp[11]
Definition: aes_ctr.c:26