FFmpeg  4.3
mpegaudiodsp_mips_float.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Bojan Zivkovic (bojan@mips.com)
30  *
31  * MPEG Audio decoder optimized for MIPS floating-point architecture
32  *
33  * This file is part of FFmpeg.
34  *
35  * FFmpeg is free software; you can redistribute it and/or
36  * modify it under the terms of the GNU Lesser General Public
37  * License as published by the Free Software Foundation; either
38  * version 2.1 of the License, or (at your option) any later version.
39  *
40  * FFmpeg is distributed in the hope that it will be useful,
41  * but WITHOUT ANY WARRANTY; without even the implied warranty of
42  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
43  * Lesser General Public License for more details.
44  *
45  * You should have received a copy of the GNU Lesser General Public
46  * License along with FFmpeg; if not, write to the Free Software
47  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
48  */
49 
50 /**
51  * @file
52  * Reference: libavcodec/mpegaudiodsp_template.c
53  * libavcodec/dct32.c
54  */
55 
56 #include <string.h>
57 
58 #include "libavutil/mips/asmdefs.h"
60 
61 #if HAVE_INLINE_ASM && HAVE_MIPSFPU
62 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
63 
64 static void ff_mpadsp_apply_window_mips_float(float *synth_buf, float *window,
65  int *dither_state, float *samples, ptrdiff_t incr)
66 {
67  register const float *w, *w2, *p;
68  int j;
69  float *samples2;
70  float sum, sum2;
71  /* temporary variables */
72  int incr1 = incr << 2;
73  int t_sample;
74  float in1, in2, in3, in4, in5, in6, in7, in8;
75  float *p2;
76 
77  /* copy to avoid wrap */
78  memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
79 
80  /**
81  * instructions are scheduled to minimize pipeline stall.
82  * use of round_sample function from the original code is
83  * changed with appropriate assembly instructions.
84  */
85 
86  __asm__ volatile (
87  "lwc1 %[sum], 0(%[dither_state]) \t\n"
88  "sll %[t_sample], %[incr1], 5 \t\n"
89  "sub %[t_sample], %[t_sample], %[incr1] \n\t"
90  "li %[j], 4 \t\n"
91  "lwc1 %[in1], 0(%[window]) \t\n"
92  "lwc1 %[in2], 16*4(%[synth_buf]) \t\n"
93  "sw $zero, 0(%[dither_state]) \t\n"
94  "lwc1 %[in3], 64*4(%[window]) \t\n"
95  "lwc1 %[in4], 80*4(%[synth_buf]) \t\n"
96  PTR_ADDU "%[samples2],%[samples], %[t_sample] \t\n"
97  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
98  "lwc1 %[in5], 128*4(%[window]) \t\n"
99  "lwc1 %[in6], 144*4(%[synth_buf]) \t\n"
100  "lwc1 %[in7], 192*4(%[window]) \t\n"
101  "madd.s %[sum], %[sum], %[in3], %[in4] \t\n"
102  "lwc1 %[in8], 208*4(%[synth_buf]) \t\n"
103  "lwc1 %[in1], 256*4(%[window]) \t\n"
104  "lwc1 %[in2], 272*4(%[synth_buf]) \t\n"
105  "madd.s %[sum], %[sum], %[in5], %[in6] \t\n"
106  "lwc1 %[in3], 320*4(%[window]) \t\n"
107  "lwc1 %[in4], 336*4(%[synth_buf]) \t\n"
108  "lwc1 %[in5], 384*4(%[window]) \t\n"
109  "madd.s %[sum], %[sum], %[in7], %[in8] \t\n"
110  "lwc1 %[in6], 400*4(%[synth_buf]) \t\n"
111  "lwc1 %[in7], 448*4(%[window]) \t\n"
112  "lwc1 %[in8], 464*4(%[synth_buf]) \t\n"
113  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
114  "lwc1 %[in1], 32*4(%[window]) \t\n"
115  "lwc1 %[in2], 48*4(%[synth_buf]) \t\n"
116  "madd.s %[sum], %[sum], %[in3], %[in4] \t\n"
117  "lwc1 %[in3], 96*4(%[window]) \t\n"
118  "lwc1 %[in4], 112*4(%[synth_buf]) \t\n"
119  "madd.s %[sum], %[sum], %[in5], %[in6] \t\n"
120  "lwc1 %[in5], 160*4(%[window]) \t\n"
121  "lwc1 %[in6], 176*4(%[synth_buf]) \t\n"
122  "madd.s %[sum], %[sum], %[in7], %[in8] \t\n"
123  "lwc1 %[in7], 224*4(%[window]) \t\n"
124  "lwc1 %[in8], 240*4(%[synth_buf]) \t\n"
125  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
126  "lwc1 %[in1], 288*4(%[window]) \t\n"
127  "lwc1 %[in2], 304*4(%[synth_buf]) \t\n"
128  "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n"
129  "lwc1 %[in3], 352*4(%[window]) \t\n"
130  "lwc1 %[in4], 368*4(%[synth_buf]) \t\n"
131  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
132  "lwc1 %[in5], 416*4(%[window]) \t\n"
133  "lwc1 %[in6], 432*4(%[synth_buf]) \t\n"
134  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
135  "lwc1 %[in7], 480*4(%[window]) \t\n"
136  "lwc1 %[in8], 496*4(%[synth_buf]) \t\n"
137  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
138  PTR_ADDU "%[w], %[window], 4 \t\n"
139  "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n"
140  PTR_ADDU "%[w2], %[window], 124 \t\n"
141  PTR_ADDIU "%[p], %[synth_buf], 68 \t\n"
142  PTR_ADDIU "%[p2], %[synth_buf], 188 \t\n"
143  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
144  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
145  "swc1 %[sum], 0(%[samples]) \t\n"
146  PTR_ADDU "%[samples], %[samples], %[incr1] \t\n"
147 
148  /* calculate two samples at the same time to avoid one memory
149  access per two sample */
150 
151  "ff_mpadsp_apply_window_loop%=: \t\n"
152  "lwc1 %[in1], 0(%[w]) \t\n"
153  "lwc1 %[in2], 0(%[p]) \t\n"
154  "lwc1 %[in3], 0(%[w2]) \t\n"
155  "lwc1 %[in4], 64*4(%[w]) \t\n"
156  "lwc1 %[in5], 64*4(%[p]) \t\n"
157  "lwc1 %[in6], 64*4(%[w2]) \t\n"
158  "mul.s %[sum], %[in1], %[in2] \t\n"
159  "mul.s %[sum2], %[in2], %[in3] \t\n"
160  "lwc1 %[in1], 128*4(%[w]) \t\n"
161  "lwc1 %[in2], 128*4(%[p]) \t\n"
162  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
163  "nmadd.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
164  "lwc1 %[in3], 128*4(%[w2]) \t\n"
165  "lwc1 %[in4], 192*4(%[w]) \t\n"
166  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
167  "lwc1 %[in5], 192*4(%[p]) \t\n"
168  "lwc1 %[in6], 192*4(%[w2]) \t\n"
169  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
170  "lwc1 %[in1], 256*4(%[w]) \t\n"
171  "lwc1 %[in2], 256*4(%[p]) \t\n"
172  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
173  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
174  "lwc1 %[in3], 256*4(%[w2]) \t\n"
175  "lwc1 %[in4], 320*4(%[w]) \t\n"
176  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
177  "lwc1 %[in5], 320*4(%[p]) \t\n"
178  "lwc1 %[in6], 320*4(%[w2]) \t\n"
179  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
180  "lwc1 %[in1], 384*4(%[w]) \t\n"
181  "lwc1 %[in2], 384*4(%[p]) \t\n"
182  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
183  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
184  "lwc1 %[in3], 384*4(%[w2]) \t\n"
185  "lwc1 %[in4], 448*4(%[w]) \t\n"
186  "madd.s %[sum], %[sum], %[in1], %[in2] \t\n"
187  "lwc1 %[in5], 448*4(%[p]) \t\n"
188  "lwc1 %[in6], 448*4(%[w2]) \t\n"
189  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
190  "madd.s %[sum], %[sum], %[in4], %[in5] \t\n"
191  "lwc1 %[in1], 32*4(%[w]) \t\n"
192  "lwc1 %[in2], 0(%[p2]) \t\n"
193  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
194  "lwc1 %[in3], 32*4(%[w2]) \t\n"
195  "lwc1 %[in4], 96*4(%[w]) \t\n"
196  "lwc1 %[in5], 64*4(%[p2]) \t\n"
197  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
198  "lwc1 %[in6], 96*4(%[w2]) \t\n"
199  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
200  "lwc1 %[in1], 160*4(%[w]) \t\n"
201  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
202  "lwc1 %[in2], 128*4(%[p2]) \t\n"
203  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
204  "lwc1 %[in3], 160*4(%[w2]) \t\n"
205  "lwc1 %[in4], 224*4(%[w]) \t\n"
206  "lwc1 %[in5], 192*4(%[p2]) \t\n"
207  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
208  "lwc1 %[in6], 224*4(%[w2]) \t\n"
209  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
210  "lwc1 %[in1], 288*4(%[w]) \t\n"
211  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
212  "lwc1 %[in2], 256*4(%[p2]) \t\n"
213  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
214  "lwc1 %[in3], 288*4(%[w2]) \t\n"
215  "lwc1 %[in4], 352*4(%[w]) \t\n"
216  "lwc1 %[in5], 320*4(%[p2]) \t\n"
217  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
218  "lwc1 %[in6], 352*4(%[w2]) \t\n"
219  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
220  "lwc1 %[in1], 416*4(%[w]) \t\n"
221  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
222  "lwc1 %[in2], 384*4(%[p2]) \t\n"
223  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
224  "lwc1 %[in3], 416*4(%[w2]) \t\n"
225  "lwc1 %[in4], 480*4(%[w]) \t\n"
226  "lwc1 %[in5], 448*4(%[p2]) \t\n"
227  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
228  "lwc1 %[in6], 480*4(%[w2]) \t\n"
229  "nmsub.s %[sum2], %[sum2], %[in2], %[in3] \t\n"
230  PTR_ADDIU "%[w], %[w], 4 \t\n"
231  "nmsub.s %[sum], %[sum], %[in4], %[in5] \t\n"
232  PTR_ADDIU "%[w2], %[w2], -4 \t\n"
233  "nmsub.s %[sum2], %[sum2], %[in5], %[in6] \t\n"
234  "addu %[j], %[j], 4 \t\n"
235  PTR_ADDIU "%[p], 4 \t\n"
236  "swc1 %[sum], 0(%[samples]) \t\n"
237  PTR_ADDIU "%[p2], -4 \t\n"
238  "swc1 %[sum2], 0(%[samples2]) \t\n"
239  PTR_ADDU "%[samples], %[samples], %[incr1] \t\n"
240  PTR_SUBU "%[samples2],%[samples2], %[incr1] \t\n"
241  "bne %[j], 64, ff_mpadsp_apply_window_loop%= \t\n"
242 
243  "lwc1 %[in1], 48*4(%[window]) \t\n"
244  "lwc1 %[in2], 32*4(%[synth_buf]) \t\n"
245  "lwc1 %[in3], 112*4(%[window]) \t\n"
246  "lwc1 %[in4], 96*4(%[synth_buf]) \t\n"
247  "lwc1 %[in5], 176*4(%[window]) \t\n"
248  "lwc1 %[in6], 160*4(%[synth_buf]) \t\n"
249  "mul.s %[sum], %[in1], %[in2] \t\n"
250  "lwc1 %[in7], 240*4(%[window]) \t\n"
251  "lwc1 %[in8], 224*4(%[synth_buf]) \t\n"
252  "lwc1 %[in1], 304*4(%[window]) \t\n"
253  "nmadd.s %[sum], %[sum], %[in3], %[in4] \t\n"
254  "lwc1 %[in2], 288*4(%[synth_buf]) \t\n"
255  "lwc1 %[in3], 368*4(%[window]) \t\n"
256  "lwc1 %[in4], 352*4(%[synth_buf]) \t\n"
257  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
258  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
259  "lwc1 %[in5], 432*4(%[window]) \t\n"
260  "lwc1 %[in6], 416*4(%[synth_buf]) \t\n"
261  "nmsub.s %[sum], %[sum], %[in1], %[in2] \t\n"
262  "lwc1 %[in7], 496*4(%[window]) \t\n"
263  "lwc1 %[in8], 480*4(%[synth_buf]) \t\n"
264  "nmsub.s %[sum], %[sum], %[in3], %[in4] \t\n"
265  "nmsub.s %[sum], %[sum], %[in5], %[in6] \t\n"
266  "nmsub.s %[sum], %[sum], %[in7], %[in8] \t\n"
267  "swc1 %[sum], 0(%[samples]) \t\n"
268 
269  : [sum] "=&f" (sum), [sum2] "=&f" (sum2),
270  [w2] "=&r" (w2), [w] "=&r" (w),
271  [p] "=&r" (p), [p2] "=&r" (p2), [j] "=&r" (j),
272  [samples] "+r" (samples), [samples2] "=&r" (samples2),
273  [in1] "=&f" (in1), [in2] "=&f" (in2),
274  [in3] "=&f" (in3), [in4] "=&f" (in4),
275  [in5] "=&f" (in5), [in6] "=&f" (in6),
276  [in7] "=&f" (in7), [in8] "=&f" (in8),
277  [t_sample] "=&r" (t_sample)
278  : [synth_buf] "r" (synth_buf), [window] "r" (window),
279  [dither_state] "r" (dither_state), [incr1] "r" (incr1)
280  : "memory"
281  );
282 }
283 
284 static void ff_dct32_mips_float(float *out, const float *tab)
285 {
286  float val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7,
287  val8 , val9 , val10, val11, val12, val13, val14, val15,
288  val16, val17, val18, val19, val20, val21, val22, val23,
289  val24, val25, val26, val27, val28, val29, val30, val31;
290  float fTmp1, fTmp2, fTmp3, fTmp4, fTmp5, fTmp6, fTmp7, fTmp8,
291  fTmp9, fTmp10, fTmp11;
292 
293  /**
294  * instructions are scheduled to minimize pipeline stall.
295  */
296  __asm__ volatile (
297  "lwc1 %[fTmp1], 0*4(%[tab]) \n\t"
298  "lwc1 %[fTmp2], 31*4(%[tab]) \n\t"
299  "lwc1 %[fTmp3], 15*4(%[tab]) \n\t"
300  "lwc1 %[fTmp4], 16*4(%[tab]) \n\t"
301  "li.s %[fTmp7], 0.50241928618815570551 \n\t"
302  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
303  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
304  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
305  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
306  "li.s %[fTmp10], 0.50060299823519630134 \n\t"
307  "li.s %[fTmp11], 10.19000812354805681150 \n\t"
308  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
309  "add.s %[val0], %[fTmp5], %[fTmp6] \n\t"
310  "sub.s %[val15], %[fTmp5], %[fTmp6] \n\t"
311  "lwc1 %[fTmp1], 7*4(%[tab]) \n\t"
312  "lwc1 %[fTmp2], 24*4(%[tab]) \n\t"
313  "madd.s %[val16], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
314  "nmsub.s %[val31], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
315  "mul.s %[val15], %[val15], %[fTmp7] \n\t"
316  "lwc1 %[fTmp3], 8*4(%[tab]) \n\t"
317  "lwc1 %[fTmp4], 23*4(%[tab]) \n\t"
318  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
319  "mul.s %[val31], %[val31], %[fTmp7] \n\t"
320  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
321  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
322  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
323  "li.s %[fTmp7], 5.10114861868916385802 \n\t"
324  "li.s %[fTmp10], 0.67480834145500574602 \n\t"
325  "li.s %[fTmp11], 0.74453627100229844977 \n\t"
326  "add.s %[val7], %[fTmp5], %[fTmp6] \n\t"
327  "sub.s %[val8], %[fTmp5], %[fTmp6] \n\t"
328  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
329  "li.s %[fTmp1], 0.50979557910415916894 \n\t"
330  "sub.s %[fTmp2], %[val0], %[val7] \n\t"
331  "mul.s %[val8], %[val8], %[fTmp7] \n\t"
332  "madd.s %[val23], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
333  "nmsub.s %[val24], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
334  "add.s %[val0], %[val0], %[val7] \n\t"
335  "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t"
336  "sub.s %[fTmp2], %[val15], %[val8] \n\t"
337  "add.s %[val8], %[val15], %[val8] \n\t"
338  "mul.s %[val24], %[val24], %[fTmp7] \n\t"
339  "sub.s %[fTmp3], %[val16], %[val23] \n\t"
340  "add.s %[val16], %[val16], %[val23] \n\t"
341  "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
342  "sub.s %[fTmp4], %[val31], %[val24] \n\t"
343  "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t"
344  "add.s %[val24], %[val31], %[val24] \n\t"
345  "mul.s %[val31], %[fTmp1], %[fTmp4] \n\t"
346 
347  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
348  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
349  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
350  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
351  [val0] "=f" (val0), [val7] "=f" (val7),
352  [val8] "=f" (val8), [val15] "=f" (val15),
353  [val16] "=f" (val16), [val23] "=f" (val23),
354  [val24] "=f" (val24), [val31] "=f" (val31)
355  : [tab] "r" (tab)
356  : "memory"
357  );
358 
359  __asm__ volatile (
360  "lwc1 %[fTmp1], 3*4(%[tab]) \n\t"
361  "lwc1 %[fTmp2], 28*4(%[tab]) \n\t"
362  "lwc1 %[fTmp3], 12*4(%[tab]) \n\t"
363  "lwc1 %[fTmp4], 19*4(%[tab]) \n\t"
364  "li.s %[fTmp7], 0.64682178335999012954 \n\t"
365  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
366  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
367  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
368  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
369  "li.s %[fTmp10], 0.53104259108978417447 \n\t"
370  "li.s %[fTmp11], 1.48416461631416627724 \n\t"
371  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
372  "add.s %[val3], %[fTmp5], %[fTmp6] \n\t"
373  "sub.s %[val12], %[fTmp5], %[fTmp6] \n\t"
374  "lwc1 %[fTmp1], 4*4(%[tab]) \n\t"
375  "lwc1 %[fTmp2], 27*4(%[tab]) \n\t"
376  "madd.s %[val19], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
377  "nmsub.s %[val28], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
378  "mul.s %[val12], %[val12], %[fTmp7] \n\t"
379  "lwc1 %[fTmp3], 11*4(%[tab]) \n\t"
380  "lwc1 %[fTmp4], 20*4(%[tab]) \n\t"
381  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
382  "mul.s %[val28], %[val28], %[fTmp7] \n\t"
383  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
384  "li.s %[fTmp7], 0.78815462345125022473 \n\t"
385  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
386  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
387  "li.s %[fTmp10], 0.55310389603444452782 \n\t"
388  "li.s %[fTmp11], 1.16943993343288495515 \n\t"
389  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
390  "add.s %[val4], %[fTmp5], %[fTmp6] \n\t"
391  "sub.s %[val11], %[fTmp5], %[fTmp6] \n\t"
392  "li.s %[fTmp1], 2.56291544774150617881 \n\t"
393  "madd.s %[val20], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
394  "nmsub.s %[val27], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
395  "mul.s %[val11], %[val11], %[fTmp7] \n\t"
396  "sub.s %[fTmp2], %[val3], %[val4] \n\t"
397  "add.s %[val3], %[val3], %[val4] \n\t"
398  "sub.s %[fTmp4], %[val19], %[val20] \n\t"
399  "mul.s %[val27], %[val27], %[fTmp7] \n\t"
400  "sub.s %[fTmp3], %[val12], %[val11] \n\t"
401  "mul.s %[val4], %[fTmp1], %[fTmp2] \n\t"
402  "add.s %[val11], %[val12], %[val11] \n\t"
403  "add.s %[val19], %[val19], %[val20] \n\t"
404  "mul.s %[val20], %[fTmp1], %[fTmp4] \n\t"
405  "mul.s %[val12], %[fTmp1], %[fTmp3] \n\t"
406  "sub.s %[fTmp2], %[val28], %[val27] \n\t"
407  "add.s %[val27], %[val28], %[val27] \n\t"
408  "mul.s %[val28], %[fTmp1], %[fTmp2] \n\t"
409 
410  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
411  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
412  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
413  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
414  [val3] "=f" (val3), [val4] "=f" (val4),
415  [val11] "=f" (val11), [val12] "=f" (val12),
416  [val19] "=f" (val19), [val20] "=f" (val20),
417  [val27] "=f" (val27), [val28] "=f" (val28)
418  : [tab] "r" (tab)
419  : "memory"
420  );
421 
422  __asm__ volatile (
423  "li.s %[fTmp1], 0.54119610014619698439 \n\t"
424  "sub.s %[fTmp2], %[val0], %[val3] \n\t"
425  "add.s %[val0], %[val0], %[val3] \n\t"
426  "sub.s %[fTmp3], %[val7], %[val4] \n\t"
427  "add.s %[val4], %[val7], %[val4] \n\t"
428  "sub.s %[fTmp4], %[val8], %[val11] \n\t"
429  "mul.s %[val3], %[fTmp1], %[fTmp2] \n\t"
430  "add.s %[val8], %[val8], %[val11] \n\t"
431  "mul.s %[val7], %[fTmp1], %[fTmp3] \n\t"
432  "sub.s %[fTmp2], %[val15], %[val12] \n\t"
433  "mul.s %[val11], %[fTmp1], %[fTmp4] \n\t"
434  "add.s %[val12], %[val15], %[val12] \n\t"
435  "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
436 
437  : [val0] "+f" (val0), [val3] "+f" (val3),
438  [val4] "+f" (val4), [val7] "+f" (val7),
439  [val8] "+f" (val8), [val11] "+f" (val11),
440  [val12] "+f" (val12), [val15] "+f" (val15),
441  [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
442  [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4)
443  :
444  );
445 
446  __asm__ volatile (
447  "sub.s %[fTmp2], %[val16], %[val19] \n\t"
448  "add.s %[val16], %[val16], %[val19] \n\t"
449  "sub.s %[fTmp3], %[val23], %[val20] \n\t"
450  "add.s %[val20], %[val23], %[val20] \n\t"
451  "sub.s %[fTmp4], %[val24], %[val27] \n\t"
452  "mul.s %[val19], %[fTmp1], %[fTmp2] \n\t"
453  "add.s %[val24], %[val24], %[val27] \n\t"
454  "mul.s %[val23], %[fTmp1], %[fTmp3] \n\t"
455  "sub.s %[fTmp2], %[val31], %[val28] \n\t"
456  "mul.s %[val27], %[fTmp1], %[fTmp4] \n\t"
457  "add.s %[val28], %[val31], %[val28] \n\t"
458  "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t"
459 
460  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
461  [val16] "+f" (val16), [val19] "+f" (val19), [val20] "+f" (val20),
462  [val23] "+f" (val23), [val24] "+f" (val24), [val27] "+f" (val27),
463  [val28] "+f" (val28), [val31] "+f" (val31)
464  : [fTmp1] "f" (fTmp1)
465  );
466 
467  __asm__ volatile (
468  "lwc1 %[fTmp1], 1*4(%[tab]) \n\t"
469  "lwc1 %[fTmp2], 30*4(%[tab]) \n\t"
470  "lwc1 %[fTmp3], 14*4(%[tab]) \n\t"
471  "lwc1 %[fTmp4], 17*4(%[tab]) \n\t"
472  "li.s %[fTmp7], 0.52249861493968888062 \n\t"
473  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
474  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
475  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
476  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
477  "li.s %[fTmp10], 0.50547095989754365998 \n\t"
478  "li.s %[fTmp11], 3.40760841846871878570 \n\t"
479  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
480  "add.s %[val1], %[fTmp5], %[fTmp6] \n\t"
481  "sub.s %[val14], %[fTmp5], %[fTmp6] \n\t"
482  "lwc1 %[fTmp1], 6*4(%[tab]) \n\t"
483  "lwc1 %[fTmp2], 25*4(%[tab]) \n\t"
484  "madd.s %[val17], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
485  "nmsub.s %[val30], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
486  "mul.s %[val14], %[val14], %[fTmp7] \n\t"
487  "lwc1 %[fTmp3], 9*4(%[tab]) \n\t"
488  "lwc1 %[fTmp4], 22*4(%[tab]) \n\t"
489  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
490  "mul.s %[val30], %[val30], %[fTmp7] \n\t"
491  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
492  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
493  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
494  "li.s %[fTmp7], 1.72244709823833392782 \n\t"
495  "li.s %[fTmp10], 0.62250412303566481615 \n\t"
496  "li.s %[fTmp11], 0.83934964541552703873 \n\t"
497  "add.s %[val6], %[fTmp5], %[fTmp6] \n\t"
498  "sub.s %[val9], %[fTmp5], %[fTmp6] \n\t"
499  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
500  "li.s %[fTmp1], 0.60134488693504528054 \n\t"
501  "sub.s %[fTmp2], %[val1], %[val6] \n\t"
502  "add.s %[val1], %[val1], %[val6] \n\t"
503  "mul.s %[val9], %[val9], %[fTmp7] \n\t"
504  "madd.s %[val22], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
505  "nmsub.s %[val25], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
506  "mul.s %[val6], %[fTmp1], %[fTmp2] \n\t"
507  "sub.s %[fTmp2], %[val14], %[val9] \n\t"
508  "add.s %[val9], %[val14], %[val9] \n\t"
509  "mul.s %[val25], %[val25], %[fTmp7] \n\t"
510  "sub.s %[fTmp3], %[val17], %[val22] \n\t"
511  "add.s %[val17], %[val17], %[val22] \n\t"
512  "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t"
513  "sub.s %[fTmp2], %[val30], %[val25] \n\t"
514  "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t"
515  "add.s %[val25], %[val30], %[val25] \n\t"
516  "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t"
517 
518  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
519  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
520  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
521  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
522  [val1] "=f" (val1), [val6] "=f" (val6),
523  [val9] "=f" (val9), [val14] "=f" (val14),
524  [val17] "=f" (val17), [val22] "=f" (val22),
525  [val25] "=f" (val25), [val30] "=f" (val30)
526  : [tab] "r" (tab)
527  : "memory"
528  );
529 
530  __asm__ volatile (
531  "lwc1 %[fTmp1], 2*4(%[tab]) \n\t"
532  "lwc1 %[fTmp2], 29*4(%[tab]) \n\t"
533  "lwc1 %[fTmp3], 13*4(%[tab]) \n\t"
534  "lwc1 %[fTmp4], 18*4(%[tab]) \n\t"
535  "li.s %[fTmp7], 0.56694403481635770368 \n\t"
536  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
537  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
538  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
539  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
540  "li.s %[fTmp10], 0.51544730992262454697 \n\t"
541  "li.s %[fTmp11], 2.05778100995341155085 \n\t"
542  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
543  "add.s %[val2], %[fTmp5], %[fTmp6] \n\t"
544  "sub.s %[val13], %[fTmp5], %[fTmp6] \n\t"
545  "lwc1 %[fTmp1], 5*4(%[tab]) \n\t"
546  "lwc1 %[fTmp2], 26*4(%[tab]) \n\t"
547  "madd.s %[val18], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
548  "nmsub.s %[val29], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
549  "mul.s %[val13], %[val13], %[fTmp7] \n\t"
550  "lwc1 %[fTmp3], 10*4(%[tab]) \n\t"
551  "lwc1 %[fTmp4], 21*4(%[tab]) \n\t"
552  "mul.s %[val29], %[val29], %[fTmp7] \n\t"
553  "add.s %[fTmp5], %[fTmp1], %[fTmp2] \n\t"
554  "sub.s %[fTmp8], %[fTmp1], %[fTmp2] \n\t"
555  "add.s %[fTmp6], %[fTmp3], %[fTmp4] \n\t"
556  "sub.s %[fTmp9], %[fTmp3], %[fTmp4] \n\t"
557  "li.s %[fTmp7], 1.06067768599034747134 \n\t"
558  "li.s %[fTmp10], 0.58293496820613387367 \n\t"
559  "li.s %[fTmp11], 0.97256823786196069369 \n\t"
560  "add.s %[val5], %[fTmp5], %[fTmp6] \n\t"
561  "sub.s %[val10], %[fTmp5], %[fTmp6] \n\t"
562  "mul.s %[fTmp8], %[fTmp8], %[fTmp10] \n\t"
563  "li.s %[fTmp1], 0.89997622313641570463 \n\t"
564  "sub.s %[fTmp2], %[val2], %[val5] \n\t"
565  "mul.s %[val10], %[val10], %[fTmp7] \n\t"
566  "madd.s %[val21], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
567  "nmsub.s %[val26], %[fTmp8], %[fTmp9], %[fTmp11] \n\t"
568  "add.s %[val2], %[val2], %[val5] \n\t"
569  "mul.s %[val5], %[fTmp1], %[fTmp2] \n\t"
570  "sub.s %[fTmp3], %[val13], %[val10] \n\t"
571  "add.s %[val10], %[val13], %[val10] \n\t"
572  "mul.s %[val26], %[val26], %[fTmp7] \n\t"
573  "sub.s %[fTmp4], %[val18], %[val21] \n\t"
574  "add.s %[val18], %[val18], %[val21] \n\t"
575  "mul.s %[val13], %[fTmp1], %[fTmp3] \n\t"
576  "sub.s %[fTmp2], %[val29], %[val26] \n\t"
577  "add.s %[val26], %[val29], %[val26] \n\t"
578  "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t"
579  "mul.s %[val29], %[fTmp1], %[fTmp2] \n\t"
580 
581  : [fTmp1] "=&f" (fTmp1), [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3),
582  [fTmp4] "=&f" (fTmp4), [fTmp5] "=&f" (fTmp5), [fTmp6] "=&f" (fTmp6),
583  [fTmp7] "=&f" (fTmp7), [fTmp8] "=&f" (fTmp8), [fTmp9] "=&f" (fTmp9),
584  [fTmp10] "=&f" (fTmp10), [fTmp11] "=&f" (fTmp11),
585  [val2] "=f" (val2), [val5] "=f" (val5),
586  [val10] "=f" (val10), [val13] "=f" (val13),
587  [val18] "=f" (val18), [val21] "=f" (val21),
588  [val26] "=f" (val26), [val29] "=f" (val29)
589  : [tab] "r" (tab)
590  : "memory"
591  );
592 
593  __asm__ volatile (
594  "li.s %[fTmp1], 1.30656296487637652785 \n\t"
595  "sub.s %[fTmp2], %[val1], %[val2] \n\t"
596  "add.s %[val1], %[val1], %[val2] \n\t"
597  "sub.s %[fTmp3], %[val6], %[val5] \n\t"
598  "add.s %[val5], %[val6], %[val5] \n\t"
599  "sub.s %[fTmp4], %[val9], %[val10] \n\t"
600  "mul.s %[val2], %[fTmp1], %[fTmp2] \n\t"
601  "add.s %[val9], %[val9], %[val10] \n\t"
602  "mul.s %[val6], %[fTmp1], %[fTmp3] \n\t"
603  "sub.s %[fTmp2], %[val14], %[val13] \n\t"
604  "mul.s %[val10], %[fTmp1], %[fTmp4] \n\t"
605  "add.s %[val13], %[val14], %[val13] \n\t"
606  "mul.s %[val14], %[fTmp1], %[fTmp2] \n\t"
607 
608  : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
609  [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
610  [val1] "+f" (val1), [val2] "+f" (val2),
611  [val5] "+f" (val5), [val6] "+f" (val6),
612  [val9] "+f" (val9), [val10] "+f" (val10),
613  [val13] "+f" (val13), [val14] "+f" (val14)
614  :
615  );
616 
617  __asm__ volatile (
618  "sub.s %[fTmp2], %[val17], %[val18] \n\t"
619  "add.s %[val17], %[val17], %[val18] \n\t"
620  "sub.s %[fTmp3], %[val22], %[val21] \n\t"
621  "add.s %[val21], %[val22], %[val21] \n\t"
622  "sub.s %[fTmp4], %[val25], %[val26] \n\t"
623  "mul.s %[val18], %[fTmp1], %[fTmp2] \n\t"
624  "add.s %[val25], %[val25], %[val26] \n\t"
625  "mul.s %[val22], %[fTmp1], %[fTmp3] \n\t"
626  "sub.s %[fTmp2], %[val30], %[val29] \n\t"
627  "mul.s %[val26], %[fTmp1], %[fTmp4] \n\t"
628  "add.s %[val29], %[val30], %[val29] \n\t"
629  "mul.s %[val30], %[fTmp1], %[fTmp2] \n\t"
630 
631  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
632  [val17] "+f" (val17), [val18] "+f" (val18), [val21] "+f" (val21),
633  [val22] "+f" (val22), [val25] "+f" (val25), [val26] "+f" (val26),
634  [val29] "+f" (val29), [val30] "+f" (val30)
635  : [fTmp1] "f" (fTmp1)
636  );
637 
638  __asm__ volatile (
639  "li.s %[fTmp1], 0.70710678118654752439 \n\t"
640  "sub.s %[fTmp2], %[val0], %[val1] \n\t"
641  "add.s %[val0], %[val0], %[val1] \n\t"
642  "sub.s %[fTmp3], %[val3], %[val2] \n\t"
643  "add.s %[val2], %[val3], %[val2] \n\t"
644  "sub.s %[fTmp4], %[val4], %[val5] \n\t"
645  "mul.s %[val1], %[fTmp1], %[fTmp2] \n\t"
646  "swc1 %[val0], 0(%[out]) \n\t"
647  "mul.s %[val3], %[fTmp3], %[fTmp1] \n\t"
648  "add.s %[val4], %[val4], %[val5] \n\t"
649  "mul.s %[val5], %[fTmp1], %[fTmp4] \n\t"
650  "swc1 %[val1], 16*4(%[out]) \n\t"
651  "sub.s %[fTmp2], %[val7], %[val6] \n\t"
652  "add.s %[val2], %[val2], %[val3] \n\t"
653  "swc1 %[val3], 24*4(%[out]) \n\t"
654  "add.s %[val6], %[val7], %[val6] \n\t"
655  "mul.s %[val7], %[fTmp1], %[fTmp2] \n\t"
656  "swc1 %[val2], 8*4(%[out]) \n\t"
657  "add.s %[val6], %[val6], %[val7] \n\t"
658  "swc1 %[val7], 28*4(%[out]) \n\t"
659  "add.s %[val4], %[val4], %[val6] \n\t"
660  "add.s %[val6], %[val6], %[val5] \n\t"
661  "add.s %[val5], %[val5], %[val7] \n\t"
662  "swc1 %[val4], 4*4(%[out]) \n\t"
663  "swc1 %[val5], 20*4(%[out]) \n\t"
664  "swc1 %[val6], 12*4(%[out]) \n\t"
665 
666  : [fTmp1] "=f" (fTmp1), [fTmp2] "=&f" (fTmp2),
667  [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
668  [val0] "+f" (val0), [val1] "+f" (val1),
669  [val2] "+f" (val2), [val3] "+f" (val3),
670  [val4] "+f" (val4), [val5] "+f" (val5),
671  [val6] "+f" (val6), [val7] "+f" (val7)
672  : [out] "r" (out)
673  );
674 
675  __asm__ volatile (
676  "sub.s %[fTmp2], %[val8], %[val9] \n\t"
677  "add.s %[val8], %[val8], %[val9] \n\t"
678  "sub.s %[fTmp3], %[val11], %[val10] \n\t"
679  "add.s %[val10], %[val11], %[val10] \n\t"
680  "sub.s %[fTmp4], %[val12], %[val13] \n\t"
681  "mul.s %[val9], %[fTmp1], %[fTmp2] \n\t"
682  "add.s %[val12], %[val12], %[val13] \n\t"
683  "mul.s %[val11], %[fTmp1], %[fTmp3] \n\t"
684  "sub.s %[fTmp2], %[val15], %[val14] \n\t"
685  "mul.s %[val13], %[fTmp1], %[fTmp4] \n\t"
686  "add.s %[val14], %[val15], %[val14] \n\t"
687  "add.s %[val10], %[val10], %[val11] \n\t"
688  "mul.s %[val15], %[fTmp1], %[fTmp2] \n\t"
689  "add.s %[val14], %[val14], %[val15] \n\t"
690  "add.s %[val12], %[val12], %[val14] \n\t"
691  "add.s %[val14], %[val14], %[val13] \n\t"
692  "add.s %[val13], %[val13], %[val15] \n\t"
693  "add.s %[val8], %[val8], %[val12] \n\t"
694  "add.s %[val12], %[val12], %[val10] \n\t"
695  "add.s %[val10], %[val10], %[val14] \n\t"
696  "add.s %[val14], %[val14], %[val9] \n\t"
697  "add.s %[val9], %[val9], %[val13] \n\t"
698  "add.s %[val13], %[val13], %[val11] \n\t"
699  "add.s %[val11], %[val11], %[val15] \n\t"
700  "swc1 %[val8], 2*4(%[out]) \n\t"
701  "swc1 %[val9], 18*4(%[out]) \n\t"
702  "swc1 %[val10], 10*4(%[out]) \n\t"
703  "swc1 %[val11], 26*4(%[out]) \n\t"
704  "swc1 %[val12], 6*4(%[out]) \n\t"
705  "swc1 %[val13], 22*4(%[out]) \n\t"
706  "swc1 %[val14], 14*4(%[out]) \n\t"
707  "swc1 %[val15], 30*4(%[out]) \n\t"
708 
709  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
710  [val8] "+f" (val8), [val9] "+f" (val9), [val10] "+f" (val10),
711  [val11] "+f" (val11), [val12] "+f" (val12), [val13] "+f" (val13),
712  [val14] "+f" (val14), [val15] "+f" (val15)
713  : [fTmp1] "f" (fTmp1), [out] "r" (out)
714  );
715 
716  __asm__ volatile (
717  "sub.s %[fTmp2], %[val16], %[val17] \n\t"
718  "add.s %[val16], %[val16], %[val17] \n\t"
719  "sub.s %[fTmp3], %[val19], %[val18] \n\t"
720  "add.s %[val18], %[val19], %[val18] \n\t"
721  "sub.s %[fTmp4], %[val20], %[val21] \n\t"
722  "mul.s %[val17], %[fTmp1], %[fTmp2] \n\t"
723  "add.s %[val20], %[val20], %[val21] \n\t"
724  "mul.s %[val19], %[fTmp1], %[fTmp3] \n\t"
725  "sub.s %[fTmp2], %[val23], %[val22] \n\t"
726  "mul.s %[val21], %[fTmp1], %[fTmp4] \n\t"
727  "add.s %[val22], %[val23], %[val22] \n\t"
728  "add.s %[val18], %[val18], %[val19] \n\t"
729  "mul.s %[val23], %[fTmp1], %[fTmp2] \n\t"
730  "add.s %[val22], %[val22], %[val23] \n\t"
731  "add.s %[val20], %[val20], %[val22] \n\t"
732  "add.s %[val22], %[val22], %[val21] \n\t"
733  "add.s %[val21], %[val21], %[val23] \n\t"
734 
735  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
736  [val16] "+f" (val16), [val17] "+f" (val17), [val18] "+f" (val18),
737  [val19] "+f" (val19), [val20] "+f" (val20), [val21] "+f" (val21),
738  [val22] "+f" (val22), [val23] "+f" (val23)
739  : [fTmp1] "f" (fTmp1)
740  );
741 
742  __asm__ volatile (
743  "sub.s %[fTmp2], %[val24], %[val25] \n\t"
744  "add.s %[val24], %[val24], %[val25] \n\t"
745  "sub.s %[fTmp3], %[val27], %[val26] \n\t"
746  "add.s %[val26], %[val27], %[val26] \n\t"
747  "sub.s %[fTmp4], %[val28], %[val29] \n\t"
748  "mul.s %[val25], %[fTmp1], %[fTmp2] \n\t"
749  "add.s %[val28], %[val28], %[val29] \n\t"
750  "mul.s %[val27], %[fTmp1], %[fTmp3] \n\t"
751  "sub.s %[fTmp2], %[val31], %[val30] \n\t"
752  "mul.s %[val29], %[fTmp1], %[fTmp4] \n\t"
753  "add.s %[val30], %[val31], %[val30] \n\t"
754  "add.s %[val26], %[val26], %[val27] \n\t"
755  "mul.s %[val31], %[fTmp1], %[fTmp2] \n\t"
756  "add.s %[val30], %[val30], %[val31] \n\t"
757  "add.s %[val28], %[val28], %[val30] \n\t"
758  "add.s %[val30], %[val30], %[val29] \n\t"
759  "add.s %[val29], %[val29], %[val31] \n\t"
760  "add.s %[val24], %[val24], %[val28] \n\t"
761  "add.s %[val28], %[val28], %[val26] \n\t"
762  "add.s %[val26], %[val26], %[val30] \n\t"
763  "add.s %[val30], %[val30], %[val25] \n\t"
764  "add.s %[val25], %[val25], %[val29] \n\t"
765  "add.s %[val29], %[val29], %[val27] \n\t"
766  "add.s %[val27], %[val27], %[val31] \n\t"
767 
768  : [fTmp2] "=&f" (fTmp2), [fTmp3] "=&f" (fTmp3), [fTmp4] "=&f" (fTmp4),
769  [val24] "+f" (val24), [val25] "+f" (val25), [val26] "+f" (val26),
770  [val27] "+f" (val27), [val28] "+f" (val28), [val29] "+f" (val29),
771  [val30] "+f" (val30), [val31] "+f" (val31)
772  : [fTmp1] "f" (fTmp1)
773  );
774 
775  out[ 1] = val16 + val24;
776  out[17] = val17 + val25;
777  out[ 9] = val18 + val26;
778  out[25] = val19 + val27;
779  out[ 5] = val20 + val28;
780  out[21] = val21 + val29;
781  out[13] = val22 + val30;
782  out[29] = val23 + val31;
783  out[ 3] = val24 + val20;
784  out[19] = val25 + val21;
785  out[11] = val26 + val22;
786  out[27] = val27 + val23;
787  out[ 7] = val28 + val18;
788  out[23] = val29 + val19;
789  out[15] = val30 + val17;
790  out[31] = val31;
791 }
792 
793 static void imdct36_mips_float(float *out, float *buf, float *in, float *win)
794 {
795  float t0, t1, t2, t3, s0, s1, s2, s3;
796  float tmp[18];
797  /* temporary variables */
798  float in1, in2, in3, in4, in5, in6;
799  float out1, out2, out3, out4, out5;
800  float c1, c2, c3, c4, c5, c6, c7, c8, c9;
801 
802  /**
803  * all loops are unrolled totally, and instructions are scheduled to
804  * minimize pipeline stall. instructions of the first two loops are
805  * reorganized, in order to eliminate unnecessary readings and
806  * writings into array. values defined in macros and tables are
807  * eliminated - they are directly loaded in appropriate variables
808  */
809 
810  /* loop 1 and 2 */
811  __asm__ volatile (
812  "lwc1 %[in1], 17*4(%[in]) \t\n"
813  "lwc1 %[in2], 16*4(%[in]) \t\n"
814  "lwc1 %[in3], 15*4(%[in]) \t\n"
815  "lwc1 %[in4], 14*4(%[in]) \t\n"
816  "lwc1 %[in5], 13*4(%[in]) \t\n"
817  "lwc1 %[in6], 12*4(%[in]) \t\n"
818  "add.s %[out1], %[in1], %[in2] \t\n"
819  "add.s %[out2], %[in2], %[in3] \t\n"
820  "add.s %[out3], %[in3], %[in4] \t\n"
821  "add.s %[out4], %[in4], %[in5] \t\n"
822  "add.s %[out5], %[in5], %[in6] \t\n"
823  "lwc1 %[in1], 11*4(%[in]) \t\n"
824  "swc1 %[out2], 16*4(%[in]) \t\n"
825  "add.s %[out1], %[out1], %[out3] \t\n"
826  "swc1 %[out4], 14*4(%[in]) \t\n"
827  "add.s %[out3], %[out3], %[out5] \t\n"
828  "lwc1 %[in2], 10*4(%[in]) \t\n"
829  "lwc1 %[in3], 9*4(%[in]) \t\n"
830  "swc1 %[out1], 17*4(%[in]) \t\n"
831  "lwc1 %[in4], 8*4(%[in]) \t\n"
832  "swc1 %[out3], 15*4(%[in]) \t\n"
833  "add.s %[out1], %[in6], %[in1] \t\n"
834  "add.s %[out2], %[in1], %[in2] \t\n"
835  "add.s %[out3], %[in2], %[in3] \t\n"
836  "add.s %[out4], %[in3], %[in4] \t\n"
837  "lwc1 %[in5], 7*4(%[in]) \t\n"
838  "swc1 %[out1], 12*4(%[in]) \t\n"
839  "add.s %[out5], %[out5], %[out2] \t\n"
840  "swc1 %[out3], 10*4(%[in]) \t\n"
841  "add.s %[out2], %[out2], %[out4] \t\n"
842  "lwc1 %[in6], 6*4(%[in]) \t\n"
843  "lwc1 %[in1], 5*4(%[in]) \t\n"
844  "swc1 %[out5], 13*4(%[in]) \t\n"
845  "lwc1 %[in2], 4*4(%[in]) \t\n"
846  "swc1 %[out2], 11*4(%[in]) \t\n"
847  "add.s %[out5], %[in4], %[in5] \t\n"
848  "add.s %[out1], %[in5], %[in6] \t\n"
849  "add.s %[out2], %[in6], %[in1] \t\n"
850  "add.s %[out3], %[in1], %[in2] \t\n"
851  "lwc1 %[in3], 3*4(%[in]) \t\n"
852  "swc1 %[out5], 8*4(%[in]) \t\n"
853  "add.s %[out4], %[out4], %[out1] \t\n"
854  "swc1 %[out2], 6*4(%[in]) \t\n"
855  "add.s %[out1], %[out1], %[out3] \t\n"
856  "lwc1 %[in4], 2*4(%[in]) \t\n"
857  "lwc1 %[in5], 1*4(%[in]) \t\n"
858  "swc1 %[out4], 9*4(%[in]) \t\n"
859  "lwc1 %[in6], 0(%[in]) \t\n"
860  "swc1 %[out1], 7*4(%[in]) \t\n"
861  "add.s %[out4], %[in2], %[in3] \t\n"
862  "add.s %[out5], %[in3], %[in4] \t\n"
863  "add.s %[out1], %[in4], %[in5] \t\n"
864  "add.s %[out2], %[in5], %[in6] \t\n"
865  "swc1 %[out4], 4*4(%[in]) \t\n"
866  "add.s %[out3], %[out3], %[out5] \t\n"
867  "swc1 %[out1], 2*4(%[in]) \t\n"
868  "add.s %[out5], %[out5], %[out2] \t\n"
869  "swc1 %[out2], 1*4(%[in]) \t\n"
870  "swc1 %[out3], 5*4(%[in]) \t\n"
871  "swc1 %[out5], 3*4(%[in]) \t\n"
872 
873  : [in1] "=&f" (in1), [in2] "=&f" (in2),
874  [in3] "=&f" (in3), [in4] "=&f" (in4),
875  [in5] "=&f" (in5), [in6] "=&f" (in6),
876  [out1] "=&f" (out1), [out2] "=&f" (out2),
877  [out3] "=&f" (out3), [out4] "=&f" (out4),
878  [out5] "=&f" (out5)
879  : [in] "r" (in)
880  : "memory"
881  );
882 
883  /* loop 3 */
884  __asm__ volatile (
885  "li.s %[c1], 0.5 \t\n"
886  "lwc1 %[in1], 8*4(%[in]) \t\n"
887  "lwc1 %[in2], 16*4(%[in]) \t\n"
888  "lwc1 %[in3], 4*4(%[in]) \t\n"
889  "lwc1 %[in4], 0(%[in]) \t\n"
890  "lwc1 %[in5], 12*4(%[in]) \t\n"
891  "li.s %[c2], 0.93969262078590838405 \t\n"
892  "add.s %[t2], %[in1], %[in2] \t\n"
893  "add.s %[t0], %[in1], %[in3] \t\n"
894  "li.s %[c3], -0.76604444311897803520 \t\n"
895  "madd.s %[t3], %[in4], %[in5], %[c1] \t\n"
896  "sub.s %[t1], %[in4], %[in5] \t\n"
897  "sub.s %[t2], %[t2], %[in3] \t\n"
898  "mul.s %[t0], %[t0], %[c2] \t\n"
899  "li.s %[c4], -0.17364817766693034885 \t\n"
900  "li.s %[c5], -0.86602540378443864676 \t\n"
901  "li.s %[c6], 0.98480775301220805936 \t\n"
902  "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n"
903  "add.s %[out2], %[t1], %[t2] \t\n"
904  "add.s %[t2], %[in2], %[in3] \t\n"
905  "sub.s %[t1], %[in1], %[in2] \t\n"
906  "sub.s %[out3], %[t3], %[t0] \t\n"
907  "swc1 %[out1], 6*4(%[tmp]) \t\n"
908  "swc1 %[out2], 16*4(%[tmp]) \t\n"
909  "mul.s %[t2], %[t2], %[c3] \t\n"
910  "mul.s %[t1], %[t1], %[c4] \t\n"
911  "add.s %[out1], %[t3], %[t0] \t\n"
912  "lwc1 %[in1], 10*4(%[in]) \t\n"
913  "lwc1 %[in2], 14*4(%[in]) \t\n"
914  "sub.s %[out3], %[out3], %[t2] \t\n"
915  "add.s %[out2], %[t3], %[t2] \t\n"
916  "add.s %[out1], %[out1], %[t1] \t\n"
917  "lwc1 %[in3], 2*4(%[in]) \t\n"
918  "lwc1 %[in4], 6*4(%[in]) \t\n"
919  "swc1 %[out3], 10*4(%[tmp]) \t\n"
920  "sub.s %[out2], %[out2], %[t1] \t\n"
921  "swc1 %[out1], 2*4(%[tmp]) \t\n"
922  "add.s %[out1], %[in1], %[in2] \t\n"
923  "add.s %[t2], %[in1], %[in3] \t\n"
924  "sub.s %[t3], %[in1], %[in2] \t\n"
925  "swc1 %[out2], 14*4(%[tmp]) \t\n"
926  "li.s %[c7], -0.34202014332566873304 \t\n"
927  "sub.s %[out1], %[out1], %[in3] \t\n"
928  "mul.s %[t2], %[t2], %[c6] \t\n"
929  "mul.s %[t3], %[t3], %[c7] \t\n"
930  "li.s %[c8], 0.86602540378443864676 \t\n"
931  "mul.s %[t0], %[in4], %[c8] \t\n"
932  "mul.s %[out1], %[out1], %[c5] \t\n"
933  "add.s %[t1], %[in2], %[in3] \t\n"
934  "li.s %[c9], -0.64278760968653932632 \t\n"
935  "add.s %[out2], %[t2], %[t3] \t\n"
936  "lwc1 %[in1], 9*4(%[in]) \t\n"
937  "swc1 %[out1], 4*4(%[tmp]) \t\n"
938  "mul.s %[t1], %[t1], %[c9] \t\n"
939  "lwc1 %[in2], 17*4(%[in]) \t\n"
940  "add.s %[out2], %[out2], %[t0] \t\n"
941  "lwc1 %[in3], 5*4(%[in]) \t\n"
942  "lwc1 %[in4], 1*4(%[in]) \t\n"
943  "add.s %[out3], %[t2], %[t1] \t\n"
944  "sub.s %[out1], %[t3], %[t1] \t\n"
945  "swc1 %[out2], 0(%[tmp]) \t\n"
946  "lwc1 %[in5], 13*4(%[in]) \t\n"
947  "add.s %[t2], %[in1], %[in2] \t\n"
948  "sub.s %[out3], %[out3], %[t0] \t\n"
949  "sub.s %[out1], %[out1], %[t0] \t\n"
950  "add.s %[t0], %[in1], %[in3] \t\n"
951  "madd.s %[t3], %[in4], %[in5], %[c1] \t\n"
952  "sub.s %[t2], %[t2], %[in3] \t\n"
953  "swc1 %[out3], 12*4(%[tmp]) \t\n"
954  "swc1 %[out1], 8*4(%[tmp]) \t\n"
955  "sub.s %[t1], %[in4], %[in5] \t\n"
956  "mul.s %[t0], %[t0], %[c2] \t\n"
957  "nmsub.s %[out1], %[t1], %[t2], %[c1] \t\n"
958  "add.s %[out2], %[t1], %[t2] \t\n"
959  "add.s %[t2], %[in2], %[in3] \t\n"
960  "sub.s %[t1], %[in1], %[in2] \t\n"
961  "sub.s %[out3], %[t3], %[t0] \t\n"
962  "swc1 %[out1], 7*4(%[tmp]) \t\n"
963  "swc1 %[out2], 17*4(%[tmp]) \t\n"
964  "mul.s %[t2], %[t2], %[c3] \t\n"
965  "mul.s %[t1], %[t1], %[c4] \t\n"
966  "add.s %[out1], %[t3], %[t0] \t\n"
967  "lwc1 %[in1], 11*4(%[in]) \t\n"
968  "lwc1 %[in2], 15*4(%[in]) \t\n"
969  "sub.s %[out3], %[out3], %[t2] \t\n"
970  "add.s %[out2], %[t3], %[t2] \t\n"
971  "add.s %[out1], %[out1], %[t1] \t\n"
972  "lwc1 %[in3], 3*4(%[in]) \t\n"
973  "lwc1 %[in4], 7*4(%[in]) \t\n"
974  "swc1 %[out3], 11*4(%[tmp]) \t\n"
975  "sub.s %[out2], %[out2], %[t1] \t\n"
976  "swc1 %[out1], 3*4(%[tmp]) \t\n"
977  "add.s %[out3], %[in1], %[in2] \t\n"
978  "add.s %[t2], %[in1], %[in3] \t\n"
979  "sub.s %[t3], %[in1], %[in2] \t\n"
980  "swc1 %[out2], 15*4(%[tmp]) \t\n"
981  "mul.s %[t0], %[in4], %[c8] \t\n"
982  "sub.s %[out3], %[out3], %[in3] \t\n"
983  "mul.s %[t2], %[t2], %[c6] \t\n"
984  "mul.s %[t3], %[t3], %[c7] \t\n"
985  "add.s %[t1], %[in2], %[in3] \t\n"
986  "mul.s %[out3], %[out3], %[c5] \t\n"
987  "add.s %[out1], %[t2], %[t3] \t\n"
988  "mul.s %[t1], %[t1], %[c9] \t\n"
989  "swc1 %[out3], 5*4(%[tmp]) \t\n"
990  "add.s %[out1], %[out1], %[t0] \t\n"
991  "add.s %[out2], %[t2], %[t1] \t\n"
992  "sub.s %[out3], %[t3], %[t1] \t\n"
993  "swc1 %[out1], 1*4(%[tmp]) \t\n"
994  "sub.s %[out2], %[out2], %[t0] \t\n"
995  "sub.s %[out3], %[out3], %[t0] \t\n"
996  "swc1 %[out2], 13*4(%[tmp]) \t\n"
997  "swc1 %[out3], 9*4(%[tmp]) \t\n"
998 
999  : [t0] "=&f" (t0), [t1] "=&f" (t1),
1000  [t2] "=&f" (t2), [t3] "=&f" (t3),
1001  [in1] "=&f" (in1), [in2] "=&f" (in2),
1002  [in3] "=&f" (in3), [in4] "=&f" (in4),
1003  [in5] "=&f" (in5),
1004  [out1] "=&f" (out1), [out2] "=&f" (out2),
1005  [out3] "=&f" (out3),
1006  [c1] "=&f" (c1), [c2] "=&f" (c2),
1007  [c3] "=&f" (c3), [c4] "=&f" (c4),
1008  [c5] "=&f" (c5), [c6] "=&f" (c6),
1009  [c7] "=&f" (c7), [c8] "=&f" (c8),
1010  [c9] "=&f" (c9)
1011  : [in] "r" (in), [tmp] "r" (tmp)
1012  : "memory"
1013  );
1014 
1015  /* loop 4 */
1016  __asm__ volatile (
1017  "lwc1 %[in1], 2*4(%[tmp]) \t\n"
1018  "lwc1 %[in2], 0(%[tmp]) \t\n"
1019  "lwc1 %[in3], 3*4(%[tmp]) \t\n"
1020  "lwc1 %[in4], 1*4(%[tmp]) \t\n"
1021  "li.s %[c1], 0.50190991877167369479 \t\n"
1022  "li.s %[c2], 5.73685662283492756461 \t\n"
1023  "add.s %[s0], %[in1], %[in2] \t\n"
1024  "sub.s %[s2], %[in1], %[in2] \t\n"
1025  "add.s %[s1], %[in3], %[in4] \t\n"
1026  "sub.s %[s3], %[in3], %[in4] \t\n"
1027  "lwc1 %[in1], 9*4(%[win]) \t\n"
1028  "lwc1 %[in2], 4*9*4(%[buf]) \t\n"
1029  "lwc1 %[in3], 8*4(%[win]) \t\n"
1030  "mul.s %[s1], %[s1], %[c1] \t\n"
1031  "mul.s %[s3], %[s3], %[c2] \t\n"
1032  "lwc1 %[in4], 4*8*4(%[buf]) \t\n"
1033  "lwc1 %[in5], 29*4(%[win]) \t\n"
1034  "lwc1 %[in6], 28*4(%[win]) \t\n"
1035  "add.s %[t0], %[s0], %[s1] \t\n"
1036  "sub.s %[t1], %[s0], %[s1] \t\n"
1037  "li.s %[c1], 0.51763809020504152469 \t\n"
1038  "li.s %[c2], 1.93185165257813657349 \t\n"
1039  "mul.s %[out3], %[in5], %[t0] \t\n"
1040  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1041  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1042  "mul.s %[out4], %[in6], %[t0] \t\n"
1043  "add.s %[t0], %[s2], %[s3] \t\n"
1044  "swc1 %[out3], 4*9*4(%[buf]) \t\n"
1045  "swc1 %[out1], 288*4(%[out]) \t\n"
1046  "swc1 %[out2], 256*4(%[out]) \t\n"
1047  "swc1 %[out4], 4*8*4(%[buf]) \t\n"
1048  "sub.s %[t1], %[s2], %[s3] \t\n"
1049  "lwc1 %[in1], 17*4(%[win]) \t\n"
1050  "lwc1 %[in2], 4*17*4(%[buf]) \t\n"
1051  "lwc1 %[in3], 0(%[win]) \t\n"
1052  "lwc1 %[in4], 0(%[buf]) \t\n"
1053  "lwc1 %[in5], 37*4(%[win]) \t\n"
1054  "lwc1 %[in6], 20*4(%[win]) \t\n"
1055  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1056  "lwc1 %[in1], 6*4(%[tmp]) \t\n"
1057  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1058  "mul.s %[out3], %[t0], %[in5] \t\n"
1059  "mul.s %[out4], %[t0], %[in6] \t\n"
1060  "swc1 %[out1], 544*4(%[out]) \t\n"
1061  "lwc1 %[in2], 4*4(%[tmp]) \t\n"
1062  "swc1 %[out2], 0(%[out]) \t\n"
1063  "swc1 %[out3], 4*17*4(%[buf]) \t\n"
1064  "swc1 %[out4], 0(%[buf]) \t\n"
1065  "lwc1 %[in3], 7*4(%[tmp]) \t\n"
1066  "add.s %[s0], %[in1], %[in2] \t\n"
1067  "sub.s %[s2], %[in1], %[in2] \t\n"
1068  "lwc1 %[in4], 5*4(%[tmp]) \t\n"
1069  "add.s %[s1], %[in3], %[in4] \t\n"
1070  "sub.s %[s3], %[in3], %[in4] \t\n"
1071  "lwc1 %[in1], 10*4(%[win]) \t\n"
1072  "lwc1 %[in2], 4*10*4(%[buf]) \t\n"
1073  "lwc1 %[in3], 7*4(%[win]) \t\n"
1074  "mul.s %[s1], %[s1], %[c1] \t\n"
1075  "mul.s %[s3], %[s3], %[c2] \t\n"
1076  "add.s %[t0], %[s0], %[s1] \t\n"
1077  "sub.s %[t1], %[s0], %[s1] \t\n"
1078  "lwc1 %[in4], 4*7*4(%[buf]) \t\n"
1079  "lwc1 %[in5], 30*4(%[win]) \t\n"
1080  "lwc1 %[in6], 27*4(%[win]) \t\n"
1081  "li.s %[c1], 0.55168895948124587824 \t\n"
1082  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1083  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1084  "mul.s %[out3], %[t0], %[in5] \t\n"
1085  "mul.s %[out4], %[t0], %[in6] \t\n"
1086  "add.s %[t0], %[s2], %[s3] \t\n"
1087  "swc1 %[out1], 320*4(%[out]) \t\n"
1088  "swc1 %[out2], 224*4(%[out]) \t\n"
1089  "swc1 %[out3], 4*10*4(%[buf]) \t\n"
1090  "swc1 %[out4], 4*7*4(%[buf]) \t\n"
1091  "sub.s %[t1], %[s2], %[s3] \t\n"
1092  "lwc1 %[in1], 16*4(%[win]) \t\n"
1093  "lwc1 %[in2], 4*16*4(%[buf]) \t\n"
1094  "lwc1 %[in3], 1*4(%[win]) \t\n"
1095  "lwc1 %[in4], 4*1*4(%[buf]) \t\n"
1096  "lwc1 %[in5], 36*4(%[win]) \t\n"
1097  "lwc1 %[in6], 21*4(%[win]) \t\n"
1098  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1099  "lwc1 %[in1], 10*4(%[tmp]) \t\n"
1100  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1101  "mul.s %[out3], %[in5], %[t0] \t\n"
1102  "mul.s %[out4], %[in6], %[t0] \t\n"
1103  "swc1 %[out1], 512*4(%[out]) \t\n"
1104  "lwc1 %[in2], 8*4(%[tmp]) \t\n"
1105  "swc1 %[out2], 32*4(%[out]) \t\n"
1106  "swc1 %[out3], 4*16*4(%[buf]) \t\n"
1107  "swc1 %[out4], 4*1*4(%[buf]) \t\n"
1108  "li.s %[c2], 1.18310079157624925896 \t\n"
1109  "add.s %[s0], %[in1], %[in2] \t\n"
1110  "sub.s %[s2], %[in1], %[in2] \t\n"
1111  "lwc1 %[in3], 11*4(%[tmp]) \t\n"
1112  "lwc1 %[in4], 9*4(%[tmp]) \t\n"
1113  "add.s %[s1], %[in3], %[in4] \t\n"
1114  "sub.s %[s3], %[in3], %[in4] \t\n"
1115  "lwc1 %[in1], 11*4(%[win]) \t\n"
1116  "lwc1 %[in2], 4*11*4(%[buf]) \t\n"
1117  "lwc1 %[in3], 6*4(%[win]) \t\n"
1118  "mul.s %[s1], %[s1], %[c1] \t\n"
1119  "mul.s %[s3], %[s3], %[c2] \t\n"
1120  "lwc1 %[in4], 4*6*4(%[buf]) \t\n"
1121  "lwc1 %[in5], 31*4(%[win]) \t\n"
1122  "lwc1 %[in6], 26*4(%[win]) \t\n"
1123  "add.s %[t0], %[s0], %[s1] \t\n"
1124  "sub.s %[t1], %[s0], %[s1] \t\n"
1125  "mul.s %[out3], %[t0], %[in5] \t\n"
1126  "mul.s %[out4], %[t0], %[in6] \t\n"
1127  "add.s %[t0], %[s2], %[s3] \t\n"
1128  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1129  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1130  "swc1 %[out3], 4*11*4(%[buf]) \t\n"
1131  "swc1 %[out4], 4*6*4(%[buf]) \t\n"
1132  "sub.s %[t1], %[s2], %[s3] \t\n"
1133  "swc1 %[out1], 352*4(%[out]) \t\n"
1134  "swc1 %[out2], 192*4(%[out]) \t\n"
1135  "lwc1 %[in1], 15*4(%[win]) \t\n"
1136  "lwc1 %[in2], 4*15*4(%[buf]) \t\n"
1137  "lwc1 %[in3], 2*4(%[win]) \t\n"
1138  "lwc1 %[in4], 4*2*4(%[buf]) \t\n"
1139  "lwc1 %[in5], 35*4(%[win]) \t\n"
1140  "lwc1 %[in6], 22*4(%[win]) \t\n"
1141  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1142  "lwc1 %[in1], 14*4(%[tmp]) \t\n"
1143  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1144  "mul.s %[out3], %[t0], %[in5] \t\n"
1145  "mul.s %[out4], %[t0], %[in6] \t\n"
1146  "swc1 %[out1], 480*4(%[out]) \t\n"
1147  "lwc1 %[in2], 12*4(%[tmp]) \t\n"
1148  "swc1 %[out2], 64*4(%[out]) \t\n"
1149  "swc1 %[out3], 4*15*4(%[buf]) \t\n"
1150  "swc1 %[out4], 4*2*4(%[buf]) \t\n"
1151  "lwc1 %[in3], 15*4(%[tmp]) \t\n"
1152  "add.s %[s0], %[in1], %[in2] \t\n"
1153  "sub.s %[s2], %[in1], %[in2] \t\n"
1154  "lwc1 %[in4], 13*4(%[tmp]) \t\n"
1155  "li.s %[c1], 0.61038729438072803416 \t\n"
1156  "li.s %[c2], 0.87172339781054900991 \t\n"
1157  "add.s %[s1], %[in3], %[in4] \t\n"
1158  "sub.s %[s3], %[in3], %[in4] \t\n"
1159  "lwc1 %[in1], 12*4(%[win]) \t\n"
1160  "lwc1 %[in2], 4*12*4(%[buf]) \t\n"
1161  "lwc1 %[in3], 5*4(%[win]) \t\n"
1162  "mul.s %[s1], %[s1], %[c1] \t\n"
1163  "mul.s %[s3], %[s3], %[c2] \t\n"
1164  "lwc1 %[in4], 4*5*4(%[buf]) \t\n"
1165  "lwc1 %[in5], 32*4(%[win]) \t\n"
1166  "lwc1 %[in6], 25*4(%[win]) \t\n"
1167  "add.s %[t0], %[s0], %[s1] \t\n"
1168  "sub.s %[t1], %[s0], %[s1] \t\n"
1169  "lwc1 %[s0], 16*4(%[tmp]) \t\n"
1170  "lwc1 %[s1], 17*4(%[tmp]) \t\n"
1171  "li.s %[c1], 0.70710678118654752439 \t\n"
1172  "mul.s %[out3], %[t0], %[in5] \t\n"
1173  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1174  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1175  "mul.s %[out4], %[t0], %[in6] \t\n"
1176  "add.s %[t0], %[s2], %[s3] \t\n"
1177  "swc1 %[out3], 4*12*4(%[buf]) \t\n"
1178  "swc1 %[out1], 384*4(%[out]) \t\n"
1179  "swc1 %[out2], 160*4(%[out]) \t\n"
1180  "swc1 %[out4], 4*5*4(%[buf]) \t\n"
1181  "sub.s %[t1], %[s2], %[s3] \t\n"
1182  "lwc1 %[in1], 14*4(%[win]) \t\n"
1183  "lwc1 %[in2], 4*14*4(%[buf]) \t\n"
1184  "lwc1 %[in3], 3*4(%[win]) \t\n"
1185  "lwc1 %[in4], 4*3*4(%[buf]) \t\n"
1186  "lwc1 %[in5], 34*4(%[win]) \t\n"
1187  "lwc1 %[in6], 23*4(%[win]) \t\n"
1188  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1189  "mul.s %[s1], %[s1], %[c1] \t\n"
1190  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1191  "mul.s %[out3], %[in5], %[t0] \t\n"
1192  "mul.s %[out4], %[in6], %[t0] \t\n"
1193  "swc1 %[out1], 448*4(%[out]) \t\n"
1194  "add.s %[t0], %[s0], %[s1] \t\n"
1195  "swc1 %[out2], 96*4(%[out]) \t\n"
1196  "swc1 %[out3], 4*14*4(%[buf]) \t\n"
1197  "swc1 %[out4], 4*3*4(%[buf]) \t\n"
1198  "sub.s %[t1], %[s0], %[s1] \t\n"
1199  "lwc1 %[in1], 13*4(%[win]) \t\n"
1200  "lwc1 %[in2], 4*13*4(%[buf]) \t\n"
1201  "lwc1 %[in3], 4*4(%[win]) \t\n"
1202  "lwc1 %[in4], 4*4*4(%[buf]) \t\n"
1203  "lwc1 %[in5], 33*4(%[win]) \t\n"
1204  "lwc1 %[in6], 24*4(%[win]) \t\n"
1205  "madd.s %[out1], %[in2], %[in1], %[t1] \t\n"
1206  "madd.s %[out2], %[in4], %[in3], %[t1] \t\n"
1207  "mul.s %[out3], %[t0], %[in5] \t\n"
1208  "mul.s %[out4], %[t0], %[in6] \t\n"
1209  "swc1 %[out1], 416*4(%[out]) \t\n"
1210  "swc1 %[out2], 128*4(%[out]) \t\n"
1211  "swc1 %[out3], 4*13*4(%[buf]) \t\n"
1212  "swc1 %[out4], 4*4*4(%[buf]) \t\n"
1213 
1214  : [c1] "=&f" (c1), [c2] "=&f" (c2),
1215  [in1] "=&f" (in1), [in2] "=&f" (in2),
1216  [in3] "=&f" (in3), [in4] "=&f" (in4),
1217  [in5] "=&f" (in5), [in6] "=&f" (in6),
1218  [out1] "=&f" (out1), [out2] "=&f" (out2),
1219  [out3] "=&f" (out3), [out4] "=&f" (out4),
1220  [t0] "=&f" (t0), [t1] "=&f" (t1),
1221  [t2] "=&f" (t2), [t3] "=&f" (t3),
1222  [s0] "=&f" (s0), [s1] "=&f" (s1),
1223  [s2] "=&f" (s2), [s3] "=&f" (s3)
1224  : [tmp] "r" (tmp), [win] "r" (win),
1225  [buf] "r" (buf), [out] "r" (out)
1226  : "memory"
1227  );
1228 }
1229 
1230 static void ff_imdct36_blocks_mips_float(float *out, float *buf, float *in,
1231  int count, int switch_point, int block_type)
1232 {
1233  int j;
1234  for (j=0 ; j < count; j++) {
1235  /* apply window & overlap with previous buffer */
1236 
1237  /* select window */
1238  int win_idx = (switch_point && j < 2) ? 0 : block_type;
1239  float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];
1240 
1241  imdct36_mips_float(out, buf, in, win);
1242 
1243  in += 18;
1244  buf += ((j&3) != 3 ? 1 : (72-3));
1245  out++;
1246  }
1247 }
1248 
1249 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
1250 #endif /* HAVE_INLINE_ASM && HAVE_MIPSFPU */
1251 
1253 {
1254 #if HAVE_INLINE_ASM && HAVE_MIPSFPU
1255 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
1256  s->apply_window_float = ff_mpadsp_apply_window_mips_float;
1257  s->imdct36_blocks_float = ff_imdct36_blocks_mips_float;
1258  s->dct32_float = ff_dct32_mips_float;
1259 #endif
1260 #endif
1261 }
out
FILE * out
Definition: movenc.c:54
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
MPADSPContext
Definition: mpegaudiodsp.h:27
t0
#define t0
Definition: regdef.h:28
samples
FFmpeg Automated Testing Environment ************************************Introduction Using FATE from your FFmpeg source directory Submitting the results to the FFmpeg result aggregation server Uploading new samples to the fate suite FATE makefile targets and variables Makefile targets Makefile variables Examples Introduction **************FATE is an extended regression suite on the client side and a means for results aggregation and presentation on the server side The first part of this document explains how you can use FATE from your FFmpeg source directory to test your ffmpeg binary The second part describes how you can run FATE to submit the results to FFmpeg’s FATE server In any way you can have a look at the publicly viewable FATE results by visiting this as it can be seen if some test on some platform broke with their recent contribution This usually happens on the platforms the developers could not test on The second part of this document describes how you can run FATE to submit your results to FFmpeg’s FATE server If you want to submit your results be sure to check that your combination of OS and compiler is not already listed on the above mentioned website In the third part you can find a comprehensive listing of FATE makefile targets and variables Using FATE from your FFmpeg source directory **********************************************If you want to run FATE on your machine you need to have the samples in place You can get the samples via the build target fate rsync Use this command from the top level source this will cause FATE to fail NOTE To use a custom wrapper to run the pass ‘ target exec’ to ‘configure’ or set the TARGET_EXEC Make variable Submitting the results to the FFmpeg result aggregation server ****************************************************************To submit your results to the server you should run fate through the shell script ‘tests fate sh’ from the FFmpeg sources This script needs to be invoked with a configuration file as its first argument tests fate sh path to fate_config A configuration file template with comments describing the individual configuration variables can be found at ‘doc fate_config sh template’ Create a configuration that suits your based on the configuration template The ‘slot’ configuration variable can be any string that is not yet but it is suggested that you name it adhering to the following pattern ‘ARCH OS COMPILER COMPILER VERSION’ The configuration file itself will be sourced in a shell therefore all shell features may be used This enables you to setup the environment as you need it for your build For your first test runs the ‘fate_recv’ variable should be empty or commented out This will run everything as normal except that it will omit the submission of the results to the server The following files should be present in $workdir as specified in the configuration it may help to try out the ‘ssh’ command with one or more ‘ v’ options You should get detailed output concerning your SSH configuration and the authentication process The only thing left is to automate the execution of the fate sh script and the synchronisation of the samples directory Uploading new samples to the fate suite *****************************************If you need a sample uploaded send a mail to samples request This is for developers who have an account on the fate suite server If you upload new samples
Definition: fate.txt:139
t1
#define t1
Definition: regdef.h:29
asmdefs.h
c1
static const uint64_t c1
Definition: murmur3.c:49
win
static float win(SuperEqualizerContext *s, float n, int N)
Definition: af_superequalizer.c:119
window
static SDL_Window * window
Definition: ffplay.c:368
s3
#define s3
Definition: regdef.h:40
tab
static const struct twinvq_data tab
Definition: twinvq_data.h:11135
s
#define s(width, name)
Definition: cbs_vp9.c:257
s1
#define s1
Definition: regdef.h:38
s2
#define s2
Definition: regdef.h:39
PTR_SUBU
#define PTR_SUBU
Definition: asmdefs.h:50
in
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
Definition: audio_convert.c:326
t3
#define t3
Definition: regdef.h:31
PTR_ADDU
#define PTR_ADDU
Definition: asmdefs.h:47
w
FFmpeg Automated Testing Environment ************************************Introduction Using FATE from your FFmpeg source directory Submitting the results to the FFmpeg result aggregation server Uploading new samples to the fate suite FATE makefile targets and variables Makefile targets Makefile variables Examples Introduction **************FATE is an extended regression suite on the client side and a means for results aggregation and presentation on the server side The first part of this document explains how you can use FATE from your FFmpeg source directory to test your ffmpeg binary The second part describes how you can run FATE to submit the results to FFmpeg’s FATE server In any way you can have a look at the publicly viewable FATE results by visiting this as it can be seen if some test on some platform broke with their recent contribution This usually happens on the platforms the developers could not test on The second part of this document describes how you can run FATE to submit your results to FFmpeg’s FATE server If you want to submit your results be sure to check that your combination of OS and compiler is not already listed on the above mentioned website In the third part you can find a comprehensive listing of FATE makefile targets and variables Using FATE from your FFmpeg source directory **********************************************If you want to run FATE on your machine you need to have the samples in place You can get the samples via the build target fate rsync Use this command from the top level source this will cause FATE to fail NOTE To use a custom wrapper to run the pass ‘ target exec’ to ‘configure’ or set the TARGET_EXEC Make variable Submitting the results to the FFmpeg result aggregation server ****************************************************************To submit your results to the server you should run fate through the shell script ‘tests fate sh’ from the FFmpeg sources This script needs to be invoked with a configuration file as its first argument tests fate sh path to fate_config A configuration file template with comments describing the individual configuration variables can be found at ‘doc fate_config sh template’ Create a configuration that suits your based on the configuration template The ‘slot’ configuration variable can be any string that is not yet but it is suggested that you name it adhering to the following pattern ‘ARCH OS COMPILER COMPILER VERSION’ The configuration file itself will be sourced in a shell therefore all shell features may be used This enables you to setup the environment as you need it for your build For your first test runs the ‘fate_recv’ variable should be empty or commented out This will run everything as normal except that it will omit the submission of the results to the server The following files should be present in $workdir as specified in the configuration it may help to try out the ‘ssh’ command with one or more ‘ v’ options You should get detailed output concerning your SSH configuration and the authentication process The only thing left is to automate the execution of the fate sh script and the synchronisation of the samples directory Uploading new samples to the fate suite *****************************************If you need a sample uploaded send a mail to samples request This is for developers who have an account on the fate suite server If you upload new please make sure they are as small as space on each network bandwidth and so on benefit from smaller test cases Also keep in mind older checkouts use existing sample that means in practice generally do not remove or overwrite files as it likely would break older checkouts or releases Also all needed samples for a commit should be ideally before the push If you need an account for frequently uploading samples or you wish to help others by doing that send a mail to ffmpeg devel rsync vauL Duo ug o o w
Definition: fate.txt:150
c2
static const uint64_t c2
Definition: murmur3.c:50
ff_mpadsp_init_mipsfpu
void ff_mpadsp_init_mipsfpu(MPADSPContext *s)
Definition: mpegaudiodsp_mips_float.c:1252
t2
#define t2
Definition: regdef.h:30
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:48
mpegaudiodsp.h
s0
#define s0
Definition: regdef.h:37
ff_mdct_win_float
float ff_mdct_win_float[8][MDCT_BUF_SIZE]