| rev |
line source |
|
nengel@2
|
1 /*
|
|
nengel@2
|
2 * MMX optimized DSP utils
|
|
nengel@2
|
3 * Copyright (c) 2000, 2001 Fabrice Bellard
|
|
nengel@2
|
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
|
nengel@2
|
5 *
|
|
nengel@2
|
6 * This file is part of FFmpeg.
|
|
nengel@2
|
7 *
|
|
nengel@2
|
8 * FFmpeg is free software; you can redistribute it and/or
|
|
nengel@2
|
9 * modify it under the terms of the GNU Lesser General Public
|
|
nengel@2
|
10 * License as published by the Free Software Foundation; either
|
|
nengel@2
|
11 * version 2.1 of the License, or (at your option) any later version.
|
|
nengel@2
|
12 *
|
|
nengel@2
|
13 * FFmpeg is distributed in the hope that it will be useful,
|
|
nengel@2
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
nengel@2
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
nengel@2
|
16 * Lesser General Public License for more details.
|
|
nengel@2
|
17 *
|
|
nengel@2
|
18 * You should have received a copy of the GNU Lesser General Public
|
|
nengel@2
|
19 * License along with FFmpeg; if not, write to the Free Software
|
|
nengel@2
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
nengel@2
|
21 *
|
|
nengel@2
|
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
|
nengel@2
|
23 */
|
|
nengel@2
|
24
|
|
nengel@2
|
25 #include "libavutil/x86_cpu.h"
|
|
nengel@2
|
26 #include "libavutil/internal.h"
|
|
nengel@2
|
27 #include "libavcodec/dsputil.h"
|
|
nengel@2
|
28 #include "libavcodec/h264_dsp.h"
|
|
nengel@2
|
29 #include "dsputil_mmx.h"
|
|
nengel@2
|
30
|
|
nengel@2
|
31
|
|
nengel@2
|
32 //#undef NDEBUG
|
|
nengel@2
|
33 //#include <assert.h>
|
|
nengel@2
|
34
|
|
nengel@2
|
35 int mm_flags; /* multimedia extension flags */
|
|
nengel@2
|
36
|
|
nengel@2
|
37 /* pixel operations */
|
|
nengel@2
|
38 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
|
|
nengel@2
|
39 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
|
|
nengel@2
|
40
|
|
nengel@2
|
41 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
|
|
nengel@2
|
42 {0x8000000080000000ULL, 0x8000000080000000ULL};
|
|
nengel@2
|
43
|
|
nengel@2
|
44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
|
|
nengel@2
|
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
|
|
nengel@2
|
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
|
|
nengel@2
|
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
|
|
nengel@2
|
48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
|
|
nengel@2
|
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
|
|
nengel@2
|
50 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
|
|
nengel@2
|
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
|
|
nengel@2
|
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
|
|
nengel@2
|
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
|
|
nengel@2
|
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
|
|
nengel@2
|
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
|
|
nengel@2
|
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
|
|
nengel@2
|
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
|
|
nengel@2
|
58
|
|
nengel@2
|
59 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
|
|
nengel@2
|
60 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
|
|
nengel@2
|
61 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
|
|
nengel@2
|
62 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
|
|
nengel@2
|
63 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
|
|
nengel@2
|
64 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
|
|
nengel@2
|
65 DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
|
|
nengel@2
|
66 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
|
|
nengel@2
|
67
|
|
nengel@2
|
68 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
|
|
nengel@2
|
69 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
|
|
nengel@2
|
70
|
|
nengel@2
|
71 #define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t"
|
|
nengel@2
|
72 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
|
|
nengel@2
|
73 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
|
|
nengel@2
|
74
|
|
nengel@2
|
75 #define MOVQ_BFE(regd) \
|
|
nengel@2
|
76 __asm__ volatile ( \
|
|
nengel@2
|
77 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
|
|
nengel@2
|
78 "paddb %%" #regd ", %%" #regd " \n\t" ::)
|
|
nengel@2
|
79
|
|
nengel@2
|
80 #ifndef PIC
|
|
nengel@2
|
81 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
|
|
nengel@2
|
82 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
|
|
nengel@2
|
83 #else
|
|
nengel@2
|
84 // for shared library it's better to use this way for accessing constants
|
|
nengel@2
|
85 // pcmpeqd -> -1
|
|
nengel@2
|
86 #define MOVQ_BONE(regd) \
|
|
nengel@2
|
87 __asm__ volatile ( \
|
|
nengel@2
|
88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
|
nengel@2
|
89 "psrlw $15, %%" #regd " \n\t" \
|
|
nengel@2
|
90 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
|
|
nengel@2
|
91
|
|
nengel@2
|
92 #define MOVQ_WTWO(regd) \
|
|
nengel@2
|
93 __asm__ volatile ( \
|
|
nengel@2
|
94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
|
nengel@2
|
95 "psrlw $15, %%" #regd " \n\t" \
|
|
nengel@2
|
96 "psllw $1, %%" #regd " \n\t"::)
|
|
nengel@2
|
97
|
|
nengel@2
|
98 #endif
|
|
nengel@2
|
99
|
|
nengel@2
|
100 // using regr as temporary and for the output result
|
|
nengel@2
|
101 // first argument is unmodifed and second is trashed
|
|
nengel@2
|
102 // regfe is supposed to contain 0xfefefefefefefefe
|
|
nengel@2
|
103 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
|
|
nengel@2
|
104 "movq " #rega ", " #regr " \n\t"\
|
|
nengel@2
|
105 "pand " #regb ", " #regr " \n\t"\
|
|
nengel@2
|
106 "pxor " #rega ", " #regb " \n\t"\
|
|
nengel@2
|
107 "pand " #regfe "," #regb " \n\t"\
|
|
nengel@2
|
108 "psrlq $1, " #regb " \n\t"\
|
|
nengel@2
|
109 "paddb " #regb ", " #regr " \n\t"
|
|
nengel@2
|
110
|
|
nengel@2
|
111 #define PAVGB_MMX(rega, regb, regr, regfe) \
|
|
nengel@2
|
112 "movq " #rega ", " #regr " \n\t"\
|
|
nengel@2
|
113 "por " #regb ", " #regr " \n\t"\
|
|
nengel@2
|
114 "pxor " #rega ", " #regb " \n\t"\
|
|
nengel@2
|
115 "pand " #regfe "," #regb " \n\t"\
|
|
nengel@2
|
116 "psrlq $1, " #regb " \n\t"\
|
|
nengel@2
|
117 "psubb " #regb ", " #regr " \n\t"
|
|
nengel@2
|
118
|
|
nengel@2
|
119 // mm6 is supposed to contain 0xfefefefefefefefe
|
|
nengel@2
|
120 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
|
|
nengel@2
|
121 "movq " #rega ", " #regr " \n\t"\
|
|
nengel@2
|
122 "movq " #regc ", " #regp " \n\t"\
|
|
nengel@2
|
123 "pand " #regb ", " #regr " \n\t"\
|
|
nengel@2
|
124 "pand " #regd ", " #regp " \n\t"\
|
|
nengel@2
|
125 "pxor " #rega ", " #regb " \n\t"\
|
|
nengel@2
|
126 "pxor " #regc ", " #regd " \n\t"\
|
|
nengel@2
|
127 "pand %%mm6, " #regb " \n\t"\
|
|
nengel@2
|
128 "pand %%mm6, " #regd " \n\t"\
|
|
nengel@2
|
129 "psrlq $1, " #regb " \n\t"\
|
|
nengel@2
|
130 "psrlq $1, " #regd " \n\t"\
|
|
nengel@2
|
131 "paddb " #regb ", " #regr " \n\t"\
|
|
nengel@2
|
132 "paddb " #regd ", " #regp " \n\t"
|
|
nengel@2
|
133
|
|
nengel@2
|
134 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
|
|
nengel@2
|
135 "movq " #rega ", " #regr " \n\t"\
|
|
nengel@2
|
136 "movq " #regc ", " #regp " \n\t"\
|
|
nengel@2
|
137 "por " #regb ", " #regr " \n\t"\
|
|
nengel@2
|
138 "por " #regd ", " #regp " \n\t"\
|
|
nengel@2
|
139 "pxor " #rega ", " #regb " \n\t"\
|
|
nengel@2
|
140 "pxor " #regc ", " #regd " \n\t"\
|
|
nengel@2
|
141 "pand %%mm6, " #regb " \n\t"\
|
|
nengel@2
|
142 "pand %%mm6, " #regd " \n\t"\
|
|
nengel@2
|
143 "psrlq $1, " #regd " \n\t"\
|
|
nengel@2
|
144 "psrlq $1, " #regb " \n\t"\
|
|
nengel@2
|
145 "psubb " #regb ", " #regr " \n\t"\
|
|
nengel@2
|
146 "psubb " #regd ", " #regp " \n\t"
|
|
nengel@2
|
147
|
|
nengel@2
|
148 /***********************************/
|
|
nengel@2
|
149 /* MMX2 specific */
|
|
nengel@2
|
150
|
|
nengel@2
|
151 #define DEF(x) x ## _mmx2
|
|
nengel@2
|
152
|
|
nengel@2
|
153 /* Introduced only in MMX2 set */
|
|
nengel@2
|
154 #define PAVGB "pavgb"
|
|
nengel@2
|
155 #define OP_AVG PAVGB
|
|
nengel@2
|
156
|
|
nengel@2
|
157 #include "dsputil_mmx_avg_template.c"
|
|
nengel@2
|
158
|
|
nengel@2
|
159 #undef DEF
|
|
nengel@2
|
160 #undef PAVGB
|
|
nengel@2
|
161 #undef OP_AVG
|
|
nengel@2
|
162
|
|
nengel@2
|
163 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
|
|
nengel@2
|
164 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
|
|
nengel@2
|
165 #define put_pixels16_mmx2 put_pixels16_mmx
|
|
nengel@2
|
166 #define put_pixels8_mmx2 put_pixels8_mmx
|
|
nengel@2
|
167 #define put_pixels4_mmx2 put_pixels4_mmx
|
|
nengel@2
|
168 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
|
|
nengel@2
|
169 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
|
|
nengel@2
|
170 #define put_pixels16_3dnow put_pixels16_mmx
|
|
nengel@2
|
171 #define put_pixels8_3dnow put_pixels8_mmx
|
|
nengel@2
|
172 #define put_pixels4_3dnow put_pixels4_mmx
|
|
nengel@2
|
173 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
|
|
nengel@2
|
174 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
|
|
nengel@2
|
175
|
|
nengel@2
|
176 /***********************************/
|
|
nengel@2
|
177 /* standard MMX */
|
|
nengel@2
|
178
|
|
nengel@2
|
179 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
|
|
nengel@2
|
180 {
|
|
nengel@2
|
181 const DCTELEM *p;
|
|
nengel@2
|
182 uint8_t *pix;
|
|
nengel@2
|
183
|
|
nengel@2
|
184 /* read the pixels */
|
|
nengel@2
|
185 p = block;
|
|
nengel@2
|
186 pix = pixels;
|
|
nengel@2
|
187 /* unrolled loop */
|
|
nengel@2
|
188 __asm__ volatile(
|
|
nengel@2
|
189 "movq %3, %%mm0 \n\t"
|
|
nengel@2
|
190 "movq 8%3, %%mm1 \n\t"
|
|
nengel@2
|
191 "movq 16%3, %%mm2 \n\t"
|
|
nengel@2
|
192 "movq 24%3, %%mm3 \n\t"
|
|
nengel@2
|
193 "movq 32%3, %%mm4 \n\t"
|
|
nengel@2
|
194 "movq 40%3, %%mm5 \n\t"
|
|
nengel@2
|
195 "movq 48%3, %%mm6 \n\t"
|
|
nengel@2
|
196 "movq 56%3, %%mm7 \n\t"
|
|
nengel@2
|
197 "packuswb %%mm1, %%mm0 \n\t"
|
|
nengel@2
|
198 "packuswb %%mm3, %%mm2 \n\t"
|
|
nengel@2
|
199 "packuswb %%mm5, %%mm4 \n\t"
|
|
nengel@2
|
200 "packuswb %%mm7, %%mm6 \n\t"
|
|
nengel@2
|
201 "movq %%mm0, (%0) \n\t"
|
|
nengel@2
|
202 "movq %%mm2, (%0, %1) \n\t"
|
|
nengel@2
|
203 "movq %%mm4, (%0, %1, 2) \n\t"
|
|
nengel@2
|
204 "movq %%mm6, (%0, %2) \n\t"
|
|
nengel@2
|
205 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
|
|
nengel@2
|
206 :"memory");
|
|
nengel@2
|
207 pix += line_size*4;
|
|
nengel@2
|
208 p += 32;
|
|
nengel@2
|
209
|
|
nengel@2
|
210 // if here would be an exact copy of the code above
|
|
nengel@2
|
211 // compiler would generate some very strange code
|
|
nengel@2
|
212 // thus using "r"
|
|
nengel@2
|
213 __asm__ volatile(
|
|
nengel@2
|
214 "movq (%3), %%mm0 \n\t"
|
|
nengel@2
|
215 "movq 8(%3), %%mm1 \n\t"
|
|
nengel@2
|
216 "movq 16(%3), %%mm2 \n\t"
|
|
nengel@2
|
217 "movq 24(%3), %%mm3 \n\t"
|
|
nengel@2
|
218 "movq 32(%3), %%mm4 \n\t"
|
|
nengel@2
|
219 "movq 40(%3), %%mm5 \n\t"
|
|
nengel@2
|
220 "movq 48(%3), %%mm6 \n\t"
|
|
nengel@2
|
221 "movq 56(%3), %%mm7 \n\t"
|
|
nengel@2
|
222 "packuswb %%mm1, %%mm0 \n\t"
|
|
nengel@2
|
223 "packuswb %%mm3, %%mm2 \n\t"
|
|
nengel@2
|
224 "packuswb %%mm5, %%mm4 \n\t"
|
|
nengel@2
|
225 "packuswb %%mm7, %%mm6 \n\t"
|
|
nengel@2
|
226 "movq %%mm0, (%0) \n\t"
|
|
nengel@2
|
227 "movq %%mm2, (%0, %1) \n\t"
|
|
nengel@2
|
228 "movq %%mm4, (%0, %1, 2) \n\t"
|
|
nengel@2
|
229 "movq %%mm6, (%0, %2) \n\t"
|
|
nengel@2
|
230 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
|
|
nengel@2
|
231 :"memory");
|
|
nengel@2
|
232 }
|
|
nengel@2
|
233
|
|
nengel@2
|
234 DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
|
|
nengel@2
|
235 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
|
|
nengel@2
|
236
|
|
nengel@2
|
237 #define put_signed_pixels_clamped_mmx_half(off) \
|
|
nengel@2
|
238 "movq "#off"(%2), %%mm1 \n\t"\
|
|
nengel@2
|
239 "movq 16+"#off"(%2), %%mm2 \n\t"\
|
|
nengel@2
|
240 "movq 32+"#off"(%2), %%mm3 \n\t"\
|
|
nengel@2
|
241 "movq 48+"#off"(%2), %%mm4 \n\t"\
|
|
nengel@2
|
242 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
|
|
nengel@2
|
243 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
|
|
nengel@2
|
244 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
|
|
nengel@2
|
245 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
|
|
nengel@2
|
246 "paddb %%mm0, %%mm1 \n\t"\
|
|
nengel@2
|
247 "paddb %%mm0, %%mm2 \n\t"\
|
|
nengel@2
|
248 "paddb %%mm0, %%mm3 \n\t"\
|
|
nengel@2
|
249 "paddb %%mm0, %%mm4 \n\t"\
|
|
nengel@2
|
250 "movq %%mm1, (%0) \n\t"\
|
|
nengel@2
|
251 "movq %%mm2, (%0, %3) \n\t"\
|
|
nengel@2
|
252 "movq %%mm3, (%0, %3, 2) \n\t"\
|
|
nengel@2
|
253 "movq %%mm4, (%0, %1) \n\t"
|
|
nengel@2
|
254
|
|
nengel@2
|
255 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
|
|
nengel@2
|
256 {
|
|
nengel@2
|
257 x86_reg line_skip = line_size;
|
|
nengel@2
|
258 x86_reg line_skip3;
|
|
nengel@2
|
259
|
|
nengel@2
|
260 __asm__ volatile (
|
|
nengel@2
|
261 "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
|
|
nengel@2
|
262 "lea (%3, %3, 2), %1 \n\t"
|
|
nengel@2
|
263 put_signed_pixels_clamped_mmx_half(0)
|
|
nengel@2
|
264 "lea (%0, %3, 4), %0 \n\t"
|
|
nengel@2
|
265 put_signed_pixels_clamped_mmx_half(64)
|
|
nengel@2
|
266 :"+&r" (pixels), "=&r" (line_skip3)
|
|
nengel@2
|
267 :"r" (block), "r"(line_skip)
|
|
nengel@2
|
268 :"memory");
|
|
nengel@2
|
269 }
|
|
nengel@2
|
270
|
|
nengel@2
|
271 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
|
|
nengel@2
|
272 {
|
|
nengel@2
|
273 const DCTELEM *p;
|
|
nengel@2
|
274 uint8_t *pix;
|
|
nengel@2
|
275 int i;
|
|
nengel@2
|
276
|
|
nengel@2
|
277 /* read the pixels */
|
|
nengel@2
|
278 p = block;
|
|
nengel@2
|
279 pix = pixels;
|
|
nengel@2
|
280 MOVQ_ZERO(mm7);
|
|
nengel@2
|
281 i = 4;
|
|
nengel@2
|
282 do {
|
|
nengel@2
|
283 __asm__ volatile(
|
|
nengel@2
|
284 "movq (%2), %%mm0 \n\t"
|
|
nengel@2
|
285 "movq 8(%2), %%mm1 \n\t"
|
|
nengel@2
|
286 "movq 16(%2), %%mm2 \n\t"
|
|
nengel@2
|
287 "movq 24(%2), %%mm3 \n\t"
|
|
nengel@2
|
288 "movq %0, %%mm4 \n\t"
|
|
nengel@2
|
289 "movq %1, %%mm6 \n\t"
|
|
nengel@2
|
290 "movq %%mm4, %%mm5 \n\t"
|
|
nengel@2
|
291 "punpcklbw %%mm7, %%mm4 \n\t"
|
|
nengel@2
|
292 "punpckhbw %%mm7, %%mm5 \n\t"
|
|
nengel@2
|
293 "paddsw %%mm4, %%mm0 \n\t"
|
|
nengel@2
|
294 "paddsw %%mm5, %%mm1 \n\t"
|
|
nengel@2
|
295 "movq %%mm6, %%mm5 \n\t"
|
|
nengel@2
|
296 "punpcklbw %%mm7, %%mm6 \n\t"
|
|
nengel@2
|
297 "punpckhbw %%mm7, %%mm5 \n\t"
|
|
nengel@2
|
298 "paddsw %%mm6, %%mm2 \n\t"
|
|
nengel@2
|
299 "paddsw %%mm5, %%mm3 \n\t"
|
|
nengel@2
|
300 "packuswb %%mm1, %%mm0 \n\t"
|
|
nengel@2
|
301 "packuswb %%mm3, %%mm2 \n\t"
|
|
nengel@2
|
302 "movq %%mm0, %0 \n\t"
|
|
nengel@2
|
303 "movq %%mm2, %1 \n\t"
|
|
nengel@2
|
304 :"+m"(*pix), "+m"(*(pix+line_size))
|
|
nengel@2
|
305 :"r"(p)
|
|
nengel@2
|
306 :"memory");
|
|
nengel@2
|
307 pix += line_size*2;
|
|
nengel@2
|
308 p += 16;
|
|
nengel@2
|
309 } while (--i);
|
|
nengel@2
|
310 }
|
|
nengel@2
|
311
|
|
nengel@2
|
312 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
313 {
|
|
nengel@2
|
314 __asm__ volatile(
|
|
nengel@2
|
315 "lea (%3, %3), %%"REG_a" \n\t"
|
|
nengel@2
|
316 ASMALIGN(3)
|
|
nengel@2
|
317 "1: \n\t"
|
|
nengel@2
|
318 "movq (%1), %%mm0 \n\t"
|
|
nengel@2
|
319 "movq (%1, %3), %%mm1 \n\t"
|
|
nengel@2
|
320 "movq %%mm0, (%2) \n\t"
|
|
nengel@2
|
321 "movq %%mm1, (%2, %3) \n\t"
|
|
nengel@2
|
322 "add %%"REG_a", %1 \n\t"
|
|
nengel@2
|
323 "add %%"REG_a", %2 \n\t"
|
|
nengel@2
|
324 "movq (%1), %%mm0 \n\t"
|
|
nengel@2
|
325 "movq (%1, %3), %%mm1 \n\t"
|
|
nengel@2
|
326 "movq %%mm0, (%2) \n\t"
|
|
nengel@2
|
327 "movq %%mm1, (%2, %3) \n\t"
|
|
nengel@2
|
328 "add %%"REG_a", %1 \n\t"
|
|
nengel@2
|
329 "add %%"REG_a", %2 \n\t"
|
|
nengel@2
|
330 "subl $4, %0 \n\t"
|
|
nengel@2
|
331 "jnz 1b \n\t"
|
|
nengel@2
|
332 : "+g"(h), "+r" (pixels), "+r" (block)
|
|
nengel@2
|
333 : "r"((x86_reg)line_size)
|
|
nengel@2
|
334 : "%"REG_a, "memory"
|
|
nengel@2
|
335 );
|
|
nengel@2
|
336 }
|
|
nengel@2
|
337
|
|
nengel@2
|
338 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
339 {
|
|
nengel@2
|
340 __asm__ volatile(
|
|
nengel@2
|
341 "1: \n\t"
|
|
nengel@2
|
342 "movdqu (%1), %%xmm0 \n\t"
|
|
nengel@2
|
343 "movdqu (%1,%3), %%xmm1 \n\t"
|
|
nengel@2
|
344 "movdqu (%1,%3,2), %%xmm2 \n\t"
|
|
nengel@2
|
345 "movdqu (%1,%4), %%xmm3 \n\t"
|
|
nengel@2
|
346 "movdqa %%xmm0, (%2) \n\t"
|
|
nengel@2
|
347 "movdqa %%xmm1, (%2,%3) \n\t"
|
|
nengel@2
|
348 "movdqa %%xmm2, (%2,%3,2) \n\t"
|
|
nengel@2
|
349 "movdqa %%xmm3, (%2,%4) \n\t"
|
|
nengel@2
|
350 "subl $4, %0 \n\t"
|
|
nengel@2
|
351 "lea (%1,%3,4), %1 \n\t"
|
|
nengel@2
|
352 "lea (%2,%3,4), %2 \n\t"
|
|
nengel@2
|
353 "jnz 1b \n\t"
|
|
nengel@2
|
354 : "+g"(h), "+r" (pixels), "+r" (block)
|
|
nengel@2
|
355 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
|
|
nengel@2
|
356 : "memory"
|
|
nengel@2
|
357 );
|
|
nengel@2
|
358 }
|
|
nengel@2
|
359
|
|
nengel@2
|
360 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
nengel@2
|
361 {
|
|
nengel@2
|
362 __asm__ volatile(
|
|
nengel@2
|
363 "1: \n\t"
|
|
nengel@2
|
364 "movdqu (%1), %%xmm0 \n\t"
|
|
nengel@2
|
365 "movdqu (%1,%3), %%xmm1 \n\t"
|
|
nengel@2
|
366 "movdqu (%1,%3,2), %%xmm2 \n\t"
|
|
nengel@2
|
367 "movdqu (%1,%4), %%xmm3 \n\t"
|
|
nengel@2
|
368 "pavgb (%2), %%xmm0 \n\t"
|
|
nengel@2
|
369 "pavgb (%2,%3), %%xmm1 \n\t"
|
|
nengel@2
|
370 "pavgb (%2,%3,2), %%xmm2 \n\t"
|
|
nengel@2
|
371 "pavgb (%2,%4), %%xmm3 \n\t"
|
|
nengel@2
|
372 "movdqa %%xmm0, (%2) \n\t"
|
|
nengel@2
|
373 "movdqa %%xmm1, (%2,%3) \n\t"
|
|
nengel@2
|
374 "movdqa %%xmm2, (%2,%3,2) \n\t"
|
|
nengel@2
|
375 "movdqa %%xmm3, (%2,%4) \n\t"
|
|
nengel@2
|
376 "subl $4, %0 \n\t"
|
|
nengel@2
|
377 "lea (%1,%3,4), %1 \n\t"
|
|
nengel@2
|
378 "lea (%2,%3,4), %2 \n\t"
|
|
nengel@2
|
379 "jnz 1b \n\t"
|
|
nengel@2
|
380 : "+g"(h), "+r" (pixels), "+r" (block)
|
|
nengel@2
|
381 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
|
|
nengel@2
|
382 : "memory"
|
|
nengel@2
|
383 );
|
|
nengel@2
|
384 }
|
|
nengel@2
|
385
|
|
nengel@2
|
386 static void clear_block_sse(DCTELEM *block)
|
|
nengel@2
|
387 {
|
|
nengel@2
|
388 __asm__ volatile(
|
|
nengel@2
|
389 "xorps %%xmm0, %%xmm0 \n"
|
|
nengel@2
|
390 "movaps %%xmm0, (%0) \n"
|
|
nengel@2
|
391 "movaps %%xmm0, 16(%0) \n"
|
|
nengel@2
|
392 "movaps %%xmm0, 32(%0) \n"
|
|
nengel@2
|
393 "movaps %%xmm0, 48(%0) \n"
|
|
nengel@2
|
394 "movaps %%xmm0, 64(%0) \n"
|
|
nengel@2
|
395 "movaps %%xmm0, 80(%0) \n"
|
|
nengel@2
|
396 "movaps %%xmm0, 96(%0) \n"
|
|
nengel@2
|
397 "movaps %%xmm0, 112(%0) \n"
|
|
nengel@2
|
398 :: "r"(block)
|
|
nengel@2
|
399 : "memory"
|
|
nengel@2
|
400 );
|
|
nengel@2
|
401 }
|
|
nengel@2
|
402
|
|
nengel@2
|
403 static void clear_blocks_sse(DCTELEM *blocks)
|
|
nengel@2
|
404 {\
|
|
nengel@2
|
405 __asm__ volatile(
|
|
nengel@2
|
406 "xorps %%xmm0, %%xmm0 \n"
|
|
nengel@2
|
407 "mov %1, %%"REG_a" \n"
|
|
nengel@2
|
408 "1: \n"
|
|
nengel@2
|
409 "movaps %%xmm0, (%0, %%"REG_a") \n"
|
|
nengel@2
|
410 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
|
|
nengel@2
|
411 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
|
|
nengel@2
|
412 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
|
|
nengel@2
|
413 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
|
|
nengel@2
|
414 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
|
|
nengel@2
|
415 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
|
|
nengel@2
|
416 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
|
|
nengel@2
|
417 "add $128, %%"REG_a" \n"
|
|
nengel@2
|
418 " js 1b \n"
|
|
nengel@2
|
419 : : "r" (((uint8_t *)blocks)+128*6),
|
|
nengel@2
|
420 "i" (-128*6)
|
|
nengel@2
|
421 : "%"REG_a
|
|
nengel@2
|
422 );
|
|
nengel@2
|
423 }
|
|
nengel@2
|
424
|
|
nengel@2
|
425 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
|
|
nengel@2
|
426 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
|
|
nengel@2
|
427 "movd %4, %%mm0 \n\t"
|
|
nengel@2
|
428 "movd %5, %%mm1 \n\t"
|
|
nengel@2
|
429 "movd %6, %%mm2 \n\t"
|
|
nengel@2
|
430 "movd %7, %%mm3 \n\t"
|
|
nengel@2
|
431 "punpcklbw %%mm1, %%mm0 \n\t"
|
|
nengel@2
|
432 "punpcklbw %%mm3, %%mm2 \n\t"
|
|
nengel@2
|
433 "movq %%mm0, %%mm1 \n\t"
|
|
nengel@2
|
434 "punpcklwd %%mm2, %%mm0 \n\t"
|
|
nengel@2
|
435 "punpckhwd %%mm2, %%mm1 \n\t"
|
|
nengel@2
|
436 "movd %%mm0, %0 \n\t"
|
|
nengel@2
|
437 "punpckhdq %%mm0, %%mm0 \n\t"
|
|
nengel@2
|
438 "movd %%mm0, %1 \n\t"
|
|
nengel@2
|
439 "movd %%mm1, %2 \n\t"
|
|
nengel@2
|
440 "punpckhdq %%mm1, %%mm1 \n\t"
|
|
nengel@2
|
441 "movd %%mm1, %3 \n\t"
|
|
nengel@2
|
442
|
|
nengel@2
|
443 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
|
|
nengel@2
|
444 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
|
|
nengel@2
|
445 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
|
|
nengel@2
|
446 "=m" (*(uint32_t*)(dst + 3*dst_stride))
|
|
nengel@2
|
447 : "m" (*(uint32_t*)(src + 0*src_stride)),
|
|
nengel@2
|
448 "m" (*(uint32_t*)(src + 1*src_stride)),
|
|
nengel@2
|
449 "m" (*(uint32_t*)(src + 2*src_stride)),
|
|
nengel@2
|
450 "m" (*(uint32_t*)(src + 3*src_stride))
|
|
nengel@2
|
451 );
|
|
nengel@2
|
452 }
|
|
nengel@2
|
453
|
|
nengel@2
|
454 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
|
|
nengel@2
|
455 \
|
|
nengel@2
|
456 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
457 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
|
|
nengel@2
|
458 }\
|
|
nengel@2
|
459 \
|
|
nengel@2
|
460 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
461 uint64_t temp[8];\
|
|
nengel@2
|
462 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
463 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
|
|
nengel@2
|
464 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
|
|
nengel@2
|
465 }\
|
|
nengel@2
|
466 \
|
|
nengel@2
|
467 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
468 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
|
|
nengel@2
|
469 }\
|
|
nengel@2
|
470 \
|
|
nengel@2
|
471 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
472 uint64_t temp[8];\
|
|
nengel@2
|
473 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
|
|
nengel@2
|
475 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
|
|
nengel@2
|
476 }\
|
|
nengel@2
|
477 \
|
|
nengel@2
|
478 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
479 uint64_t temp[8];\
|
|
nengel@2
|
480 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
481 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
|
|
nengel@2
|
482 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
|
|
nengel@2
|
483 }\
|
|
nengel@2
|
484 \
|
|
nengel@2
|
485 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
486 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
|
|
nengel@2
|
487 }\
|
|
nengel@2
|
488 \
|
|
nengel@2
|
489 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
490 uint64_t temp[8];\
|
|
nengel@2
|
491 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
492 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
|
|
nengel@2
|
493 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
|
|
nengel@2
|
494 }\
|
|
nengel@2
|
495 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
496 uint64_t half[8 + 9];\
|
|
nengel@2
|
497 uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
|
nengel@2
|
498 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
499 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
500 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
|
nengel@2
|
501 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
|
nengel@2
|
502 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
|
nengel@2
|
503 }\
|
|
nengel@2
|
504 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
505 uint64_t half[8 + 9];\
|
|
nengel@2
|
506 uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
|
nengel@2
|
507 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
508 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
509 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
|
nengel@2
|
510 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
|
nengel@2
|
511 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
|
nengel@2
|
512 }\
|
|
nengel@2
|
513 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
514 uint64_t half[8 + 9];\
|
|
nengel@2
|
515 uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
|
nengel@2
|
516 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
517 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
518 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
|
nengel@2
|
519 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
|
nengel@2
|
520 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
|
nengel@2
|
521 }\
|
|
nengel@2
|
522 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
523 uint64_t half[8 + 9];\
|
|
nengel@2
|
524 uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
|
nengel@2
|
525 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
526 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
527 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
|
nengel@2
|
528 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
|
nengel@2
|
529 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
|
nengel@2
|
530 }\
|
|
nengel@2
|
531 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
532 uint64_t half[8 + 9];\
|
|
nengel@2
|
533 uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
|
nengel@2
|
534 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
535 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
536 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
|
nengel@2
|
537 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
|
|
nengel@2
|
538 }\
|
|
nengel@2
|
539 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
540 uint64_t half[8 + 9];\
|
|
nengel@2
|
541 uint8_t * const halfH= ((uint8_t*)half) + 64;\
|
|
nengel@2
|
542 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
543 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
544 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
|
|
nengel@2
|
545 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
|
|
nengel@2
|
546 }\
|
|
nengel@2
|
547 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
548 uint64_t half[8 + 9];\
|
|
nengel@2
|
549 uint8_t * const halfH= ((uint8_t*)half);\
|
|
nengel@2
|
550 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
551 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
|
|
nengel@2
|
552 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
|
nengel@2
|
553 }\
|
|
nengel@2
|
554 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
555 uint64_t half[8 + 9];\
|
|
nengel@2
|
556 uint8_t * const halfH= ((uint8_t*)half);\
|
|
nengel@2
|
557 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
558 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
|
|
nengel@2
|
559 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
|
nengel@2
|
560 }\
|
|
nengel@2
|
561 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
562 uint64_t half[9];\
|
|
nengel@2
|
563 uint8_t * const halfH= ((uint8_t*)half);\
|
|
nengel@2
|
564 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
|
|
nengel@2
|
565 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
|
|
nengel@2
|
566 }\
|
|
nengel@2
|
567 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
568 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
|
|
nengel@2
|
569 }\
|
|
nengel@2
|
570 \
|
|
nengel@2
|
571 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
572 uint64_t temp[32];\
|
|
nengel@2
|
573 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
574 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
|
|
nengel@2
|
575 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
|
|
nengel@2
|
576 }\
|
|
nengel@2
|
577 \
|
|
nengel@2
|
578 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
579 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
|
|
nengel@2
|
580 }\
|
|
nengel@2
|
581 \
|
|
nengel@2
|
582 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
583 uint64_t temp[32];\
|
|
nengel@2
|
584 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
|
|
nengel@2
|
586 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
|
|
nengel@2
|
587 }\
|
|
nengel@2
|
588 \
|
|
nengel@2
|
589 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
590 uint64_t temp[32];\
|
|
nengel@2
|
591 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
592 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
|
|
nengel@2
|
593 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
|
|
nengel@2
|
594 }\
|
|
nengel@2
|
595 \
|
|
nengel@2
|
596 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
597 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
|
|
nengel@2
|
598 }\
|
|
nengel@2
|
599 \
|
|
nengel@2
|
600 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
601 uint64_t temp[32];\
|
|
nengel@2
|
602 uint8_t * const half= (uint8_t*)temp;\
|
|
nengel@2
|
603 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
|
|
nengel@2
|
604 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
|
|
nengel@2
|
605 }\
|
|
nengel@2
|
606 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
607 uint64_t half[16*2 + 17*2];\
|
|
nengel@2
|
608 uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
|
nengel@2
|
609 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
610 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
611 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
|
nengel@2
|
612 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
|
nengel@2
|
613 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
|
nengel@2
|
614 }\
|
|
nengel@2
|
615 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
616 uint64_t half[16*2 + 17*2];\
|
|
nengel@2
|
617 uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
|
nengel@2
|
618 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
619 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
620 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
|
nengel@2
|
621 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
|
nengel@2
|
622 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
|
nengel@2
|
623 }\
|
|
nengel@2
|
624 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
625 uint64_t half[16*2 + 17*2];\
|
|
nengel@2
|
626 uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
|
nengel@2
|
627 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
628 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
629 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
|
nengel@2
|
630 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
|
nengel@2
|
631 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
|
nengel@2
|
632 }\
|
|
nengel@2
|
633 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
634 uint64_t half[16*2 + 17*2];\
|
|
nengel@2
|
635 uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
|
nengel@2
|
636 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
637 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
638 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
|
nengel@2
|
639 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
|
nengel@2
|
640 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
|
nengel@2
|
641 }\
|
|
nengel@2
|
642 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
643 uint64_t half[16*2 + 17*2];\
|
|
nengel@2
|
644 uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
|
nengel@2
|
645 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
646 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
647 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
|
nengel@2
|
648 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
|
|
nengel@2
|
649 }\
|
|
nengel@2
|
650 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
651 uint64_t half[16*2 + 17*2];\
|
|
nengel@2
|
652 uint8_t * const halfH= ((uint8_t*)half) + 256;\
|
|
nengel@2
|
653 uint8_t * const halfHV= ((uint8_t*)half);\
|
|
nengel@2
|
654 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
655 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
|
|
nengel@2
|
656 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
|
|
nengel@2
|
657 }\
|
|
nengel@2
|
658 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
659 uint64_t half[17*2];\
|
|
nengel@2
|
660 uint8_t * const halfH= ((uint8_t*)half);\
|
|
nengel@2
|
661 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
662 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
|
|
nengel@2
|
663 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
|
nengel@2
|
664 }\
|
|
nengel@2
|
665 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
666 uint64_t half[17*2];\
|
|
nengel@2
|
667 uint8_t * const halfH= ((uint8_t*)half);\
|
|
nengel@2
|
668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
669 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
|
|
nengel@2
|
670 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
|
nengel@2
|
671 }\
|
|
nengel@2
|
672 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
|
nengel@2
|
673 uint64_t half[17*2];\
|
|
nengel@2
|
674 uint8_t * const halfH= ((uint8_t*)half);\
|
|
nengel@2
|
675 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
|
|
nengel@2
|
676 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
|
|
nengel@2
|
677 }
|
|
nengel@2
|
678
|
|
nengel@2
|
679 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
|
|
nengel@2
|
680 #define AVG_3DNOW_OP(a,b,temp, size) \
|
|
nengel@2
|
681 "mov" #size " " #b ", " #temp " \n\t"\
|
|
nengel@2
|
682 "pavgusb " #temp ", " #a " \n\t"\
|
|
nengel@2
|
683 "mov" #size " " #a ", " #b " \n\t"
|
|
nengel@2
|
684 #define AVG_MMX2_OP(a,b,temp, size) \
|
|
nengel@2
|
685 "mov" #size " " #b ", " #temp " \n\t"\
|
|
nengel@2
|
686 "pavgb " #temp ", " #a " \n\t"\
|
|
nengel@2
|
687 "mov" #size " " #a ", " #b " \n\t"
|
|
nengel@2
|
688
|
|
nengel@2
|
689 #define PREFETCH(name, op) \
|
|
nengel@2
|
690 static void name(void *mem, int stride, int h){\
|
|
nengel@2
|
691 const uint8_t *p= mem;\
|
|
nengel@2
|
692 do{\
|
|
nengel@2
|
693 __asm__ volatile(#op" %0" :: "m"(*p));\
|
|
nengel@2
|
694 p+= stride;\
|
|
nengel@2
|
695 }while(--h);\
|
|
nengel@2
|
696 }
|
|
nengel@2
|
697 PREFETCH(prefetch_mmx2, prefetcht0)
|
|
nengel@2
|
698 #undef PREFETCH
|
|
nengel@2
|
699
|
|
nengel@2
|
700 #include "h264dsp_mmx.c"
|
|
nengel@2
|
701
|
|
nengel@2
|
702 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
|
nengel@2
|
703 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
|
|
nengel@2
|
704 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
|
|
nengel@2
|
705 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
|
|
nengel@2
|
706 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
|
|
nengel@2
|
707
|
|
nengel@2
|
708 void dsputil_init_mmx(DSPContext* c)
|
|
nengel@2
|
709 {
|
|
nengel@2
|
710 mm_flags = mm_support();
|
|
nengel@2
|
711
|
|
nengel@2
|
712 if (mm_flags & FF_MM_MMX) {
|
|
nengel@2
|
713 c->clear_block = clear_block_sse;
|
|
nengel@2
|
714 c->clear_blocks = clear_blocks_sse;
|
|
nengel@2
|
715 c->prefetch = prefetch_mmx2;
|
|
nengel@2
|
716
|
|
nengel@2
|
717
|
|
nengel@2
|
718 #define H264_QPEL_FUNCS(x, y, CPU)\
|
|
nengel@2
|
719 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
|
|
nengel@2
|
720 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
|
|
nengel@2
|
721 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
|
|
nengel@2
|
722 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
|
|
nengel@2
|
723
|
|
nengel@2
|
724 if((mm_flags & FF_MM_SSE2)){
|
|
nengel@2
|
725 c->put_pixels_tab[0][0] = put_pixels16_sse2;
|
|
nengel@2
|
726 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
|
|
nengel@2
|
727
|
|
nengel@2
|
728 }
|
|
nengel@2
|
729 if(mm_flags & FF_MM_SSE2){
|
|
nengel@2
|
730 H264_QPEL_FUNCS(0, 1, sse2);
|
|
nengel@2
|
731 H264_QPEL_FUNCS(0, 2, sse2);
|
|
nengel@2
|
732 H264_QPEL_FUNCS(0, 3, sse2);
|
|
nengel@2
|
733 H264_QPEL_FUNCS(1, 1, sse2);
|
|
nengel@2
|
734 H264_QPEL_FUNCS(1, 2, sse2);
|
|
nengel@2
|
735 H264_QPEL_FUNCS(1, 3, sse2);
|
|
nengel@2
|
736 H264_QPEL_FUNCS(2, 1, sse2);
|
|
nengel@2
|
737 H264_QPEL_FUNCS(2, 2, sse2);
|
|
nengel@2
|
738 H264_QPEL_FUNCS(2, 3, sse2);
|
|
nengel@2
|
739 H264_QPEL_FUNCS(3, 1, sse2);
|
|
nengel@2
|
740 H264_QPEL_FUNCS(3, 2, sse2);
|
|
nengel@2
|
741 H264_QPEL_FUNCS(3, 3, sse2);
|
|
nengel@2
|
742 }
|
|
nengel@2
|
743 #if HAVE_SSSE3
|
|
nengel@2
|
744 if(mm_flags & FF_MM_SSSE3){
|
|
nengel@2
|
745 H264_QPEL_FUNCS(1, 0, ssse3);
|
|
nengel@2
|
746 H264_QPEL_FUNCS(1, 1, ssse3);
|
|
nengel@2
|
747 H264_QPEL_FUNCS(1, 2, ssse3);
|
|
nengel@2
|
748 H264_QPEL_FUNCS(1, 3, ssse3);
|
|
nengel@2
|
749 H264_QPEL_FUNCS(2, 0, ssse3);
|
|
nengel@2
|
750 H264_QPEL_FUNCS(2, 1, ssse3);
|
|
nengel@2
|
751 H264_QPEL_FUNCS(2, 2, ssse3);
|
|
nengel@2
|
752 H264_QPEL_FUNCS(2, 3, ssse3);
|
|
nengel@2
|
753 H264_QPEL_FUNCS(3, 0, ssse3);
|
|
nengel@2
|
754 H264_QPEL_FUNCS(3, 1, ssse3);
|
|
nengel@2
|
755 H264_QPEL_FUNCS(3, 2, ssse3);
|
|
nengel@2
|
756 H264_QPEL_FUNCS(3, 3, ssse3);
|
|
nengel@2
|
757
|
|
nengel@2
|
758 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
|
|
nengel@2
|
759 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
|
|
nengel@2
|
760 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
|
|
nengel@2
|
761 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
|
|
nengel@2
|
762 }
|
|
nengel@2
|
763 #endif
|
|
nengel@2
|
764
|
|
nengel@2
|
765
|
|
nengel@2
|
766 }
|
|
nengel@2
|
767 }
|
|
nengel@2
|
768
|
|
nengel@2
|
769 void ff_h264dsp_init_x86(H264DSPContext *c)
|
|
nengel@2
|
770 {
|
|
nengel@2
|
771 mm_flags = mm_support();
|
|
nengel@2
|
772
|
|
nengel@2
|
773 if (mm_flags & FF_MM_MMX) {
|
|
nengel@2
|
774 c->h264_idct_dc_add=
|
|
nengel@2
|
775 c->h264_idct_add= ff_h264_idct_add_mmx;
|
|
nengel@2
|
776 c->h264_idct8_dc_add=
|
|
nengel@2
|
777 c->h264_idct8_add= ff_h264_idct8_add_mmx;
|
|
nengel@2
|
778
|
|
nengel@2
|
779 if (mm_flags & FF_MM_MMX2) {
|
|
nengel@2
|
780 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
|
|
nengel@2
|
781 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
|
|
nengel@2
|
782 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
|
|
nengel@2
|
783 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
|
|
nengel@2
|
784
|
|
nengel@2
|
785 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
|
|
nengel@2
|
786 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
|
|
nengel@2
|
787
|
|
nengel@2
|
788 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
|
|
nengel@2
|
789 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
|
|
nengel@2
|
790 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
|
|
nengel@2
|
791 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
|
|
nengel@2
|
792 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
|
|
nengel@2
|
793 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
|
|
nengel@2
|
794 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
|
|
nengel@2
|
795
|
|
nengel@2
|
796 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
|
|
nengel@2
|
797 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
|
|
nengel@2
|
798 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
|
|
nengel@2
|
799 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
|
|
nengel@2
|
800 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
|
|
nengel@2
|
801 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
|
|
nengel@2
|
802 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
|
|
nengel@2
|
803 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
|
|
nengel@2
|
804
|
|
nengel@2
|
805 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
|
|
nengel@2
|
806 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
|
|
nengel@2
|
807 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
|
|
nengel@2
|
808 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
|
|
nengel@2
|
809 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
|
|
nengel@2
|
810 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
|
|
nengel@2
|
811 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
|
|
nengel@2
|
812 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
|
|
nengel@2
|
813 }
|
|
nengel@2
|
814 if(mm_flags & FF_MM_SSE2){
|
|
nengel@2
|
815 c->h264_idct8_add = ff_h264_idct8_add_sse2;
|
|
nengel@2
|
816 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
|
|
nengel@2
|
817 }
|
|
nengel@2
|
818
|
|
nengel@2
|
819 }
|
|
nengel@2
|
820 }
|
|
nengel@2
|
821
|