Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
comparison libavcodec/x86/dsputil_mmx.c @ 3:0b056460c67d
changed code to use VSs
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Mon, 29 Oct 2012 16:44:27 +0100 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9ac291ad5518 |
|---|---|
| 1 /* | |
| 2 * MMX optimized DSP utils | |
| 3 * Copyright (c) 2000, 2001 Fabrice Bellard | |
| 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |
| 5 * | |
| 6 * This file is part of FFmpeg. | |
| 7 * | |
| 8 * FFmpeg is free software; you can redistribute it and/or | |
| 9 * modify it under the terms of the GNU Lesser General Public | |
| 10 * License as published by the Free Software Foundation; either | |
| 11 * version 2.1 of the License, or (at your option) any later version. | |
| 12 * | |
| 13 * FFmpeg is distributed in the hope that it will be useful, | |
| 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 16 * Lesser General Public License for more details. | |
| 17 * | |
| 18 * You should have received a copy of the GNU Lesser General Public | |
| 19 * License along with FFmpeg; if not, write to the Free Software | |
| 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 21 * | |
| 22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> | |
| 23 */ | |
| 24 | |
| 25 #include "libavutil/x86_cpu.h" | |
| 26 #include "libavutil/internal.h" | |
| 27 #include "libavcodec/dsputil.h" | |
| 28 #include "libavcodec/h264_dsp.h" | |
| 29 #include "dsputil_mmx.h" | |
| 30 | |
| 31 | |
| 32 //#undef NDEBUG | |
| 33 //#include <assert.h> | |
| 34 | |
| 35 int mm_flags; /* multimedia extension flags */ | |
| 36 | |
| 37 /* pixel operations */ | |
| 38 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; | |
| 39 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | |
| 40 | |
| 41 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = | |
| 42 {0x8000000080000000ULL, 0x8000000080000000ULL}; | |
| 43 | |
| 44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL; | |
| 45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL; | |
| 46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; | |
| 47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; | |
| 48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; | |
| 49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; | |
| 50 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; | |
| 51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; | |
| 52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; | |
| 53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; | |
| 54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; | |
| 55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; | |
| 56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; | |
| 57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; | |
| 58 | |
| 59 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL; | |
| 60 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL; | |
| 61 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; | |
| 62 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; | |
| 63 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; | |
| 64 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; | |
| 65 DECLARE_ALIGNED(8, const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL; | |
| 66 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; | |
| 67 | |
| 68 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; | |
| 69 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; | |
| 70 | |
| 71 #define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t" | |
| 72 #define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::) | |
| 73 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) | |
| 74 | |
| 75 #define MOVQ_BFE(regd) \ | |
| 76 __asm__ volatile ( \ | |
| 77 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ | |
| 78 "paddb %%" #regd ", %%" #regd " \n\t" ::) | |
| 79 | |
| 80 #ifndef PIC | |
| 81 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) | |
| 82 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) | |
| 83 #else | |
| 84 // for shared library it's better to use this way for accessing constants | |
| 85 // pcmpeqd -> -1 | |
| 86 #define MOVQ_BONE(regd) \ | |
| 87 __asm__ volatile ( \ | |
| 88 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
| 89 "psrlw $15, %%" #regd " \n\t" \ | |
| 90 "packuswb %%" #regd ", %%" #regd " \n\t" ::) | |
| 91 | |
| 92 #define MOVQ_WTWO(regd) \ | |
| 93 __asm__ volatile ( \ | |
| 94 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ | |
| 95 "psrlw $15, %%" #regd " \n\t" \ | |
| 96 "psllw $1, %%" #regd " \n\t"::) | |
| 97 | |
| 98 #endif | |
| 99 | |
| 100 // using regr as temporary and for the output result | |
| 101 // first argument is unmodifed and second is trashed | |
| 102 // regfe is supposed to contain 0xfefefefefefefefe | |
| 103 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ | |
| 104 "movq " #rega ", " #regr " \n\t"\ | |
| 105 "pand " #regb ", " #regr " \n\t"\ | |
| 106 "pxor " #rega ", " #regb " \n\t"\ | |
| 107 "pand " #regfe "," #regb " \n\t"\ | |
| 108 "psrlq $1, " #regb " \n\t"\ | |
| 109 "paddb " #regb ", " #regr " \n\t" | |
| 110 | |
| 111 #define PAVGB_MMX(rega, regb, regr, regfe) \ | |
| 112 "movq " #rega ", " #regr " \n\t"\ | |
| 113 "por " #regb ", " #regr " \n\t"\ | |
| 114 "pxor " #rega ", " #regb " \n\t"\ | |
| 115 "pand " #regfe "," #regb " \n\t"\ | |
| 116 "psrlq $1, " #regb " \n\t"\ | |
| 117 "psubb " #regb ", " #regr " \n\t" | |
| 118 | |
| 119 // mm6 is supposed to contain 0xfefefefefefefefe | |
| 120 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ | |
| 121 "movq " #rega ", " #regr " \n\t"\ | |
| 122 "movq " #regc ", " #regp " \n\t"\ | |
| 123 "pand " #regb ", " #regr " \n\t"\ | |
| 124 "pand " #regd ", " #regp " \n\t"\ | |
| 125 "pxor " #rega ", " #regb " \n\t"\ | |
| 126 "pxor " #regc ", " #regd " \n\t"\ | |
| 127 "pand %%mm6, " #regb " \n\t"\ | |
| 128 "pand %%mm6, " #regd " \n\t"\ | |
| 129 "psrlq $1, " #regb " \n\t"\ | |
| 130 "psrlq $1, " #regd " \n\t"\ | |
| 131 "paddb " #regb ", " #regr " \n\t"\ | |
| 132 "paddb " #regd ", " #regp " \n\t" | |
| 133 | |
| 134 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ | |
| 135 "movq " #rega ", " #regr " \n\t"\ | |
| 136 "movq " #regc ", " #regp " \n\t"\ | |
| 137 "por " #regb ", " #regr " \n\t"\ | |
| 138 "por " #regd ", " #regp " \n\t"\ | |
| 139 "pxor " #rega ", " #regb " \n\t"\ | |
| 140 "pxor " #regc ", " #regd " \n\t"\ | |
| 141 "pand %%mm6, " #regb " \n\t"\ | |
| 142 "pand %%mm6, " #regd " \n\t"\ | |
| 143 "psrlq $1, " #regd " \n\t"\ | |
| 144 "psrlq $1, " #regb " \n\t"\ | |
| 145 "psubb " #regb ", " #regr " \n\t"\ | |
| 146 "psubb " #regd ", " #regp " \n\t" | |
| 147 | |
| 148 /***********************************/ | |
| 149 /* MMX2 specific */ | |
| 150 | |
| 151 #define DEF(x) x ## _mmx2 | |
| 152 | |
| 153 /* Introduced only in MMX2 set */ | |
| 154 #define PAVGB "pavgb" | |
| 155 #define OP_AVG PAVGB | |
| 156 | |
| 157 #include "dsputil_mmx_avg_template.c" | |
| 158 | |
| 159 #undef DEF | |
| 160 #undef PAVGB | |
| 161 #undef OP_AVG | |
| 162 | |
| 163 #define put_no_rnd_pixels16_mmx put_pixels16_mmx | |
| 164 #define put_no_rnd_pixels8_mmx put_pixels8_mmx | |
| 165 #define put_pixels16_mmx2 put_pixels16_mmx | |
| 166 #define put_pixels8_mmx2 put_pixels8_mmx | |
| 167 #define put_pixels4_mmx2 put_pixels4_mmx | |
| 168 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx | |
| 169 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx | |
| 170 #define put_pixels16_3dnow put_pixels16_mmx | |
| 171 #define put_pixels8_3dnow put_pixels8_mmx | |
| 172 #define put_pixels4_3dnow put_pixels4_mmx | |
| 173 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx | |
| 174 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx | |
| 175 | |
| 176 /***********************************/ | |
| 177 /* standard MMX */ | |
| 178 | |
| 179 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) | |
| 180 { | |
| 181 const DCTELEM *p; | |
| 182 uint8_t *pix; | |
| 183 | |
| 184 /* read the pixels */ | |
| 185 p = block; | |
| 186 pix = pixels; | |
| 187 /* unrolled loop */ | |
| 188 __asm__ volatile( | |
| 189 "movq %3, %%mm0 \n\t" | |
| 190 "movq 8%3, %%mm1 \n\t" | |
| 191 "movq 16%3, %%mm2 \n\t" | |
| 192 "movq 24%3, %%mm3 \n\t" | |
| 193 "movq 32%3, %%mm4 \n\t" | |
| 194 "movq 40%3, %%mm5 \n\t" | |
| 195 "movq 48%3, %%mm6 \n\t" | |
| 196 "movq 56%3, %%mm7 \n\t" | |
| 197 "packuswb %%mm1, %%mm0 \n\t" | |
| 198 "packuswb %%mm3, %%mm2 \n\t" | |
| 199 "packuswb %%mm5, %%mm4 \n\t" | |
| 200 "packuswb %%mm7, %%mm6 \n\t" | |
| 201 "movq %%mm0, (%0) \n\t" | |
| 202 "movq %%mm2, (%0, %1) \n\t" | |
| 203 "movq %%mm4, (%0, %1, 2) \n\t" | |
| 204 "movq %%mm6, (%0, %2) \n\t" | |
| 205 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) | |
| 206 :"memory"); | |
| 207 pix += line_size*4; | |
| 208 p += 32; | |
| 209 | |
| 210 // if here would be an exact copy of the code above | |
| 211 // compiler would generate some very strange code | |
| 212 // thus using "r" | |
| 213 __asm__ volatile( | |
| 214 "movq (%3), %%mm0 \n\t" | |
| 215 "movq 8(%3), %%mm1 \n\t" | |
| 216 "movq 16(%3), %%mm2 \n\t" | |
| 217 "movq 24(%3), %%mm3 \n\t" | |
| 218 "movq 32(%3), %%mm4 \n\t" | |
| 219 "movq 40(%3), %%mm5 \n\t" | |
| 220 "movq 48(%3), %%mm6 \n\t" | |
| 221 "movq 56(%3), %%mm7 \n\t" | |
| 222 "packuswb %%mm1, %%mm0 \n\t" | |
| 223 "packuswb %%mm3, %%mm2 \n\t" | |
| 224 "packuswb %%mm5, %%mm4 \n\t" | |
| 225 "packuswb %%mm7, %%mm6 \n\t" | |
| 226 "movq %%mm0, (%0) \n\t" | |
| 227 "movq %%mm2, (%0, %1) \n\t" | |
| 228 "movq %%mm4, (%0, %1, 2) \n\t" | |
| 229 "movq %%mm6, (%0, %2) \n\t" | |
| 230 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) | |
| 231 :"memory"); | |
| 232 } | |
| 233 | |
| 234 DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] = | |
| 235 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; | |
| 236 | |
| 237 #define put_signed_pixels_clamped_mmx_half(off) \ | |
| 238 "movq "#off"(%2), %%mm1 \n\t"\ | |
| 239 "movq 16+"#off"(%2), %%mm2 \n\t"\ | |
| 240 "movq 32+"#off"(%2), %%mm3 \n\t"\ | |
| 241 "movq 48+"#off"(%2), %%mm4 \n\t"\ | |
| 242 "packsswb 8+"#off"(%2), %%mm1 \n\t"\ | |
| 243 "packsswb 24+"#off"(%2), %%mm2 \n\t"\ | |
| 244 "packsswb 40+"#off"(%2), %%mm3 \n\t"\ | |
| 245 "packsswb 56+"#off"(%2), %%mm4 \n\t"\ | |
| 246 "paddb %%mm0, %%mm1 \n\t"\ | |
| 247 "paddb %%mm0, %%mm2 \n\t"\ | |
| 248 "paddb %%mm0, %%mm3 \n\t"\ | |
| 249 "paddb %%mm0, %%mm4 \n\t"\ | |
| 250 "movq %%mm1, (%0) \n\t"\ | |
| 251 "movq %%mm2, (%0, %3) \n\t"\ | |
| 252 "movq %%mm3, (%0, %3, 2) \n\t"\ | |
| 253 "movq %%mm4, (%0, %1) \n\t" | |
| 254 | |
| 255 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) | |
| 256 { | |
| 257 x86_reg line_skip = line_size; | |
| 258 x86_reg line_skip3; | |
| 259 | |
| 260 __asm__ volatile ( | |
| 261 "movq "MANGLE(ff_vector128)", %%mm0 \n\t" | |
| 262 "lea (%3, %3, 2), %1 \n\t" | |
| 263 put_signed_pixels_clamped_mmx_half(0) | |
| 264 "lea (%0, %3, 4), %0 \n\t" | |
| 265 put_signed_pixels_clamped_mmx_half(64) | |
| 266 :"+&r" (pixels), "=&r" (line_skip3) | |
| 267 :"r" (block), "r"(line_skip) | |
| 268 :"memory"); | |
| 269 } | |
| 270 | |
| 271 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) | |
| 272 { | |
| 273 const DCTELEM *p; | |
| 274 uint8_t *pix; | |
| 275 int i; | |
| 276 | |
| 277 /* read the pixels */ | |
| 278 p = block; | |
| 279 pix = pixels; | |
| 280 MOVQ_ZERO(mm7); | |
| 281 i = 4; | |
| 282 do { | |
| 283 __asm__ volatile( | |
| 284 "movq (%2), %%mm0 \n\t" | |
| 285 "movq 8(%2), %%mm1 \n\t" | |
| 286 "movq 16(%2), %%mm2 \n\t" | |
| 287 "movq 24(%2), %%mm3 \n\t" | |
| 288 "movq %0, %%mm4 \n\t" | |
| 289 "movq %1, %%mm6 \n\t" | |
| 290 "movq %%mm4, %%mm5 \n\t" | |
| 291 "punpcklbw %%mm7, %%mm4 \n\t" | |
| 292 "punpckhbw %%mm7, %%mm5 \n\t" | |
| 293 "paddsw %%mm4, %%mm0 \n\t" | |
| 294 "paddsw %%mm5, %%mm1 \n\t" | |
| 295 "movq %%mm6, %%mm5 \n\t" | |
| 296 "punpcklbw %%mm7, %%mm6 \n\t" | |
| 297 "punpckhbw %%mm7, %%mm5 \n\t" | |
| 298 "paddsw %%mm6, %%mm2 \n\t" | |
| 299 "paddsw %%mm5, %%mm3 \n\t" | |
| 300 "packuswb %%mm1, %%mm0 \n\t" | |
| 301 "packuswb %%mm3, %%mm2 \n\t" | |
| 302 "movq %%mm0, %0 \n\t" | |
| 303 "movq %%mm2, %1 \n\t" | |
| 304 :"+m"(*pix), "+m"(*(pix+line_size)) | |
| 305 :"r"(p) | |
| 306 :"memory"); | |
| 307 pix += line_size*2; | |
| 308 p += 16; | |
| 309 } while (--i); | |
| 310 } | |
| 311 | |
| 312 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 313 { | |
| 314 __asm__ volatile( | |
| 315 "lea (%3, %3), %%"REG_a" \n\t" | |
| 316 ASMALIGN(3) | |
| 317 "1: \n\t" | |
| 318 "movq (%1), %%mm0 \n\t" | |
| 319 "movq (%1, %3), %%mm1 \n\t" | |
| 320 "movq %%mm0, (%2) \n\t" | |
| 321 "movq %%mm1, (%2, %3) \n\t" | |
| 322 "add %%"REG_a", %1 \n\t" | |
| 323 "add %%"REG_a", %2 \n\t" | |
| 324 "movq (%1), %%mm0 \n\t" | |
| 325 "movq (%1, %3), %%mm1 \n\t" | |
| 326 "movq %%mm0, (%2) \n\t" | |
| 327 "movq %%mm1, (%2, %3) \n\t" | |
| 328 "add %%"REG_a", %1 \n\t" | |
| 329 "add %%"REG_a", %2 \n\t" | |
| 330 "subl $4, %0 \n\t" | |
| 331 "jnz 1b \n\t" | |
| 332 : "+g"(h), "+r" (pixels), "+r" (block) | |
| 333 : "r"((x86_reg)line_size) | |
| 334 : "%"REG_a, "memory" | |
| 335 ); | |
| 336 } | |
| 337 | |
| 338 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 339 { | |
| 340 __asm__ volatile( | |
| 341 "1: \n\t" | |
| 342 "movdqu (%1), %%xmm0 \n\t" | |
| 343 "movdqu (%1,%3), %%xmm1 \n\t" | |
| 344 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
| 345 "movdqu (%1,%4), %%xmm3 \n\t" | |
| 346 "movdqa %%xmm0, (%2) \n\t" | |
| 347 "movdqa %%xmm1, (%2,%3) \n\t" | |
| 348 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
| 349 "movdqa %%xmm3, (%2,%4) \n\t" | |
| 350 "subl $4, %0 \n\t" | |
| 351 "lea (%1,%3,4), %1 \n\t" | |
| 352 "lea (%2,%3,4), %2 \n\t" | |
| 353 "jnz 1b \n\t" | |
| 354 : "+g"(h), "+r" (pixels), "+r" (block) | |
| 355 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
| 356 : "memory" | |
| 357 ); | |
| 358 } | |
| 359 | |
| 360 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
| 361 { | |
| 362 __asm__ volatile( | |
| 363 "1: \n\t" | |
| 364 "movdqu (%1), %%xmm0 \n\t" | |
| 365 "movdqu (%1,%3), %%xmm1 \n\t" | |
| 366 "movdqu (%1,%3,2), %%xmm2 \n\t" | |
| 367 "movdqu (%1,%4), %%xmm3 \n\t" | |
| 368 "pavgb (%2), %%xmm0 \n\t" | |
| 369 "pavgb (%2,%3), %%xmm1 \n\t" | |
| 370 "pavgb (%2,%3,2), %%xmm2 \n\t" | |
| 371 "pavgb (%2,%4), %%xmm3 \n\t" | |
| 372 "movdqa %%xmm0, (%2) \n\t" | |
| 373 "movdqa %%xmm1, (%2,%3) \n\t" | |
| 374 "movdqa %%xmm2, (%2,%3,2) \n\t" | |
| 375 "movdqa %%xmm3, (%2,%4) \n\t" | |
| 376 "subl $4, %0 \n\t" | |
| 377 "lea (%1,%3,4), %1 \n\t" | |
| 378 "lea (%2,%3,4), %2 \n\t" | |
| 379 "jnz 1b \n\t" | |
| 380 : "+g"(h), "+r" (pixels), "+r" (block) | |
| 381 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) | |
| 382 : "memory" | |
| 383 ); | |
| 384 } | |
| 385 | |
| 386 static void clear_block_sse(DCTELEM *block) | |
| 387 { | |
| 388 __asm__ volatile( | |
| 389 "xorps %%xmm0, %%xmm0 \n" | |
| 390 "movaps %%xmm0, (%0) \n" | |
| 391 "movaps %%xmm0, 16(%0) \n" | |
| 392 "movaps %%xmm0, 32(%0) \n" | |
| 393 "movaps %%xmm0, 48(%0) \n" | |
| 394 "movaps %%xmm0, 64(%0) \n" | |
| 395 "movaps %%xmm0, 80(%0) \n" | |
| 396 "movaps %%xmm0, 96(%0) \n" | |
| 397 "movaps %%xmm0, 112(%0) \n" | |
| 398 :: "r"(block) | |
| 399 : "memory" | |
| 400 ); | |
| 401 } | |
| 402 | |
| 403 static void clear_blocks_sse(DCTELEM *blocks) | |
| 404 {\ | |
| 405 __asm__ volatile( | |
| 406 "xorps %%xmm0, %%xmm0 \n" | |
| 407 "mov %1, %%"REG_a" \n" | |
| 408 "1: \n" | |
| 409 "movaps %%xmm0, (%0, %%"REG_a") \n" | |
| 410 "movaps %%xmm0, 16(%0, %%"REG_a") \n" | |
| 411 "movaps %%xmm0, 32(%0, %%"REG_a") \n" | |
| 412 "movaps %%xmm0, 48(%0, %%"REG_a") \n" | |
| 413 "movaps %%xmm0, 64(%0, %%"REG_a") \n" | |
| 414 "movaps %%xmm0, 80(%0, %%"REG_a") \n" | |
| 415 "movaps %%xmm0, 96(%0, %%"REG_a") \n" | |
| 416 "movaps %%xmm0, 112(%0, %%"REG_a") \n" | |
| 417 "add $128, %%"REG_a" \n" | |
| 418 " js 1b \n" | |
| 419 : : "r" (((uint8_t *)blocks)+128*6), | |
| 420 "i" (-128*6) | |
| 421 : "%"REG_a | |
| 422 ); | |
| 423 } | |
| 424 | |
| 425 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ | |
| 426 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... | |
| 427 "movd %4, %%mm0 \n\t" | |
| 428 "movd %5, %%mm1 \n\t" | |
| 429 "movd %6, %%mm2 \n\t" | |
| 430 "movd %7, %%mm3 \n\t" | |
| 431 "punpcklbw %%mm1, %%mm0 \n\t" | |
| 432 "punpcklbw %%mm3, %%mm2 \n\t" | |
| 433 "movq %%mm0, %%mm1 \n\t" | |
| 434 "punpcklwd %%mm2, %%mm0 \n\t" | |
| 435 "punpckhwd %%mm2, %%mm1 \n\t" | |
| 436 "movd %%mm0, %0 \n\t" | |
| 437 "punpckhdq %%mm0, %%mm0 \n\t" | |
| 438 "movd %%mm0, %1 \n\t" | |
| 439 "movd %%mm1, %2 \n\t" | |
| 440 "punpckhdq %%mm1, %%mm1 \n\t" | |
| 441 "movd %%mm1, %3 \n\t" | |
| 442 | |
| 443 : "=m" (*(uint32_t*)(dst + 0*dst_stride)), | |
| 444 "=m" (*(uint32_t*)(dst + 1*dst_stride)), | |
| 445 "=m" (*(uint32_t*)(dst + 2*dst_stride)), | |
| 446 "=m" (*(uint32_t*)(dst + 3*dst_stride)) | |
| 447 : "m" (*(uint32_t*)(src + 0*src_stride)), | |
| 448 "m" (*(uint32_t*)(src + 1*src_stride)), | |
| 449 "m" (*(uint32_t*)(src + 2*src_stride)), | |
| 450 "m" (*(uint32_t*)(src + 3*src_stride)) | |
| 451 ); | |
| 452 } | |
| 453 | |
| 454 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ | |
| 455 \ | |
| 456 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
| 457 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ | |
| 458 }\ | |
| 459 \ | |
| 460 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 461 uint64_t temp[8];\ | |
| 462 uint8_t * const half= (uint8_t*)temp;\ | |
| 463 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
| 464 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ | |
| 465 }\ | |
| 466 \ | |
| 467 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 468 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ | |
| 469 }\ | |
| 470 \ | |
| 471 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 472 uint64_t temp[8];\ | |
| 473 uint8_t * const half= (uint8_t*)temp;\ | |
| 474 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ | |
| 475 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ | |
| 476 }\ | |
| 477 \ | |
| 478 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 479 uint64_t temp[8];\ | |
| 480 uint8_t * const half= (uint8_t*)temp;\ | |
| 481 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ | |
| 482 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ | |
| 483 }\ | |
| 484 \ | |
| 485 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 486 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
| 487 }\ | |
| 488 \ | |
| 489 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 490 uint64_t temp[8];\ | |
| 491 uint8_t * const half= (uint8_t*)temp;\ | |
| 492 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ | |
| 493 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ | |
| 494 }\ | |
| 495 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 496 uint64_t half[8 + 9];\ | |
| 497 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
| 498 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 499 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 500 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
| 501 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
| 502 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
| 503 }\ | |
| 504 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 505 uint64_t half[8 + 9];\ | |
| 506 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
| 507 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 508 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 509 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
| 510 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
| 511 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
| 512 }\ | |
| 513 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 514 uint64_t half[8 + 9];\ | |
| 515 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
| 516 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 517 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 518 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
| 519 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
| 520 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
| 521 }\ | |
| 522 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 523 uint64_t half[8 + 9];\ | |
| 524 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
| 525 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 526 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 527 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
| 528 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
| 529 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
| 530 }\ | |
| 531 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 532 uint64_t half[8 + 9];\ | |
| 533 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
| 534 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 535 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 536 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
| 537 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ | |
| 538 }\ | |
| 539 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 540 uint64_t half[8 + 9];\ | |
| 541 uint8_t * const halfH= ((uint8_t*)half) + 64;\ | |
| 542 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 543 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 544 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ | |
| 545 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ | |
| 546 }\ | |
| 547 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 548 uint64_t half[8 + 9];\ | |
| 549 uint8_t * const halfH= ((uint8_t*)half);\ | |
| 550 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 551 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ | |
| 552 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
| 553 }\ | |
| 554 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 555 uint64_t half[8 + 9];\ | |
| 556 uint8_t * const halfH= ((uint8_t*)half);\ | |
| 557 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 558 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ | |
| 559 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
| 560 }\ | |
| 561 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 562 uint64_t half[9];\ | |
| 563 uint8_t * const halfH= ((uint8_t*)half);\ | |
| 564 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ | |
| 565 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ | |
| 566 }\ | |
| 567 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ | |
| 568 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ | |
| 569 }\ | |
| 570 \ | |
| 571 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 572 uint64_t temp[32];\ | |
| 573 uint8_t * const half= (uint8_t*)temp;\ | |
| 574 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
| 575 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ | |
| 576 }\ | |
| 577 \ | |
| 578 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 579 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ | |
| 580 }\ | |
| 581 \ | |
| 582 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 583 uint64_t temp[32];\ | |
| 584 uint8_t * const half= (uint8_t*)temp;\ | |
| 585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ | |
| 586 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ | |
| 587 }\ | |
| 588 \ | |
| 589 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 590 uint64_t temp[32];\ | |
| 591 uint8_t * const half= (uint8_t*)temp;\ | |
| 592 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ | |
| 593 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ | |
| 594 }\ | |
| 595 \ | |
| 596 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 597 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ | |
| 598 }\ | |
| 599 \ | |
| 600 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 601 uint64_t temp[32];\ | |
| 602 uint8_t * const half= (uint8_t*)temp;\ | |
| 603 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ | |
| 604 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ | |
| 605 }\ | |
| 606 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 607 uint64_t half[16*2 + 17*2];\ | |
| 608 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
| 609 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 610 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 611 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
| 612 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
| 613 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
| 614 }\ | |
| 615 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 616 uint64_t half[16*2 + 17*2];\ | |
| 617 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
| 618 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 619 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 620 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
| 621 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
| 622 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
| 623 }\ | |
| 624 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 625 uint64_t half[16*2 + 17*2];\ | |
| 626 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
| 627 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 628 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 629 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
| 630 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
| 631 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
| 632 }\ | |
| 633 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 634 uint64_t half[16*2 + 17*2];\ | |
| 635 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
| 636 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 637 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 638 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
| 639 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
| 640 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
| 641 }\ | |
| 642 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 643 uint64_t half[16*2 + 17*2];\ | |
| 644 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
| 645 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 646 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 647 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
| 648 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ | |
| 649 }\ | |
| 650 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 651 uint64_t half[16*2 + 17*2];\ | |
| 652 uint8_t * const halfH= ((uint8_t*)half) + 256;\ | |
| 653 uint8_t * const halfHV= ((uint8_t*)half);\ | |
| 654 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 655 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ | |
| 656 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ | |
| 657 }\ | |
| 658 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 659 uint64_t half[17*2];\ | |
| 660 uint8_t * const halfH= ((uint8_t*)half);\ | |
| 661 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 662 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ | |
| 663 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
| 664 }\ | |
| 665 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 666 uint64_t half[17*2];\ | |
| 667 uint8_t * const halfH= ((uint8_t*)half);\ | |
| 668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 669 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ | |
| 670 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
| 671 }\ | |
| 672 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ | |
| 673 uint64_t half[17*2];\ | |
| 674 uint8_t * const halfH= ((uint8_t*)half);\ | |
| 675 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ | |
| 676 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ | |
| 677 } | |
| 678 | |
| 679 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" | |
| 680 #define AVG_3DNOW_OP(a,b,temp, size) \ | |
| 681 "mov" #size " " #b ", " #temp " \n\t"\ | |
| 682 "pavgusb " #temp ", " #a " \n\t"\ | |
| 683 "mov" #size " " #a ", " #b " \n\t" | |
| 684 #define AVG_MMX2_OP(a,b,temp, size) \ | |
| 685 "mov" #size " " #b ", " #temp " \n\t"\ | |
| 686 "pavgb " #temp ", " #a " \n\t"\ | |
| 687 "mov" #size " " #a ", " #b " \n\t" | |
| 688 | |
| 689 #define PREFETCH(name, op) \ | |
| 690 static void name(void *mem, int stride, int h){\ | |
| 691 const uint8_t *p= mem;\ | |
| 692 do{\ | |
| 693 __asm__ volatile(#op" %0" :: "m"(*p));\ | |
| 694 p+= stride;\ | |
| 695 }while(--h);\ | |
| 696 } | |
| 697 PREFETCH(prefetch_mmx2, prefetcht0) | |
| 698 #undef PREFETCH | |
| 699 | |
| 700 #include "h264dsp_mmx.c" | |
| 701 | |
| 702 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | |
| 703 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | |
| 704 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); | |
| 705 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
| 706 void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | |
| 707 | |
| 708 void dsputil_init_mmx(DSPContext* c) | |
| 709 { | |
| 710 mm_flags = mm_support(); | |
| 711 | |
| 712 if (mm_flags & FF_MM_MMX) { | |
| 713 c->clear_block = clear_block_sse; | |
| 714 c->clear_blocks = clear_blocks_sse; | |
| 715 c->prefetch = prefetch_mmx2; | |
| 716 | |
| 717 | |
| 718 #define H264_QPEL_FUNCS(x, y, CPU)\ | |
| 719 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\ | |
| 720 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\ | |
| 721 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\ | |
| 722 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; | |
| 723 | |
| 724 if((mm_flags & FF_MM_SSE2)){ | |
| 725 c->put_pixels_tab[0][0] = put_pixels16_sse2; | |
| 726 c->avg_pixels_tab[0][0] = avg_pixels16_sse2; | |
| 727 | |
| 728 } | |
| 729 if(mm_flags & FF_MM_SSE2){ | |
| 730 H264_QPEL_FUNCS(0, 1, sse2); | |
| 731 H264_QPEL_FUNCS(0, 2, sse2); | |
| 732 H264_QPEL_FUNCS(0, 3, sse2); | |
| 733 H264_QPEL_FUNCS(1, 1, sse2); | |
| 734 H264_QPEL_FUNCS(1, 2, sse2); | |
| 735 H264_QPEL_FUNCS(1, 3, sse2); | |
| 736 H264_QPEL_FUNCS(2, 1, sse2); | |
| 737 H264_QPEL_FUNCS(2, 2, sse2); | |
| 738 H264_QPEL_FUNCS(2, 3, sse2); | |
| 739 H264_QPEL_FUNCS(3, 1, sse2); | |
| 740 H264_QPEL_FUNCS(3, 2, sse2); | |
| 741 H264_QPEL_FUNCS(3, 3, sse2); | |
| 742 } | |
| 743 #if HAVE_SSSE3 | |
| 744 if(mm_flags & FF_MM_SSSE3){ | |
| 745 H264_QPEL_FUNCS(1, 0, ssse3); | |
| 746 H264_QPEL_FUNCS(1, 1, ssse3); | |
| 747 H264_QPEL_FUNCS(1, 2, ssse3); | |
| 748 H264_QPEL_FUNCS(1, 3, ssse3); | |
| 749 H264_QPEL_FUNCS(2, 0, ssse3); | |
| 750 H264_QPEL_FUNCS(2, 1, ssse3); | |
| 751 H264_QPEL_FUNCS(2, 2, ssse3); | |
| 752 H264_QPEL_FUNCS(2, 3, ssse3); | |
| 753 H264_QPEL_FUNCS(3, 0, ssse3); | |
| 754 H264_QPEL_FUNCS(3, 1, ssse3); | |
| 755 H264_QPEL_FUNCS(3, 2, ssse3); | |
| 756 H264_QPEL_FUNCS(3, 3, ssse3); | |
| 757 | |
| 758 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd; | |
| 759 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd; | |
| 760 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3; | |
| 761 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3; | |
| 762 } | |
| 763 #endif | |
| 764 | |
| 765 | |
| 766 } | |
| 767 } | |
| 768 | |
| 769 void ff_h264dsp_init_x86(H264DSPContext *c) | |
| 770 { | |
| 771 mm_flags = mm_support(); | |
| 772 | |
| 773 if (mm_flags & FF_MM_MMX) { | |
| 774 c->h264_idct_dc_add= | |
| 775 c->h264_idct_add= ff_h264_idct_add_mmx; | |
| 776 c->h264_idct8_dc_add= | |
| 777 c->h264_idct8_add= ff_h264_idct8_add_mmx; | |
| 778 | |
| 779 if (mm_flags & FF_MM_MMX2) { | |
| 780 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | |
| 781 c->h264_idct_add8 = ff_h264_idct_add8_mmx2; | |
| 782 c->h264_idct_add16 = ff_h264_idct_add16_mmx2; | |
| 783 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; | |
| 784 | |
| 785 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; | |
| 786 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; | |
| 787 | |
| 788 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; | |
| 789 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; | |
| 790 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | |
| 791 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | |
| 792 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; | |
| 793 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; | |
| 794 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; | |
| 795 | |
| 796 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; | |
| 797 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; | |
| 798 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; | |
| 799 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; | |
| 800 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; | |
| 801 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; | |
| 802 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; | |
| 803 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; | |
| 804 | |
| 805 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; | |
| 806 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; | |
| 807 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; | |
| 808 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; | |
| 809 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; | |
| 810 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; | |
| 811 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; | |
| 812 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; | |
| 813 } | |
| 814 if(mm_flags & FF_MM_SSE2){ | |
| 815 c->h264_idct8_add = ff_h264_idct8_add_sse2; | |
| 816 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |
| 817 } | |
| 818 | |
| 819 } | |
| 820 } | |
| 821 |
