Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSsH264App

changeset 1:11d15c47beaf
add h264 decoder code
author: Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date: Mon, 27 Aug 2012 12:09:56 +0200
parents: 7d83462103bc
children: 897f711a7157
files: ffmpeg_smp/benchmark.sh ffmpeg_smp/h264dec/COPYING.GPLv3 ffmpeg_smp/h264dec/README.txt ffmpeg_smp/h264dec/configure.ac ffmpeg_smp/h264dec/h264dec.c ffmpeg_smp/h264dec/libavcodec/arm/aac.h ffmpeg_smp/h264dec/libavcodec/arm/asm.S ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S ffmpeg_smp/h264dec/libavcodec/arm/mathops.h ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S ffmpeg_smp/h264dec/libavcodec/arm/synth_filter_neon.S ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S ffmpeg_smp/h264dec/libavcodec/avcodec.h ffmpeg_smp/h264dec/libavcodec/cabac.c ffmpeg_smp/h264dec/libavcodec/cabac.h ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.h ffmpeg_smp/h264dec/libavcodec/cell/h264_filter_spu_vec.c ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h ffmpeg_smp/h264dec/libavcodec/dsputil.c ffmpeg_smp/h264dec/libavcodec/dsputil.h ffmpeg_smp/h264dec/libavcodec/get_bits.h ffmpeg_smp/h264dec/libavcodec/golomb.c ffmpeg_smp/h264dec/libavcodec/golomb.h ffmpeg_smp/h264dec/libavcodec/h264.c ffmpeg_smp/h264dec/libavcodec/h264.h ffmpeg_smp/h264dec/libavcodec/h264_cell.c ffmpeg_smp/h264dec/libavcodec/h264_data.h ffmpeg_smp/h264dec/libavcodec/h264_deblock.c ffmpeg_smp/h264dec/libavcodec/h264_deblock.h ffmpeg_smp/h264dec/libavcodec/h264_dsp.c ffmpeg_smp/h264dec/libavcodec/h264_dsp.h ffmpeg_smp/h264dec/libavcodec/h264_entropy.c ffmpeg_smp/h264dec/libavcodec/h264_entropy.h ffmpeg_smp/h264dec/libavcodec/h264_idct.c ffmpeg_smp/h264dec/libavcodec/h264_idct.h ffmpeg_smp/h264dec/libavcodec/h264_mc.c ffmpeg_smp/h264dec/libavcodec/h264_mc.h ffmpeg_smp/h264dec/libavcodec/h264_misc.c ffmpeg_smp/h264dec/libavcodec/h264_misc.h ffmpeg_smp/h264dec/libavcodec/h264_nal.c ffmpeg_smp/h264dec/libavcodec/h264_nal.h ffmpeg_smp/h264dec/libavcodec/h264_numa.c ffmpeg_smp/h264dec/libavcodec/h264_ompss.c ffmpeg_smp/h264dec/libavcodec/h264_opencl.c ffmpeg_smp/h264dec/libavcodec/h264_opencl.h ffmpeg_smp/h264dec/libavcodec/h264_parser.c ffmpeg_smp/h264dec/libavcodec/h264_parser.h ffmpeg_smp/h264dec/libavcodec/h264_pred.c ffmpeg_smp/h264dec/libavcodec/h264_pred.h ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h ffmpeg_smp/h264dec/libavcodec/h264_ps.c ffmpeg_smp/h264dec/libavcodec/h264_ps.h ffmpeg_smp/h264dec/libavcodec/h264_pthread.c ffmpeg_smp/h264dec/libavcodec/h264_pthread.h ffmpeg_smp/h264dec/libavcodec/h264_rec.c ffmpeg_smp/h264dec/libavcodec/h264_rec.h ffmpeg_smp/h264dec/libavcodec/h264_refs.c ffmpeg_smp/h264dec/libavcodec/h264_refs.h ffmpeg_smp/h264dec/libavcodec/h264_sei.c ffmpeg_smp/h264dec/libavcodec/h264_sei.h ffmpeg_smp/h264dec/libavcodec/h264_seq.c ffmpeg_smp/h264dec/libavcodec/h264_types.h ffmpeg_smp/h264dec/libavcodec/mathops.h ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h ffmpeg_smp/h264dec/libavcodec/raw.h ffmpeg_smp/h264dec/libavcodec/rectangle.h ffmpeg_smp/h264dec/libavcodec/scratch.c ffmpeg_smp/h264dec/libavcodec/simple_idct.c ffmpeg_smp/h264dec/libavcodec/simple_idct.h ffmpeg_smp/h264dec/libavcodec/utils.c ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c ffmpeg_smp/h264dec/libavcodec/x86/mathops.h ffmpeg_smp/h264dec/libavcodec/x86/mmx.h ffmpeg_smp/h264dec/libavutil/arm/bswap.h ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h ffmpeg_smp/h264dec/libavutil/arm/timer.h ffmpeg_smp/h264dec/libavutil/attributes.h ffmpeg_smp/h264dec/libavutil/bswap.h ffmpeg_smp/h264dec/libavutil/common.h ffmpeg_smp/h264dec/libavutil/error.h ffmpeg_smp/h264dec/libavutil/internal.h ffmpeg_smp/h264dec/libavutil/intreadwrite.h ffmpeg_smp/h264dec/libavutil/log.c ffmpeg_smp/h264dec/libavutil/log.h ffmpeg_smp/h264dec/libavutil/mem.c ffmpeg_smp/h264dec/libavutil/mem.h ffmpeg_smp/h264dec/libavutil/pixfmt.h ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h ffmpeg_smp/h264dec/libavutil/ppc/timer.h ffmpeg_smp/h264dec/libavutil/timer.h ffmpeg_smp/h264dec/libavutil/x86/bswap.h ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h ffmpeg_smp/h264dec/libavutil/x86/timer.h ffmpeg_smp/h264dec/libavutil/x86_cpu.h
diffstat: 169 files changed, 51940 insertions(+), 0 deletions(-) [+]
[-]

ffmpeg_smp/benchmark.sh 126

ffmpeg_smp/h264dec/COPYING.GPLv3 674

ffmpeg_smp/h264dec/README.txt 79

ffmpeg_smp/h264dec/configure.ac 171

ffmpeg_smp/h264dec/h264dec.c 288

ffmpeg_smp/h264dec/libavcodec/arm/aac.h 137

ffmpeg_smp/h264dec/libavcodec/arm/asm.S 72

ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c 32

ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S 61

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S 712

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h 33

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S 623

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c 112

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c 41

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c 121

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c 308

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c 36

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c 205

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c 1114

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S 1146

ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S 189

ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c 65

ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S 371

ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c 126

ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S 1883

ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S 180

ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c 75

ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S 362

ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S 118

ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S 388

ffmpeg_smp/h264dec/libavcodec/arm/mathops.h 116

ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S 303

ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c 38

ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h 27

ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c 101

ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S 117

ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c 120

ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S 151

ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S 486

ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S 703

ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S 433

ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S 373

ffmpeg_smp/h264dec/libavcodec/arm/synth_filter_neon.S 117

ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S 420

ffmpeg_smp/h264dec/libavcodec/avcodec.h 407

ffmpeg_smp/h264dec/libavcodec/cabac.c 242

ffmpeg_smp/h264dec/libavcodec/cabac.h 206

ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c 140

ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h 233

ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c 1147

ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h 34

ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c 2633

ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h 17

ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c 355

ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c 266

ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h 80

ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c 725

ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h 97

ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c 332

ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h 8

ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c 74

ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.h 59

ffmpeg_smp/h264dec/libavcodec/cell/h264_filter_spu_vec.c 650

ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c 408

ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h 141

ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c 802

ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h 48

ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c 1560

ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c 362

ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h 53

ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h 90

ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c 26

ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h 83

ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h 203

ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h 137

ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h 92

ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c 508

ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c 356

ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h 69

ffmpeg_smp/h264dec/libavcodec/dsputil.c 1057

ffmpeg_smp/h264dec/libavcodec/dsputil.h 465

ffmpeg_smp/h264dec/libavcodec/get_bits.h 325

ffmpeg_smp/h264dec/libavcodec/golomb.c 184

ffmpeg_smp/h264dec/libavcodec/golomb.h 410

ffmpeg_smp/h264dec/libavcodec/h264.c 215

ffmpeg_smp/h264dec/libavcodec/h264.h 76

ffmpeg_smp/h264dec/libavcodec/h264_cell.c 1242

ffmpeg_smp/h264dec/libavcodec/h264_data.h 243

ffmpeg_smp/h264dec/libavcodec/h264_deblock.c 507

ffmpeg_smp/h264dec/libavcodec/h264_deblock.h 8

ffmpeg_smp/h264dec/libavcodec/h264_dsp.c 320

ffmpeg_smp/h264dec/libavcodec/h264_dsp.h 83

ffmpeg_smp/h264dec/libavcodec/h264_entropy.c 2065

ffmpeg_smp/h264dec/libavcodec/h264_entropy.h 20

ffmpeg_smp/h264dec/libavcodec/h264_idct.c 270

ffmpeg_smp/h264dec/libavcodec/h264_idct.h 19

ffmpeg_smp/h264dec/libavcodec/h264_mc.c 272

ffmpeg_smp/h264dec/libavcodec/h264_mc.h 12

ffmpeg_smp/h264dec/libavcodec/h264_misc.c 944

ffmpeg_smp/h264dec/libavcodec/h264_misc.h 52

ffmpeg_smp/h264dec/libavcodec/h264_nal.c 628

ffmpeg_smp/h264dec/libavcodec/h264_nal.h 11

ffmpeg_smp/h264dec/libavcodec/h264_numa.c 33

ffmpeg_smp/h264dec/libavcodec/h264_ompss.c 400

ffmpeg_smp/h264dec/libavcodec/h264_parser.c 224

ffmpeg_smp/h264dec/libavcodec/h264_parser.h 10

ffmpeg_smp/h264dec/libavcodec/h264_pred.c 945

ffmpeg_smp/h264dec/libavcodec/h264_pred.h 90

ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c 1013

ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h 10

ffmpeg_smp/h264dec/libavcodec/h264_ps.c 462

ffmpeg_smp/h264dec/libavcodec/h264_ps.h 9

ffmpeg_smp/h264dec/libavcodec/h264_pthread.c 604

ffmpeg_smp/h264dec/libavcodec/h264_pthread.h 14

ffmpeg_smp/h264dec/libavcodec/h264_rec.c 412

ffmpeg_smp/h264dec/libavcodec/h264_rec.h 12

ffmpeg_smp/h264dec/libavcodec/h264_refs.c 461

ffmpeg_smp/h264dec/libavcodec/h264_refs.h 14

ffmpeg_smp/h264dec/libavcodec/h264_sei.c 191

ffmpeg_smp/h264dec/libavcodec/h264_sei.h 7

ffmpeg_smp/h264dec/libavcodec/h264_seq.c 220

ffmpeg_smp/h264dec/libavcodec/h264_types.h 658

ffmpeg_smp/h264dec/libavcodec/mathops.h 145

ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c 619

ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h 52

ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c 48

ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h 154

ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c 1021

ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c 783

ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c 232

ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h 79

ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h 46

ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h 105

ffmpeg_smp/h264dec/libavcodec/raw.h 39

ffmpeg_smp/h264dec/libavcodec/rectangle.h 92

ffmpeg_smp/h264dec/libavcodec/scratch.c 295

ffmpeg_smp/h264dec/libavcodec/simple_idct.c 372

ffmpeg_smp/h264dec/libavcodec/simple_idct.h 47

ffmpeg_smp/h264dec/libavcodec/utils.c 68

ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c 135

ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c 304

ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c 208

ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c 821

ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h 170

ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c 250

ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c 1741

ffmpeg_smp/h264dec/libavcodec/x86/mathops.h 67

ffmpeg_smp/h264dec/libavcodec/x86/mmx.h 267

ffmpeg_smp/h264dec/libavutil/arm/bswap.h 72

ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h 78

ffmpeg_smp/h264dec/libavutil/arm/timer.h 40

ffmpeg_smp/h264dec/libavutil/attributes.h 113

ffmpeg_smp/h264dec/libavutil/bswap.h 95

ffmpeg_smp/h264dec/libavutil/common.h 298

ffmpeg_smp/h264dec/libavutil/error.h 53

ffmpeg_smp/h264dec/libavutil/internal.h 168

ffmpeg_smp/h264dec/libavutil/intreadwrite.h 498

ffmpeg_smp/h264dec/libavutil/log.c 111

ffmpeg_smp/h264dec/libavutil/log.h 120

ffmpeg_smp/h264dec/libavutil/mem.c 127

ffmpeg_smp/h264dec/libavutil/mem.h 143

ffmpeg_smp/h264dec/libavutil/pixfmt.h 161

ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h 108

ffmpeg_smp/h264dec/libavutil/ppc/timer.h 47

ffmpeg_smp/h264dec/libavutil/timer.h 69

ffmpeg_smp/h264dec/libavutil/x86/bswap.h 61

ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h 97

ffmpeg_smp/h264dec/libavutil/x86/timer.h 35

ffmpeg_smp/h264dec/libavutil/x86_cpu.h 73 ffmpeg_smp/benchmark.sh 126 ffmpeg_smp/h264dec/COPYING.GPLv3 674 ffmpeg_smp/h264dec/README.txt 79 ffmpeg_smp/h264dec/configure.ac 171 ffmpeg_smp/h264dec/h264dec.c 288 ffmpeg_smp/h264dec/libavcodec/arm/aac.h 137 ffmpeg_smp/h264dec/libavcodec/arm/asm.S 72 ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c 32 ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S 61 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S 712 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h 33 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S 623 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c 112 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c 41 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c 121 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c 308 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c 36 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c 205 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c 1114 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S 1146 ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S 189 ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c 65 ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S 371 ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c 126 ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S 1883 ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S 180 ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c 75 ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S 362 ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S 118 ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S 388 ffmpeg_smp/h264dec/libavcodec/arm/mathops.h 116 ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S 303 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c 38 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h 27 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c 101 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S 117 ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c 120 ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S 151 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S 486 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S 703 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S 433 ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S 373 ffmpeg_smp/h264dec/libavcodec/arm/synth_filter_neon.S 117 ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S 420 ffmpeg_smp/h264dec/libavcodec/avcodec.h 407 ffmpeg_smp/h264dec/libavcodec/cabac.c 242 ffmpeg_smp/h264dec/libavcodec/cabac.h 206 ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c 140 ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h 233 ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c 1147 ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h 34 ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c 2633 ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h 17 ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c 355 ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c 266 ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h 80 ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c 725 ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h 97 ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c 332 ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h 8 ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c 74 ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.h 59 ffmpeg_smp/h264dec/libavcodec/cell/h264_filter_spu_vec.c 650 ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c 408 ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h 141 ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c 802 ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h 48 ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c 1560 ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c 362 ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h 53 ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h 90 ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c 26 ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h 83 ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h 203 ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h 137 ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h 92 ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c 508 ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c 356 ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h 69 ffmpeg_smp/h264dec/libavcodec/dsputil.c 1057 ffmpeg_smp/h264dec/libavcodec/dsputil.h 465 ffmpeg_smp/h264dec/libavcodec/get_bits.h 325 ffmpeg_smp/h264dec/libavcodec/golomb.c 184 ffmpeg_smp/h264dec/libavcodec/golomb.h 410 ffmpeg_smp/h264dec/libavcodec/h264.c 215 ffmpeg_smp/h264dec/libavcodec/h264.h 76 ffmpeg_smp/h264dec/libavcodec/h264_cell.c 1242 ffmpeg_smp/h264dec/libavcodec/h264_data.h 243 ffmpeg_smp/h264dec/libavcodec/h264_deblock.c 507 ffmpeg_smp/h264dec/libavcodec/h264_deblock.h 8 ffmpeg_smp/h264dec/libavcodec/h264_dsp.c 320 ffmpeg_smp/h264dec/libavcodec/h264_dsp.h 83 ffmpeg_smp/h264dec/libavcodec/h264_entropy.c 2065 ffmpeg_smp/h264dec/libavcodec/h264_entropy.h 20 ffmpeg_smp/h264dec/libavcodec/h264_idct.c 270 ffmpeg_smp/h264dec/libavcodec/h264_idct.h 19 ffmpeg_smp/h264dec/libavcodec/h264_mc.c 272 ffmpeg_smp/h264dec/libavcodec/h264_mc.h 12 ffmpeg_smp/h264dec/libavcodec/h264_misc.c 944 ffmpeg_smp/h264dec/libavcodec/h264_misc.h 52 ffmpeg_smp/h264dec/libavcodec/h264_nal.c 628 ffmpeg_smp/h264dec/libavcodec/h264_nal.h 11 ffmpeg_smp/h264dec/libavcodec/h264_numa.c 33 ffmpeg_smp/h264dec/libavcodec/h264_ompss.c 400 ffmpeg_smp/h264dec/libavcodec/h264_parser.c 224 ffmpeg_smp/h264dec/libavcodec/h264_parser.h 10 ffmpeg_smp/h264dec/libavcodec/h264_pred.c 945 ffmpeg_smp/h264dec/libavcodec/h264_pred.h 90 ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c 1013 ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h 10 ffmpeg_smp/h264dec/libavcodec/h264_ps.c 462 ffmpeg_smp/h264dec/libavcodec/h264_ps.h 9 ffmpeg_smp/h264dec/libavcodec/h264_pthread.c 604 ffmpeg_smp/h264dec/libavcodec/h264_pthread.h 14 ffmpeg_smp/h264dec/libavcodec/h264_rec.c 412 ffmpeg_smp/h264dec/libavcodec/h264_rec.h 12 ffmpeg_smp/h264dec/libavcodec/h264_refs.c 461 ffmpeg_smp/h264dec/libavcodec/h264_refs.h 14 ffmpeg_smp/h264dec/libavcodec/h264_sei.c 191 ffmpeg_smp/h264dec/libavcodec/h264_sei.h 7 ffmpeg_smp/h264dec/libavcodec/h264_seq.c 220 ffmpeg_smp/h264dec/libavcodec/h264_types.h 658 ffmpeg_smp/h264dec/libavcodec/mathops.h 145 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c 619 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h 52 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c 48 ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h 154 ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c 1021 ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c 783 ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c 232 ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h 79 ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h 46 ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h 105 ffmpeg_smp/h264dec/libavcodec/raw.h 39 ffmpeg_smp/h264dec/libavcodec/rectangle.h 92 ffmpeg_smp/h264dec/libavcodec/scratch.c 295 ffmpeg_smp/h264dec/libavcodec/simple_idct.c 372 ffmpeg_smp/h264dec/libavcodec/simple_idct.h 47 ffmpeg_smp/h264dec/libavcodec/utils.c 68 ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c 135 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c 304 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c 208 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c 821 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h 170 ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c 250 ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c 1741 ffmpeg_smp/h264dec/libavcodec/x86/mathops.h 67 ffmpeg_smp/h264dec/libavcodec/x86/mmx.h 267 ffmpeg_smp/h264dec/libavutil/arm/bswap.h 72 ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h 78 ffmpeg_smp/h264dec/libavutil/arm/timer.h 40 ffmpeg_smp/h264dec/libavutil/attributes.h 113 ffmpeg_smp/h264dec/libavutil/bswap.h 95 ffmpeg_smp/h264dec/libavutil/common.h 298 ffmpeg_smp/h264dec/libavutil/error.h 53 ffmpeg_smp/h264dec/libavutil/internal.h 168 ffmpeg_smp/h264dec/libavutil/intreadwrite.h 498 ffmpeg_smp/h264dec/libavutil/log.c 111 ffmpeg_smp/h264dec/libavutil/log.h 120 ffmpeg_smp/h264dec/libavutil/mem.c 127 ffmpeg_smp/h264dec/libavutil/mem.h 143 ffmpeg_smp/h264dec/libavutil/pixfmt.h 161 ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h 108 ffmpeg_smp/h264dec/libavutil/ppc/timer.h 47 ffmpeg_smp/h264dec/libavutil/timer.h 69 ffmpeg_smp/h264dec/libavutil/x86/bswap.h 61 ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h 97 ffmpeg_smp/h264dec/libavutil/x86/timer.h 35 ffmpeg_smp/h264dec/libavutil/x86_cpu.h 73
ffmpeg_smp/benchmark.sh 126
ffmpeg_smp/h264dec/COPYING.GPLv3 674
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/ffmpeg_smp/benchmark.sh	Mon Aug 27 12:09:56 2012 +0200
     1.3 @@ -0,0 +1,126 @@
     1.4 +#! /bin/bash
     1.5 +
     1.6 +workers=(1 4 8 12 16 20 24 28 32)
     1.7 +cpus=(0 3 7 15 15 23 23 31 31)
     1.8 +nodes=(0 0 0 1 1 2 2 3 3)
     1.9 +
    1.10 +confs=( "1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17"		#small
    1.11 +		"1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15")    #large
    1.12 +
    1.13 +
    1.14 +
    1.15 +#confsmall=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 8" "3 6 10" "4 7 12" "4 8 15" "5 8 17")
    1.16 +# "7 10 21" "8 12 25" "10 15 29" "11 17 32")
    1.17 +#conflarge=("1 1 1" "1 2 2" "2 3 4" "2 4 5" "3 5 7" "3 6 9" "4 7 12" "4 8 13" "5 10 15")
    1.18 +#"5 12 21" "6 15 25" "7 17 30" "8 19 36")
    1.19 +
    1.20 +
    1.21 +configs=9
    1.22 +
    1.23 +average_ompss_2d=0
    1.24 +average_ompss_3d=0
    1.25 +average_pthread=0
    1.26 +average_serial=0
    1.27 +
    1.28 +iterations_low=4
    1.29 +iterations_high=8
    1.30 +
    1.31 +nframes=10000  # max frames limit for debug purpose
    1.32 +inputs=("14" "10")
    1.33 +inputs_vebose=("Big Bug Bunny 1920x1080 10000 frames" "Park Joy 3840x2160 2500 frames")
    1.34 +osargs=("-z 8 8" "-z 12 12 --static-3d")
    1.35 +
    1.36 +time_stamp=`date +%Y.%m.%d_%H.%M.%S`
    1.37 +outputdir="/home/stefan.hauser/ffmpeg_smp/ppopp_results/rx600s5-1t/$time_stamp"
    1.38 +ompss_2d="$outputdir/ompss_2d.txt"
    1.39 +ompss_3d="$outputdir/ompss_3d.txt"
    1.40 +pthread="$outputdir/pthread.txt"
    1.41 +serial="$outputdir/serial.txt"
    1.42 +
    1.43 +#executes the experiments for a single conf $1=confnum $2 iterations $3 input_idx
    1.44 +function execute_single_conf {
    1.45 +	conf=$1
    1.46 +	iter=$2
    1.47 +	iidx=$3
    1.48 +
    1.49 +	average_ompss_2d=0
    1.50 +	average_ompss_3d=0
    1.51 +	average_pthread=0
    1.52 +
    1.53 +	echo "Workers: " ${workers[$conf]} | tee -a $ompss_2d $ompss_3d $pthread $serial
    1.54 +
    1.55 +	cd build-ss
    1.56 +	for ((i=1;i<=$iter;i+=1)); do
    1.57 +	    # OMPSS
    1.58 +	    #export CSS_NUM_CPUS=$worker
    1.59 +	    NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[0]} 2> output
    1.60 +		runtime=$(cat output | grep real | sed s/^.*l.//g)
    1.61 +	    average_ompss_2d=$(echo "$average_ompss_2d + $runtime"|bc)
    1.62 +	    echo -n $runtime " " >> $ompss_2d
    1.63 +	done
    1.64 +
    1.65 +	for ((i=1;i<=$iter;i+=1)); do
    1.66 +		NX_PES=${workers[$conf]} numactl --interleave=0-${nodes[$conf]} time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -e $((${workers[$conf]}+1)) ${osargs[1]} 2> output
    1.67 +		runtime=$(cat output | grep real | sed s/^.*l.//g)
    1.68 +	    average_ompss_3d=$(echo "$average_ompss_3d + $runtime"|bc)
    1.69 +	    echo -n $runtime " " >> $ompss_3d
    1.70 +	done
    1.71 +	cd ..
    1.72 +
    1.73 +	cd build
    1.74 +	for ((i=1;i<=$iter;i+=1)); do
    1.75 +		# Pthreads
    1.76 +	    numactl --physcpubind=0-$((${cpus[$conf]})) time -p ./ffmpeg -i ${inputs[$iidx]} -n $nframes -t ${confs[$(($conf + $iidx * $configs))]} 2> output
    1.77 +		runtime=$(cat output | grep real | sed s/^.*l.//g)
    1.78 +	    average_pthread=$(echo "$average_pthread + $runtime"|bc)
    1.79 +	    echo -n $runtime " " >> $pthread
    1.80 +	done
    1.81 +	cd ..
    1.82 +
    1.83 +	echo "" | tee -a $pthread $ompss_2d $ompss_3d
    1.84 +	average_ompss_2d=$(echo "scale=5;$average_ompss_2d/$iter"|bc)
    1.85 +	average_ompss_3d=$(echo "scale=5;$average_ompss_3d/$iter"|bc)
    1.86 +	average_pthread=$(echo "scale=5;$average_pthread/$iter"|bc)
    1.87 +
    1.88 +	echo "time: " $average_ompss_2d >> $ompss_2d
    1.89 +	echo "time: " $average_ompss_3d >> $ompss_3d
    1.90 +	echo "time: " $average_pthread >> $pthread
    1.91 +	echo "time: " $average_serial >> $serial
    1.92 +}
    1.93 +
    1.94 +
    1.95 +mkdir $outputdir
    1.96 +
    1.97 +echo "Processing inputs ..."
    1.98 +
    1.99 +echo "h264dec Benchmark" | tee $ompss_2d $ompss_3d $pthread $serial
   1.100 +
   1.101 +for n in 0 1; do
   1.102 +	echo "Input: ${inputs_vebose[$n]}" | tee -a $ompss_2d $ompss_3d $pthread $serial
   1.103 +	echo "" | tee -a $ompss_2d $ompss_3d $pthread $serial
   1.104 +
   1.105 +	# Serial
   1.106 +	cd build
   1.107 +	numactl --physcpubind=0 time -p ./ffmpeg -i ${inputs[$n]} -n $nframes -s 2> output
   1.108 +	runtime=$(cat output | grep real | sed s/^.*l.//g)
   1.109 +	average_serial=$runtime
   1.110 +	cd ..
   1.111 +
   1.112 +	execute_single_conf 0 1 $n
   1.113 +
   1.114 +	#Parallel
   1.115 +	for ((confidx=1;confidx<=4;confidx+=1)); do
   1.116 +		execute_single_conf $confidx $iterations_low $n		
   1.117 +	done
   1.118 +
   1.119 +	for ((confidx=5;confidx<=$(($configs-1));confidx+=1)); do
   1.120 +		execute_single_conf $confidx $iterations_high $n		
   1.121 +	done
   1.122 +
   1.123 +	echo "-------------------" | tee -a $ompss_2d $ompss_3d $pthread $serial
   1.124 +done
   1.125 +
   1.126 +echo "FINISHED"
   1.127 +
   1.128 +rm build/output build-ss/output
   1.129 +

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/ffmpeg_smp/h264dec/COPYING.GPLv3	Mon Aug 27 12:09:56 2012 +0200
     2.3 @@ -0,0 +1,674 @@
     2.4 +                    GNU GENERAL PUBLIC LICENSE
     2.5 +                       Version 3, 29 June 2007
     2.6 +
     2.7 + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
     2.8 + Everyone is permitted to copy and distribute verbatim copies
     2.9 + of this license document, but changing it is not allowed.
    2.10 +
    2.11 +                            Preamble
    2.12 +
    2.13 +  The GNU General Public License is a free, copyleft license for
    2.14 +software and other kinds of works.
    2.15 +
    2.16 +  The licenses for most software and other practical works are designed
    2.17 +to take away your freedom to share and change the works.  By contrast,
    2.18 +the GNU General Public License is intended to guarantee your freedom to
    2.19 +share and change all versions of a program--to make sure it remains free
    2.20 +software for all its users.  We, the Free Software Foundation, use the
    2.21 +GNU General Public License for most of our software; it applies also to
    2.22 +any other work released this way by its authors.  You can apply it to
    2.23 +your programs, too.
    2.24 +
    2.25 +  When we speak of free software, we are referring to freedom, not
    2.26 +price.  Our General Public Licenses are designed to make sure that you
    2.27 +have the freedom to distribute copies of free software (and charge for
    2.28 +them if you wish), that you receive source code or can get it if you
    2.29 +want it, that you can change the software or use pieces of it in new
    2.30 +free programs, and that you know you can do these things.
    2.31 +
    2.32 +  To protect your rights, we need to prevent others from denying you
    2.33 +these rights or asking you to surrender the rights.  Therefore, you have
    2.34 +certain responsibilities if you distribute copies of the software, or if
    2.35 +you modify it: responsibilities to respect the freedom of others.
    2.36 +
    2.37 +  For example, if you distribute copies of such a program, whether
    2.38 +gratis or for a fee, you must pass on to the recipients the same
    2.39 +freedoms that you received.  You must make sure that they, too, receive
    2.40 +or can get the source code.  And you must show them these terms so they
    2.41 +know their rights.
    2.42 +
    2.43 +  Developers that use the GNU GPL protect your rights with two steps:
    2.44 +(1) assert copyright on the software, and (2) offer you this License
    2.45 +giving you legal permission to copy, distribute and/or modify it.
    2.46 +
    2.47 +  For the developers' and authors' protection, the GPL clearly explains
    2.48 +that there is no warranty for this free software.  For both users' and
    2.49 +authors' sake, the GPL requires that modified versions be marked as
    2.50 +changed, so that their problems will not be attributed erroneously to
    2.51 +authors of previous versions.
    2.52 +
    2.53 +  Some devices are designed to deny users access to install or run
    2.54 +modified versions of the software inside them, although the manufacturer
    2.55 +can do so.  This is fundamentally incompatible with the aim of
    2.56 +protecting users' freedom to change the software.  The systematic
    2.57 +pattern of such abuse occurs in the area of products for individuals to
    2.58 +use, which is precisely where it is most unacceptable.  Therefore, we
    2.59 +have designed this version of the GPL to prohibit the practice for those
    2.60 +products.  If such problems arise substantially in other domains, we
    2.61 +stand ready to extend this provision to those domains in future versions
    2.62 +of the GPL, as needed to protect the freedom of users.
    2.63 +
    2.64 +  Finally, every program is threatened constantly by software patents.
    2.65 +States should not allow patents to restrict development and use of
    2.66 +software on general-purpose computers, but in those that do, we wish to
    2.67 +avoid the special danger that patents applied to a free program could
    2.68 +make it effectively proprietary.  To prevent this, the GPL assures that
    2.69 +patents cannot be used to render the program non-free.
    2.70 +
    2.71 +  The precise terms and conditions for copying, distribution and
    2.72 +modification follow.
    2.73 +
    2.74 +                       TERMS AND CONDITIONS
    2.75 +
    2.76 +  0. Definitions.
    2.77 +
    2.78 +  "This License" refers to version 3 of the GNU General Public License.
    2.79 +
    2.80 +  "Copyright" also means copyright-like laws that apply to other kinds of
    2.81 +works, such as semiconductor masks.
    2.82 +
    2.83 +  "The Program" refers to any copyrightable work licensed under this
    2.84 +License.  Each licensee is addressed as "you".  "Licensees" and
    2.85 +"recipients" may be individuals or organizations.
    2.86 +
    2.87 +  To "modify" a work means to copy from or adapt all or part of the work
    2.88 +in a fashion requiring copyright permission, other than the making of an
    2.89 +exact copy.  The resulting work is called a "modified version" of the
    2.90 +earlier work or a work "based on" the earlier work.
    2.91 +
    2.92 +  A "covered work" means either the unmodified Program or a work based
    2.93 +on the Program.
    2.94 +
    2.95 +  To "propagate" a work means to do anything with it that, without
    2.96 +permission, would make you directly or secondarily liable for
    2.97 +infringement under applicable copyright law, except executing it on a
    2.98 +computer or modifying a private copy.  Propagation includes copying,
    2.99 +distribution (with or without modification), making available to the
   2.100 +public, and in some countries other activities as well.
   2.101 +
   2.102 +  To "convey" a work means any kind of propagation that enables other
   2.103 +parties to make or receive copies.  Mere interaction with a user through
   2.104 +a computer network, with no transfer of a copy, is not conveying.
   2.105 +
   2.106 +  An interactive user interface displays "Appropriate Legal Notices"
   2.107 +to the extent that it includes a convenient and prominently visible
   2.108 +feature that (1) displays an appropriate copyright notice, and (2)
   2.109 +tells the user that there is no warranty for the work (except to the
   2.110 +extent that warranties are provided), that licensees may convey the
   2.111 +work under this License, and how to view a copy of this License.  If
   2.112 +the interface presents a list of user commands or options, such as a
   2.113 +menu, a prominent item in the list meets this criterion.
   2.114 +
   2.115 +  1. Source Code.
   2.116 +
   2.117 +  The "source code" for a work means the preferred form of the work
   2.118 +for making modifications to it.  "Object code" means any non-source
   2.119 +form of a work.
   2.120 +
   2.121 +  A "Standard Interface" means an interface that either is an official
   2.122 +standard defined by a recognized standards body, or, in the case of
   2.123 +interfaces specified for a particular programming language, one that
   2.124 +is widely used among developers working in that language.
   2.125 +
   2.126 +  The "System Libraries" of an executable work include anything, other
   2.127 +than the work as a whole, that (a) is included in the normal form of
   2.128 +packaging a Major Component, but which is not part of that Major
   2.129 +Component, and (b) serves only to enable use of the work with that
   2.130 +Major Component, or to implement a Standard Interface for which an
   2.131 +implementation is available to the public in source code form.  A
   2.132 +"Major Component", in this context, means a major essential component
   2.133 +(kernel, window system, and so on) of the specific operating system
   2.134 +(if any) on which the executable work runs, or a compiler used to
   2.135 +produce the work, or an object code interpreter used to run it.
   2.136 +
   2.137 +  The "Corresponding Source" for a work in object code form means all
   2.138 +the source code needed to generate, install, and (for an executable
   2.139 +work) run the object code and to modify the work, including scripts to
   2.140 +control those activities.  However, it does not include the work's
   2.141 +System Libraries, or general-purpose tools or generally available free
   2.142 +programs which are used unmodified in performing those activities but
   2.143 +which are not part of the work.  For example, Corresponding Source
   2.144 +includes interface definition files associated with source files for
   2.145 +the work, and the source code for shared libraries and dynamically
   2.146 +linked subprograms that the work is specifically designed to require,
   2.147 +such as by intimate data communication or control flow between those
   2.148 +subprograms and other parts of the work.
   2.149 +
   2.150 +  The Corresponding Source need not include anything that users
   2.151 +can regenerate automatically from other parts of the Corresponding
   2.152 +Source.
   2.153 +
   2.154 +  The Corresponding Source for a work in source code form is that
   2.155 +same work.
   2.156 +
   2.157 +  2. Basic Permissions.
   2.158 +
   2.159 +  All rights granted under this License are granted for the term of
   2.160 +copyright on the Program, and are irrevocable provided the stated
   2.161 +conditions are met.  This License explicitly affirms your unlimited
   2.162 +permission to run the unmodified Program.  The output from running a
   2.163 +covered work is covered by this License only if the output, given its
   2.164 +content, constitutes a covered work.  This License acknowledges your
   2.165 +rights of fair use or other equivalent, as provided by copyright law.
   2.166 +
   2.167 +  You may make, run and propagate covered works that you do not
   2.168 +convey, without conditions so long as your license otherwise remains
   2.169 +in force.  You may convey covered works to others for the sole purpose
   2.170 +of having them make modifications exclusively for you, or provide you
   2.171 +with facilities for running those works, provided that you comply with
   2.172 +the terms of this License in conveying all material for which you do
   2.173 +not control copyright.  Those thus making or running the covered works
   2.174 +for you must do so exclusively on your behalf, under your direction
   2.175 +and control, on terms that prohibit them from making any copies of
   2.176 +your copyrighted material outside their relationship with you.
   2.177 +
   2.178 +  Conveying under any other circumstances is permitted solely under
   2.179 +the conditions stated below.  Sublicensing is not allowed; section 10
   2.180 +makes it unnecessary.
   2.181 +
   2.182 +  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
   2.183 +
   2.184 +  No covered work shall be deemed part of an effective technological
   2.185 +measure under any applicable law fulfilling obligations under article
   2.186 +11 of the WIPO copyright treaty adopted on 20 December 1996, or
   2.187 +similar laws prohibiting or restricting circumvention of such
   2.188 +measures.
   2.189 +
   2.190 +  When you convey a covered work, you waive any legal power to forbid
   2.191 +circumvention of technological measures to the extent such circumvention
   2.192 +is effected by exercising rights under this License with respect to
   2.193 +the covered work, and you disclaim any intention to limit operation or
   2.194 +modification of the work as a means of enforcing, against the work's
   2.195 +users, your or third parties' legal rights to forbid circumvention of
   2.196 +technological measures.
   2.197 +
   2.198 +  4. Conveying Verbatim Copies.
   2.199 +
   2.200 +  You may convey verbatim copies of the Program's source code as you
   2.201 +receive it, in any medium, provided that you conspicuously and
   2.202 +appropriately publish on each copy an appropriate copyright notice;
   2.203 +keep intact all notices stating that this License and any
   2.204 +non-permissive terms added in accord with section 7 apply to the code;
   2.205 +keep intact all notices of the absence of any warranty; and give all
   2.206 +recipients a copy of this License along with the Program.
   2.207 +
   2.208 +  You may charge any price or no price for each copy that you convey,
   2.209 +and you may offer support or warranty protection for a fee.
   2.210 +
   2.211 +  5. Conveying Modified Source Versions.
   2.212 +
   2.213 +  You may convey a work based on the Program, or the modifications to
   2.214 +produce it from the Program, in the form of source code under the
   2.215 +terms of section 4, provided that you also meet all of these conditions:
   2.216 +
   2.217 +    a) The work must carry prominent notices stating that you modified
   2.218 +    it, and giving a relevant date.
   2.219 +
   2.220 +    b) The work must carry prominent notices stating that it is
   2.221 +    released under this License and any conditions added under section
   2.222 +    7.  This requirement modifies the requirement in section 4 to
   2.223 +    "keep intact all notices".
   2.224 +
   2.225 +    c) You must license the entire work, as a whole, under this
   2.226 +    License to anyone who comes into possession of a copy.  This
   2.227 +    License will therefore apply, along with any applicable section 7
   2.228 +    additional terms, to the whole of the work, and all its parts,
   2.229 +    regardless of how they are packaged.  This License gives no
   2.230 +    permission to license the work in any other way, but it does not
   2.231 +    invalidate such permission if you have separately received it.
   2.232 +
   2.233 +    d) If the work has interactive user interfaces, each must display
   2.234 +    Appropriate Legal Notices; however, if the Program has interactive
   2.235 +    interfaces that do not display Appropriate Legal Notices, your
   2.236 +    work need not make them do so.
   2.237 +
   2.238 +  A compilation of a covered work with other separate and independent
   2.239 +works, which are not by their nature extensions of the covered work,
   2.240 +and which are not combined with it such as to form a larger program,
   2.241 +in or on a volume of a storage or distribution medium, is called an
   2.242 +"aggregate" if the compilation and its resulting copyright are not
   2.243 +used to limit the access or legal rights of the compilation's users
   2.244 +beyond what the individual works permit.  Inclusion of a covered work
   2.245 +in an aggregate does not cause this License to apply to the other
   2.246 +parts of the aggregate.
   2.247 +
   2.248 +  6. Conveying Non-Source Forms.
   2.249 +
   2.250 +  You may convey a covered work in object code form under the terms
   2.251 +of sections 4 and 5, provided that you also convey the
   2.252 +machine-readable Corresponding Source under the terms of this License,
   2.253 +in one of these ways:
   2.254 +
   2.255 +    a) Convey the object code in, or embodied in, a physical product
   2.256 +    (including a physical distribution medium), accompanied by the
   2.257 +    Corresponding Source fixed on a durable physical medium
   2.258 +    customarily used for software interchange.
   2.259 +
   2.260 +    b) Convey the object code in, or embodied in, a physical product
   2.261 +    (including a physical distribution medium), accompanied by a
   2.262 +    written offer, valid for at least three years and valid for as
   2.263 +    long as you offer spare parts or customer support for that product
   2.264 +    model, to give anyone who possesses the object code either (1) a
   2.265 +    copy of the Corresponding Source for all the software in the
   2.266 +    product that is covered by this License, on a durable physical
   2.267 +    medium customarily used for software interchange, for a price no
   2.268 +    more than your reasonable cost of physically performing this
   2.269 +    conveying of source, or (2) access to copy the
   2.270 +    Corresponding Source from a network server at no charge.
   2.271 +
   2.272 +    c) Convey individual copies of the object code with a copy of the
   2.273 +    written offer to provide the Corresponding Source.  This
   2.274 +    alternative is allowed only occasionally and noncommercially, and
   2.275 +    only if you received the object code with such an offer, in accord
   2.276 +    with subsection 6b.
   2.277 +
   2.278 +    d) Convey the object code by offering access from a designated
   2.279 +    place (gratis or for a charge), and offer equivalent access to the
   2.280 +    Corresponding Source in the same way through the same place at no
   2.281 +    further charge.  You need not require recipients to copy the
   2.282 +    Corresponding Source along with the object code.  If the place to
   2.283 +    copy the object code is a network server, the Corresponding Source
   2.284 +    may be on a different server (operated by you or a third party)
   2.285 +    that supports equivalent copying facilities, provided you maintain
   2.286 +    clear directions next to the object code saying where to find the
   2.287 +    Corresponding Source.  Regardless of what server hosts the
   2.288 +    Corresponding Source, you remain obligated to ensure that it is
   2.289 +    available for as long as needed to satisfy these requirements.
   2.290 +
   2.291 +    e) Convey the object code using peer-to-peer transmission, provided
   2.292 +    you inform other peers where the object code and Corresponding
   2.293 +    Source of the work are being offered to the general public at no
   2.294 +    charge under subsection 6d.
   2.295 +
   2.296 +  A separable portion of the object code, whose source code is excluded
   2.297 +from the Corresponding Source as a System Library, need not be
   2.298 +included in conveying the object code work.
   2.299 +
   2.300 +  A "User Product" is either (1) a "consumer product", which means any
   2.301 +tangible personal property which is normally used for personal, family,
   2.302 +or household purposes, or (2) anything designed or sold for incorporation
   2.303 +into a dwelling.  In determining whether a product is a consumer product,
   2.304 +doubtful cases shall be resolved in favor of coverage.  For a particular
   2.305 +product received by a particular user, "normally used" refers to a
   2.306 +typical or common use of that class of product, regardless of the status
   2.307 +of the particular user or of the way in which the particular user
   2.308 +actually uses, or expects or is expected to use, the product.  A product
   2.309 +is a consumer product regardless of whether the product has substantial
   2.310 +commercial, industrial or non-consumer uses, unless such uses represent
   2.311 +the only significant mode of use of the product.
   2.312 +
   2.313 +  "Installation Information" for a User Product means any methods,
   2.314 +procedures, authorization keys, or other information required to install
   2.315 +and execute modified versions of a covered work in that User Product from
   2.316 +a modified version of its Corresponding Source.  The information must
   2.317 +suffice to ensure that the continued functioning of the modified object
   2.318 +code is in no case prevented or interfered with solely because
   2.319 +modification has been made.
   2.320 +
   2.321 +  If you convey an object code work under this section in, or with, or
   2.322 +specifically for use in, a User Product, and the conveying occurs as
   2.323 +part of a transaction in which the right of possession and use of the
   2.324 +User Product is transferred to the recipient in perpetuity or for a
   2.325 +fixed term (regardless of how the transaction is characterized), the
   2.326 +Corresponding Source conveyed under this section must be accompanied
   2.327 +by the Installation Information.  But this requirement does not apply
   2.328 +if neither you nor any third party retains the ability to install
   2.329 +modified object code on the User Product (for example, the work has
   2.330 +been installed in ROM).
   2.331 +
   2.332 +  The requirement to provide Installation Information does not include a
   2.333 +requirement to continue to provide support service, warranty, or updates
   2.334 +for a work that has been modified or installed by the recipient, or for
   2.335 +the User Product in which it has been modified or installed.  Access to a
   2.336 +network may be denied when the modification itself materially and
   2.337 +adversely affects the operation of the network or violates the rules and
   2.338 +protocols for communication across the network.
   2.339 +
   2.340 +  Corresponding Source conveyed, and Installation Information provided,
   2.341 +in accord with this section must be in a format that is publicly
   2.342 +documented (and with an implementation available to the public in
   2.343 +source code form), and must require no special password or key for
   2.344 +unpacking, reading or copying.
   2.345 +
   2.346 +  7. Additional Terms.
   2.347 +
   2.348 +  "Additional permissions" are terms that supplement the terms of this
   2.349 +License by making exceptions from one or more of its conditions.
   2.350 +Additional permissions that are applicable to the entire Program shall
   2.351 +be treated as though they were included in this License, to the extent
   2.352 +that they are valid under applicable law.  If additional permissions
   2.353 +apply only to part of the Program, that part may be used separately
   2.354 +under those permissions, but the entire Program remains governed by
   2.355 +this License without regard to the additional permissions.
   2.356 +
   2.357 +  When you convey a copy of a covered work, you may at your option
   2.358 +remove any additional permissions from that copy, or from any part of
   2.359 +it.  (Additional permissions may be written to require their own
   2.360 +removal in certain cases when you modify the work.)  You may place
   2.361 +additional permissions on material, added by you to a covered work,
   2.362 +for which you have or can give appropriate copyright permission.
   2.363 +
   2.364 +  Notwithstanding any other provision of this License, for material you
   2.365 +add to a covered work, you may (if authorized by the copyright holders of
   2.366 +that material) supplement the terms of this License with terms:
   2.367 +
   2.368 +    a) Disclaiming warranty or limiting liability differently from the
   2.369 +    terms of sections 15 and 16 of this License; or
   2.370 +
   2.371 +    b) Requiring preservation of specified reasonable legal notices or
   2.372 +    author attributions in that material or in the Appropriate Legal
   2.373 +    Notices displayed by works containing it; or
   2.374 +
   2.375 +    c) Prohibiting misrepresentation of the origin of that material, or
   2.376 +    requiring that modified versions of such material be marked in
   2.377 +    reasonable ways as different from the original version; or
   2.378 +
   2.379 +    d) Limiting the use for publicity purposes of names of licensors or
   2.380 +    authors of the material; or
   2.381 +
   2.382 +    e) Declining to grant rights under trademark law for use of some
   2.383 +    trade names, trademarks, or service marks; or
   2.384 +
   2.385 +    f) Requiring indemnification of licensors and authors of that
   2.386 +    material by anyone who conveys the material (or modified versions of
   2.387 +    it) with contractual assumptions of liability to the recipient, for
   2.388 +    any liability that these contractual assumptions directly impose on
   2.389 +    those licensors and authors.
   2.390 +
   2.391 +  All other non-permissive additional terms are considered "further
   2.392 +restrictions" within the meaning of section 10.  If the Program as you
   2.393 +received it, or any part of it, contains a notice stating that it is
   2.394 +governed by this License along with a term that is a further
   2.395 +restriction, you may remove that term.  If a license document contains
   2.396 +a further restriction but permits relicensing or conveying under this
   2.397 +License, you may add to a covered work material governed by the terms
   2.398 +of that license document, provided that the further restriction does
   2.399 +not survive such relicensing or conveying.
   2.400 +
   2.401 +  If you add terms to a covered work in accord with this section, you
   2.402 +must place, in the relevant source files, a statement of the
   2.403 +additional terms that apply to those files, or a notice indicating
   2.404 +where to find the applicable terms.
   2.405 +
   2.406 +  Additional terms, permissive or non-permissive, may be stated in the
   2.407 +form of a separately written license, or stated as exceptions;
   2.408 +the above requirements apply either way.
   2.409 +
   2.410 +  8. Termination.
   2.411 +
   2.412 +  You may not propagate or modify a covered work except as expressly
   2.413 +provided under this License.  Any attempt otherwise to propagate or
   2.414 +modify it is void, and will automatically terminate your rights under
   2.415 +this License (including any patent licenses granted under the third
   2.416 +paragraph of section 11).
   2.417 +
   2.418 +  However, if you cease all violation of this License, then your
   2.419 +license from a particular copyright holder is reinstated (a)
   2.420 +provisionally, unless and until the copyright holder explicitly and
   2.421 +finally terminates your license, and (b) permanently, if the copyright
   2.422 +holder fails to notify you of the violation by some reasonable means
   2.423 +prior to 60 days after the cessation.
   2.424 +
   2.425 +  Moreover, your license from a particular copyright holder is
   2.426 +reinstated permanently if the copyright holder notifies you of the
   2.427 +violation by some reasonable means, this is the first time you have
   2.428 +received notice of violation of this License (for any work) from that
   2.429 +copyright holder, and you cure the violation prior to 30 days after
   2.430 +your receipt of the notice.
   2.431 +
   2.432 +  Termination of your rights under this section does not terminate the
   2.433 +licenses of parties who have received copies or rights from you under
   2.434 +this License.  If your rights have been terminated and not permanently
   2.435 +reinstated, you do not qualify to receive new licenses for the same
   2.436 +material under section 10.
   2.437 +
   2.438 +  9. Acceptance Not Required for Having Copies.
   2.439 +
   2.440 +  You are not required to accept this License in order to receive or
   2.441 +run a copy of the Program.  Ancillary propagation of a covered work
   2.442 +occurring solely as a consequence of using peer-to-peer transmission
   2.443 +to receive a copy likewise does not require acceptance.  However,
   2.444 +nothing other than this License grants you permission to propagate or
   2.445 +modify any covered work.  These actions infringe copyright if you do
   2.446 +not accept this License.  Therefore, by modifying or propagating a
   2.447 +covered work, you indicate your acceptance of this License to do so.
   2.448 +
   2.449 +  10. Automatic Licensing of Downstream Recipients.
   2.450 +
   2.451 +  Each time you convey a covered work, the recipient automatically
   2.452 +receives a license from the original licensors, to run, modify and
   2.453 +propagate that work, subject to this License.  You are not responsible
   2.454 +for enforcing compliance by third parties with this License.
   2.455 +
   2.456 +  An "entity transaction" is a transaction transferring control of an
   2.457 +organization, or substantially all assets of one, or subdividing an
   2.458 +organization, or merging organizations.  If propagation of a covered
   2.459 +work results from an entity transaction, each party to that
   2.460 +transaction who receives a copy of the work also receives whatever
   2.461 +licenses to the work the party's predecessor in interest had or could
   2.462 +give under the previous paragraph, plus a right to possession of the
   2.463 +Corresponding Source of the work from the predecessor in interest, if
   2.464 +the predecessor has it or can get it with reasonable efforts.
   2.465 +
   2.466 +  You may not impose any further restrictions on the exercise of the
   2.467 +rights granted or affirmed under this License.  For example, you may
   2.468 +not impose a license fee, royalty, or other charge for exercise of
   2.469 +rights granted under this License, and you may not initiate litigation
   2.470 +(including a cross-claim or counterclaim in a lawsuit) alleging that
   2.471 +any patent claim is infringed by making, using, selling, offering for
   2.472 +sale, or importing the Program or any portion of it.
   2.473 +
   2.474 +  11. Patents.
   2.475 +
   2.476 +  A "contributor" is a copyright holder who authorizes use under this
   2.477 +License of the Program or a work on which the Program is based.  The
   2.478 +work thus licensed is called the contributor's "contributor version".
   2.479 +
   2.480 +  A contributor's "essential patent claims" are all patent claims
   2.481 +owned or controlled by the contributor, whether already acquired or
   2.482 +hereafter acquired, that would be infringed by some manner, permitted
   2.483 +by this License, of making, using, or selling its contributor version,
   2.484 +but do not include claims that would be infringed only as a
   2.485 +consequence of further modification of the contributor version.  For
   2.486 +purposes of this definition, "control" includes the right to grant
   2.487 +patent sublicenses in a manner consistent with the requirements of
   2.488 +this License.
   2.489 +
   2.490 +  Each contributor grants you a non-exclusive, worldwide, royalty-free
   2.491 +patent license under the contributor's essential patent claims, to
   2.492 +make, use, sell, offer for sale, import and otherwise run, modify and
   2.493 +propagate the contents of its contributor version.
   2.494 +
   2.495 +  In the following three paragraphs, a "patent license" is any express
   2.496 +agreement or commitment, however denominated, not to enforce a patent
   2.497 +(such as an express permission to practice a patent or covenant not to
   2.498 +sue for patent infringement).  To "grant" such a patent license to a
   2.499 +party means to make such an agreement or commitment not to enforce a
   2.500 +patent against the party.
   2.501 +
   2.502 +  If you convey a covered work, knowingly relying on a patent license,
   2.503 +and the Corresponding Source of the work is not available for anyone
   2.504 +to copy, free of charge and under the terms of this License, through a
   2.505 +publicly available network server or other readily accessible means,
   2.506 +then you must either (1) cause the Corresponding Source to be so
   2.507 +available, or (2) arrange to deprive yourself of the benefit of the
   2.508 +patent license for this particular work, or (3) arrange, in a manner
   2.509 +consistent with the requirements of this License, to extend the patent
   2.510 +license to downstream recipients.  "Knowingly relying" means you have
   2.511 +actual knowledge that, but for the patent license, your conveying the
   2.512 +covered work in a country, or your recipient's use of the covered work
   2.513 +in a country, would infringe one or more identifiable patents in that
   2.514 +country that you have reason to believe are valid.
   2.515 +
   2.516 +  If, pursuant to or in connection with a single transaction or
   2.517 +arrangement, you convey, or propagate by procuring conveyance of, a
   2.518 +covered work, and grant a patent license to some of the parties
   2.519 +receiving the covered work authorizing them to use, propagate, modify
   2.520 +or convey a specific copy of the covered work, then the patent license
   2.521 +you grant is automatically extended to all recipients of the covered
   2.522 +work and works based on it.
   2.523 +
   2.524 +  A patent license is "discriminatory" if it does not include within
   2.525 +the scope of its coverage, prohibits the exercise of, or is
   2.526 +conditioned on the non-exercise of one or more of the rights that are
   2.527 +specifically granted under this License.  You may not convey a covered
   2.528 +work if you are a party to an arrangement with a third party that is
   2.529 +in the business of distributing software, under which you make payment
   2.530 +to the third party based on the extent of your activity of conveying
   2.531 +the work, and under which the third party grants, to any of the
   2.532 +parties who would receive the covered work from you, a discriminatory
   2.533 +patent license (a) in connection with copies of the covered work
   2.534 +conveyed by you (or copies made from those copies), or (b) primarily
   2.535 +for and in connection with specific products or compilations that
   2.536 +contain the covered work, unless you entered into that arrangement,
   2.537 +or that patent license was granted, prior to 28 March 2007.
   2.538 +
   2.539 +  Nothing in this License shall be construed as excluding or limiting
   2.540 +any implied license or other defenses to infringement that may
   2.541 +otherwise be available to you under applicable patent law.
   2.542 +
   2.543 +  12. No Surrender of Others' Freedom.
   2.544 +
   2.545 +  If conditions are imposed on you (whether by court order, agreement or
   2.546 +otherwise) that contradict the conditions of this License, they do not
   2.547 +excuse you from the conditions of this License.  If you cannot convey a
   2.548 +covered work so as to satisfy simultaneously your obligations under this
   2.549 +License and any other pertinent obligations, then as a consequence you may
   2.550 +not convey it at all.  For example, if you agree to terms that obligate you
   2.551 +to collect a royalty for further conveying from those to whom you convey
   2.552 +the Program, the only way you could satisfy both those terms and this
   2.553 +License would be to refrain entirely from conveying the Program.
   2.554 +
   2.555 +  13. Use with the GNU Affero General Public License.
   2.556 +
   2.557 +  Notwithstanding any other provision of this License, you have
   2.558 +permission to link or combine any covered work with a work licensed
   2.559 +under version 3 of the GNU Affero General Public License into a single
   2.560 +combined work, and to convey the resulting work.  The terms of this
   2.561 +License will continue to apply to the part which is the covered work,
   2.562 +but the special requirements of the GNU Affero General Public License,
   2.563 +section 13, concerning interaction through a network will apply to the
   2.564 +combination as such.
   2.565 +
   2.566 +  14. Revised Versions of this License.
   2.567 +
   2.568 +  The Free Software Foundation may publish revised and/or new versions of
   2.569 +the GNU General Public License from time to time.  Such new versions will
   2.570 +be similar in spirit to the present version, but may differ in detail to
   2.571 +address new problems or concerns.
   2.572 +
   2.573 +  Each version is given a distinguishing version number.  If the
   2.574 +Program specifies that a certain numbered version of the GNU General
   2.575 +Public License "or any later version" applies to it, you have the
   2.576 +option of following the terms and conditions either of that numbered
   2.577 +version or of any later version published by the Free Software
   2.578 +Foundation.  If the Program does not specify a version number of the
   2.579 +GNU General Public License, you may choose any version ever published
   2.580 +by the Free Software Foundation.
   2.581 +
   2.582 +  If the Program specifies that a proxy can decide which future
   2.583 +versions of the GNU General Public License can be used, that proxy's
   2.584 +public statement of acceptance of a version permanently authorizes you
   2.585 +to choose that version for the Program.
   2.586 +
   2.587 +  Later license versions may give you additional or different
   2.588 +permissions.  However, no additional obligations are imposed on any
   2.589 +author or copyright holder as a result of your choosing to follow a
   2.590 +later version.
   2.591 +
   2.592 +  15. Disclaimer of Warranty.
   2.593 +
   2.594 +  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
   2.595 +APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
   2.596 +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
   2.597 +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
   2.598 +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   2.599 +PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
   2.600 +IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
   2.601 +ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
   2.602 +
   2.603 +  16. Limitation of Liability.
   2.604 +
   2.605 +  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
   2.606 +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
   2.607 +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
   2.608 +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
   2.609 +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
   2.610 +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
   2.611 +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
   2.612 +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
   2.613 +SUCH DAMAGES.
   2.614 +
   2.615 +  17. Interpretation of Sections 15 and 16.
   2.616 +
   2.617 +  If the disclaimer of warranty and limitation of liability provided
   2.618 +above cannot be given local legal effect according to their terms,
   2.619 +reviewing courts shall apply local law that most closely approximates
   2.620 +an absolute waiver of all civil liability in connection with the
   2.621 +Program, unless a warranty or assumption of liability accompanies a
   2.622 +copy of the Program in return for a fee.
   2.623 +
   2.624 +                     END OF TERMS AND CONDITIONS
   2.625 +
   2.626 +            How to Apply These Terms to Your New Programs
   2.627 +
   2.628 +  If you develop a new program, and you want it to be of the greatest
   2.629 +possible use to the public, the best way to achieve this is to make it
   2.630 +free software which everyone can redistribute and change under these terms.
   2.631 +
   2.632 +  To do so, attach the following notices to the program.  It is safest
   2.633 +to attach them to the start of each source file to most effectively
   2.634 +state the exclusion of warranty; and each file should have at least
   2.635 +the "copyright" line and a pointer to where the full notice is found.
   2.636 +
   2.637 +    <one line to give the program's name and a brief idea of what it does.>
   2.638 +    Copyright (C) <year>  <name of author>
   2.639 +
   2.640 +    This program is free software: you can redistribute it and/or modify
   2.641 +    it under the terms of the GNU General Public License as published by
   2.642 +    the Free Software Foundation, either version 3 of the License, or
   2.643 +    (at your option) any later version.
   2.644 +
   2.645 +    This program is distributed in the hope that it will be useful,
   2.646 +    but WITHOUT ANY WARRANTY; without even the implied warranty of
   2.647 +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   2.648 +    GNU General Public License for more details.
   2.649 +
   2.650 +    You should have received a copy of the GNU General Public License
   2.651 +    along with this program.  If not, see <http://www.gnu.org/licenses/>.
   2.652 +
   2.653 +Also add information on how to contact you by electronic and paper mail.
   2.654 +
   2.655 +  If the program does terminal interaction, make it output a short
   2.656 +notice like this when it starts in an interactive mode:
   2.657 +
   2.658 +    <program>  Copyright (C) <year>  <name of author>
   2.659 +    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
   2.660 +    This is free software, and you are welcome to redistribute it
   2.661 +    under certain conditions; type `show c' for details.
   2.662 +
   2.663 +The hypothetical commands `show w' and `show c' should show the appropriate
   2.664 +parts of the General Public License.  Of course, your program's commands
   2.665 +might be different; for a GUI interface, you would use an "about box".
   2.666 +
   2.667 +  You should also get your employer (if you work as a programmer) or school,
   2.668 +if any, to sign a "copyright disclaimer" for the program, if necessary.
   2.669 +For more information on this, and how to apply and follow the GNU GPL, see
   2.670 +<http://www.gnu.org/licenses/>.
   2.671 +
   2.672 +  The GNU General Public License does not permit incorporating your program
   2.673 +into proprietary programs.  If your program is a subroutine library, you
   2.674 +may consider it more useful to permit linking proprietary applications with
   2.675 +the library.  If this is what you want to do, use the GNU Lesser General
   2.676 +Public License instead of this License.  But first, please read
   2.677 +<http://www.gnu.org/philosophy/why-not-lgpl.html>.

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/ffmpeg_smp/h264dec/README.txt	Mon Aug 27 12:09:56 2012 +0200
     3.3 @@ -0,0 +1,79 @@
     3.4 +App: h264dec
     3.5 +
     3.6 +This application decodes H.264 raw videos.
     3.7 +
     3.8 +Build Sequential/Pthreads:
     3.9 +
    3.10 +autoreconf -i -f
    3.11 +mkdir build
    3.12 +cd build
    3.13 +../configure --enable-ssse3 --enable-sdl2
    3.14 +make
    3.15 +
    3.16 +Build OmpSs:
    3.17 +
    3.18 +autoreconf -i -f
    3.19 +mkdir build
    3.20 +cd build-ss
    3.21 +../configure CC=sscc --enable-ssse3 --enable-sdl2
    3.22 +make
    3.23 +
    3.24 +ssse3 enables assembler optimizations up to ssse3 (optional)
    3.25 +sdl enables a rudimentary viewing capability      (optional)
    3.26 +
    3.27 +Usage Sequential/Pthreads:
    3.28 +./h264dec -i $(INPUT_VIDEO) -s
    3.29 +./h264dec -i $(INPUT_VIDEO) -t $(THREADS)
    3.30 +
    3.31 +Usage OmpSs:
    3.32 +NX_PES=<numthreads> ./h264dec -i <inputfile> -e <num parallel entropy frames> -z <width> <height> --static-3d
    3.33 +
    3.34 +-e specify the number of entropy decode pipeline buffers and should be ideally
    3.35 +the same as the number of threads.
    3.36 +
    3.37 +-z allows to set the MB reconstruction grouped block size. A size between 6 by 6 to 10 by 10
    3.38 +was found to strike a good balance between overhead and parallelism, but is machine and input
    3.39 +dependent.
    3.40 +
    3.41 +--static-3d performs overlapping wavefront decoding.
    3.42 +
    3.43 +General usage:
    3.44 +-d 				displays output
    3.45 +-f 				fullscreen
    3.46 +-o $(OUT_FILE)  write raw YUV
    3.47 +-v  			show framerate
    3.48 +
    3.49 +
    3.50 +The INPUT_VIDEOs are in "inputs_encore", but should be able to decode any raw H.264 stream using
    3.51 +one slice per frame, non-interlaced, and CABAC, YUV420.
    3.52 +
    3.53 +
    3.54 +Integrated OmpSs player demo
    3.55 +----------------------------
    3.56 +NOTE: for the player demo SDL2 must be installed.
    3.57 +
    3.58 +1. Go to the OmpSs build directory (/home/cchi/Projects/ffmpeg_smp/build-ss)
    3.59 +
    3.60 +2. Launch the H.264 decoder with the desired options:
    3.61 +
    3.62 +NX_PES=<numthreads> ./h264dec <inputfile> -v (verbose) -e <num parallel entropy frames> -z <width> <height> -d (display) -f (fullscreen)
    3.63 +
    3.64 +note that <num parallel entropy frames> should be equal or higher than <numthreads> for optimal performance
    3.65 +
    3.66 +Examples:
    3.67 +
    3.68 +NX_PES=7 ./h264dec -i ../../h264_movies/park_joy_2160px5.h264 -v -z 8 8 -df -e 9
    3.69 +NX_PES=7 ./h264dec -i ../../h264_movies/big_buck_bunny_1080p24.h264 -v -d  -z 6 6 -e 9
    3.70 +
    3.71 +Interacting with the program
    3.72 +----------------------------
    3.73 +<CTRL+F>    Fullscreen mode
    3.74 +<ESCAPE>    Window mode
    3.75 +<SPACE>     Pause/resume
    3.76 +<M>         Show/hide macroblock borders
    3.77 +<arrows>    When macroblock borders are shown resizes the macroblocks
    3.78 +<ALT+F4>    Close
    3.79 +
    3.80 +Force close in case of lockup
    3.81 +-----------------------------
    3.82 +On a terminal: killall -9 h264dec

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/ffmpeg_smp/h264dec/configure.ac	Mon Aug 27 12:09:56 2012 +0200
     4.3 @@ -0,0 +1,171 @@
     4.4 +#                                               -*- Autoconf -*-
     4.5 +# Process this file with autoconf to produce a configure script.
     4.6 +
     4.7 +AC_PREREQ(2.61)
     4.8 +AC_INIT([h264_mt], [0.1], [cchi@cs.tu-berlin.de])
     4.9 +#AM_INIT_AUTOMAKE(AC_PACKAGE_NAME, AC_PACKAGE_VERSION)
    4.10 +AM_INIT_AUTOMAKE([-Wall -Werror foreign])
    4.11 +
    4.12 +AC_CONFIG_SRCDIR([h264dec.c])
    4.13 +AC_PROG_RANLIB
    4.14 +
    4.15 +# Checks for programs.
    4.16 +AC_GNU_SOURCE
    4.17 +AC_PROG_CC
    4.18 +AM_CONDITIONAL([HAVE_OMPSS], [test $CC = "sscc"])
    4.19 +AC_DEFINE([OMPSS], [0], [Define to 1 on when using the OmpSs compiler sscc])
    4.20 +if test $CC = "sscc";then
    4.21 +AC_DEFINE([OMPSS], [1], [Define to 1 on when using the OmpSs compiler sscc])
    4.22 +fi
    4.23 +
    4.24 +#if [ test -n "${CFLAGS+x}" ] ; then
    4.25 +#    CFLAGS="-O3 -g"
    4.26 +#fi
    4.27 +
    4.28 +# Checks for libraries.
    4.29 +AC_CHECK_LIB([pthread], [pthread_yield])
    4.30 +AC_CHECK_LIB([spe2], [spe_image_open])
    4.31 +AC_CHECK_LIB([sync], [mutex_init])
    4.32 +AC_CHECK_LIB([rt], [clock_gettime])
    4.33 +
    4.34 +AC_ARG_ENABLE([sdl2], AS_HELP_STRING([--enable-sdl2], [Enable SDL2 playback]))
    4.35 +if test "$enable_sdl2" = "yes"; then
    4.36 +	AC_CHECK_LIB([SDL2], [SDL_CreateWindow], [], [echo "Error! libSDL2 required for playback." exit -1])
    4.37 +fi
    4.38 +
    4.39 +if test "$enable_sdl2" = "yes"; then
    4.40 +	AC_CHECK_LIB([X11], [XInitThreads], [], [echo "Error! libX11 currently required for SDL2 workaround." exit -1])
    4.41 +fi
    4.42 +
    4.43 +AC_ARG_ENABLE([sdl_ttf], AS_HELP_STRING([--enable-sdl_ttf], [Enable SDL_ttf for overlaying fonts]))
    4.44 +if test "$enable_sdl_ttf" = "yes"; then
    4.45 +    AC_CHECK_LIB([SDL_ttf], [TTF_Init], [], [echo "Error! libSDL_ttf required for font rendering." exit -1])
    4.46 +fi
    4.47 +
    4.48 +
    4.49 +
    4.50 +AC_ARG_ENABLE([opencl], AS_HELP_STRING([--enable-opencl], [Enable GPU decoder]))
    4.51 +if test "$enable_opencl" = "yes"; then
    4.52 +	AC_CHECK_LIB([OpenCL], [clGetPlatformIDs], [], [echo "Error! libOpenCL required for GPU functionality." exit -1])
    4.53 +fi
    4.54 +AM_CONDITIONAL([HAVE_OPENCL], [test "$enable_opencl" = "yes"])
    4.55 +
    4.56 +
    4.57 +# Checks for header files.
    4.58 +AC_HEADER_STDC
    4.59 +AC_CHECK_HEADERS([stdint.h stdlib.h string.h unistd.h])
    4.60 +
    4.61 +# Checks for typedefs, structures, and compiler characteristics.
    4.62 +AC_C_CONST
    4.63 +AC_TYPE_UINT32_T
    4.64 +AC_TYPE_UINT64_T
    4.65 +AC_TYPE_UINT8_T
    4.66 +AC_C_VOLATILE
    4.67 +AC_C_BIGENDIAN
    4.68 +
    4.69 +# Checks for library functions.
    4.70 +AC_CHECK_FUNCS([malloc realloc memalign posix_memalign memmove memset])
    4.71 +
    4.72 +AC_CANONICAL_HOST
    4.73 +AC_CANONICAL_BUILD
    4.74 +
    4.75 +AC_MSG_CHECKING([for architecture])
    4.76 +
    4.77 +AC_DEFINE([ARCH_ARM], [0], [Define to 1 on arm architectures.])
    4.78 +AC_DEFINE([ARCH_X86_32], [0], [Define to 1 on x86 architectures.])
    4.79 +AC_DEFINE([ARCH_X86_64], [0], [Define to 1 on x86_64 architectures.])
    4.80 +AC_DEFINE([ARCH_X86], [ARCH_X86_32 ||ARCH_X86_64], [True on x86])
    4.81 +AC_DEFINE([ARCH_PPC], [0], [Define to 1 on ppc architectures.])
    4.82 +AC_DEFINE([ARCH_PPC64], [0], [Define to 1 on ppc64 architectures.])
    4.83 +AC_DEFINE([ARCH_CELL], [0], [Define to 1 on cell architectures.])
    4.84 +
    4.85 +if test "$enable_optimizations" != "no"; then
    4.86 +	case $build_cpu in
    4.87 +		arm )
    4.88 +			arch="arm"
    4.89 +			AC_MSG_RESULT([arm])
    4.90 +			AC_DEFINE([ARCH_ARM], [1], [Define to 1 on arm architectures.])
    4.91 +			;;
    4.92 +		i686 )
    4.93 +			arch="x86"
    4.94 +			AC_MSG_RESULT([x86])
    4.95 +			AC_DEFINE([ARCH_X86_32], [1], [Define to 1 on x86 architectures.])
    4.96 +			;;
    4.97 +		x86_64 )
    4.98 +			arch="x86_64"
    4.99 +			AC_MSG_RESULT([x86_64])
   4.100 +			AC_DEFINE([ARCH_X86_64], [1], [Define to 1 on x86 architectures.])
   4.101 +			;;
   4.102 +		powerpc64 )
   4.103 +			AC_DEFINE([HAVE_BIGENDIAN], [1], [Define to 1 on bigendian architectures.])
   4.104 +			if grep -E ^cpu /proc/cpuinfo | grep -q Cell ; then
   4.105 +				arch="cell"
   4.106 +				AC_MSG_RESULT([cell])
   4.107 +				AC_DEFINE([ARCH_CELL], [1], [Define to 1 on cell architectures.])
   4.108 +			else
   4.109 +				arch="powerpc64"
   4.110 +				AC_MSG_RESULT([ppc64])
   4.111 +				AC_DEFINE([ARCH_PPC64], [1], [Define to 1 on ppc64 architectures.])
   4.112 +			fi
   4.113 +			;;
   4.114 +		* )
   4.115 +			AC_MSG_RESULT([default (little endian).])
   4.116 +			;;
   4.117 +	esac
   4.118 +fi
   4.119 +
   4.120 +AM_CONDITIONAL([HAVE_CELL], [test $arch = "cell"])
   4.121 +
   4.122 +# Additional options
   4.123 +AC_ARG_ENABLE([optimizations], AS_HELP_STRING([--disable-optimizations], [Disable all architecture specific optimizations. Compiler optimizations are not disabled.]))
   4.124 +
   4.125 +AC_DEFINE([HAVE_SSE], [0], [Define to 1 to enable sse optimizations.])
   4.126 +AC_DEFINE([HAVE_MMX], [0], [Define to 1 to enable mmx optimizations.])
   4.127 +AC_DEFINE([HAVE_MMX2], [0], [Define to 1 to enable mmx2 optimizations.])
   4.128 +AC_DEFINE([HAVE_SSSE3], [0], [Define to 1 to enable ssse3 optimizations.])
   4.129 +AC_DEFINE([HAVE_ALTIVEC], [0], [Define to 1 to enable altivec optimizations.])
   4.130 +AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.])
   4.131 +
   4.132 +AC_ARG_ENABLE([ssse3], AS_HELP_STRING([--enable-ssse3], [Enable ssse3 optimizations]))
   4.133 +if test "$enable_ssse3" = "yes"; then
   4.134 +	AC_DEFINE([HAVE_SSSE3], [1], [Define to 1 to enable ssse3 optimizations.])
   4.135 +	AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.])
   4.136 +	AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.])
   4.137 +	AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.])
   4.138 +	ARCH_SUBDIR=x86
   4.139 +fi
   4.140 +
   4.141 +AC_ARG_ENABLE([sse], AS_HELP_STRING([--enable-sse], [Enable sse optimizations]))
   4.142 +if test "$enable_sse" = "yes"; then
   4.143 +	AC_DEFINE([HAVE_SSE], [1], [Define to 1 to enable sse optimizations.])
   4.144 +	AC_DEFINE([HAVE_MMX], [1], [Define to 1 to enable mmx optimizations.])
   4.145 +	AC_DEFINE([HAVE_MMX2], [1], [Define to 1 to enable mmx2 optimizations.])
   4.146 +	ARCH_SUBDIR=x86
   4.147 +fi
   4.148 +
   4.149 +AC_ARG_ENABLE([altivec], AS_HELP_STRING([--enable-altivec], [Enable altivec optimizations]))
   4.150 +if test "$enable_altivec" = "yes"; then
   4.151 +	AC_DEFINE([HAVE_ALTIVEC], [1], [Define to 1 to enable altivec optimizations.])
   4.152 +	ARCH_SUBDIR="$ARCH_SUBDIR ppc"
   4.153 +	TMPCLAGS=$CFLAGS
   4.154 +	CFLAGS="$CFLAGS -maltivec"
   4.155 +	AC_CHECK_HEADERS(altivec.h)
   4.156 +	CFLAGS=$TMPCLAGS
   4.157 +fi
   4.158 +
   4.159 +AC_ARG_ENABLE([neon], AS_HELP_STRING([--enable-neon], [Enable neon optimizations]))
   4.160 +if test "$enable_neon" = "yes"; then
   4.161 +	AC_DEFINE([HAVE_NEON], [1], [Define to 1 to enable neon optimizations.])
   4.162 +	ARCH_SUBDIR=arm
   4.163 +fi
   4.164 +
   4.165 +AM_CONDITIONAL([HAVE_ARCH_SUBDIR], [test "$ARCH_SUBDIR" != ""])
   4.166 +AC_SUBST([ARCH_SUBDIR])
   4.167 +
   4.168 +AC_DEFINE([HAVE_NEON], [0], [Define to 1 to enable neon optimizations.])
   4.169 +
   4.170 +AC_CONFIG_HEADER([config.h])
   4.171 +
   4.172 +AC_CONFIG_FILES([Makefile libavutil/Makefile libavcodec/Makefile libavcodec/x86/Makefile libavcodec/ppc/Makefile libavcodec/cell/Makefile])
   4.173 +
   4.174 +AC_OUTPUT

     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/ffmpeg_smp/h264dec/h264dec.c	Mon Aug 27 12:09:56 2012 +0200
     5.3 @@ -0,0 +1,288 @@
     5.4 +/*
     5.5 +* H264 decoder main
     5.6 +*/
     5.7 +
     5.8 +#include "config.h"
     5.9 +#include "libavcodec/h264.h"
    5.10 +
    5.11 +#include <string.h>
    5.12 +#include <stdlib.h>
    5.13 +#include <errno.h>
    5.14 +#include <signal.h>
    5.15 +#include <unistd.h>
    5.16 +#include <getopt.h>
    5.17 +#include <fcntl.h>
    5.18 +
    5.19 +#include <sys/types.h>
    5.20 +#include <sys/time.h>
    5.21 +#include <sys/resource.h>
    5.22 +#include <time.h>
    5.23 +
    5.24 +#include <assert.h>
    5.25 +
    5.26 +
    5.27 +static const char program_name[] = "h264dec";
    5.28 +static const int program_birth_year = 2010;
    5.29 +
    5.30 +static const char *file_name;
    5.31 +static int ifile, ofile;
    5.32 +static int no_arch =0;
    5.33 +static int parallel = 1;
    5.34 +static int frame_width  = 0;
    5.35 +static int frame_height = 0;
    5.36 +
    5.37 +static void av_exit(int ret)
    5.38 +{
    5.39 +    //do some free calls
    5.40 +#undef exit
    5.41 +    exit(ret);
    5.42 +}
    5.43 +
    5.44 +static void opt_input_file(const char *filename)
    5.45 +{
    5.46 +    /* open the input file */
    5.47 +    ifile = open(filename, O_RDONLY, 0666);
    5.48 +    if (ifile < 0){
    5.49 +        fprintf(stderr, "Failed to open %s\n", filename);
    5.50 +        av_exit(-1);
    5.51 +    }
    5.52 +
    5.53 +    //parse first frame to get resolution (other information available but not used)
    5.54 +    H264Slice slice;
    5.55 +    PictureInfo pi;
    5.56 +    GetBitContext gb = {0,};
    5.57 +    ParserContext *pc;
    5.58 +    NalContext *nc;
    5.59 +
    5.60 +    pc = get_parse_context(ifile);
    5.61 +    nc = get_nal_context(0, 0);
    5.62 +
    5.63 +    memset(&slice, 0, sizeof(H264Slice));
    5.64 +    slice.current_picture_info=&pi;
    5.65 +
    5.66 +    av_read_frame_internal(pc, &gb);
    5.67 +    decode_nal_units(nc, &slice, &gb);
    5.68 +
    5.69 +    frame_width = nc->width;
    5.70 +    frame_height= nc->height;
    5.71 +
    5.72 +    //clean up
    5.73 +    av_freep(&gb.raw);
    5.74 +    if (gb.rbsp)
    5.75 +        av_freep(&gb.rbsp);
    5.76 +    free_parse_context(pc);
    5.77 +    free_nal_context(nc);
    5.78 +
    5.79 +    //rewind file
    5.80 +    int offset;
    5.81 +    if ( (offset=lseek(ifile, 0, SEEK_SET)) ){
    5.82 +        fprintf(stderr, "Rewind input file %s failed at offset %d\n", filename, offset);
    5.83 +    }
    5.84 +
    5.85 +}
    5.86 +
    5.87 +static void opt_output_file(const char *filename)
    5.88 +{
    5.89 +    if (filename){
    5.90 +        if (!strcmp(filename, "-"))
    5.91 +            filename = "pipe:";
    5.92 +
    5.93 +        ofile = open(filename, O_CREAT | O_TRUNC | O_WRONLY, 0666);
    5.94 +    }else{
    5.95 +        ofile =0;
    5.96 +    }
    5.97 +}
    5.98 +
    5.99 +static void show_usage(void)
   5.100 +{
   5.101 +    printf("usage: ffmpeg [options] -i infile }...\n");
   5.102 +    printf("\n");
   5.103 +}
   5.104 +
   5.105 +static struct option long_options[] = {
   5.106 +    {"static-sched", 0, 0, 0},
   5.107 +    {"static-mbd", 0, 0, 0},
   5.108 +    {"numamap", 0, 0, 0},
   5.109 +    {"no-mbd", 0, 0, 0},
   5.110 +    {"static-3d", 0, 0, 0},
   5.111 +    {"slice-bufs", 1, 0, 0},
   5.112 +    {"smt", 0, 0, 0},
   5.113 +    {"noarch", 0, 0, 'a'},
   5.114 +    {"display", 0, 0, 'd'},
   5.115 +    {"fullscreen", 0, 0, 'f'},
   5.116 +    {"numframes", 1, 0, 'n'},
   5.117 +    {"use-ppe-ed", 1, 0, 'p'},
   5.118 +    {"sequential", 0, 0, 's'},
   5.119 +    {"threads", 1, 0, 't'},
   5.120 +    {"verbose", 1, 0, 'v'},
   5.121 +    {"wave-order", 1, 0, 'w'},
   5.122 +    {"smb-size", 1, 0, 'z'},
   5.123 +    {"pipe-bufs", 1, 0, 'e'},
   5.124 +    {0, 0, 0, 0}
   5.125 +};
   5.126 +
   5.127 +static h264_options cli_opts;
   5.128 +static void parse_cmd(int argc, char **argv)
   5.129 +{
   5.130 +    int c;
   5.131 +    int digit_optind = 0;
   5.132 +    int option_index = 0;
   5.133 +    char ofile_name[1024];
   5.134 +    extern char *optarg;
   5.135 +    extern int optind, optopt;
   5.136 +
   5.137 +    cli_opts.statsched =0;
   5.138 +    cli_opts.numamap =0;
   5.139 +    cli_opts.statmbd =0;
   5.140 +    cli_opts.no_mbd= 0;
   5.141 +    cli_opts.numframes = INT_MAX;
   5.142 +    cli_opts.display=0;
   5.143 +    cli_opts.fullscreen=0;
   5.144 +    cli_opts.verbose=0;
   5.145 +    cli_opts.ppe_ed=0;
   5.146 +    cli_opts.profile=0;
   5.147 +    cli_opts.threads = 1;
   5.148 +    cli_opts.smb_size[0] = cli_opts.smb_size[1] = 1;
   5.149 +    cli_opts.wave_order=0;
   5.150 +    cli_opts.static_3d=0;
   5.151 +    cli_opts.pipe_bufs=8;
   5.152 +    cli_opts.slice_bufs=1;
   5.153 +    cli_opts.smt= 0;
   5.154 +    while ((c = getopt_long(argc, argv, "ade:fi:n:o:p:st:vwz:", long_options, &option_index)) != -1 ){
   5.155 +        int this_option_optind = optind ? optind : 1;
   5.156 +
   5.157 +        switch (c){
   5.158 +            case 0:
   5.159 +                if (option_index==0){
   5.160 +                    cli_opts.statsched=1;
   5.161 +                }else if (option_index==1){
   5.162 +                    cli_opts.statmbd= 1;
   5.163 +                }else if (option_index==2){
   5.164 +                    cli_opts.numamap= 1;
   5.165 +                }else if (option_index==3){
   5.166 +                    cli_opts.no_mbd= 1;
   5.167 +                }else if (option_index==4){
   5.168 +                    cli_opts.static_3d= 1;
   5.169 +                }else if (option_index==5){
   5.170 +                    cli_opts.slice_bufs= (unsigned) atoi(optarg);
   5.171 +                }else if (option_index==6){
   5.172 +                    cli_opts.smt= 1;
   5.173 +                }
   5.174 +                break;
   5.175 +            case '0':
   5.176 +            case '1':
   5.177 +            case '2':
   5.178 +                if (digit_optind != 0 && digit_optind != this_option_optind)
   5.179 +                    printf("digits occur in two different argv-elements.\n");
   5.180 +                digit_optind = this_option_optind;
   5.181 +                printf("option %c\n", c);
   5.182 +                break;
   5.183 +            case 'a':
   5.184 +                no_arch=1;
   5.185 +                break;
   5.186 +            case 'd':
   5.187 +                cli_opts.display=1;
   5.188 +                break;
   5.189 +            case 'f':
   5.190 +                cli_opts.fullscreen=1;
   5.191 +                break;
   5.192 +            case 'i':
   5.193 +                file_name = (const char *)optarg;
   5.194 +                opt_input_file(file_name);
   5.195 +                break;
   5.196 +            case 'n':
   5.197 +                cli_opts.numframes = (unsigned) atoi(optarg);
   5.198 +                break;
   5.199 +            case 'o':
   5.200 +                strcpy(ofile_name, optarg);
   5.201 +                opt_output_file(ofile_name);
   5.202 +                break;
   5.203 +            case 'p':
   5.204 +                cli_opts.profile = (unsigned) atoi(optarg);
   5.205 +                break;
   5.206 +            case 's':
   5.207 +                cli_opts.threads = 0;
   5.208 +                parallel = 0;
   5.209 +                break;
   5.210 +            case 't':
   5.211 +                cli_opts.threads = atoi(optarg);
   5.212 +                if (cli_opts.threads<=0){
   5.213 +                    fprintf(stderr, "Option -%c requires thread numbers > 0\n", c);
   5.214 +                    av_exit(-1);
   5.215 +                }
   5.216 +                break;
   5.217 +            case 'v':
   5.218 +                cli_opts.verbose = 1;
   5.219 +                break;
   5.220 +            case 'w':
   5.221 +                cli_opts.wave_order = 1;
   5.222 +                break;
   5.223 +            case 'z': // only useful in ompss
   5.224 +                if (argc < optind +1){
   5.225 +                    fprintf(stderr, "Option -%c (--smb-size) requires 2 arguments\n", c);
   5.226 +                    av_exit(-1);
   5.227 +                }
   5.228 +                optind--;
   5.229 +                for (int i=0; i<2; i++){
   5.230 +                    cli_opts.smb_size[i] = atoi(argv[optind++]);
   5.231 +                    if (!(cli_opts.smb_size > 0)){
   5.232 +                        fprintf(stderr, "Option -%c (--smb-size) requires dimensions > 0\n", c);
   5.233 +                        av_exit(-1);
   5.234 +                    }
   5.235 +                }
   5.236 +                break;
   5.237 +            case 'e':
   5.238 +                cli_opts.pipe_bufs = atoi(optarg);
   5.239 +                break;
   5.240 +            case ':':
   5.241 +                fprintf(stderr, "Option -%c requires an operand\n", optopt);
   5.242 +                av_exit(-1);
   5.243 +                break;
   5.244 +            case '?':
   5.245 +                fprintf(stderr, "Unrecognized option: -%c\n", optopt);
   5.246 +                av_exit(-1);
   5.247 +                break;
   5.248 +        }
   5.249 +    }
   5.250 +
   5.251 +}
   5.252 +
   5.253 +int main(int argc, char **argv)
   5.254 +{
   5.255 +    /* parse options */
   5.256 +    parse_cmd(argc, argv);
   5.257 +
   5.258 +    if(!ifile ) {
   5.259 +        show_usage();
   5.260 +        av_exit(1);
   5.261 +    }
   5.262 +
   5.263 +    H264Context *h = get_h264dec_context(file_name, ifile, ofile, frame_width, frame_height, &cli_opts);
   5.264 +#if OMPSS
   5.265 +    if (h264_decode_ompss( h ) < 0)
   5.266 +        av_exit(-1);
   5.267 +#else
   5.268 +    if (parallel){
   5.269 +        if (ARCH_CELL && !no_arch){
   5.270 +            if (h264_decode_cell( h ) < 0)
   5.271 +                av_exit(-1);
   5.272 +        }else{
   5.273 +            if (h264_decode_pthread( h ) < 0)
   5.274 +                av_exit(1);
   5.275 +        }
   5.276 +    }else{
   5.277 +        if (ARCH_CELL && !no_arch){
   5.278 +            if (h264_decode_cell_seq( h ) < 0)
   5.279 +                av_exit(1);
   5.280 +        }else{
   5.281 +            if (h264_decode_seq( h ) < 0)
   5.282 +                av_exit(1);
   5.283 +        }
   5.284 +    }
   5.285 +#endif
   5.286 +    free_h264dec_context(h);
   5.287 +    close(ifile);
   5.288 +    close(ofile);
   5.289 +
   5.290 +    return 0;
   5.291 +}

     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/aac.h	Mon Aug 27 12:09:56 2012 +0200
     6.3 @@ -0,0 +1,137 @@
     6.4 +/*
     6.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
     6.6 + *
     6.7 + * This file is part of FFmpeg.
     6.8 + *
     6.9 + * FFmpeg is free software; you can redistribute it and/or
    6.10 + * modify it under the terms of the GNU Lesser General Public
    6.11 + * License as published by the Free Software Foundation; either
    6.12 + * version 2.1 of the License, or (at your option) any later version.
    6.13 + *
    6.14 + * FFmpeg is distributed in the hope that it will be useful,
    6.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    6.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    6.17 + * Lesser General Public License for more details.
    6.18 + *
    6.19 + * You should have received a copy of the GNU Lesser General Public
    6.20 + * License along with FFmpeg; if not, write to the Free Software
    6.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    6.22 + */
    6.23 +
    6.24 +#ifndef AVCODEC_ARM_AAC_H
    6.25 +#define AVCODEC_ARM_AAC_H
    6.26 +
    6.27 +#include "config.h"
    6.28 +
    6.29 +#if HAVE_NEON && HAVE_INLINE_ASM
    6.30 +
    6.31 +#define VMUL2 VMUL2
    6.32 +static inline float *VMUL2(float *dst, const float *v, unsigned idx,
    6.33 +                           const float *scale)
    6.34 +{
    6.35 +    unsigned v0, v1;
    6.36 +    __asm__ volatile ("ubfx     %0,  %4,  #0, #4      \n\t"
    6.37 +                      "ubfx     %1,  %4,  #4, #4      \n\t"
    6.38 +                      "ldr      %0,  [%3, %0, lsl #2] \n\t"
    6.39 +                      "ldr      %1,  [%3, %1, lsl #2] \n\t"
    6.40 +                      "vld1.32  {d1[]},   [%5,:32]    \n\t"
    6.41 +                      "vmov     d0,  %0,  %1          \n\t"
    6.42 +                      "vmul.f32 d0,  d0,  d1          \n\t"
    6.43 +                      "vst1.32  {d0},     [%2,:64]!   \n\t"
    6.44 +                      : "=&r"(v0), "=&r"(v1), "+r"(dst)
    6.45 +                      : "r"(v), "r"(idx), "r"(scale)
    6.46 +                      : "d0", "d1");
    6.47 +    return dst;
    6.48 +}
    6.49 +
    6.50 +#define VMUL4 VMUL4
    6.51 +static inline float *VMUL4(float *dst, const float *v, unsigned idx,
    6.52 +                           const float *scale)
    6.53 +{
    6.54 +    unsigned v0, v1, v2, v3;
    6.55 +    __asm__ volatile ("ubfx     %0,  %6,  #0, #2      \n\t"
    6.56 +                      "ubfx     %1,  %6,  #2, #2      \n\t"
    6.57 +                      "ldr      %0,  [%5, %0, lsl #2] \n\t"
    6.58 +                      "ubfx     %2,  %6,  #4, #2      \n\t"
    6.59 +                      "ldr      %1,  [%5, %1, lsl #2] \n\t"
    6.60 +                      "ubfx     %3,  %6,  #6, #2      \n\t"
    6.61 +                      "ldr      %2,  [%5, %2, lsl #2] \n\t"
    6.62 +                      "vmov     d0,  %0,  %1          \n\t"
    6.63 +                      "ldr      %3,  [%5, %3, lsl #2] \n\t"
    6.64 +                      "vld1.32  {d2[],d3[]},[%7,:32]  \n\t"
    6.65 +                      "vmov     d1,  %2,  %3          \n\t"
    6.66 +                      "vmul.f32 q0,  q0,  q1          \n\t"
    6.67 +                      "vst1.32  {q0},     [%4,:128]!  \n\t"
    6.68 +                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst)
    6.69 +                      : "r"(v), "r"(idx), "r"(scale)
    6.70 +                      : "d0", "d1", "d2", "d3");
    6.71 +    return dst;
    6.72 +}
    6.73 +
    6.74 +#define VMUL2S VMUL2S
    6.75 +static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
    6.76 +                            unsigned sign, const float *scale)
    6.77 +{
    6.78 +    unsigned v0, v1, v2, v3;
    6.79 +    __asm__ volatile ("ubfx     %0,  %6,  #0, #4      \n\t"
    6.80 +                      "ubfx     %1,  %6,  #4, #4      \n\t"
    6.81 +                      "ldr      %0,  [%5, %0, lsl #2] \n\t"
    6.82 +                      "lsl      %2,  %8,  #30         \n\t"
    6.83 +                      "ldr      %1,  [%5, %1, lsl #2] \n\t"
    6.84 +                      "lsl      %3,  %8,  #31         \n\t"
    6.85 +                      "vmov     d0,  %0,  %1          \n\t"
    6.86 +                      "bic      %2,  %2,  #1<<30      \n\t"
    6.87 +                      "vld1.32  {d1[]},   [%7,:32]    \n\t"
    6.88 +                      "vmov     d2,  %2,  %3          \n\t"
    6.89 +                      "veor     d0,  d0,  d2          \n\t"
    6.90 +                      "vmul.f32 d0,  d0,  d1          \n\t"
    6.91 +                      "vst1.32  {d0},     [%4,:64]!   \n\t"
    6.92 +                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst)
    6.93 +                      : "r"(v), "r"(idx), "r"(scale), "r"(sign)
    6.94 +                      : "d0", "d1", "d2");
    6.95 +    return dst;
    6.96 +}
    6.97 +
    6.98 +#define VMUL4S VMUL4S
    6.99 +static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
   6.100 +                            unsigned sign, const float *scale)
   6.101 +{
   6.102 +    unsigned v0, v1, v2, v3, nz;
   6.103 +    __asm__ volatile ("vld1.32  {d2[],d3[]},[%9,:32]  \n\t"
   6.104 +                      "ubfx     %0,  %8,  #0, #2      \n\t"
   6.105 +                      "ubfx     %1,  %8,  #2, #2      \n\t"
   6.106 +                      "ldr      %0,  [%7, %0, lsl #2] \n\t"
   6.107 +                      "ubfx     %2,  %8,  #4, #2      \n\t"
   6.108 +                      "ldr      %1,  [%7, %1, lsl #2] \n\t"
   6.109 +                      "ubfx     %3,  %8,  #6, #2      \n\t"
   6.110 +                      "ldr      %2,  [%7, %2, lsl #2] \n\t"
   6.111 +                      "vmov     d0,  %0,  %1          \n\t"
   6.112 +                      "ldr      %3,  [%7, %3, lsl #2] \n\t"
   6.113 +                      "lsr      %6,  %8,  #12         \n\t"
   6.114 +                      "rbit     %6,  %6               \n\t"
   6.115 +                      "vmov     d1,  %2,  %3          \n\t"
   6.116 +                      "lsls     %6,  %6,  #1          \n\t"
   6.117 +                      "and      %0,  %5,  #1<<31      \n\t"
   6.118 +                      "lslcs    %5,  %5,  #1          \n\t"
   6.119 +                      "lsls     %6,  %6,  #1          \n\t"
   6.120 +                      "and      %1,  %5,  #1<<31      \n\t"
   6.121 +                      "lslcs    %5,  %5,  #1          \n\t"
   6.122 +                      "lsls     %6,  %6,  #1          \n\t"
   6.123 +                      "and      %2,  %5,  #1<<31      \n\t"
   6.124 +                      "lslcs    %5,  %5,  #1          \n\t"
   6.125 +                      "vmov     d4,  %0,  %1          \n\t"
   6.126 +                      "and      %3,  %5,  #1<<31      \n\t"
   6.127 +                      "vmov     d5,  %2,  %3          \n\t"
   6.128 +                      "veor     q0,  q0,  q2          \n\t"
   6.129 +                      "vmul.f32 q0,  q0,  q1          \n\t"
   6.130 +                      "vst1.32  {q0},     [%4,:128]!  \n\t"
   6.131 +                      : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
   6.132 +                        "+r"(sign), "=r"(nz)
   6.133 +                      : "r"(v), "r"(idx), "r"(scale)
   6.134 +                      : "d0", "d1", "d2", "d3", "d4", "d5");
   6.135 +    return dst;
   6.136 +}
   6.137 +
   6.138 +#endif /* HAVE_NEON && HAVE_INLINE_ASM */
   6.139 +
   6.140 +#endif /* AVCODEC_ARM_AAC_H */

     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/asm.S	Mon Aug 27 12:09:56 2012 +0200
     7.3 @@ -0,0 +1,72 @@
     7.4 +/*
     7.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
     7.6 + *
     7.7 + * This file is part of FFmpeg.
     7.8 + *
     7.9 + * FFmpeg is free software; you can redistribute it and/or
    7.10 + * modify it under the terms of the GNU Lesser General Public
    7.11 + * License as published by the Free Software Foundation; either
    7.12 + * version 2.1 of the License, or (at your option) any later version.
    7.13 + *
    7.14 + * FFmpeg is distributed in the hope that it will be useful,
    7.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    7.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    7.17 + * Lesser General Public License for more details.
    7.18 + *
    7.19 + * You should have received a copy of the GNU Lesser General Public
    7.20 + * License along with FFmpeg; if not, write to the Free Software
    7.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    7.22 + */
    7.23 +
    7.24 +#include "config.h"
    7.25 +
    7.26 +#ifdef __ELF__
    7.27 +#   define ELF
    7.28 +#else
    7.29 +#   define ELF @
    7.30 +#endif
    7.31 +
    7.32 +        .macro require8, val=1
    7.33 +ELF     .eabi_attribute 24, \val
    7.34 +        .endm
    7.35 +
    7.36 +        .macro preserve8, val=1
    7.37 +ELF     .eabi_attribute 25, \val
    7.38 +        .endm
    7.39 +
    7.40 +        .macro function name, export=0
    7.41 +        .macro endfunc
    7.42 +ELF     .size   \name, . - \name
    7.43 +        .endfunc
    7.44 +        .purgem endfunc
    7.45 +        .endm
    7.46 +.if \export
    7.47 +        .global EXTERN_ASM\name
    7.48 +EXTERN_ASM\name:
    7.49 +.endif
    7.50 +ELF     .type   \name, %function
    7.51 +        .func   \name
    7.52 +\name:
    7.53 +        .endm
    7.54 +
    7.55 +        .macro movrel rd, val
    7.56 +#if HAVE_ARMV6T2 && !CONFIG_PIC
    7.57 +        movw            \rd, #:lower16:\val
    7.58 +        movt            \rd, #:upper16:\val
    7.59 +#else
    7.60 +        ldr             \rd, =\val
    7.61 +#endif
    7.62 +        .endm
    7.63 +
    7.64 +#if HAVE_VFP_ARGS
    7.65 +        .eabi_attribute 28, 1
    7.66 +#   define VFP
    7.67 +#   define NOVFP @
    7.68 +#else
    7.69 +#   define VFP   @
    7.70 +#   define NOVFP
    7.71 +#endif
    7.72 +
    7.73 +#define GLUE(a, b) a ## b
    7.74 +#define JOIN(a, b) GLUE(a, b)
    7.75 +#define X(s) JOIN(EXTERN_ASM, s)

     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
     8.3 @@ -0,0 +1,32 @@
     8.4 +/*
     8.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
     8.6 + *
     8.7 + * This file is part of FFmpeg.
     8.8 + *
     8.9 + * FFmpeg is free software; you can redistribute it and/or
    8.10 + * modify it under the terms of the GNU Lesser General Public
    8.11 + * License as published by the Free Software Foundation; either
    8.12 + * version 2.1 of the License, or (at your option) any later version.
    8.13 + *
    8.14 + * FFmpeg is distributed in the hope that it will be useful,
    8.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    8.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    8.17 + * Lesser General Public License for more details.
    8.18 + *
    8.19 + * You should have received a copy of the GNU Lesser General Public
    8.20 + * License along with FFmpeg; if not, write to the Free Software
    8.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    8.22 + */
    8.23 +
    8.24 +#include "config.h"
    8.25 +#include "libavutil/attributes.h"
    8.26 +#include "libavcodec/dcadsp.h"
    8.27 +
    8.28 +void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
    8.29 +                         int decifactor, float scale, float bias);
    8.30 +
    8.31 +void av_cold ff_dcadsp_init_arm(DCADSPContext *s)
    8.32 +{
    8.33 +    if (HAVE_NEON)
    8.34 +        s->lfe_fir = ff_dca_lfe_fir_neon;
    8.35 +}

     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dcadsp_neon.S	Mon Aug 27 12:09:56 2012 +0200
     9.3 @@ -0,0 +1,61 @@
     9.4 +/*
     9.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
     9.6 + *
     9.7 + * This file is part of FFmpeg.
     9.8 + *
     9.9 + * FFmpeg is free software; you can redistribute it and/or
    9.10 + * modify it under the terms of the GNU Lesser General Public
    9.11 + * License as published by the Free Software Foundation; either
    9.12 + * version 2.1 of the License, or (at your option) any later version.
    9.13 + *
    9.14 + * FFmpeg is distributed in the hope that it will be useful,
    9.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    9.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    9.17 + * Lesser General Public License for more details.
    9.18 + *
    9.19 + * You should have received a copy of the GNU Lesser General Public
    9.20 + * License along with FFmpeg; if not, write to the Free Software
    9.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    9.22 + */
    9.23 +
    9.24 +#include "asm.S"
    9.25 +
    9.26 +function ff_dca_lfe_fir_neon, export=1
    9.27 +        push            {r4-r6,lr}
    9.28 +
    9.29 +        add             r4,  r0,  r3,  lsl #2   @ out2
    9.30 +        add             r5,  r2,  #256*4-16     @ cf1
    9.31 +        sub             r1,  r1,  #12
    9.32 +        cmp             r3,  #32
    9.33 +        moveq           r6,  #256/32
    9.34 +        movne           r6,  #256/64
    9.35 +NOVFP   vldr            d0,  [sp, #16]          @ scale, bias
    9.36 +        mov             lr,  #-16
    9.37 +1:
    9.38 +        vmov.f32        q2,  #0.0               @ v0
    9.39 +        vmov.f32        q3,  #0.0               @ v1
    9.40 +        mov             r12, r6
    9.41 +2:
    9.42 +        vld1.32         {q8},     [r2,:128]!    @ cf0
    9.43 +        vld1.32         {q9},     [r5,:128], lr @ cf1
    9.44 +        vld1.32         {q1},     [r1], lr      @ in
    9.45 +        subs            r12, r12, #4
    9.46 +        vrev64.32       q10, q8
    9.47 +        vmla.f32        q3,  q1,  q9
    9.48 +        vmla.f32        d4,  d2,  d21
    9.49 +        vmla.f32        d5,  d3,  d20
    9.50 +        bne             2b
    9.51 +
    9.52 +        add             r1,  r1,  r6,  lsl #2
    9.53 +        subs            r3,  r3,  #1
    9.54 +        vadd.f32        d4,  d4,  d5
    9.55 +        vadd.f32        d6,  d6,  d7
    9.56 +        vpadd.f32       d4,  d4,  d6
    9.57 +        vdup.32         d5,  d0[1]
    9.58 +        vmla.f32        d5,  d4,  d0[0]
    9.59 +        vst1.32         {d5[0]},  [r0,:32]!
    9.60 +        vst1.32         {d5[1]},  [r4,:32]!
    9.61 +        bne             1b
    9.62 +
    9.63 +        pop             {r4-r6,pc}
    9.64 +endfunc

    10.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    10.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.S	Mon Aug 27 12:09:56 2012 +0200
    10.3 @@ -0,0 +1,712 @@
    10.4 +@
    10.5 +@ ARMv4 optimized DSP utils
    10.6 +@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
    10.7 +@
    10.8 +@ This file is part of FFmpeg.
    10.9 +@
   10.10 +@ FFmpeg is free software; you can redistribute it and/or
   10.11 +@ modify it under the terms of the GNU Lesser General Public
   10.12 +@ License as published by the Free Software Foundation; either
   10.13 +@ version 2.1 of the License, or (at your option) any later version.
   10.14 +@
   10.15 +@ FFmpeg is distributed in the hope that it will be useful,
   10.16 +@ but WITHOUT ANY WARRANTY; without even the implied warranty of
   10.17 +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   10.18 +@ Lesser General Public License for more details.
   10.19 +@
   10.20 +@ You should have received a copy of the GNU Lesser General Public
   10.21 +@ License along with FFmpeg; if not, write to the Free Software
   10.22 +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   10.23 +@
   10.24 +
   10.25 +#include "config.h"
   10.26 +#include "asm.S"
   10.27 +
   10.28 +        preserve8
   10.29 +
   10.30 +#if !HAVE_PLD
   10.31 +.macro pld reg
   10.32 +.endm
   10.33 +#endif
   10.34 +
   10.35 +#if HAVE_ARMV5TE
   10.36 +function ff_prefetch_arm, export=1
   10.37 +        subs            r2,  r2,  #1
   10.38 +        pld             [r0]
   10.39 +        add             r0,  r0,  r1
   10.40 +        bne             ff_prefetch_arm
   10.41 +        bx              lr
   10.42 +endfunc
   10.43 +#endif
   10.44 +
   10.45 +.macro  ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
   10.46 +        mov             \Rd0, \Rn0, lsr #(\shift * 8)
   10.47 +        mov             \Rd1, \Rn1, lsr #(\shift * 8)
   10.48 +        mov             \Rd2, \Rn2, lsr #(\shift * 8)
   10.49 +        mov             \Rd3, \Rn3, lsr #(\shift * 8)
   10.50 +        orr             \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
   10.51 +        orr             \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
   10.52 +        orr             \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
   10.53 +        orr             \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
   10.54 +.endm
   10.55 +.macro  ALIGN_DWORD shift, R0, R1, R2
   10.56 +        mov             \R0, \R0, lsr #(\shift * 8)
   10.57 +        orr             \R0, \R0, \R1, lsl #(32 - \shift * 8)
   10.58 +        mov             \R1, \R1, lsr #(\shift * 8)
   10.59 +        orr             \R1, \R1, \R2, lsl #(32 - \shift * 8)
   10.60 +.endm
   10.61 +.macro  ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
   10.62 +        mov             \Rdst0, \Rsrc0, lsr #(\shift * 8)
   10.63 +        mov             \Rdst1, \Rsrc1, lsr #(\shift * 8)
   10.64 +        orr             \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
   10.65 +        orr             \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
   10.66 +.endm
   10.67 +
   10.68 +.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
   10.69 +        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
   10.70 +        @ Rmask = 0xFEFEFEFE
   10.71 +        @ Rn = destroy
   10.72 +        eor             \Rd0, \Rn0, \Rm0
   10.73 +        eor             \Rd1, \Rn1, \Rm1
   10.74 +        orr             \Rn0, \Rn0, \Rm0
   10.75 +        orr             \Rn1, \Rn1, \Rm1
   10.76 +        and             \Rd0, \Rd0, \Rmask
   10.77 +        and             \Rd1, \Rd1, \Rmask
   10.78 +        sub             \Rd0, \Rn0, \Rd0, lsr #1
   10.79 +        sub             \Rd1, \Rn1, \Rd1, lsr #1
   10.80 +.endm
   10.81 +
   10.82 +.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
   10.83 +        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
   10.84 +        @ Rmask = 0xFEFEFEFE
   10.85 +        @ Rn = destroy
   10.86 +        eor             \Rd0, \Rn0, \Rm0
   10.87 +        eor             \Rd1, \Rn1, \Rm1
   10.88 +        and             \Rn0, \Rn0, \Rm0
   10.89 +        and             \Rn1, \Rn1, \Rm1
   10.90 +        and             \Rd0, \Rd0, \Rmask
   10.91 +        and             \Rd1, \Rd1, \Rmask
   10.92 +        add             \Rd0, \Rn0, \Rd0, lsr #1
   10.93 +        add             \Rd1, \Rn1, \Rd1, lsr #1
   10.94 +.endm
   10.95 +
   10.96 +.macro  JMP_ALIGN tmp, reg
   10.97 +        ands            \tmp, \reg, #3
   10.98 +        bic             \reg, \reg, #3
   10.99 +        beq             1f
  10.100 +        subs            \tmp, \tmp, #1
  10.101 +        beq             2f
  10.102 +        subs            \tmp, \tmp, #1
  10.103 +        beq             3f
  10.104 +        b    4f
  10.105 +.endm
  10.106 +
  10.107 +@ ----------------------------------------------------------------
  10.108 +        .align 5
  10.109 +function ff_put_pixels16_arm, export=1
  10.110 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.111 +        @ block = word aligned, pixles = unaligned
  10.112 +        pld             [r1]
  10.113 +        push            {r4-r11, lr}
  10.114 +        JMP_ALIGN       r5,  r1
  10.115 +1:
  10.116 +        ldm             r1,  {r4-r7}
  10.117 +        add             r1,  r1,  r2
  10.118 +        stm             r0,  {r4-r7}
  10.119 +        pld             [r1]
  10.120 +        subs            r3,  r3,  #1
  10.121 +        add             r0,  r0,  r2
  10.122 +        bne             1b
  10.123 +        pop             {r4-r11, pc}
  10.124 +        .align 5
  10.125 +2:
  10.126 +        ldm             r1,  {r4-r8}
  10.127 +        add             r1,  r1,  r2
  10.128 +        ALIGN_QWORD_D   1,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
  10.129 +        pld             [r1]
  10.130 +        subs            r3,  r3,  #1
  10.131 +        stm             r0,  {r9-r12}
  10.132 +        add             r0,  r0,  r2
  10.133 +        bne             2b
  10.134 +        pop             {r4-r11, pc}
  10.135 +        .align 5
  10.136 +3:
  10.137 +        ldm             r1,  {r4-r8}
  10.138 +        add             r1,  r1,  r2
  10.139 +        ALIGN_QWORD_D   2,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
  10.140 +        pld             [r1]
  10.141 +        subs            r3,  r3,  #1
  10.142 +        stm             r0,  {r9-r12}
  10.143 +        add             r0,  r0,  r2
  10.144 +        bne             3b
  10.145 +        pop             {r4-r11, pc}
  10.146 +        .align 5
  10.147 +4:
  10.148 +        ldm             r1,  {r4-r8}
  10.149 +        add             r1,  r1,  r2
  10.150 +        ALIGN_QWORD_D   3,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
  10.151 +        pld             [r1]
  10.152 +        subs            r3,  r3,  #1
  10.153 +        stm             r0,  {r9-r12}
  10.154 +        add             r0,  r0,  r2
  10.155 +        bne             4b
  10.156 +        pop             {r4-r11,pc}
  10.157 +endfunc
  10.158 +
  10.159 +@ ----------------------------------------------------------------
  10.160 +        .align 5
  10.161 +function ff_put_pixels8_arm, export=1
  10.162 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.163 +        @ block = word aligned, pixles = unaligned
  10.164 +        pld             [r1]
  10.165 +        push            {r4-r5,lr}
  10.166 +        JMP_ALIGN       r5,  r1
  10.167 +1:
  10.168 +        ldm             r1,  {r4-r5}
  10.169 +        add             r1,  r1,  r2
  10.170 +        subs            r3,  r3,  #1
  10.171 +        pld             [r1]
  10.172 +        stm             r0,  {r4-r5}
  10.173 +        add             r0,  r0,  r2
  10.174 +        bne             1b
  10.175 +        pop             {r4-r5,pc}
  10.176 +        .align 5
  10.177 +2:
  10.178 +        ldm             r1,  {r4-r5, r12}
  10.179 +        add             r1,  r1,  r2
  10.180 +        ALIGN_DWORD     1,   r4,  r5,  r12
  10.181 +        pld             [r1]
  10.182 +        subs            r3,  r3,  #1
  10.183 +        stm             r0,  {r4-r5}
  10.184 +        add             r0,  r0,  r2
  10.185 +        bne             2b
  10.186 +        pop             {r4-r5,pc}
  10.187 +        .align 5
  10.188 +3:
  10.189 +        ldm             r1,  {r4-r5, r12}
  10.190 +        add             r1,  r1,  r2
  10.191 +        ALIGN_DWORD     2,   r4,  r5,  r12
  10.192 +        pld             [r1]
  10.193 +        subs            r3,  r3,  #1
  10.194 +        stm             r0,  {r4-r5}
  10.195 +        add             r0,  r0,  r2
  10.196 +        bne             3b
  10.197 +        pop             {r4-r5,pc}
  10.198 +        .align 5
  10.199 +4:
  10.200 +        ldm             r1,  {r4-r5, r12}
  10.201 +        add             r1,  r1,  r2
  10.202 +        ALIGN_DWORD     3,   r4,  r5,  r12
  10.203 +        pld             [r1]
  10.204 +        subs            r3,  r3,  #1
  10.205 +        stm             r0,  {r4-r5}
  10.206 +        add             r0,  r0,  r2
  10.207 +        bne             4b
  10.208 +        pop             {r4-r5,pc}
  10.209 +endfunc
  10.210 +
  10.211 +@ ----------------------------------------------------------------
  10.212 +        .align 5
  10.213 +function ff_put_pixels8_x2_arm, export=1
  10.214 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.215 +        @ block = word aligned, pixles = unaligned
  10.216 +        pld             [r1]
  10.217 +        push            {r4-r10,lr}
  10.218 +        ldr             r12, =0xfefefefe
  10.219 +        JMP_ALIGN       r5,  r1
  10.220 +1:
  10.221 +        ldm             r1,  {r4-r5, r10}
  10.222 +        add             r1,  r1,  r2
  10.223 +        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
  10.224 +        pld             [r1]
  10.225 +        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
  10.226 +        subs            r3,  r3,  #1
  10.227 +        stm             r0,  {r8-r9}
  10.228 +        add             r0,  r0,  r2
  10.229 +        bne             1b
  10.230 +        pop             {r4-r10,pc}
  10.231 +        .align 5
  10.232 +2:
  10.233 +        ldm             r1,  {r4-r5, r10}
  10.234 +        add             r1,  r1,  r2
  10.235 +        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
  10.236 +        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
  10.237 +        pld             [r1]
  10.238 +        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
  10.239 +        subs            r3,  r3,  #1
  10.240 +        stm             r0,  {r4-r5}
  10.241 +        add             r0,  r0,  r2
  10.242 +        bne             2b
  10.243 +        pop             {r4-r10,pc}
  10.244 +        .align 5
  10.245 +3:
  10.246 +        ldm             r1,  {r4-r5, r10}
  10.247 +        add             r1,  r1,  r2
  10.248 +        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
  10.249 +        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
  10.250 +        pld             [r1]
  10.251 +        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
  10.252 +        subs            r3,  r3,  #1
  10.253 +        stm             r0,  {r4-r5}
  10.254 +        add             r0,  r0,  r2
  10.255 +        bne             3b
  10.256 +        pop             {r4-r10,pc}
  10.257 +        .align 5
  10.258 +4:
  10.259 +        ldm             r1,  {r4-r5, r10}
  10.260 +        add             r1,  r1,  r2
  10.261 +        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
  10.262 +        pld             [r1]
  10.263 +        RND_AVG32       r8,  r9,  r6,  r7,  r5,  r10, r12
  10.264 +        subs            r3,  r3,  #1
  10.265 +        stm             r0,  {r8-r9}
  10.266 +        add             r0,  r0,  r2
  10.267 +        bne             4b
  10.268 +        pop             {r4-r10,pc}
  10.269 +endfunc
  10.270 +
  10.271 +        .align 5
  10.272 +function ff_put_no_rnd_pixels8_x2_arm, export=1
  10.273 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.274 +        @ block = word aligned, pixles = unaligned
  10.275 +        pld             [r1]
  10.276 +        push            {r4-r10,lr}
  10.277 +        ldr             r12, =0xfefefefe
  10.278 +        JMP_ALIGN       r5,  r1
  10.279 +1:
  10.280 +        ldm             r1,  {r4-r5, r10}
  10.281 +        add             r1,  r1,  r2
  10.282 +        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
  10.283 +        pld             [r1]
  10.284 +        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
  10.285 +        subs            r3,  r3,  #1
  10.286 +        stm             r0,  {r8-r9}
  10.287 +        add             r0,  r0,  r2
  10.288 +        bne             1b
  10.289 +        pop             {r4-r10,pc}
  10.290 +        .align 5
  10.291 +2:
  10.292 +        ldm             r1,  {r4-r5, r10}
  10.293 +        add             r1,  r1,  r2
  10.294 +        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
  10.295 +        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
  10.296 +        pld             [r1]
  10.297 +        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
  10.298 +        subs            r3,  r3,  #1
  10.299 +        stm             r0,  {r4-r5}
  10.300 +        add             r0,  r0,  r2
  10.301 +        bne             2b
  10.302 +        pop             {r4-r10,pc}
  10.303 +        .align 5
  10.304 +3:
  10.305 +        ldm             r1,  {r4-r5, r10}
  10.306 +        add             r1,  r1,  r2
  10.307 +        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
  10.308 +        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
  10.309 +        pld             [r1]
  10.310 +        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
  10.311 +        subs            r3,  r3,  #1
  10.312 +        stm             r0,  {r4-r5}
  10.313 +        add             r0,  r0,  r2
  10.314 +        bne             3b
  10.315 +        pop             {r4-r10,pc}
  10.316 +        .align 5
  10.317 +4:
  10.318 +        ldm             r1,  {r4-r5, r10}
  10.319 +        add             r1,  r1,  r2
  10.320 +        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
  10.321 +        pld             [r1]
  10.322 +        NO_RND_AVG32    r8,  r9,  r6,  r7,  r5,  r10, r12
  10.323 +        subs            r3,  r3,  #1
  10.324 +        stm             r0,  {r8-r9}
  10.325 +        add             r0,  r0,  r2
  10.326 +        bne             4b
  10.327 +        pop             {r4-r10,pc}
  10.328 +endfunc
  10.329 +
  10.330 +
  10.331 +@ ----------------------------------------------------------------
  10.332 +        .align 5
  10.333 +function ff_put_pixels8_y2_arm, export=1
  10.334 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.335 +        @ block = word aligned, pixles = unaligned
  10.336 +        pld             [r1]
  10.337 +        push            {r4-r11,lr}
  10.338 +        mov             r3,  r3,  lsr #1
  10.339 +        ldr             r12, =0xfefefefe
  10.340 +        JMP_ALIGN       r5,  r1
  10.341 +1:
  10.342 +        ldm             r1,  {r4-r5}
  10.343 +        add             r1,  r1,  r2
  10.344 +6:      ldm             r1,  {r6-r7}
  10.345 +        add             r1,  r1,  r2
  10.346 +        pld             [r1]
  10.347 +        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
  10.348 +        ldm             r1,  {r4-r5}
  10.349 +        add             r1,  r1,  r2
  10.350 +        stm             r0,  {r8-r9}
  10.351 +        add             r0,  r0,  r2
  10.352 +        pld             [r1]
  10.353 +        RND_AVG32       r8,  r9,  r6,  r7,  r4,  r5,  r12
  10.354 +        subs            r3,  r3,  #1
  10.355 +        stm             r0,  {r8-r9}
  10.356 +        add             r0,  r0,  r2
  10.357 +        bne             6b
  10.358 +        pop             {r4-r11,pc}
  10.359 +        .align 5
  10.360 +2:
  10.361 +        ldm             r1,  {r4-r6}
  10.362 +        add             r1,  r1,  r2
  10.363 +        pld             [r1]
  10.364 +        ALIGN_DWORD     1,   r4,  r5,  r6
  10.365 +6:      ldm             r1,  {r7-r9}
  10.366 +        add             r1,  r1,  r2
  10.367 +        pld             [r1]
  10.368 +        ALIGN_DWORD     1,   r7,  r8,  r9
  10.369 +        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
  10.370 +        stm             r0,  {r10-r11}
  10.371 +        add             r0,  r0,  r2
  10.372 +        ldm             r1,  {r4-r6}
  10.373 +        add             r1,  r1,  r2
  10.374 +        pld             [r1]
  10.375 +        ALIGN_DWORD     1,   r4,  r5,  r6
  10.376 +        subs            r3,  r3,  #1
  10.377 +        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
  10.378 +        stm             r0,  {r10-r11}
  10.379 +        add             r0,  r0,  r2
  10.380 +        bne             6b
  10.381 +        pop             {r4-r11,pc}
  10.382 +        .align 5
  10.383 +3:
  10.384 +        ldm             r1,  {r4-r6}
  10.385 +        add             r1,  r1,  r2
  10.386 +        pld             [r1]
  10.387 +        ALIGN_DWORD     2,   r4,  r5,  r6
  10.388 +6:      ldm             r1,  {r7-r9}
  10.389 +        add             r1,  r1,  r2
  10.390 +        pld             [r1]
  10.391 +        ALIGN_DWORD     2,   r7,  r8,  r9
  10.392 +        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
  10.393 +        stm             r0,  {r10-r11}
  10.394 +        add             r0,  r0,  r2
  10.395 +        ldm             r1,  {r4-r6}
  10.396 +        add             r1,  r1,  r2
  10.397 +        pld             [r1]
  10.398 +        ALIGN_DWORD     2,   r4,  r5,  r6
  10.399 +        subs            r3,  r3,  #1
  10.400 +        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
  10.401 +        stm             r0,  {r10-r11}
  10.402 +        add             r0,  r0,  r2
  10.403 +        bne             6b
  10.404 +        pop             {r4-r11,pc}
  10.405 +        .align 5
  10.406 +4:
  10.407 +        ldm             r1,  {r4-r6}
  10.408 +        add             r1,  r1,  r2
  10.409 +        pld             [r1]
  10.410 +        ALIGN_DWORD     3,   r4,  r5,  r6
  10.411 +6:      ldm             r1,  {r7-r9}
  10.412 +        add             r1,  r1,  r2
  10.413 +        pld             [r1]
  10.414 +        ALIGN_DWORD     3,   r7,  r8,  r9
  10.415 +        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
  10.416 +        stm             r0,  {r10-r11}
  10.417 +        add             r0,  r0,  r2
  10.418 +        ldm             r1,  {r4-r6}
  10.419 +        add             r1,  r1,  r2
  10.420 +        pld             [r1]
  10.421 +        ALIGN_DWORD     3,   r4,  r5,  r6
  10.422 +        subs            r3,  r3,  #1
  10.423 +        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
  10.424 +        stm             r0,  {r10-r11}
  10.425 +        add             r0,  r0,  r2
  10.426 +        bne             6b
  10.427 +        pop             {r4-r11,pc}
  10.428 +endfunc
  10.429 +
  10.430 +        .align 5
  10.431 +function ff_put_no_rnd_pixels8_y2_arm, export=1
  10.432 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.433 +        @ block = word aligned, pixles = unaligned
  10.434 +        pld             [r1]
  10.435 +        push            {r4-r11,lr}
  10.436 +        mov             r3,  r3,  lsr #1
  10.437 +        ldr             r12, =0xfefefefe
  10.438 +        JMP_ALIGN       r5,  r1
  10.439 +1:
  10.440 +        ldm             r1,  {r4-r5}
  10.441 +        add             r1,  r1,  r2
  10.442 +6:      ldm             r1,  {r6-r7}
  10.443 +        add             r1,  r1,  r2
  10.444 +        pld             [r1]
  10.445 +        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
  10.446 +        ldm             r1,  {r4-r5}
  10.447 +        add             r1,  r1,  r2
  10.448 +        stm             r0,  {r8-r9}
  10.449 +        add             r0,  r0,  r2
  10.450 +        pld             [r1]
  10.451 +        NO_RND_AVG32    r8,  r9,  r6,  r7,  r4,  r5,  r12
  10.452 +        subs            r3,  r3,  #1
  10.453 +        stm             r0,  {r8-r9}
  10.454 +        add             r0,  r0,  r2
  10.455 +        bne             6b
  10.456 +        pop             {r4-r11,pc}
  10.457 +        .align 5
  10.458 +2:
  10.459 +        ldm             r1,  {r4-r6}
  10.460 +        add             r1,  r1,  r2
  10.461 +        pld             [r1]
  10.462 +        ALIGN_DWORD     1,   r4,  r5,  r6
  10.463 +6:      ldm             r1,  {r7-r9}
  10.464 +        add             r1,  r1,  r2
  10.465 +        pld             [r1]
  10.466 +        ALIGN_DWORD     1,   r7,  r8,  r9
  10.467 +        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
  10.468 +        stm             r0,  {r10-r11}
  10.469 +        add             r0,  r0,  r2
  10.470 +        ldm             r1,  {r4-r6}
  10.471 +        add             r1,  r1,  r2
  10.472 +        pld             [r1]
  10.473 +        ALIGN_DWORD     1,   r4,  r5,  r6
  10.474 +        subs            r3,  r3,  #1
  10.475 +        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
  10.476 +        stm             r0,  {r10-r11}
  10.477 +        add             r0,  r0,  r2
  10.478 +        bne             6b
  10.479 +        pop             {r4-r11,pc}
  10.480 +        .align 5
  10.481 +3:
  10.482 +        ldm             r1,  {r4-r6}
  10.483 +        add             r1,  r1,  r2
  10.484 +        pld             [r1]
  10.485 +        ALIGN_DWORD     2,   r4,  r5,  r6
  10.486 +6:      ldm             r1,  {r7-r9}
  10.487 +        add             r1,  r1,  r2
  10.488 +        pld             [r1]
  10.489 +        ALIGN_DWORD     2,   r7,  r8,  r9
  10.490 +        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
  10.491 +        stm             r0,  {r10-r11}
  10.492 +        add             r0,  r0,  r2
  10.493 +        ldm             r1,  {r4-r6}
  10.494 +        add             r1,  r1,  r2
  10.495 +        pld             [r1]
  10.496 +        ALIGN_DWORD     2,   r4,  r5,  r6
  10.497 +        subs            r3,  r3,  #1
  10.498 +        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
  10.499 +        stm             r0,  {r10-r11}
  10.500 +        add             r0,  r0,  r2
  10.501 +        bne             6b
  10.502 +        pop             {r4-r11,pc}
  10.503 +        .align 5
  10.504 +4:
  10.505 +        ldm             r1,  {r4-r6}
  10.506 +        add             r1,  r1,  r2
  10.507 +        pld             [r1]
  10.508 +        ALIGN_DWORD     3,   r4,  r5,  r6
  10.509 +6:      ldm             r1,  {r7-r9}
  10.510 +        add             r1,  r1,  r2
  10.511 +        pld             [r1]
  10.512 +        ALIGN_DWORD     3,   r7,  r8,  r9
  10.513 +        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
  10.514 +        stm             r0,  {r10-r11}
  10.515 +        add             r0,  r0,  r2
  10.516 +        ldm             r1,  {r4-r6}
  10.517 +        add             r1,  r1,  r2
  10.518 +        pld             [r1]
  10.519 +        ALIGN_DWORD     3,   r4,  r5,  r6
  10.520 +        subs            r3,  r3,  #1
  10.521 +        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
  10.522 +        stm             r0,  {r10-r11}
  10.523 +        add             r0,  r0,  r2
  10.524 +        bne             6b
  10.525 +        pop             {r4-r11,pc}
  10.526 +endfunc
  10.527 +
  10.528 +        .ltorg
  10.529 +
  10.530 +@ ----------------------------------------------------------------
  10.531 +.macro  RND_XY2_IT align, rnd
  10.532 +        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
  10.533 +        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
  10.534 +.if \align == 0
  10.535 +        ldm             r1,  {r6-r8}
  10.536 +.elseif \align == 3
  10.537 +        ldm             r1,  {r5-r7}
  10.538 +.else
  10.539 +        ldm             r1,  {r8-r10}
  10.540 +.endif
  10.541 +        add             r1,  r1,  r2
  10.542 +        pld             [r1]
  10.543 +.if \align == 0
  10.544 +        ALIGN_DWORD_D   1,   r4,  r5,  r6,  r7,  r8
  10.545 +.elseif \align == 1
  10.546 +        ALIGN_DWORD_D   1,   r4,  r5,  r8,  r9,  r10
  10.547 +        ALIGN_DWORD_D   2,   r6,  r7,  r8,  r9,  r10
  10.548 +.elseif \align == 2
  10.549 +        ALIGN_DWORD_D   2,   r4,  r5,  r8,  r9,  r10
  10.550 +        ALIGN_DWORD_D   3,   r6,  r7,  r8,  r9,  r10
  10.551 +.elseif \align == 3
  10.552 +        ALIGN_DWORD_D   3,   r4,  r5,  r5,  r6,  r7
  10.553 +.endif
  10.554 +        ldr             r14, =0x03030303
  10.555 +        tst             r3,  #1
  10.556 +        and             r8,  r4,  r14
  10.557 +        and             r9,  r5,  r14
  10.558 +        and             r10, r6,  r14
  10.559 +        and             r11, r7,  r14
  10.560 +        andeq           r14, r14, r14, \rnd #1
  10.561 +        add             r8,  r8,  r10
  10.562 +        add             r9,  r9,  r11
  10.563 +        ldr             r12, =0xfcfcfcfc >> 2
  10.564 +        addeq           r8,  r8,  r14
  10.565 +        addeq           r9,  r9,  r14
  10.566 +        and             r4,  r12, r4,  lsr #2
  10.567 +        and             r5,  r12, r5,  lsr #2
  10.568 +        and             r6,  r12, r6,  lsr #2
  10.569 +        and             r7,  r12, r7,  lsr #2
  10.570 +        add             r10, r4,  r6
  10.571 +        add             r11, r5,  r7
  10.572 +        subs            r3,  r3,  #1
  10.573 +.endm
  10.574 +
  10.575 +.macro RND_XY2_EXPAND align, rnd
  10.576 +        RND_XY2_IT      \align, \rnd
  10.577 +6:      push            {r8-r11}
  10.578 +        RND_XY2_IT      \align, \rnd
  10.579 +        pop             {r4-r7}
  10.580 +        add             r4,  r4,  r8
  10.581 +        add             r5,  r5,  r9
  10.582 +        ldr             r14, =0x0f0f0f0f
  10.583 +        add             r6,  r6,  r10
  10.584 +        add             r7,  r7,  r11
  10.585 +        and             r4,  r14, r4,  lsr #2
  10.586 +        and             r5,  r14, r5,  lsr #2
  10.587 +        add             r4,  r4,  r6
  10.588 +        add             r5,  r5,  r7
  10.589 +        stm             r0,  {r4-r5}
  10.590 +        add             r0,  r0,  r2
  10.591 +        bge             6b
  10.592 +        pop             {r4-r11,pc}
  10.593 +.endm
  10.594 +
  10.595 +        .align 5
  10.596 +function ff_put_pixels8_xy2_arm, export=1
  10.597 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.598 +        @ block = word aligned, pixles = unaligned
  10.599 +        pld             [r1]
  10.600 +        push            {r4-r11,lr} @ R14 is also called LR
  10.601 +        JMP_ALIGN       r5,  r1
  10.602 +1:      RND_XY2_EXPAND  0, lsl
  10.603 +        .align 5
  10.604 +2:      RND_XY2_EXPAND  1, lsl
  10.605 +        .align 5
  10.606 +3:      RND_XY2_EXPAND  2, lsl
  10.607 +        .align 5
  10.608 +4:      RND_XY2_EXPAND  3, lsl
  10.609 +endfunc
  10.610 +
  10.611 +        .align 5
  10.612 +function ff_put_no_rnd_pixels8_xy2_arm, export=1
  10.613 +        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  10.614 +        @ block = word aligned, pixles = unaligned
  10.615 +        pld             [r1]
  10.616 +        push            {r4-r11,lr}
  10.617 +        JMP_ALIGN       r5,  r1
  10.618 +1:      RND_XY2_EXPAND  0, lsr
  10.619 +        .align 5
  10.620 +2:      RND_XY2_EXPAND  1, lsr
  10.621 +        .align 5
  10.622 +3:      RND_XY2_EXPAND  2, lsr
  10.623 +        .align 5
  10.624 +4:      RND_XY2_EXPAND  3, lsr
  10.625 +endfunc
  10.626 +
  10.627 +        .align 5
  10.628 +@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
  10.629 +function ff_add_pixels_clamped_arm, export=1
  10.630 +        push            {r4-r10}
  10.631 +        mov             r10, #8
  10.632 +1:
  10.633 +        ldr             r4,  [r1]               /* load dest */
  10.634 +        /* block[0] and block[1]*/
  10.635 +        ldrsh           r5,  [r0]
  10.636 +        ldrsh           r7,  [r0, #2]
  10.637 +        and             r6,  r4,  #0xFF
  10.638 +        and             r8,  r4,  #0xFF00
  10.639 +        add             r6,  r5,  r6
  10.640 +        add             r8,  r7,  r8,  lsr #8
  10.641 +        mvn             r5,  r5
  10.642 +        mvn             r7,  r7
  10.643 +        tst             r6,  #0x100
  10.644 +        movne           r6,  r5,  lsr #24
  10.645 +        tst             r8,  #0x100
  10.646 +        movne           r8,  r7,  lsr #24
  10.647 +        mov             r9,  r6
  10.648 +        ldrsh           r5,  [r0, #4]           /* moved form [A] */
  10.649 +        orr             r9,  r9,  r8,  lsl #8
  10.650 +        /* block[2] and block[3] */
  10.651 +        /* [A] */
  10.652 +        ldrsh           r7,  [r0, #6]
  10.653 +        and             r6,  r4,  #0xFF0000
  10.654 +        and             r8,  r4,  #0xFF000000
  10.655 +        add             r6,  r5,  r6,  lsr #16
  10.656 +        add             r8,  r7,  r8,  lsr #24
  10.657 +        mvn             r5,  r5
  10.658 +        mvn             r7,  r7
  10.659 +        tst             r6,  #0x100
  10.660 +        movne           r6,  r5,  lsr #24
  10.661 +        tst             r8,  #0x100
  10.662 +        movne           r8,  r7,  lsr #24
  10.663 +        orr             r9,  r9,  r6,  lsl #16
  10.664 +        ldr             r4,  [r1, #4]           /* moved form [B] */
  10.665 +        orr             r9,  r9,  r8,  lsl #24
  10.666 +        /* store dest */
  10.667 +        ldrsh           r5,  [r0, #8]           /* moved form [C] */
  10.668 +        str             r9,  [r1]
  10.669 +
  10.670 +        /* load dest */
  10.671 +        /* [B] */
  10.672 +        /* block[4] and block[5] */
  10.673 +        /* [C] */
  10.674 +        ldrsh           r7,  [r0, #10]
  10.675 +        and             r6,  r4,  #0xFF
  10.676 +        and             r8,  r4,  #0xFF00
  10.677 +        add             r6,  r5,  r6
  10.678 +        add             r8,  r7,  r8,  lsr #8
  10.679 +        mvn             r5,  r5
  10.680 +        mvn             r7,  r7
  10.681 +        tst             r6,  #0x100
  10.682 +        movne           r6,  r5,  lsr #24
  10.683 +        tst             r8,  #0x100
  10.684 +        movne           r8,  r7,  lsr #24
  10.685 +        mov             r9,  r6
  10.686 +        ldrsh           r5,  [r0, #12]          /* moved from [D] */
  10.687 +        orr             r9,  r9,  r8,  lsl #8
  10.688 +        /* block[6] and block[7] */
  10.689 +        /* [D] */
  10.690 +        ldrsh           r7,  [r0, #14]
  10.691 +        and             r6,  r4,  #0xFF0000
  10.692 +        and             r8,  r4,  #0xFF000000
  10.693 +        add             r6,  r5,  r6,  lsr #16
  10.694 +        add             r8,  r7,  r8,  lsr #24
  10.695 +        mvn             r5,  r5
  10.696 +        mvn             r7,  r7
  10.697 +        tst             r6,  #0x100
  10.698 +        movne           r6,  r5,  lsr #24
  10.699 +        tst             r8,  #0x100
  10.700 +        movne           r8,  r7,  lsr #24
  10.701 +        orr             r9,  r9,  r6,  lsl #16
  10.702 +        add             r0,  r0,  #16           /* moved from [E] */
  10.703 +        orr             r9,  r9,  r8,  lsl #24
  10.704 +        subs            r10, r10, #1            /* moved from [F] */
  10.705 +        /* store dest */
  10.706 +        str             r9,  [r1, #4]
  10.707 +
  10.708 +        /* [E] */
  10.709 +        /* [F] */
  10.710 +        add             r1,  r1,  r2
  10.711 +        bne             1b
  10.712 +
  10.713 +        pop             {r4-r10}
  10.714 +        bx              lr
  10.715 +endfunc

    11.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    11.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_arm.h	Mon Aug 27 12:09:56 2012 +0200
    11.3 @@ -0,0 +1,33 @@
    11.4 +/*
    11.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    11.6 + *
    11.7 + * This file is part of FFmpeg.
    11.8 + *
    11.9 + * FFmpeg is free software; you can redistribute it and/or
   11.10 + * modify it under the terms of the GNU Lesser General Public
   11.11 + * License as published by the Free Software Foundation; either
   11.12 + * version 2.1 of the License, or (at your option) any later version.
   11.13 + *
   11.14 + * FFmpeg is distributed in the hope that it will be useful,
   11.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   11.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   11.17 + * Lesser General Public License for more details.
   11.18 + *
   11.19 + * You should have received a copy of the GNU Lesser General Public
   11.20 + * License along with FFmpeg; if not, write to the Free Software
   11.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   11.22 + */
   11.23 +
   11.24 +#ifndef AVCODEC_ARM_DSPUTIL_H
   11.25 +#define AVCODEC_ARM_DSPUTIL_H
   11.26 +
   11.27 +#include "libavcodec/avcodec.h"
   11.28 +#include "libavcodec/dsputil.h"
   11.29 +
   11.30 +void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx);
   11.31 +void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx);
   11.32 +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx);
   11.33 +void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
   11.34 +void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
   11.35 +
   11.36 +#endif

    12.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    12.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_armv6.S	Mon Aug 27 12:09:56 2012 +0200
    12.3 @@ -0,0 +1,623 @@
    12.4 +/*
    12.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    12.6 + *
    12.7 + * This file is part of FFmpeg.
    12.8 + *
    12.9 + * FFmpeg is free software; you can redistribute it and/or
   12.10 + * modify it under the terms of the GNU Lesser General Public
   12.11 + * License as published by the Free Software Foundation; either
   12.12 + * version 2.1 of the License, or (at your option) any later version.
   12.13 + *
   12.14 + * FFmpeg is distributed in the hope that it will be useful,
   12.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   12.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   12.17 + * Lesser General Public License for more details.
   12.18 + *
   12.19 + * You should have received a copy of the GNU Lesser General Public
   12.20 + * License along with FFmpeg; if not, write to the Free Software
   12.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   12.22 + */
   12.23 +
   12.24 +#include "asm.S"
   12.25 +
   12.26 +        preserve8
   12.27 +
   12.28 +        .text
   12.29 +
   12.30 +.macro  call_2x_pixels  type, subp
   12.31 +function ff_\type\()_pixels16\subp\()_armv6, export=1
   12.32 +        push            {r0-r3, lr}
   12.33 +        bl              ff_\type\()_pixels8\subp\()_armv6
   12.34 +        pop             {r0-r3, lr}
   12.35 +        add             r0,  r0,  #8
   12.36 +        add             r1,  r1,  #8
   12.37 +        b               ff_\type\()_pixels8\subp\()_armv6
   12.38 +endfunc
   12.39 +.endm
   12.40 +
   12.41 +call_2x_pixels          avg
   12.42 +call_2x_pixels          put, _x2
   12.43 +call_2x_pixels          put, _y2
   12.44 +call_2x_pixels          put, _x2_no_rnd
   12.45 +call_2x_pixels          put, _y2_no_rnd
   12.46 +
   12.47 +function ff_put_pixels16_armv6, export=1
   12.48 +        push            {r4-r11}
   12.49 +1:
   12.50 +        ldr             r5,  [r1, #4]
   12.51 +        ldr             r6,  [r1, #8]
   12.52 +        ldr             r7,  [r1, #12]
   12.53 +        ldr             r4,  [r1], r2
   12.54 +        strd            r6,  r7,  [r0, #8]
   12.55 +        ldr             r9,  [r1, #4]
   12.56 +        strd            r4,  r5,  [r0],  r2
   12.57 +        ldr             r10, [r1, #8]
   12.58 +        ldr             r11, [r1, #12]
   12.59 +        ldr             r8,  [r1], r2
   12.60 +        strd            r10, r11, [r0, #8]
   12.61 +        subs            r3,  r3,  #2
   12.62 +        strd            r8,  r9,  [r0],  r2
   12.63 +        bne             1b
   12.64 +
   12.65 +        pop             {r4-r11}
   12.66 +        bx              lr
   12.67 +endfunc
   12.68 +
   12.69 +function ff_put_pixels8_armv6, export=1
   12.70 +        push            {r4-r7}
   12.71 +1:
   12.72 +        ldr             r5,  [r1, #4]
   12.73 +        ldr             r4,  [r1], r2
   12.74 +        ldr             r7,  [r1, #4]
   12.75 +        strd            r4,  r5,  [r0],  r2
   12.76 +        ldr             r6,  [r1], r2
   12.77 +        subs            r3,  r3,  #2
   12.78 +        strd            r6,  r7,  [r0],  r2
   12.79 +        bne             1b
   12.80 +
   12.81 +        pop             {r4-r7}
   12.82 +        bx              lr
   12.83 +endfunc
   12.84 +
   12.85 +function ff_put_pixels8_x2_armv6, export=1
   12.86 +        push            {r4-r11, lr}
   12.87 +        mov             r12, #1
   12.88 +        orr             r12, r12, r12, lsl #8
   12.89 +        orr             r12, r12, r12, lsl #16
   12.90 +1:
   12.91 +        ldr             r4,  [r1]
   12.92 +        subs            r3,  r3,  #2
   12.93 +        ldr             r5,  [r1, #4]
   12.94 +        ldr             r7,  [r1, #5]
   12.95 +        lsr             r6,  r4,  #8
   12.96 +        ldr             r8,  [r1, r2]!
   12.97 +        orr             r6,  r6,  r5,  lsl #24
   12.98 +        ldr             r9,  [r1, #4]
   12.99 +        ldr             r11, [r1, #5]
  12.100 +        lsr             r10, r8,  #8
  12.101 +        add             r1,  r1,  r2
  12.102 +        orr             r10, r10, r9,  lsl #24
  12.103 +        eor             r14, r4,  r6
  12.104 +        uhadd8          r4,  r4,  r6
  12.105 +        eor             r6,  r5,  r7
  12.106 +        uhadd8          r5,  r5,  r7
  12.107 +        and             r14, r14, r12
  12.108 +        and             r6,  r6,  r12
  12.109 +        uadd8           r4,  r4,  r14
  12.110 +        eor             r14, r8,  r10
  12.111 +        uadd8           r5,  r5,  r6
  12.112 +        eor             r6,  r9,  r11
  12.113 +        uhadd8          r8,  r8,  r10
  12.114 +        and             r14, r14, r12
  12.115 +        uhadd8          r9,  r9,  r11
  12.116 +        and             r6,  r6,  r12
  12.117 +        uadd8           r8,  r8,  r14
  12.118 +        strd            r4,  r5,  [r0],  r2
  12.119 +        uadd8           r9,  r9,  r6
  12.120 +        strd            r8,  r9,  [r0],  r2
  12.121 +        bne             1b
  12.122 +
  12.123 +        pop             {r4-r11, pc}
  12.124 +endfunc
  12.125 +
  12.126 +function ff_put_pixels8_y2_armv6, export=1
  12.127 +        push            {r4-r11}
  12.128 +        mov             r12, #1
  12.129 +        orr             r12, r12, r12, lsl #8
  12.130 +        orr             r12, r12, r12, lsl #16
  12.131 +        ldr             r4,  [r1]
  12.132 +        ldr             r5,  [r1, #4]
  12.133 +        ldr             r6,  [r1, r2]!
  12.134 +        ldr             r7,  [r1, #4]
  12.135 +1:
  12.136 +        subs            r3,  r3,  #2
  12.137 +        uhadd8          r8,  r4,  r6
  12.138 +        eor             r10, r4,  r6
  12.139 +        uhadd8          r9,  r5,  r7
  12.140 +        eor             r11, r5,  r7
  12.141 +        and             r10, r10, r12
  12.142 +        ldr             r4,  [r1, r2]!
  12.143 +        uadd8           r8,  r8,  r10
  12.144 +        and             r11, r11, r12
  12.145 +        uadd8           r9,  r9,  r11
  12.146 +        ldr             r5,  [r1, #4]
  12.147 +        uhadd8          r10, r4,  r6
  12.148 +        eor             r6,  r4,  r6
  12.149 +        uhadd8          r11, r5,  r7
  12.150 +        and             r6,  r6,  r12
  12.151 +        eor             r7,  r5,  r7
  12.152 +        uadd8           r10, r10, r6
  12.153 +        and             r7,  r7,  r12
  12.154 +        ldr             r6,  [r1, r2]!
  12.155 +        uadd8           r11, r11, r7
  12.156 +        strd            r8,  r9,  [r0],  r2
  12.157 +        ldr             r7,  [r1, #4]
  12.158 +        strd            r10, r11, [r0],  r2
  12.159 +        bne             1b
  12.160 +
  12.161 +        pop             {r4-r11}
  12.162 +        bx              lr
  12.163 +endfunc
  12.164 +
  12.165 +function ff_put_pixels8_x2_no_rnd_armv6, export=1
  12.166 +        push            {r4-r9, lr}
  12.167 +1:
  12.168 +        subs            r3,  r3,  #2
  12.169 +        ldr             r4,  [r1]
  12.170 +        ldr             r5,  [r1, #4]
  12.171 +        ldr             r7,  [r1, #5]
  12.172 +        ldr             r8,  [r1, r2]!
  12.173 +        ldr             r9,  [r1, #4]
  12.174 +        ldr             r14, [r1, #5]
  12.175 +        add             r1,  r1,  r2
  12.176 +        lsr             r6,  r4,  #8
  12.177 +        orr             r6,  r6,  r5,  lsl #24
  12.178 +        lsr             r12, r8,  #8
  12.179 +        orr             r12, r12, r9,  lsl #24
  12.180 +        uhadd8          r4,  r4,  r6
  12.181 +        uhadd8          r5,  r5,  r7
  12.182 +        uhadd8          r8,  r8,  r12
  12.183 +        uhadd8          r9,  r9,  r14
  12.184 +        stm             r0,  {r4,r5}
  12.185 +        add             r0,  r0,  r2
  12.186 +        stm             r0,  {r8,r9}
  12.187 +        add             r0,  r0,  r2
  12.188 +        bne             1b
  12.189 +
  12.190 +        pop             {r4-r9, pc}
  12.191 +endfunc
  12.192 +
  12.193 +function ff_put_pixels8_y2_no_rnd_armv6, export=1
  12.194 +        push            {r4-r9, lr}
  12.195 +        ldr             r4,  [r1]
  12.196 +        ldr             r5,  [r1, #4]
  12.197 +        ldr             r6,  [r1, r2]!
  12.198 +        ldr             r7,  [r1, #4]
  12.199 +1:
  12.200 +        subs            r3,  r3,  #2
  12.201 +        uhadd8          r8,  r4,  r6
  12.202 +        ldr             r4,  [r1, r2]!
  12.203 +        uhadd8          r9,  r5,  r7
  12.204 +        ldr             r5,  [r1, #4]
  12.205 +        uhadd8          r12, r4,  r6
  12.206 +        ldr             r6,  [r1, r2]!
  12.207 +        uhadd8          r14, r5,  r7
  12.208 +        ldr             r7,  [r1, #4]
  12.209 +        stm             r0,  {r8,r9}
  12.210 +        add             r0,  r0,  r2
  12.211 +        stm             r0,  {r12,r14}
  12.212 +        add             r0,  r0,  r2
  12.213 +        bne             1b
  12.214 +
  12.215 +        pop             {r4-r9, pc}
  12.216 +endfunc
  12.217 +
  12.218 +function ff_avg_pixels8_armv6, export=1
  12.219 +        pld             [r1, r2]
  12.220 +        push            {r4-r10, lr}
  12.221 +        mov             lr,  #1
  12.222 +        orr             lr,  lr,  lr,  lsl #8
  12.223 +        orr             lr,  lr,  lr,  lsl #16
  12.224 +        ldrd            r4,  r5,  [r0]
  12.225 +        ldr             r10, [r1, #4]
  12.226 +        ldr             r9,  [r1], r2
  12.227 +        subs            r3,  r3,  #2
  12.228 +1:
  12.229 +        pld             [r1, r2]
  12.230 +        eor             r8,  r4,  r9
  12.231 +        uhadd8          r4,  r4,  r9
  12.232 +        eor             r12, r5,  r10
  12.233 +        ldrd            r6,  r7,  [r0, r2]
  12.234 +        uhadd8          r5,  r5,  r10
  12.235 +        and             r8,  r8,  lr
  12.236 +        ldr             r10, [r1, #4]
  12.237 +        and             r12, r12, lr
  12.238 +        uadd8           r4,  r4,  r8
  12.239 +        ldr             r9,  [r1], r2
  12.240 +        eor             r8,  r6,  r9
  12.241 +        uadd8           r5,  r5,  r12
  12.242 +        pld             [r1, r2,  lsl #1]
  12.243 +        eor             r12, r7,  r10
  12.244 +        uhadd8          r6,  r6,  r9
  12.245 +        strd            r4,  r5,  [r0], r2
  12.246 +        uhadd8          r7,  r7,  r10
  12.247 +        beq             2f
  12.248 +        and             r8,  r8,  lr
  12.249 +        ldrd            r4,  r5,  [r0, r2]
  12.250 +        uadd8           r6,  r6,  r8
  12.251 +        ldr             r10, [r1, #4]
  12.252 +        and             r12, r12, lr
  12.253 +        subs            r3,  r3,  #2
  12.254 +        uadd8           r7,  r7,  r12
  12.255 +        ldr             r9,  [r1], r2
  12.256 +        strd            r6,  r7,  [r0], r2
  12.257 +        b               1b
  12.258 +2:
  12.259 +        and             r8,  r8,  lr
  12.260 +        and             r12, r12, lr
  12.261 +        uadd8           r6,  r6,  r8
  12.262 +        uadd8           r7,  r7,  r12
  12.263 +        strd            r6,  r7,  [r0], r2
  12.264 +
  12.265 +        pop             {r4-r10, pc}
  12.266 +endfunc
  12.267 +
  12.268 +function ff_add_pixels_clamped_armv6, export=1
  12.269 +        push            {r4-r8,lr}
  12.270 +        mov             r3,  #8
  12.271 +1:
  12.272 +        ldm             r0!, {r4,r5,r12,lr}
  12.273 +        ldrd            r6,  r7,  [r1]
  12.274 +        pkhbt           r8,  r4,  r5,  lsl #16
  12.275 +        pkhtb           r5,  r5,  r4,  asr #16
  12.276 +        pkhbt           r4,  r12, lr,  lsl #16
  12.277 +        pkhtb           lr,  lr,  r12, asr #16
  12.278 +        pld             [r1, r2]
  12.279 +        uxtab16         r8,  r8,  r6
  12.280 +        uxtab16         r5,  r5,  r6,  ror #8
  12.281 +        uxtab16         r4,  r4,  r7
  12.282 +        uxtab16         lr,  lr,  r7,  ror #8
  12.283 +        usat16          r8,  #8,  r8
  12.284 +        usat16          r5,  #8,  r5
  12.285 +        usat16          r4,  #8,  r4
  12.286 +        usat16          lr,  #8,  lr
  12.287 +        orr             r6,  r8,  r5,  lsl #8
  12.288 +        orr             r7,  r4,  lr,  lsl #8
  12.289 +        subs            r3,  r3,  #1
  12.290 +        strd            r6,  r7,  [r1],  r2
  12.291 +        bgt             1b
  12.292 +        pop             {r4-r8,pc}
  12.293 +endfunc
  12.294 +
  12.295 +function ff_get_pixels_armv6, export=1
  12.296 +        pld             [r1, r2]
  12.297 +        push            {r4-r8, lr}
  12.298 +        mov             lr,  #8
  12.299 +1:
  12.300 +        ldrd            r4,  r5,  [r1],  r2
  12.301 +        subs            lr,  lr,  #1
  12.302 +        uxtb16          r6,  r4
  12.303 +        uxtb16          r4,  r4,  ror #8
  12.304 +        uxtb16          r12, r5
  12.305 +        uxtb16          r8,  r5,  ror #8
  12.306 +        pld             [r1, r2]
  12.307 +        pkhbt           r5,  r6,  r4,  lsl #16
  12.308 +        pkhtb           r6,  r4,  r6,  asr #16
  12.309 +        pkhbt           r7,  r12, r8,  lsl #16
  12.310 +        pkhtb           r12, r8,  r12, asr #16
  12.311 +        stm             r0!, {r5,r6,r7,r12}
  12.312 +        bgt             1b
  12.313 +
  12.314 +        pop             {r4-r8, pc}
  12.315 +endfunc
  12.316 +
  12.317 +function ff_diff_pixels_armv6, export=1
  12.318 +        pld             [r1, r3]
  12.319 +        pld             [r2, r3]
  12.320 +        push            {r4-r9, lr}
  12.321 +        mov             lr,  #8
  12.322 +1:
  12.323 +        ldrd            r4,  r5,  [r1],  r3
  12.324 +        ldrd            r6,  r7,  [r2],  r3
  12.325 +        uxtb16          r8,  r4
  12.326 +        uxtb16          r4,  r4,  ror #8
  12.327 +        uxtb16          r9,  r6
  12.328 +        uxtb16          r6,  r6,  ror #8
  12.329 +        pld             [r1, r3]
  12.330 +        ssub16          r9,  r8,  r9
  12.331 +        ssub16          r6,  r4,  r6
  12.332 +        uxtb16          r8,  r5
  12.333 +        uxtb16          r5,  r5,  ror #8
  12.334 +        pld             [r2, r3]
  12.335 +        pkhbt           r4,  r9,  r6,  lsl #16
  12.336 +        pkhtb           r6,  r6,  r9,  asr #16
  12.337 +        uxtb16          r9,  r7
  12.338 +        uxtb16          r7,  r7,  ror #8
  12.339 +        ssub16          r9,  r8,  r9
  12.340 +        ssub16          r5,  r5,  r7
  12.341 +        subs            lr,  lr,  #1
  12.342 +        pkhbt           r8,  r9,  r5,  lsl #16
  12.343 +        pkhtb           r9,  r5,  r9,  asr #16
  12.344 +        stm             r0!, {r4,r6,r8,r9}
  12.345 +        bgt             1b
  12.346 +
  12.347 +        pop             {r4-r9, pc}
  12.348 +endfunc
  12.349 +
  12.350 +function ff_pix_abs16_armv6, export=1
  12.351 +        ldr             r0,  [sp]
  12.352 +        push            {r4-r9, lr}
  12.353 +        mov             r12, #0
  12.354 +        mov             lr,  #0
  12.355 +        ldm             r1,  {r4-r7}
  12.356 +        ldr             r8,  [r2]
  12.357 +1:
  12.358 +        ldr             r9,  [r2, #4]
  12.359 +        pld             [r1, r3]
  12.360 +        usada8          r12, r4,  r8,  r12
  12.361 +        ldr             r8,  [r2, #8]
  12.362 +        pld             [r2, r3]
  12.363 +        usada8          lr,  r5,  r9,  lr
  12.364 +        ldr             r9,  [r2, #12]
  12.365 +        usada8          r12, r6,  r8,  r12
  12.366 +        subs            r0,  r0,  #1
  12.367 +        usada8          lr,  r7,  r9,  lr
  12.368 +        beq             2f
  12.369 +        add             r1,  r1,  r3
  12.370 +        ldm             r1,  {r4-r7}
  12.371 +        add             r2,  r2,  r3
  12.372 +        ldr             r8,  [r2]
  12.373 +        b               1b
  12.374 +2:
  12.375 +        add             r0,  r12, lr
  12.376 +        pop             {r4-r9, pc}
  12.377 +endfunc
  12.378 +
  12.379 +function ff_pix_abs16_x2_armv6, export=1
  12.380 +        ldr             r12, [sp]
  12.381 +        push            {r4-r11, lr}
  12.382 +        mov             r0,  #0
  12.383 +        mov             lr,  #1
  12.384 +        orr             lr,  lr,  lr,  lsl #8
  12.385 +        orr             lr,  lr,  lr,  lsl #16
  12.386 +1:
  12.387 +        ldr             r8,  [r2]
  12.388 +        ldr             r9,  [r2, #4]
  12.389 +        lsr             r10, r8,  #8
  12.390 +        ldr             r4,  [r1]
  12.391 +        lsr             r6,  r9,  #8
  12.392 +        orr             r10, r10, r9,  lsl #24
  12.393 +        ldr             r5,  [r2, #8]
  12.394 +        eor             r11, r8,  r10
  12.395 +        uhadd8          r7,  r8,  r10
  12.396 +        orr             r6,  r6,  r5,  lsl #24
  12.397 +        and             r11, r11, lr
  12.398 +        uadd8           r7,  r7,  r11
  12.399 +        ldr             r8,  [r1, #4]
  12.400 +        usada8          r0,  r4,  r7,  r0
  12.401 +        eor             r7,  r9,  r6
  12.402 +        lsr             r10, r5,  #8
  12.403 +        and             r7,  r7,  lr
  12.404 +        uhadd8          r4,  r9,  r6
  12.405 +        ldr             r6,  [r2, #12]
  12.406 +        uadd8           r4,  r4,  r7
  12.407 +        pld             [r1, r3]
  12.408 +        orr             r10, r10, r6,  lsl #24
  12.409 +        usada8          r0,  r8,  r4,  r0
  12.410 +        ldr             r4,  [r1, #8]
  12.411 +        eor             r11, r5,  r10
  12.412 +        ldrb            r7,  [r2, #16]
  12.413 +        and             r11, r11, lr
  12.414 +        uhadd8          r8,  r5,  r10
  12.415 +        ldr             r5,  [r1, #12]
  12.416 +        uadd8           r8,  r8,  r11
  12.417 +        pld             [r2, r3]
  12.418 +        lsr             r10, r6,  #8
  12.419 +        usada8          r0,  r4,  r8,  r0
  12.420 +        orr             r10, r10, r7,  lsl #24
  12.421 +        subs            r12,  r12,  #1
  12.422 +        eor             r11, r6,  r10
  12.423 +        add             r1,  r1,  r3
  12.424 +        uhadd8          r9,  r6,  r10
  12.425 +        and             r11, r11, lr
  12.426 +        uadd8           r9,  r9,  r11
  12.427 +        add             r2,  r2,  r3
  12.428 +        usada8          r0,  r5,  r9,  r0
  12.429 +        bgt             1b
  12.430 +
  12.431 +        pop             {r4-r11, pc}
  12.432 +endfunc
  12.433 +
  12.434 +.macro  usad_y2         p0,  p1,  p2,  p3,  n0,  n1,  n2,  n3
  12.435 +        ldr             \n0, [r2]
  12.436 +        eor             \n1, \p0, \n0
  12.437 +        uhadd8          \p0, \p0, \n0
  12.438 +        and             \n1, \n1, lr
  12.439 +        ldr             \n2, [r1]
  12.440 +        uadd8           \p0, \p0, \n1
  12.441 +        ldr             \n1, [r2, #4]
  12.442 +        usada8          r0,  \p0, \n2, r0
  12.443 +        pld             [r1,  r3]
  12.444 +        eor             \n3, \p1, \n1
  12.445 +        uhadd8          \p1, \p1, \n1
  12.446 +        and             \n3, \n3, lr
  12.447 +        ldr             \p0, [r1, #4]
  12.448 +        uadd8           \p1, \p1, \n3
  12.449 +        ldr             \n2, [r2, #8]
  12.450 +        usada8          r0,  \p1, \p0, r0
  12.451 +        pld             [r2,  r3]
  12.452 +        eor             \p0, \p2, \n2
  12.453 +        uhadd8          \p2, \p2, \n2
  12.454 +        and             \p0, \p0, lr
  12.455 +        ldr             \p1, [r1, #8]
  12.456 +        uadd8           \p2, \p2, \p0
  12.457 +        ldr             \n3, [r2, #12]
  12.458 +        usada8          r0,  \p2, \p1, r0
  12.459 +        eor             \p1, \p3, \n3
  12.460 +        uhadd8          \p3, \p3, \n3
  12.461 +        and             \p1, \p1, lr
  12.462 +        ldr             \p0,  [r1, #12]
  12.463 +        uadd8           \p3, \p3, \p1
  12.464 +        add             r1,  r1,  r3
  12.465 +        usada8          r0,  \p3, \p0,  r0
  12.466 +        add             r2,  r2,  r3
  12.467 +.endm
  12.468 +
  12.469 +function ff_pix_abs16_y2_armv6, export=1
  12.470 +        pld             [r1]
  12.471 +        pld             [r2]
  12.472 +        ldr             r12, [sp]
  12.473 +        push            {r4-r11, lr}
  12.474 +        mov             r0,  #0
  12.475 +        mov             lr,  #1
  12.476 +        orr             lr,  lr,  lr,  lsl #8
  12.477 +        orr             lr,  lr,  lr,  lsl #16
  12.478 +        ldr             r4,  [r2]
  12.479 +        ldr             r5,  [r2, #4]
  12.480 +        ldr             r6,  [r2, #8]
  12.481 +        ldr             r7,  [r2, #12]
  12.482 +        add             r2,  r2,  r3
  12.483 +1:
  12.484 +        usad_y2         r4,  r5,  r6,  r7,  r8,  r9,  r10, r11
  12.485 +        subs            r12, r12, #2
  12.486 +        usad_y2         r8,  r9,  r10, r11, r4,  r5,  r6,  r7
  12.487 +        bgt             1b
  12.488 +
  12.489 +        pop             {r4-r11, pc}
  12.490 +endfunc
  12.491 +
  12.492 +function ff_pix_abs8_armv6, export=1
  12.493 +        pld             [r2, r3]
  12.494 +        ldr             r12, [sp]
  12.495 +        push            {r4-r9, lr}
  12.496 +        mov             r0,  #0
  12.497 +        mov             lr,  #0
  12.498 +        ldrd            r4,  r5,  [r1], r3
  12.499 +1:
  12.500 +        subs            r12, r12, #2
  12.501 +        ldr             r7,  [r2, #4]
  12.502 +        ldr             r6,  [r2], r3
  12.503 +        ldrd            r8,  r9,  [r1], r3
  12.504 +        usada8          r0,  r4,  r6,  r0
  12.505 +        pld             [r2, r3]
  12.506 +        usada8          lr,  r5,  r7,  lr
  12.507 +        ldr             r7,  [r2, #4]
  12.508 +        ldr             r6,  [r2], r3
  12.509 +        beq             2f
  12.510 +        ldrd            r4,  r5,  [r1], r3
  12.511 +        usada8          r0,  r8,  r6,  r0
  12.512 +        pld             [r2, r3]
  12.513 +        usada8          lr,  r9,  r7,  lr
  12.514 +        b               1b
  12.515 +2:
  12.516 +        usada8          r0,  r8,  r6,  r0
  12.517 +        usada8          lr,  r9,  r7,  lr
  12.518 +        add             r0,  r0,  lr
  12.519 +        pop             {r4-r9, pc}
  12.520 +endfunc
  12.521 +
  12.522 +function ff_sse16_armv6, export=1
  12.523 +        ldr             r12, [sp]
  12.524 +        push            {r4-r9, lr}
  12.525 +        mov             r0,  #0
  12.526 +1:
  12.527 +        ldrd            r4,  r5,  [r1]
  12.528 +        ldr             r8,  [r2]
  12.529 +        uxtb16          lr,  r4
  12.530 +        uxtb16          r4,  r4,  ror #8
  12.531 +        uxtb16          r9,  r8
  12.532 +        uxtb16          r8,  r8,  ror #8
  12.533 +        ldr             r7,  [r2, #4]
  12.534 +        usub16          lr,  lr,  r9
  12.535 +        usub16          r4,  r4,  r8
  12.536 +        smlad           r0,  lr,  lr,  r0
  12.537 +        uxtb16          r6,  r5
  12.538 +        uxtb16          lr,  r5,  ror #8
  12.539 +        uxtb16          r8,  r7
  12.540 +        uxtb16          r9,  r7,  ror #8
  12.541 +        smlad           r0,  r4,  r4,  r0
  12.542 +        ldrd            r4,  r5,  [r1, #8]
  12.543 +        usub16          r6,  r6,  r8
  12.544 +        usub16          r8,  lr,  r9
  12.545 +        ldr             r7,  [r2, #8]
  12.546 +        smlad           r0,  r6,  r6,  r0
  12.547 +        uxtb16          lr,  r4
  12.548 +        uxtb16          r4,  r4,  ror #8
  12.549 +        uxtb16          r9,  r7
  12.550 +        uxtb16          r7,  r7, ror #8
  12.551 +        smlad           r0,  r8,  r8,  r0
  12.552 +        ldr             r8,  [r2, #12]
  12.553 +        usub16          lr,  lr,  r9
  12.554 +        usub16          r4,  r4,  r7
  12.555 +        smlad           r0,  lr,  lr,  r0
  12.556 +        uxtb16          r6,  r5
  12.557 +        uxtb16          r5,  r5,  ror #8
  12.558 +        uxtb16          r9,  r8
  12.559 +        uxtb16          r8,  r8,  ror #8
  12.560 +        smlad           r0,  r4,  r4,  r0
  12.561 +        usub16          r6,  r6,  r9
  12.562 +        usub16          r5,  r5,  r8
  12.563 +        smlad           r0,  r6,  r6,  r0
  12.564 +        add             r1,  r1,  r3
  12.565 +        add             r2,  r2,  r3
  12.566 +        subs            r12, r12, #1
  12.567 +        smlad           r0,  r5,  r5,  r0
  12.568 +        bgt             1b
  12.569 +
  12.570 +        pop             {r4-r9, pc}
  12.571 +endfunc
  12.572 +
  12.573 +function ff_pix_norm1_armv6, export=1
  12.574 +        push            {r4-r6, lr}
  12.575 +        mov             r12, #16
  12.576 +        mov             lr,  #0
  12.577 +1:
  12.578 +        ldm             r0,  {r2-r5}
  12.579 +        uxtb16          r6,  r2
  12.580 +        uxtb16          r2,  r2,  ror #8
  12.581 +        smlad           lr,  r6,  r6,  lr
  12.582 +        uxtb16          r6,  r3
  12.583 +        smlad           lr,  r2,  r2,  lr
  12.584 +        uxtb16          r3,  r3,  ror #8
  12.585 +        smlad           lr,  r6,  r6,  lr
  12.586 +        uxtb16          r6,  r4
  12.587 +        smlad           lr,  r3,  r3,  lr
  12.588 +        uxtb16          r4,  r4,  ror #8
  12.589 +        smlad           lr,  r6,  r6,  lr
  12.590 +        uxtb16          r6,  r5
  12.591 +        smlad           lr,  r4,  r4,  lr
  12.592 +        uxtb16          r5,  r5,  ror #8
  12.593 +        smlad           lr,  r6,  r6,  lr
  12.594 +        subs            r12, r12, #1
  12.595 +        add             r0,  r0,  r1
  12.596 +        smlad           lr,  r5,  r5,  lr
  12.597 +        bgt             1b
  12.598 +
  12.599 +        mov             r0,  lr
  12.600 +        pop             {r4-r6, pc}
  12.601 +endfunc
  12.602 +
  12.603 +function ff_pix_sum_armv6, export=1
  12.604 +        push            {r4-r7, lr}
  12.605 +        mov             r12, #16
  12.606 +        mov             r2,  #0
  12.607 +        mov             r3,  #0
  12.608 +        mov             lr,  #0
  12.609 +        ldr             r4,  [r0]
  12.610 +1:
  12.611 +        subs            r12, r12, #1
  12.612 +        ldr             r5,  [r0, #4]
  12.613 +        usada8          r2,  r4,  lr,  r2
  12.614 +        ldr             r6,  [r0, #8]
  12.615 +        usada8          r3,  r5,  lr,  r3
  12.616 +        ldr             r7,  [r0, #12]
  12.617 +        usada8          r2,  r6,  lr,  r2
  12.618 +        beq             2f
  12.619 +        ldr             r4,  [r0, r1]!
  12.620 +        usada8          r3,  r7,  lr,  r3
  12.621 +        bgt             1b
  12.622 +2:
  12.623 +        usada8          r3,  r7,  lr,  r3
  12.624 +        add             r0,  r2,  r3
  12.625 +        pop             {r4-r7, pc}
  12.626 +endfunc

    13.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    13.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
    13.3 @@ -0,0 +1,112 @@
    13.4 +/*
    13.5 + * ARM optimized DSP utils
    13.6 + * Copyright (c) 2001 Lionel Ulmer
    13.7 + *
    13.8 + * This file is part of FFmpeg.
    13.9 + *
   13.10 + * FFmpeg is free software; you can redistribute it and/or
   13.11 + * modify it under the terms of the GNU Lesser General Public
   13.12 + * License as published by the Free Software Foundation; either
   13.13 + * version 2.1 of the License, or (at your option) any later version.
   13.14 + *
   13.15 + * FFmpeg is distributed in the hope that it will be useful,
   13.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   13.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13.18 + * Lesser General Public License for more details.
   13.19 + *
   13.20 + * You should have received a copy of the GNU Lesser General Public
   13.21 + * License along with FFmpeg; if not, write to the Free Software
   13.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   13.23 + */
   13.24 +
   13.25 +#include "libavcodec/dsputil.h"
   13.26 +#include "dsputil_arm.h"
   13.27 +
   13.28 +void ff_j_rev_dct_arm(DCTELEM *data);
   13.29 +void ff_simple_idct_arm(DCTELEM *data);
   13.30 +
   13.31 +/* XXX: local hack */
   13.32 +static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
   13.33 +static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
   13.34 +
   13.35 +void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.36 +void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.37 +void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.38 +void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.39 +
   13.40 +void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.41 +void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.42 +void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.43 +
   13.44 +void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
   13.45 +
   13.46 +CALL_2X_PIXELS(ff_put_pixels16_x2_arm,         ff_put_pixels8_x2_arm,        8)
   13.47 +CALL_2X_PIXELS(ff_put_pixels16_y2_arm,         ff_put_pixels8_y2_arm,        8)
   13.48 +CALL_2X_PIXELS(ff_put_pixels16_xy2_arm,        ff_put_pixels8_xy2_arm,       8)
   13.49 +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm,  ff_put_no_rnd_pixels8_x2_arm, 8)
   13.50 +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm,  ff_put_no_rnd_pixels8_y2_arm, 8)
   13.51 +CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
   13.52 +
   13.53 +void ff_add_pixels_clamped_arm(const DCTELEM *block, uint8_t *dest,
   13.54 +                               int line_size);
   13.55 +
   13.56 +/* XXX: those functions should be suppressed ASAP when all IDCTs are
   13.57 +   converted */
   13.58 +static void j_rev_dct_arm_put(uint8_t *dest, int line_size, DCTELEM *block)
   13.59 +{
   13.60 +    ff_j_rev_dct_arm (block);
   13.61 +    ff_put_pixels_clamped(block, dest, line_size);
   13.62 +}
   13.63 +static void j_rev_dct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)
   13.64 +{
   13.65 +    ff_j_rev_dct_arm (block);
   13.66 +    ff_add_pixels_clamped(block, dest, line_size);
   13.67 +}
   13.68 +static void simple_idct_arm_put(uint8_t *dest, int line_size, DCTELEM *block)
   13.69 +{
   13.70 +    ff_simple_idct_arm (block);
   13.71 +    ff_put_pixels_clamped(block, dest, line_size);
   13.72 +}
   13.73 +static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)
   13.74 +{
   13.75 +    ff_simple_idct_arm (block);
   13.76 +    ff_add_pixels_clamped(block, dest, line_size);
   13.77 +}
   13.78 +
   13.79 +int mm_support(void)
   13.80 +{
   13.81 +    return HAVE_IWMMXT * FF_MM_IWMMXT;
   13.82 +}
   13.83 +
   13.84 +void dsputil_init_arm(DSPContext* c)
   13.85 +{
   13.86 +    ff_put_pixels_clamped = c->put_pixels_clamped;
   13.87 +    ff_add_pixels_clamped = c->add_pixels_clamped;
   13.88 +  
   13.89 +    c->idct_put              = simple_idct_arm_put;
   13.90 +    c->idct_add              = simple_idct_arm_add;
   13.91 +    c->idct                  = ff_simple_idct_arm;
   13.92 +    c->idct_permutation_type = FF_NO_IDCT_PERM;
   13.93 +
   13.94 +    c->add_pixels_clamped = ff_add_pixels_clamped_arm;
   13.95 +
   13.96 +    c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
   13.97 +    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
   13.98 +    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
   13.99 +    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
  13.100 +    c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
  13.101 +    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
  13.102 +    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
  13.103 +    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
  13.104 +
  13.105 +    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
  13.106 +    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
  13.107 +    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
  13.108 +    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
  13.109 +    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
  13.110 +    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
  13.111 +    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
  13.112 +    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
  13.113 +
  13.114 +    if (HAVE_NEON)    ff_dsputil_init_neon(c);
  13.115 +}

    14.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    14.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv5te.c	Mon Aug 27 12:09:56 2012 +0200
    14.3 @@ -0,0 +1,41 @@
    14.4 +/*
    14.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    14.6 + *
    14.7 + * This file is part of FFmpeg.
    14.8 + *
    14.9 + * FFmpeg is free software; you can redistribute it and/or
   14.10 + * modify it under the terms of the GNU Lesser General Public
   14.11 + * License as published by the Free Software Foundation; either
   14.12 + * version 2.1 of the License, or (at your option) any later version.
   14.13 + *
   14.14 + * FFmpeg is distributed in the hope that it will be useful,
   14.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   14.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   14.17 + * Lesser General Public License for more details.
   14.18 + *
   14.19 + * You should have received a copy of the GNU Lesser General Public
   14.20 + * License along with FFmpeg; if not, write to the Free Software
   14.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   14.22 + */
   14.23 +
   14.24 +#include "libavcodec/dsputil.h"
   14.25 +#include "dsputil_arm.h"
   14.26 +
   14.27 +void ff_simple_idct_armv5te(DCTELEM *data);
   14.28 +void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
   14.29 +void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, DCTELEM *data);
   14.30 +
   14.31 +void ff_prefetch_arm(void *mem, int stride, int h);
   14.32 +
   14.33 +void av_cold ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx)
   14.34 +{
   14.35 +    if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
   14.36 +                           avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
   14.37 +        c->idct_put              = ff_simple_idct_put_armv5te;
   14.38 +        c->idct_add              = ff_simple_idct_add_armv5te;
   14.39 +        c->idct                  = ff_simple_idct_armv5te;
   14.40 +        c->idct_permutation_type = FF_NO_IDCT_PERM;
   14.41 +    }
   14.42 +
   14.43 +    c->prefetch = ff_prefetch_arm;
   14.44 +}

    15.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    15.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_armv6.c	Mon Aug 27 12:09:56 2012 +0200
    15.3 @@ -0,0 +1,121 @@
    15.4 +/*
    15.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    15.6 + *
    15.7 + * This file is part of FFmpeg.
    15.8 + *
    15.9 + * FFmpeg is free software; you can redistribute it and/or
   15.10 + * modify it under the terms of the GNU Lesser General Public
   15.11 + * License as published by the Free Software Foundation; either
   15.12 + * version 2.1 of the License, or (at your option) any later version.
   15.13 + *
   15.14 + * FFmpeg is distributed in the hope that it will be useful,
   15.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   15.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   15.17 + * Lesser General Public License for more details.
   15.18 + *
   15.19 + * You should have received a copy of the GNU Lesser General Public
   15.20 + * License along with FFmpeg; if not, write to the Free Software
   15.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   15.22 + */
   15.23 +
   15.24 +#include <stdint.h>
   15.25 +
   15.26 +#include "libavcodec/avcodec.h"
   15.27 +#include "libavcodec/dsputil.h"
   15.28 +#include "dsputil_arm.h"
   15.29 +
   15.30 +void ff_simple_idct_armv6(DCTELEM *data);
   15.31 +void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data);
   15.32 +void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data);
   15.33 +
   15.34 +void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
   15.35 +void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, int, int);
   15.36 +void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, int, int);
   15.37 +
   15.38 +void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
   15.39 +void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
   15.40 +
   15.41 +void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, int, int);
   15.42 +
   15.43 +void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
   15.44 +void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, int, int);
   15.45 +void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, int, int);
   15.46 +
   15.47 +void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
   15.48 +void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, int, int);
   15.49 +
   15.50 +void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, int, int);
   15.51 +
   15.52 +void ff_add_pixels_clamped_armv6(const DCTELEM *block,
   15.53 +                                 uint8_t *restrict pixels,
   15.54 +                                 int line_size);
   15.55 +
   15.56 +void ff_get_pixels_armv6(DCTELEM *block, const uint8_t *pixels, int stride);
   15.57 +void ff_diff_pixels_armv6(DCTELEM *block, const uint8_t *s1,
   15.58 +                          const uint8_t *s2, int stride);
   15.59 +
   15.60 +int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
   15.61 +                       int line_size, int h);
   15.62 +int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
   15.63 +                          int line_size, int h);
   15.64 +int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
   15.65 +                          int line_size, int h);
   15.66 +
   15.67 +int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
   15.68 +                       int line_size, int h);
   15.69 +
   15.70 +int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
   15.71 +                   int line_size, int h);
   15.72 +
   15.73 +int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
   15.74 +int ff_pix_sum_armv6(uint8_t *pix, int line_size);
   15.75 +
   15.76 +void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
   15.77 +{
   15.78 +    if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
   15.79 +                           avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
   15.80 +        c->idct_put              = ff_simple_idct_put_armv6;
   15.81 +        c->idct_add              = ff_simple_idct_add_armv6;
   15.82 +        c->idct                  = ff_simple_idct_armv6;
   15.83 +        c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
   15.84 +    }
   15.85 +
   15.86 +    c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
   15.87 +    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
   15.88 +    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
   15.89 +/*     c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
   15.90 +    c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
   15.91 +    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
   15.92 +    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
   15.93 +/*     c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
   15.94 +
   15.95 +    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
   15.96 +    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
   15.97 +    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
   15.98 +/*     c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
   15.99 +    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
  15.100 +    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
  15.101 +    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
  15.102 +/*     c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
  15.103 +
  15.104 +    c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
  15.105 +    c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
  15.106 +
  15.107 +    c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
  15.108 +    c->get_pixels = ff_get_pixels_armv6;
  15.109 +    c->diff_pixels = ff_diff_pixels_armv6;
  15.110 +
  15.111 +    c->pix_abs[0][0] = ff_pix_abs16_armv6;
  15.112 +    c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
  15.113 +    c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
  15.114 +
  15.115 +    c->pix_abs[1][0] = ff_pix_abs8_armv6;
  15.116 +
  15.117 +    c->sad[0] = ff_pix_abs16_armv6;
  15.118 +    c->sad[1] = ff_pix_abs8_armv6;
  15.119 +
  15.120 +    c->sse[0] = ff_sse16_armv6;
  15.121 +
  15.122 +    c->pix_norm1 = ff_pix_norm1_armv6;
  15.123 +    c->pix_sum   = ff_pix_sum_armv6;
  15.124 +}

    16.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    16.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_neon.c	Mon Aug 27 12:09:56 2012 +0200
    16.3 @@ -0,0 +1,308 @@
    16.4 +/*
    16.5 + * ARM NEON optimised DSP functions
    16.6 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
    16.7 + *
    16.8 + * This file is part of FFmpeg.
    16.9 + *
   16.10 + * FFmpeg is free software; you can redistribute it and/or
   16.11 + * modify it under the terms of the GNU Lesser General Public
   16.12 + * License as published by the Free Software Foundation; either
   16.13 + * version 2.1 of the License, or (at your option) any later version.
   16.14 + *
   16.15 + * FFmpeg is distributed in the hope that it will be useful,
   16.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   16.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   16.18 + * Lesser General Public License for more details.
   16.19 + *
   16.20 + * You should have received a copy of the GNU Lesser General Public
   16.21 + * License along with FFmpeg; if not, write to the Free Software
   16.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   16.23 + */
   16.24 +
   16.25 +#include <stdint.h>
   16.26 +
   16.27 +#include "libavcodec/avcodec.h"
   16.28 +#include "libavcodec/dsputil.h"
   16.29 +#include "dsputil_arm.h"
   16.30 +
   16.31 +void ff_simple_idct_neon(DCTELEM *data);
   16.32 +void ff_simple_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
   16.33 +void ff_simple_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
   16.34 +
   16.35 +void ff_vp3_idct_neon(DCTELEM *data);
   16.36 +void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, DCTELEM *data);
   16.37 +void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, DCTELEM *data);
   16.38 +void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const DCTELEM *data);
   16.39 +
   16.40 +void ff_put_pixels16_neon(uint8_t *, const uint8_t *, int, int);
   16.41 +void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, int, int);
   16.42 +void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, int, int);
   16.43 +void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, int, int);
   16.44 +void ff_put_pixels8_neon(uint8_t *, const uint8_t *, int, int);
   16.45 +void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, int, int);
   16.46 +void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, int, int);
   16.47 +void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, int, int);
   16.48 +void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
   16.49 +void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
   16.50 +void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
   16.51 +void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
   16.52 +void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
   16.53 +void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, int, int);
   16.54 +
   16.55 +void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, int, int);
   16.56 +void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, int, int);
   16.57 +
   16.58 +void ff_add_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
   16.59 +void ff_put_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
   16.60 +void ff_put_signed_pixels_clamped_neon(const DCTELEM *, uint8_t *, int);
   16.61 +
   16.62 +void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
   16.63 +void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
   16.64 +void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
   16.65 +void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
   16.66 +void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
   16.67 +void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
   16.68 +void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
   16.69 +void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
   16.70 +void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
   16.71 +void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
   16.72 +void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
   16.73 +void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
   16.74 +void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
   16.75 +void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
   16.76 +void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
   16.77 +void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
   16.78 +
   16.79 +void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
   16.80 +void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
   16.81 +void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
   16.82 +void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
   16.83 +void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
   16.84 +void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
   16.85 +void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
   16.86 +void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
   16.87 +void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
   16.88 +void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
   16.89 +void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
   16.90 +void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
   16.91 +void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
   16.92 +void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
   16.93 +void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
   16.94 +void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
   16.95 +
   16.96 +void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, int);
   16.97 +void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, int);
   16.98 +void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, int);
   16.99 +void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, int);
  16.100 +void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, int);
  16.101 +void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, int);
  16.102 +void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, int);
  16.103 +void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, int);
  16.104 +void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, int);
  16.105 +void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, int);
  16.106 +void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, int);
  16.107 +void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, int);
  16.108 +void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, int);
  16.109 +void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, int);
  16.110 +void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, int);
  16.111 +void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, int);
  16.112 +
  16.113 +void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, int);
  16.114 +void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, int);
  16.115 +void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, int);
  16.116 +void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, int);
  16.117 +void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, int);
  16.118 +void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, int);
  16.119 +void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, int);
  16.120 +void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, int);
  16.121 +void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, int);
  16.122 +void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, int);
  16.123 +void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, int);
  16.124 +void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, int);
  16.125 +void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, int);
  16.126 +void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, int);
  16.127 +void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, int);
  16.128 +void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, int);
  16.129 +
  16.130 +void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
  16.131 +void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
  16.132 +void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
  16.133 +
  16.134 +void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
  16.135 +void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
  16.136 +void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
  16.137 +
  16.138 +void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
  16.139 +void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
  16.140 +
  16.141 +void ff_vector_fmul_neon(float *dst, const float *src, int len);
  16.142 +void ff_vector_fmul_window_neon(float *dst, const float *src0,
  16.143 +                                const float *src1, const float *win,
  16.144 +                                float add_bias, int len);
  16.145 +void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
  16.146 +                                int len);
  16.147 +void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src,
  16.148 +                                     const float **vp, float mul, int len);
  16.149 +void ff_vector_fmul_sv_scalar_4_neon(float *dst, const float *src,
  16.150 +                                     const float **vp, float mul, int len);
  16.151 +void ff_sv_fmul_scalar_2_neon(float *dst, const float **vp, float mul,
  16.152 +                              int len);
  16.153 +void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
  16.154 +                              int len);
  16.155 +void ff_butterflies_float_neon(float *v1, float *v2, int len);
  16.156 +float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
  16.157 +void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
  16.158 +                                        float mul, int len);
  16.159 +void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
  16.160 +                                 const float *src1, int len);
  16.161 +void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
  16.162 +                             const float *src2, int len);
  16.163 +
  16.164 +void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
  16.165 +                          int len);
  16.166 +void ff_float_to_int16_neon(int16_t *, const float *, long);
  16.167 +void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
  16.168 +
  16.169 +void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
  16.170 +
  16.171 +int32_t ff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len,
  16.172 +                                    int shift);
  16.173 +int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, int16_t *v2,
  16.174 +                                             int16_t *v3, int len, int mul);
  16.175 +
  16.176 +void ff_dsputil_init_neon(DSPContext *c)
  16.177 +{
  16.178 +
  16.179 +    {
  16.180 +        c->idct_put              = ff_simple_idct_put_neon;
  16.181 +        c->idct_add              = ff_simple_idct_add_neon;
  16.182 +        c->idct                  = ff_simple_idct_neon;
  16.183 +        c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
  16.184 +
  16.185 +    }
  16.186 +
  16.187 +    c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
  16.188 +    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
  16.189 +    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
  16.190 +    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
  16.191 +    c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
  16.192 +    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
  16.193 +    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
  16.194 +    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
  16.195 +
  16.196 +    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
  16.197 +    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
  16.198 +    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
  16.199 +    c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
  16.200 +    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
  16.201 +    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
  16.202 +    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
  16.203 +    c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
  16.204 +
  16.205 +    c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
  16.206 +    c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
  16.207 +
  16.208 +    c->add_pixels_clamped = ff_add_pixels_clamped_neon;
  16.209 +    c->put_pixels_clamped = ff_put_pixels_clamped_neon;
  16.210 +    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
  16.211 +
  16.212 +
  16.213 +	c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
  16.214 +	c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
  16.215 +	c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
  16.216 +
  16.217 +	c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
  16.218 +	c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
  16.219 +	c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
  16.220 +
  16.221 +	c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
  16.222 +	c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
  16.223 +	c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
  16.224 +	c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
  16.225 +	c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
  16.226 +	c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
  16.227 +	c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
  16.228 +	c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
  16.229 +	c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
  16.230 +	c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
  16.231 +	c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
  16.232 +	c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
  16.233 +	c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
  16.234 +	c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
  16.235 +	c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
  16.236 +	c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
  16.237 +
  16.238 +	c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
  16.239 +	c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
  16.240 +	c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
  16.241 +	c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
  16.242 +	c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
  16.243 +	c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
  16.244 +	c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
  16.245 +	c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
  16.246 +	c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
  16.247 +	c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
  16.248 +	c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
  16.249 +	c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
  16.250 +	c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
  16.251 +	c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
  16.252 +	c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
  16.253 +	c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
  16.254 +
  16.255 +	c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
  16.256 +	c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
  16.257 +	c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
  16.258 +	c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
  16.259 +	c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
  16.260 +	c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
  16.261 +	c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
  16.262 +	c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
  16.263 +	c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
  16.264 +	c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
  16.265 +	c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
  16.266 +	c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
  16.267 +	c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
  16.268 +	c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
  16.269 +	c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
  16.270 +	c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
  16.271 +
  16.272 +	c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
  16.273 +	c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
  16.274 +	c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
  16.275 +	c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
  16.276 +	c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
  16.277 +	c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
  16.278 +	c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
  16.279 +	c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
  16.280 +	c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
  16.281 +	c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
  16.282 +	c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
  16.283 +	c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
  16.284 +	c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
  16.285 +	c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
  16.286 +	c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
  16.287 +	c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;    
  16.288 +
  16.289 +    c->vector_fmul                = ff_vector_fmul_neon;
  16.290 +    c->vector_fmul_window         = ff_vector_fmul_window_neon;
  16.291 +    c->vector_fmul_scalar         = ff_vector_fmul_scalar_neon;
  16.292 +    c->butterflies_float          = ff_butterflies_float_neon;
  16.293 +    c->scalarproduct_float        = ff_scalarproduct_float_neon;
  16.294 +    c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
  16.295 +    c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
  16.296 +    c->vector_fmul_add            = ff_vector_fmul_add_neon;
  16.297 +    c->vector_clipf               = ff_vector_clipf_neon;
  16.298 +
  16.299 +    c->vector_fmul_sv_scalar[0] = ff_vector_fmul_sv_scalar_2_neon;
  16.300 +    c->vector_fmul_sv_scalar[1] = ff_vector_fmul_sv_scalar_4_neon;
  16.301 +
  16.302 +    c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
  16.303 +    c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
  16.304 +
  16.305 +
  16.306 +    c->float_to_int16            = ff_float_to_int16_neon;
  16.307 +    c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
  16.308 +
  16.309 +    c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
  16.310 +    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
  16.311 +}

    17.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    17.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_init_vfp.c	Mon Aug 27 12:09:56 2012 +0200
    17.3 @@ -0,0 +1,36 @@
    17.4 +/*
    17.5 + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
    17.6 + *
    17.7 + * This file is part of FFmpeg.
    17.8 + *
    17.9 + * FFmpeg is free software; you can redistribute it and/or
   17.10 + * modify it under the terms of the GNU Lesser General Public
   17.11 + * License as published by the Free Software Foundation; either
   17.12 + * version 2.1 of the License, or (at your option) any later version.
   17.13 + *
   17.14 + * FFmpeg is distributed in the hope that it will be useful,
   17.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   17.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   17.17 + * Lesser General Public License for more details.
   17.18 + *
   17.19 + * You should have received a copy of the GNU Lesser General Public
   17.20 + * License along with FFmpeg; if not, write to the Free Software
   17.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   17.22 + */
   17.23 +
   17.24 +#include "libavcodec/dsputil.h"
   17.25 +#include "dsputil_arm.h"
   17.26 +
   17.27 +void ff_vector_fmul_vfp(float *dst, const float *src, int len);
   17.28 +void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
   17.29 +                                const float *src1, int len);
   17.30 +void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
   17.31 +
   17.32 +void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
   17.33 +{
   17.34 +    c->vector_fmul = ff_vector_fmul_vfp;
   17.35 +    c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
   17.36 +#if HAVE_ARMV6
   17.37 +    c->float_to_int16 = ff_float_to_int16_vfp;
   17.38 +#endif
   17.39 +}

    18.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    18.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt.c	Mon Aug 27 12:09:56 2012 +0200
    18.3 @@ -0,0 +1,205 @@
    18.4 +/*
    18.5 + * iWMMXt optimized DSP utils
    18.6 + * Copyright (c) 2004 AGAWA Koji
    18.7 + *
    18.8 + * This file is part of FFmpeg.
    18.9 + *
   18.10 + * FFmpeg is free software; you can redistribute it and/or
   18.11 + * modify it under the terms of the GNU Lesser General Public
   18.12 + * License as published by the Free Software Foundation; either
   18.13 + * version 2.1 of the License, or (at your option) any later version.
   18.14 + *
   18.15 + * FFmpeg is distributed in the hope that it will be useful,
   18.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   18.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   18.18 + * Lesser General Public License for more details.
   18.19 + *
   18.20 + * You should have received a copy of the GNU Lesser General Public
   18.21 + * License along with FFmpeg; if not, write to the Free Software
   18.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   18.23 + */
   18.24 +
   18.25 +#include "libavcodec/dsputil.h"
   18.26 +
   18.27 +#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
   18.28 +#define SET_RND(regd)  __asm__ volatile ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
   18.29 +#define WAVG2B "wavg2b"
   18.30 +#include "dsputil_iwmmxt_rnd_template.c"
   18.31 +#undef DEF
   18.32 +#undef SET_RND
   18.33 +#undef WAVG2B
   18.34 +
   18.35 +#define DEF(x, y) x ## _ ## y ##_iwmmxt
   18.36 +#define SET_RND(regd)  __asm__ volatile ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
   18.37 +#define WAVG2B "wavg2br"
   18.38 +#include "dsputil_iwmmxt_rnd_template.c"
   18.39 +#undef DEF
   18.40 +#undef SET_RND
   18.41 +#undef WAVG2BR
   18.42 +
   18.43 +// need scheduling
   18.44 +#define OP(AVG)                                         \
   18.45 +    __asm__ volatile (                                      \
   18.46 +        /* alignment */                                 \
   18.47 +        "and r12, %[pixels], #7 \n\t"                   \
   18.48 +        "bic %[pixels], %[pixels], #7 \n\t"             \
   18.49 +        "tmcr wcgr1, r12 \n\t"                          \
   18.50 +                                                        \
   18.51 +        "wldrd wr0, [%[pixels]] \n\t"                   \
   18.52 +        "wldrd wr1, [%[pixels], #8] \n\t"               \
   18.53 +        "add %[pixels], %[pixels], %[line_size] \n\t"   \
   18.54 +        "walignr1 wr4, wr0, wr1 \n\t"                   \
   18.55 +                                                        \
   18.56 +        "1: \n\t"                                       \
   18.57 +                                                        \
   18.58 +        "wldrd wr2, [%[pixels]] \n\t"                   \
   18.59 +        "wldrd wr3, [%[pixels], #8] \n\t"               \
   18.60 +        "add %[pixels], %[pixels], %[line_size] \n\t"   \
   18.61 +        "pld [%[pixels]] \n\t"                          \
   18.62 +        "walignr1 wr5, wr2, wr3 \n\t"                   \
   18.63 +        AVG " wr6, wr4, wr5 \n\t"                       \
   18.64 +        "wstrd wr6, [%[block]] \n\t"                    \
   18.65 +        "add %[block], %[block], %[line_size] \n\t"     \
   18.66 +                                                        \
   18.67 +        "wldrd wr0, [%[pixels]] \n\t"                   \
   18.68 +        "wldrd wr1, [%[pixels], #8] \n\t"               \
   18.69 +        "add %[pixels], %[pixels], %[line_size] \n\t"   \
   18.70 +        "walignr1 wr4, wr0, wr1 \n\t"                   \
   18.71 +        "pld [%[pixels]] \n\t"                          \
   18.72 +        AVG " wr6, wr4, wr5 \n\t"                       \
   18.73 +        "wstrd wr6, [%[block]] \n\t"                    \
   18.74 +        "add %[block], %[block], %[line_size] \n\t"     \
   18.75 +                                                        \
   18.76 +        "subs %[h], %[h], #2 \n\t"                      \
   18.77 +        "bne 1b \n\t"                                   \
   18.78 +        : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h)  \
   18.79 +        : [line_size]"r"(line_size) \
   18.80 +        : "memory", "r12");
   18.81 +void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
   18.82 +{
   18.83 +    OP("wavg2br");
   18.84 +}
   18.85 +void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
   18.86 +{
   18.87 +    OP("wavg2b");
   18.88 +}
   18.89 +#undef OP
   18.90 +
   18.91 +void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
   18.92 +{
   18.93 +    uint8_t *pixels2 = pixels + line_size;
   18.94 +
   18.95 +    __asm__ volatile (
   18.96 +        "mov            r12, #4                 \n\t"
   18.97 +        "1:                                     \n\t"
   18.98 +        "pld            [%[pixels], %[line_size2]]              \n\t"
   18.99 +        "pld            [%[pixels2], %[line_size2]]             \n\t"
  18.100 +        "wldrd          wr4, [%[pixels]]        \n\t"
  18.101 +        "wldrd          wr5, [%[pixels2]]       \n\t"
  18.102 +        "pld            [%[block], #32]         \n\t"
  18.103 +        "wunpckelub     wr6, wr4                \n\t"
  18.104 +        "wldrd          wr0, [%[block]]         \n\t"
  18.105 +        "wunpckehub     wr7, wr4                \n\t"
  18.106 +        "wldrd          wr1, [%[block], #8]     \n\t"
  18.107 +        "wunpckelub     wr8, wr5                \n\t"
  18.108 +        "wldrd          wr2, [%[block], #16]    \n\t"
  18.109 +        "wunpckehub     wr9, wr5                \n\t"
  18.110 +        "wldrd          wr3, [%[block], #24]    \n\t"
  18.111 +        "add            %[block], %[block], #32 \n\t"
  18.112 +        "waddhss        wr10, wr0, wr6          \n\t"
  18.113 +        "waddhss        wr11, wr1, wr7          \n\t"
  18.114 +        "waddhss        wr12, wr2, wr8          \n\t"
  18.115 +        "waddhss        wr13, wr3, wr9          \n\t"
  18.116 +        "wpackhus       wr14, wr10, wr11        \n\t"
  18.117 +        "wpackhus       wr15, wr12, wr13        \n\t"
  18.118 +        "wstrd          wr14, [%[pixels]]       \n\t"
  18.119 +        "add            %[pixels], %[pixels], %[line_size2]     \n\t"
  18.120 +        "subs           r12, r12, #1            \n\t"
  18.121 +        "wstrd          wr15, [%[pixels2]]      \n\t"
  18.122 +        "add            %[pixels2], %[pixels2], %[line_size2]   \n\t"
  18.123 +        "bne            1b                      \n\t"
  18.124 +        : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
  18.125 +        : [line_size2]"r"(line_size << 1)
  18.126 +        : "cc", "memory", "r12");
  18.127 +}
  18.128 +
  18.129 +static void clear_blocks_iwmmxt(DCTELEM *blocks)
  18.130 +{
  18.131 +    __asm__ volatile(
  18.132 +                "wzero wr0                      \n\t"
  18.133 +                "mov r1, #(128 * 6 / 32)        \n\t"
  18.134 +                "1:                             \n\t"
  18.135 +                "wstrd wr0, [%0]                \n\t"
  18.136 +                "wstrd wr0, [%0, #8]            \n\t"
  18.137 +                "wstrd wr0, [%0, #16]           \n\t"
  18.138 +                "wstrd wr0, [%0, #24]           \n\t"
  18.139 +                "subs r1, r1, #1                \n\t"
  18.140 +                "add %0, %0, #32                \n\t"
  18.141 +                "bne 1b                         \n\t"
  18.142 +                : "+r"(blocks)
  18.143 +                :
  18.144 +                : "r1"
  18.145 +        );
  18.146 +}
  18.147 +
  18.148 +static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  18.149 +{
  18.150 +    return;
  18.151 +}
  18.152 +
  18.153 +/* A run time test is not simple. If this file is compiled in
  18.154 + * then we should install the functions
  18.155 + */
  18.156 +int mm_flags = FF_MM_IWMMXT; /* multimedia extension flags */
  18.157 +
  18.158 +void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
  18.159 +{
  18.160 +    if (avctx->dsp_mask) {
  18.161 +        if (avctx->dsp_mask & FF_MM_FORCE)
  18.162 +            mm_flags |= (avctx->dsp_mask & 0xffff);
  18.163 +        else
  18.164 +            mm_flags &= ~(avctx->dsp_mask & 0xffff);
  18.165 +    }
  18.166 +
  18.167 +    if (!(mm_flags & FF_MM_IWMMXT)) return;
  18.168 +
  18.169 +    c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
  18.170 +
  18.171 +    c->clear_blocks = clear_blocks_iwmmxt;
  18.172 +
  18.173 +    c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
  18.174 +    c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
  18.175 +    c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
  18.176 +    c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
  18.177 +    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
  18.178 +    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
  18.179 +    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
  18.180 +    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
  18.181 +
  18.182 +    c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
  18.183 +    c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
  18.184 +    c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
  18.185 +    c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
  18.186 +    c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
  18.187 +    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
  18.188 +    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
  18.189 +    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
  18.190 +
  18.191 +    c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
  18.192 +    c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
  18.193 +    c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
  18.194 +    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
  18.195 +    c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
  18.196 +    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
  18.197 +    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
  18.198 +    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
  18.199 +
  18.200 +    c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
  18.201 +    c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
  18.202 +    c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
  18.203 +    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
  18.204 +    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
  18.205 +    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
  18.206 +    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
  18.207 +    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
  18.208 +}

    19.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    19.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_iwmmxt_rnd_template.c	Mon Aug 27 12:09:56 2012 +0200
    19.3 @@ -0,0 +1,1114 @@
    19.4 +/*
    19.5 + * iWMMXt optimized DSP utils
    19.6 + * copyright (c) 2004 AGAWA Koji
    19.7 + *
    19.8 + * This file is part of FFmpeg.
    19.9 + *
   19.10 + * FFmpeg is free software; you can redistribute it and/or
   19.11 + * modify it under the terms of the GNU Lesser General Public
   19.12 + * License as published by the Free Software Foundation; either
   19.13 + * version 2.1 of the License, or (at your option) any later version.
   19.14 + *
   19.15 + * FFmpeg is distributed in the hope that it will be useful,
   19.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   19.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   19.18 + * Lesser General Public License for more details.
   19.19 + *
   19.20 + * You should have received a copy of the GNU Lesser General Public
   19.21 + * License along with FFmpeg; if not, write to the Free Software
   19.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   19.23 + */
   19.24 +
   19.25 +void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
   19.26 +{
   19.27 +    int stride = line_size;
   19.28 +    __asm__ volatile (
   19.29 +        "and r12, %[pixels], #7 \n\t"
   19.30 +        "bic %[pixels], %[pixels], #7 \n\t"
   19.31 +        "tmcr wcgr1, r12 \n\t"
   19.32 +        "add r4, %[pixels], %[line_size] \n\t"
   19.33 +        "add r5, %[block], %[line_size] \n\t"
   19.34 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
   19.35 +        "1: \n\t"
   19.36 +        "wldrd wr0, [%[pixels]] \n\t"
   19.37 +        "subs %[h], %[h], #2 \n\t"
   19.38 +        "wldrd wr1, [%[pixels], #8] \n\t"
   19.39 +        "add %[pixels], %[pixels], %[line_size] \n\t"
   19.40 +        "wldrd wr3, [r4] \n\t"
   19.41 +        "pld [%[pixels]] \n\t"
   19.42 +        "pld [%[pixels], #32] \n\t"
   19.43 +        "wldrd wr4, [r4, #8] \n\t"
   19.44 +        "add r4, r4, %[line_size] \n\t"
   19.45 +        "walignr1 wr8, wr0, wr1 \n\t"
   19.46 +        "pld [r4] \n\t"
   19.47 +        "pld [r4, #32] \n\t"
   19.48 +        "walignr1 wr10, wr3, wr4 \n\t"
   19.49 +        "wstrd wr8, [%[block]] \n\t"
   19.50 +        "add %[block], %[block], %[line_size] \n\t"
   19.51 +        "wstrd wr10, [r5] \n\t"
   19.52 +        "add r5, r5, %[line_size] \n\t"
   19.53 +        "bne 1b \n\t"
   19.54 +        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
   19.55 +        :
   19.56 +        : "memory", "r4", "r5", "r12");
   19.57 +}
   19.58 +
   19.59 +void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
   19.60 +{
   19.61 +    int stride = line_size;
   19.62 +    __asm__ volatile (
   19.63 +        "and r12, %[pixels], #7 \n\t"
   19.64 +        "bic %[pixels], %[pixels], #7 \n\t"
   19.65 +        "tmcr wcgr1, r12 \n\t"
   19.66 +        "add r4, %[pixels], %[line_size] \n\t"
   19.67 +        "add r5, %[block], %[line_size] \n\t"
   19.68 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
   19.69 +        "1: \n\t"
   19.70 +        "wldrd wr0, [%[pixels]] \n\t"
   19.71 +        "subs %[h], %[h], #2 \n\t"
   19.72 +        "wldrd wr1, [%[pixels], #8] \n\t"
   19.73 +        "add %[pixels], %[pixels], %[line_size] \n\t"
   19.74 +        "wldrd wr3, [r4] \n\t"
   19.75 +        "pld [%[pixels]] \n\t"
   19.76 +        "pld [%[pixels], #32] \n\t"
   19.77 +        "wldrd wr4, [r4, #8] \n\t"
   19.78 +        "add r4, r4, %[line_size] \n\t"
   19.79 +        "walignr1 wr8, wr0, wr1 \n\t"
   19.80 +        "wldrd wr0, [%[block]] \n\t"
   19.81 +        "wldrd wr2, [r5] \n\t"
   19.82 +        "pld [r4] \n\t"
   19.83 +        "pld [r4, #32] \n\t"
   19.84 +        "walignr1 wr10, wr3, wr4 \n\t"
   19.85 +        WAVG2B" wr8, wr8, wr0 \n\t"
   19.86 +        WAVG2B" wr10, wr10, wr2 \n\t"
   19.87 +        "wstrd wr8, [%[block]] \n\t"
   19.88 +        "add %[block], %[block], %[line_size] \n\t"
   19.89 +        "wstrd wr10, [r5] \n\t"
   19.90 +        "pld [%[block]] \n\t"
   19.91 +        "pld [%[block], #32] \n\t"
   19.92 +        "add r5, r5, %[line_size] \n\t"
   19.93 +        "pld [r5] \n\t"
   19.94 +        "pld [r5, #32] \n\t"
   19.95 +        "bne 1b \n\t"
   19.96 +        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
   19.97 +        :
   19.98 +        : "memory", "r4", "r5", "r12");
   19.99 +}
  19.100 +
  19.101 +void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.102 +{
  19.103 +    int stride = line_size;
  19.104 +    __asm__ volatile (
  19.105 +        "and r12, %[pixels], #7 \n\t"
  19.106 +        "bic %[pixels], %[pixels], #7 \n\t"
  19.107 +        "tmcr wcgr1, r12 \n\t"
  19.108 +        "add r4, %[pixels], %[line_size] \n\t"
  19.109 +        "add r5, %[block], %[line_size] \n\t"
  19.110 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
  19.111 +        "1: \n\t"
  19.112 +        "wldrd wr0, [%[pixels]] \n\t"
  19.113 +        "wldrd wr1, [%[pixels], #8] \n\t"
  19.114 +        "subs %[h], %[h], #2 \n\t"
  19.115 +        "wldrd wr2, [%[pixels], #16] \n\t"
  19.116 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.117 +        "wldrd wr3, [r4] \n\t"
  19.118 +        "pld [%[pixels]] \n\t"
  19.119 +        "pld [%[pixels], #32] \n\t"
  19.120 +        "walignr1 wr8, wr0, wr1 \n\t"
  19.121 +        "wldrd wr4, [r4, #8] \n\t"
  19.122 +        "walignr1 wr9, wr1, wr2 \n\t"
  19.123 +        "wldrd wr5, [r4, #16] \n\t"
  19.124 +        "add r4, r4, %[line_size] \n\t"
  19.125 +        "pld [r4] \n\t"
  19.126 +        "pld [r4, #32] \n\t"
  19.127 +        "walignr1 wr10, wr3, wr4 \n\t"
  19.128 +        "wstrd wr8, [%[block]] \n\t"
  19.129 +        "walignr1 wr11, wr4, wr5 \n\t"
  19.130 +        "wstrd wr9, [%[block], #8] \n\t"
  19.131 +        "add %[block], %[block], %[line_size] \n\t"
  19.132 +        "wstrd wr10, [r5] \n\t"
  19.133 +        "wstrd wr11, [r5, #8] \n\t"
  19.134 +        "add r5, r5, %[line_size] \n\t"
  19.135 +        "bne 1b \n\t"
  19.136 +        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  19.137 +        :
  19.138 +        : "memory", "r4", "r5", "r12");
  19.139 +}
  19.140 +
  19.141 +void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.142 +{
  19.143 +    int stride = line_size;
  19.144 +    __asm__ volatile (
  19.145 +        "pld [%[pixels]]                \n\t"
  19.146 +        "pld [%[pixels], #32]           \n\t"
  19.147 +        "pld [%[block]]                 \n\t"
  19.148 +        "pld [%[block], #32]            \n\t"
  19.149 +        "and r12, %[pixels], #7         \n\t"
  19.150 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.151 +        "tmcr wcgr1, r12                \n\t"
  19.152 +        "add r4, %[pixels], %[line_size]\n\t"
  19.153 +        "add r5, %[block], %[line_size] \n\t"
  19.154 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
  19.155 +        "1:                             \n\t"
  19.156 +        "wldrd wr0, [%[pixels]]         \n\t"
  19.157 +        "wldrd wr1, [%[pixels], #8]     \n\t"
  19.158 +        "subs %[h], %[h], #2            \n\t"
  19.159 +        "wldrd wr2, [%[pixels], #16]    \n\t"
  19.160 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.161 +        "wldrd wr3, [r4]                \n\t"
  19.162 +        "pld [%[pixels]]                \n\t"
  19.163 +        "pld [%[pixels], #32]           \n\t"
  19.164 +        "walignr1 wr8, wr0, wr1         \n\t"
  19.165 +        "wldrd wr4, [r4, #8]            \n\t"
  19.166 +        "walignr1 wr9, wr1, wr2         \n\t"
  19.167 +        "wldrd wr5, [r4, #16]           \n\t"
  19.168 +        "add r4, r4, %[line_size]       \n\t"
  19.169 +        "wldrd wr0, [%[block]]          \n\t"
  19.170 +        "pld [r4]                       \n\t"
  19.171 +        "wldrd wr1, [%[block], #8]      \n\t"
  19.172 +        "pld [r4, #32]                  \n\t"
  19.173 +        "wldrd wr2, [r5]                \n\t"
  19.174 +        "walignr1 wr10, wr3, wr4        \n\t"
  19.175 +        "wldrd wr3, [r5, #8]            \n\t"
  19.176 +        WAVG2B" wr8, wr8, wr0           \n\t"
  19.177 +        WAVG2B" wr9, wr9, wr1           \n\t"
  19.178 +        WAVG2B" wr10, wr10, wr2         \n\t"
  19.179 +        "wstrd wr8, [%[block]]          \n\t"
  19.180 +        "walignr1 wr11, wr4, wr5        \n\t"
  19.181 +        WAVG2B" wr11, wr11, wr3         \n\t"
  19.182 +        "wstrd wr9, [%[block], #8]      \n\t"
  19.183 +        "add %[block], %[block], %[line_size] \n\t"
  19.184 +        "wstrd wr10, [r5]               \n\t"
  19.185 +        "pld [%[block]]                 \n\t"
  19.186 +        "pld [%[block], #32]            \n\t"
  19.187 +        "wstrd wr11, [r5, #8]           \n\t"
  19.188 +        "add r5, r5, %[line_size]       \n\t"
  19.189 +        "pld [r5]                       \n\t"
  19.190 +        "pld [r5, #32]                  \n\t"
  19.191 +        "bne 1b \n\t"
  19.192 +        : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  19.193 +        :
  19.194 +        : "memory", "r4", "r5", "r12");
  19.195 +}
  19.196 +
  19.197 +void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.198 +{
  19.199 +    int stride = line_size;
  19.200 +    // [wr0 wr1 wr2 wr3] for previous line
  19.201 +    // [wr4 wr5 wr6 wr7] for current line
  19.202 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.203 +    __asm__ volatile(
  19.204 +        "pld [%[pixels]]                \n\t"
  19.205 +        "pld [%[pixels], #32]           \n\t"
  19.206 +        "and r12, %[pixels], #7         \n\t"
  19.207 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.208 +        "tmcr wcgr1, r12                \n\t"
  19.209 +        "add r12, r12, #1               \n\t"
  19.210 +        "add r4, %[pixels], %[line_size]\n\t"
  19.211 +        "tmcr wcgr2, r12                \n\t"
  19.212 +        "add r5, %[block], %[line_size] \n\t"
  19.213 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
  19.214 +
  19.215 +        "1:                             \n\t"
  19.216 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.217 +        "cmp r12, #8                    \n\t"
  19.218 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.219 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.220 +        "wldrd wr13, [r4]               \n\t"
  19.221 +        "pld [%[pixels]]                \n\t"
  19.222 +        "wldrd wr14, [r4, #8]           \n\t"
  19.223 +        "pld [%[pixels], #32]           \n\t"
  19.224 +        "add r4, r4, %[line_size]       \n\t"
  19.225 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.226 +        "pld [r4]                       \n\t"
  19.227 +        "pld [r4, #32]                  \n\t"
  19.228 +        "walignr1 wr2, wr13, wr14       \n\t"
  19.229 +        "wmoveq wr4, wr11               \n\t"
  19.230 +        "wmoveq wr6, wr14               \n\t"
  19.231 +        "walignr2ne wr4, wr10, wr11     \n\t"
  19.232 +        "walignr2ne wr6, wr13, wr14     \n\t"
  19.233 +        WAVG2B" wr0, wr0, wr4           \n\t"
  19.234 +        WAVG2B" wr2, wr2, wr6           \n\t"
  19.235 +        "wstrd wr0, [%[block]]          \n\t"
  19.236 +        "subs %[h], %[h], #2            \n\t"
  19.237 +        "wstrd wr2, [r5]                \n\t"
  19.238 +        "add %[block], %[block], %[line_size]   \n\t"
  19.239 +        "add r5, r5, %[line_size]       \n\t"
  19.240 +        "bne 1b                         \n\t"
  19.241 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  19.242 +        :
  19.243 +        : "r4", "r5", "r12", "memory");
  19.244 +}
  19.245 +
  19.246 +void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.247 +{
  19.248 +    int stride = line_size;
  19.249 +    // [wr0 wr1 wr2 wr3] for previous line
  19.250 +    // [wr4 wr5 wr6 wr7] for current line
  19.251 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.252 +    __asm__ volatile(
  19.253 +        "pld [%[pixels]]                \n\t"
  19.254 +        "pld [%[pixels], #32]           \n\t"
  19.255 +        "and r12, %[pixels], #7         \n\t"
  19.256 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.257 +        "tmcr wcgr1, r12                \n\t"
  19.258 +        "add r12, r12, #1               \n\t"
  19.259 +        "add r4, %[pixels], %[line_size]\n\t"
  19.260 +        "tmcr wcgr2, r12                \n\t"
  19.261 +        "add r5, %[block], %[line_size] \n\t"
  19.262 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
  19.263 +
  19.264 +        "1:                             \n\t"
  19.265 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.266 +        "cmp r12, #8                    \n\t"
  19.267 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.268 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.269 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.270 +        "wldrd wr13, [r4]               \n\t"
  19.271 +        "pld [%[pixels]]                \n\t"
  19.272 +        "wldrd wr14, [r4, #8]           \n\t"
  19.273 +        "pld [%[pixels], #32]           \n\t"
  19.274 +        "wldrd wr15, [r4, #16]          \n\t"
  19.275 +        "add r4, r4, %[line_size]       \n\t"
  19.276 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.277 +        "pld [r4]                       \n\t"
  19.278 +        "pld [r4, #32]                  \n\t"
  19.279 +        "walignr1 wr1, wr11, wr12       \n\t"
  19.280 +        "walignr1 wr2, wr13, wr14       \n\t"
  19.281 +        "walignr1 wr3, wr14, wr15       \n\t"
  19.282 +        "wmoveq wr4, wr11               \n\t"
  19.283 +        "wmoveq wr5, wr12               \n\t"
  19.284 +        "wmoveq wr6, wr14               \n\t"
  19.285 +        "wmoveq wr7, wr15               \n\t"
  19.286 +        "walignr2ne wr4, wr10, wr11     \n\t"
  19.287 +        "walignr2ne wr5, wr11, wr12     \n\t"
  19.288 +        "walignr2ne wr6, wr13, wr14     \n\t"
  19.289 +        "walignr2ne wr7, wr14, wr15     \n\t"
  19.290 +        WAVG2B" wr0, wr0, wr4           \n\t"
  19.291 +        WAVG2B" wr1, wr1, wr5           \n\t"
  19.292 +        "wstrd wr0, [%[block]]          \n\t"
  19.293 +        WAVG2B" wr2, wr2, wr6           \n\t"
  19.294 +        "wstrd wr1, [%[block], #8]      \n\t"
  19.295 +        WAVG2B" wr3, wr3, wr7           \n\t"
  19.296 +        "add %[block], %[block], %[line_size]   \n\t"
  19.297 +        "wstrd wr2, [r5]                \n\t"
  19.298 +        "subs %[h], %[h], #2            \n\t"
  19.299 +        "wstrd wr3, [r5, #8]            \n\t"
  19.300 +        "add r5, r5, %[line_size]       \n\t"
  19.301 +        "bne 1b                         \n\t"
  19.302 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  19.303 +        :
  19.304 +        : "r4", "r5", "r12", "memory");
  19.305 +}
  19.306 +
  19.307 +void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.308 +{
  19.309 +    int stride = line_size;
  19.310 +    // [wr0 wr1 wr2 wr3] for previous line
  19.311 +    // [wr4 wr5 wr6 wr7] for current line
  19.312 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.313 +    __asm__ volatile(
  19.314 +        "pld [%[pixels]]                \n\t"
  19.315 +        "pld [%[pixels], #32]           \n\t"
  19.316 +        "pld [%[block]]                 \n\t"
  19.317 +        "pld [%[block], #32]            \n\t"
  19.318 +        "and r12, %[pixels], #7         \n\t"
  19.319 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.320 +        "tmcr wcgr1, r12                \n\t"
  19.321 +        "add r12, r12, #1               \n\t"
  19.322 +        "add r4, %[pixels], %[line_size]\n\t"
  19.323 +        "tmcr wcgr2, r12                \n\t"
  19.324 +        "add r5, %[block], %[line_size] \n\t"
  19.325 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
  19.326 +        "pld [r5]                       \n\t"
  19.327 +        "pld [r5, #32]                  \n\t"
  19.328 +
  19.329 +        "1:                             \n\t"
  19.330 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.331 +        "cmp r12, #8                    \n\t"
  19.332 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.333 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.334 +        "wldrd wr13, [r4]               \n\t"
  19.335 +        "pld [%[pixels]]                \n\t"
  19.336 +        "wldrd wr14, [r4, #8]           \n\t"
  19.337 +        "pld [%[pixels], #32]           \n\t"
  19.338 +        "add r4, r4, %[line_size]       \n\t"
  19.339 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.340 +        "pld [r4]                       \n\t"
  19.341 +        "pld [r4, #32]                  \n\t"
  19.342 +        "walignr1 wr2, wr13, wr14       \n\t"
  19.343 +        "wmoveq wr4, wr11               \n\t"
  19.344 +        "wmoveq wr6, wr14               \n\t"
  19.345 +        "walignr2ne wr4, wr10, wr11     \n\t"
  19.346 +        "wldrd wr10, [%[block]]         \n\t"
  19.347 +        "walignr2ne wr6, wr13, wr14     \n\t"
  19.348 +        "wldrd wr12, [r5]               \n\t"
  19.349 +        WAVG2B" wr0, wr0, wr4           \n\t"
  19.350 +        WAVG2B" wr2, wr2, wr6           \n\t"
  19.351 +        WAVG2B" wr0, wr0, wr10          \n\t"
  19.352 +        WAVG2B" wr2, wr2, wr12          \n\t"
  19.353 +        "wstrd wr0, [%[block]]          \n\t"
  19.354 +        "subs %[h], %[h], #2            \n\t"
  19.355 +        "wstrd wr2, [r5]                \n\t"
  19.356 +        "add %[block], %[block], %[line_size]   \n\t"
  19.357 +        "add r5, r5, %[line_size]       \n\t"
  19.358 +        "pld [%[block]]                 \n\t"
  19.359 +        "pld [%[block], #32]            \n\t"
  19.360 +        "pld [r5]                       \n\t"
  19.361 +        "pld [r5, #32]                  \n\t"
  19.362 +        "bne 1b                         \n\t"
  19.363 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  19.364 +        :
  19.365 +        : "r4", "r5", "r12", "memory");
  19.366 +}
  19.367 +
  19.368 +void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.369 +{
  19.370 +    int stride = line_size;
  19.371 +    // [wr0 wr1 wr2 wr3] for previous line
  19.372 +    // [wr4 wr5 wr6 wr7] for current line
  19.373 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.374 +    __asm__ volatile(
  19.375 +        "pld [%[pixels]]                \n\t"
  19.376 +        "pld [%[pixels], #32]           \n\t"
  19.377 +        "pld [%[block]]                 \n\t"
  19.378 +        "pld [%[block], #32]            \n\t"
  19.379 +        "and r12, %[pixels], #7         \n\t"
  19.380 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.381 +        "tmcr wcgr1, r12                \n\t"
  19.382 +        "add r12, r12, #1               \n\t"
  19.383 +        "add r4, %[pixels], %[line_size]\n\t"
  19.384 +        "tmcr wcgr2, r12                \n\t"
  19.385 +        "add r5, %[block], %[line_size] \n\t"
  19.386 +        "mov %[line_size], %[line_size], lsl #1 \n\t"
  19.387 +        "pld [r5]                       \n\t"
  19.388 +        "pld [r5, #32]                  \n\t"
  19.389 +
  19.390 +        "1:                             \n\t"
  19.391 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.392 +        "cmp r12, #8                    \n\t"
  19.393 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.394 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.395 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.396 +        "wldrd wr13, [r4]               \n\t"
  19.397 +        "pld [%[pixels]]                \n\t"
  19.398 +        "wldrd wr14, [r4, #8]           \n\t"
  19.399 +        "pld [%[pixels], #32]           \n\t"
  19.400 +        "wldrd wr15, [r4, #16]          \n\t"
  19.401 +        "add r4, r4, %[line_size]       \n\t"
  19.402 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.403 +        "pld [r4]                       \n\t"
  19.404 +        "pld [r4, #32]                  \n\t"
  19.405 +        "walignr1 wr1, wr11, wr12       \n\t"
  19.406 +        "walignr1 wr2, wr13, wr14       \n\t"
  19.407 +        "walignr1 wr3, wr14, wr15       \n\t"
  19.408 +        "wmoveq wr4, wr11               \n\t"
  19.409 +        "wmoveq wr5, wr12               \n\t"
  19.410 +        "wmoveq wr6, wr14               \n\t"
  19.411 +        "wmoveq wr7, wr15               \n\t"
  19.412 +        "walignr2ne wr4, wr10, wr11     \n\t"
  19.413 +        "walignr2ne wr5, wr11, wr12     \n\t"
  19.414 +        "walignr2ne wr6, wr13, wr14     \n\t"
  19.415 +        "walignr2ne wr7, wr14, wr15     \n\t"
  19.416 +        "wldrd wr10, [%[block]]         \n\t"
  19.417 +        WAVG2B" wr0, wr0, wr4           \n\t"
  19.418 +        "wldrd wr11, [%[block], #8]     \n\t"
  19.419 +        WAVG2B" wr1, wr1, wr5           \n\t"
  19.420 +        "wldrd wr12, [r5]               \n\t"
  19.421 +        WAVG2B" wr2, wr2, wr6           \n\t"
  19.422 +        "wldrd wr13, [r5, #8]           \n\t"
  19.423 +        WAVG2B" wr3, wr3, wr7           \n\t"
  19.424 +        WAVG2B" wr0, wr0, wr10          \n\t"
  19.425 +        WAVG2B" wr1, wr1, wr11          \n\t"
  19.426 +        WAVG2B" wr2, wr2, wr12          \n\t"
  19.427 +        WAVG2B" wr3, wr3, wr13          \n\t"
  19.428 +        "wstrd wr0, [%[block]]          \n\t"
  19.429 +        "subs %[h], %[h], #2            \n\t"
  19.430 +        "wstrd wr1, [%[block], #8]      \n\t"
  19.431 +        "add %[block], %[block], %[line_size]   \n\t"
  19.432 +        "wstrd wr2, [r5]                \n\t"
  19.433 +        "pld [%[block]]                 \n\t"
  19.434 +        "wstrd wr3, [r5, #8]            \n\t"
  19.435 +        "add r5, r5, %[line_size]       \n\t"
  19.436 +        "pld [%[block], #32]            \n\t"
  19.437 +        "pld [r5]                       \n\t"
  19.438 +        "pld [r5, #32]                  \n\t"
  19.439 +        "bne 1b                         \n\t"
  19.440 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  19.441 +        :
  19.442 +        :"r4", "r5", "r12", "memory");
  19.443 +}
  19.444 +
  19.445 +void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.446 +{
  19.447 +    int stride = line_size;
  19.448 +    // [wr0 wr1 wr2 wr3] for previous line
  19.449 +    // [wr4 wr5 wr6 wr7] for current line
  19.450 +    __asm__ volatile(
  19.451 +        "pld            [%[pixels]]                             \n\t"
  19.452 +        "pld            [%[pixels], #32]                        \n\t"
  19.453 +        "and            r12, %[pixels], #7                      \n\t"
  19.454 +        "tmcr           wcgr1, r12                              \n\t"
  19.455 +        "bic            %[pixels], %[pixels], #7                \n\t"
  19.456 +
  19.457 +        "wldrd          wr10, [%[pixels]]                       \n\t"
  19.458 +        "wldrd          wr11, [%[pixels], #8]                   \n\t"
  19.459 +        "pld            [%[block]]                              \n\t"
  19.460 +        "add            %[pixels], %[pixels], %[line_size]      \n\t"
  19.461 +        "walignr1       wr0, wr10, wr11                         \n\t"
  19.462 +        "pld            [%[pixels]]                             \n\t"
  19.463 +        "pld            [%[pixels], #32]                        \n\t"
  19.464 +
  19.465 +      "1:                                                       \n\t"
  19.466 +        "wldrd          wr10, [%[pixels]]                       \n\t"
  19.467 +        "wldrd          wr11, [%[pixels], #8]                   \n\t"
  19.468 +        "add            %[pixels], %[pixels], %[line_size]      \n\t"
  19.469 +        "pld            [%[pixels]]                             \n\t"
  19.470 +        "pld            [%[pixels], #32]                        \n\t"
  19.471 +        "walignr1       wr4, wr10, wr11                         \n\t"
  19.472 +        "wldrd          wr10, [%[block]]                        \n\t"
  19.473 +         WAVG2B"        wr8, wr0, wr4                           \n\t"
  19.474 +         WAVG2B"        wr8, wr8, wr10                          \n\t"
  19.475 +        "wstrd          wr8, [%[block]]                         \n\t"
  19.476 +        "add            %[block], %[block], %[line_size]        \n\t"
  19.477 +
  19.478 +        "wldrd          wr10, [%[pixels]]                       \n\t"
  19.479 +        "wldrd          wr11, [%[pixels], #8]                   \n\t"
  19.480 +        "pld            [%[block]]                              \n\t"
  19.481 +        "add            %[pixels], %[pixels], %[line_size]      \n\t"
  19.482 +        "pld            [%[pixels]]                             \n\t"
  19.483 +        "pld            [%[pixels], #32]                        \n\t"
  19.484 +        "walignr1       wr0, wr10, wr11                         \n\t"
  19.485 +        "wldrd          wr10, [%[block]]                        \n\t"
  19.486 +         WAVG2B"        wr8, wr0, wr4                           \n\t"
  19.487 +         WAVG2B"        wr8, wr8, wr10                          \n\t"
  19.488 +        "wstrd          wr8, [%[block]]                         \n\t"
  19.489 +        "add            %[block], %[block], %[line_size]        \n\t"
  19.490 +
  19.491 +        "subs           %[h], %[h], #2                          \n\t"
  19.492 +        "pld            [%[block]]                              \n\t"
  19.493 +        "bne            1b                                      \n\t"
  19.494 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  19.495 +        :
  19.496 +        : "cc", "memory", "r12");
  19.497 +}
  19.498 +
  19.499 +void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.500 +{
  19.501 +    int stride = line_size;
  19.502 +    // [wr0 wr1 wr2 wr3] for previous line
  19.503 +    // [wr4 wr5 wr6 wr7] for current line
  19.504 +    __asm__ volatile(
  19.505 +        "pld [%[pixels]]                \n\t"
  19.506 +        "pld [%[pixels], #32]           \n\t"
  19.507 +        "and r12, %[pixels], #7         \n\t"
  19.508 +        "tmcr wcgr1, r12                \n\t"
  19.509 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.510 +
  19.511 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.512 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.513 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.514 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.515 +        "pld [%[pixels]]                \n\t"
  19.516 +        "pld [%[pixels], #32]           \n\t"
  19.517 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.518 +        "walignr1 wr1, wr11, wr12       \n\t"
  19.519 +
  19.520 +        "1:                             \n\t"
  19.521 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.522 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.523 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.524 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.525 +        "pld [%[pixels]]                \n\t"
  19.526 +        "pld [%[pixels], #32]           \n\t"
  19.527 +        "walignr1 wr4, wr10, wr11       \n\t"
  19.528 +        "walignr1 wr5, wr11, wr12       \n\t"
  19.529 +        WAVG2B" wr8, wr0, wr4           \n\t"
  19.530 +        WAVG2B" wr9, wr1, wr5           \n\t"
  19.531 +        "wstrd wr8, [%[block]]          \n\t"
  19.532 +        "wstrd wr9, [%[block], #8]      \n\t"
  19.533 +        "add %[block], %[block], %[line_size]   \n\t"
  19.534 +
  19.535 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.536 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.537 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.538 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.539 +        "pld [%[pixels]]                \n\t"
  19.540 +        "pld [%[pixels], #32]           \n\t"
  19.541 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.542 +        "walignr1 wr1, wr11, wr12       \n\t"
  19.543 +        WAVG2B" wr8, wr0, wr4           \n\t"
  19.544 +        WAVG2B" wr9, wr1, wr5           \n\t"
  19.545 +        "wstrd wr8, [%[block]]          \n\t"
  19.546 +        "wstrd wr9, [%[block], #8]      \n\t"
  19.547 +        "add %[block], %[block], %[line_size]   \n\t"
  19.548 +
  19.549 +        "subs %[h], %[h], #2            \n\t"
  19.550 +        "bne 1b                         \n\t"
  19.551 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  19.552 +        :
  19.553 +        : "r4", "r5", "r12", "memory");
  19.554 +}
  19.555 +
  19.556 +void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.557 +{
  19.558 +    int stride = line_size;
  19.559 +    // [wr0 wr1 wr2 wr3] for previous line
  19.560 +    // [wr4 wr5 wr6 wr7] for current line
  19.561 +    __asm__ volatile(
  19.562 +        "pld [%[pixels]]                \n\t"
  19.563 +        "pld [%[pixels], #32]           \n\t"
  19.564 +        "and r12, %[pixels], #7         \n\t"
  19.565 +        "tmcr wcgr1, r12                \n\t"
  19.566 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.567 +
  19.568 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.569 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.570 +        "pld [%[block]]                 \n\t"
  19.571 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.572 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.573 +        "pld [%[pixels]]                \n\t"
  19.574 +        "pld [%[pixels], #32]           \n\t"
  19.575 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.576 +        "walignr1 wr1, wr11, wr12       \n\t"
  19.577 +
  19.578 +        "1:                             \n\t"
  19.579 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.580 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.581 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.582 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.583 +        "pld [%[pixels]]                \n\t"
  19.584 +        "pld [%[pixels], #32]           \n\t"
  19.585 +        "walignr1 wr4, wr10, wr11       \n\t"
  19.586 +        "walignr1 wr5, wr11, wr12       \n\t"
  19.587 +        "wldrd wr10, [%[block]]         \n\t"
  19.588 +        "wldrd wr11, [%[block], #8]     \n\t"
  19.589 +        WAVG2B" wr8, wr0, wr4           \n\t"
  19.590 +        WAVG2B" wr9, wr1, wr5           \n\t"
  19.591 +        WAVG2B" wr8, wr8, wr10          \n\t"
  19.592 +        WAVG2B" wr9, wr9, wr11          \n\t"
  19.593 +        "wstrd wr8, [%[block]]          \n\t"
  19.594 +        "wstrd wr9, [%[block], #8]      \n\t"
  19.595 +        "add %[block], %[block], %[line_size]   \n\t"
  19.596 +
  19.597 +        "wldrd wr10, [%[pixels]]        \n\t"
  19.598 +        "wldrd wr11, [%[pixels], #8]    \n\t"
  19.599 +        "pld [%[block]]                 \n\t"
  19.600 +        "wldrd wr12, [%[pixels], #16]   \n\t"
  19.601 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.602 +        "pld [%[pixels]]                \n\t"
  19.603 +        "pld [%[pixels], #32]           \n\t"
  19.604 +        "walignr1 wr0, wr10, wr11       \n\t"
  19.605 +        "walignr1 wr1, wr11, wr12       \n\t"
  19.606 +        "wldrd wr10, [%[block]]         \n\t"
  19.607 +        "wldrd wr11, [%[block], #8]     \n\t"
  19.608 +        WAVG2B" wr8, wr0, wr4           \n\t"
  19.609 +        WAVG2B" wr9, wr1, wr5           \n\t"
  19.610 +        WAVG2B" wr8, wr8, wr10          \n\t"
  19.611 +        WAVG2B" wr9, wr9, wr11          \n\t"
  19.612 +        "wstrd wr8, [%[block]]          \n\t"
  19.613 +        "wstrd wr9, [%[block], #8]      \n\t"
  19.614 +        "add %[block], %[block], %[line_size]   \n\t"
  19.615 +
  19.616 +        "subs %[h], %[h], #2            \n\t"
  19.617 +        "pld [%[block]]                 \n\t"
  19.618 +        "bne 1b                         \n\t"
  19.619 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
  19.620 +        :
  19.621 +        : "r4", "r5", "r12", "memory");
  19.622 +}
  19.623 +
  19.624 +void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.625 +{
  19.626 +    // [wr0 wr1 wr2 wr3] for previous line
  19.627 +    // [wr4 wr5 wr6 wr7] for current line
  19.628 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.629 +    __asm__ volatile(
  19.630 +        "pld [%[pixels]]                \n\t"
  19.631 +        "mov r12, #2                    \n\t"
  19.632 +        "pld [%[pixels], #32]           \n\t"
  19.633 +        "tmcr wcgr0, r12                \n\t" /* for shift value */
  19.634 +        "and r12, %[pixels], #7         \n\t"
  19.635 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.636 +        "tmcr wcgr1, r12                \n\t"
  19.637 +
  19.638 +        // [wr0 wr1 wr2 wr3] <= *
  19.639 +        // [wr4 wr5 wr6 wr7]
  19.640 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.641 +        "add r12, r12, #1               \n\t"
  19.642 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.643 +        "tmcr wcgr2, r12                \n\t"
  19.644 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.645 +        "cmp r12, #8                    \n\t"
  19.646 +        "pld [%[pixels]]                \n\t"
  19.647 +        "pld [%[pixels], #32]           \n\t"
  19.648 +        "walignr1 wr2, wr12, wr13       \n\t"
  19.649 +        "wmoveq wr10, wr13              \n\t"
  19.650 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.651 +        "wunpckelub wr0, wr2            \n\t"
  19.652 +        "wunpckehub wr1, wr2            \n\t"
  19.653 +        "wunpckelub wr8, wr10           \n\t"
  19.654 +        "wunpckehub wr9, wr10           \n\t"
  19.655 +        "waddhus wr0, wr0, wr8          \n\t"
  19.656 +        "waddhus wr1, wr1, wr9          \n\t"
  19.657 +
  19.658 +        "1:                             \n\t"
  19.659 +        // [wr0 wr1 wr2 wr3]
  19.660 +        // [wr4 wr5 wr6 wr7] <= *
  19.661 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.662 +        "cmp r12, #8                    \n\t"
  19.663 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.664 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.665 +        "walignr1 wr6, wr12, wr13       \n\t"
  19.666 +        "pld [%[pixels]]                \n\t"
  19.667 +        "pld [%[pixels], #32]           \n\t"
  19.668 +        "wmoveq wr10, wr13              \n\t"
  19.669 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.670 +        "wunpckelub wr4, wr6            \n\t"
  19.671 +        "wunpckehub wr5, wr6            \n\t"
  19.672 +        "wunpckelub wr8, wr10           \n\t"
  19.673 +        "wunpckehub wr9, wr10           \n\t"
  19.674 +        "waddhus wr4, wr4, wr8          \n\t"
  19.675 +        "waddhus wr5, wr5, wr9          \n\t"
  19.676 +        "waddhus wr8, wr0, wr4          \n\t"
  19.677 +        "waddhus wr9, wr1, wr5          \n\t"
  19.678 +        "waddhus wr8, wr8, wr15         \n\t"
  19.679 +        "waddhus wr9, wr9, wr15         \n\t"
  19.680 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
  19.681 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
  19.682 +        "wpackhus wr8, wr8, wr9         \n\t"
  19.683 +        "wstrd wr8, [%[block]]          \n\t"
  19.684 +        "add %[block], %[block], %[line_size]   \n\t"
  19.685 +
  19.686 +        // [wr0 wr1 wr2 wr3] <= *
  19.687 +        // [wr4 wr5 wr6 wr7]
  19.688 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.689 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.690 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.691 +        "walignr1 wr2, wr12, wr13       \n\t"
  19.692 +        "pld [%[pixels]]                \n\t"
  19.693 +        "pld [%[pixels], #32]           \n\t"
  19.694 +        "wmoveq wr10, wr13              \n\t"
  19.695 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.696 +        "wunpckelub wr0, wr2            \n\t"
  19.697 +        "wunpckehub wr1, wr2            \n\t"
  19.698 +        "wunpckelub wr8, wr10           \n\t"
  19.699 +        "wunpckehub wr9, wr10           \n\t"
  19.700 +        "waddhus wr0, wr0, wr8          \n\t"
  19.701 +        "waddhus wr1, wr1, wr9          \n\t"
  19.702 +        "waddhus wr8, wr0, wr4          \n\t"
  19.703 +        "waddhus wr9, wr1, wr5          \n\t"
  19.704 +        "waddhus wr8, wr8, wr15         \n\t"
  19.705 +        "waddhus wr9, wr9, wr15         \n\t"
  19.706 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
  19.707 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
  19.708 +        "wpackhus wr8, wr8, wr9         \n\t"
  19.709 +        "subs %[h], %[h], #2            \n\t"
  19.710 +        "wstrd wr8, [%[block]]          \n\t"
  19.711 +        "add %[block], %[block], %[line_size]   \n\t"
  19.712 +        "bne 1b                         \n\t"
  19.713 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
  19.714 +        : [line_size]"r"(line_size)
  19.715 +        : "r12", "memory");
  19.716 +}
  19.717 +
  19.718 +void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.719 +{
  19.720 +    // [wr0 wr1 wr2 wr3] for previous line
  19.721 +    // [wr4 wr5 wr6 wr7] for current line
  19.722 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.723 +    __asm__ volatile(
  19.724 +        "pld [%[pixels]]                \n\t"
  19.725 +        "mov r12, #2                    \n\t"
  19.726 +        "pld [%[pixels], #32]           \n\t"
  19.727 +        "tmcr wcgr0, r12                \n\t" /* for shift value */
  19.728 +        /* alignment */
  19.729 +        "and r12, %[pixels], #7         \n\t"
  19.730 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.731 +        "tmcr wcgr1, r12                \n\t"
  19.732 +        "add r12, r12, #1               \n\t"
  19.733 +        "tmcr wcgr2, r12                \n\t"
  19.734 +
  19.735 +        // [wr0 wr1 wr2 wr3] <= *
  19.736 +        // [wr4 wr5 wr6 wr7]
  19.737 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.738 +        "cmp r12, #8                    \n\t"
  19.739 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.740 +        "wldrd wr14, [%[pixels], #16]   \n\t"
  19.741 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.742 +        "pld [%[pixels]]                \n\t"
  19.743 +        "walignr1 wr2, wr12, wr13       \n\t"
  19.744 +        "pld [%[pixels], #32]           \n\t"
  19.745 +        "walignr1 wr3, wr13, wr14       \n\t"
  19.746 +        "wmoveq wr10, wr13              \n\t"
  19.747 +        "wmoveq wr11, wr14              \n\t"
  19.748 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.749 +        "walignr2ne wr11, wr13, wr14    \n\t"
  19.750 +        "wunpckelub wr0, wr2            \n\t"
  19.751 +        "wunpckehub wr1, wr2            \n\t"
  19.752 +        "wunpckelub wr2, wr3            \n\t"
  19.753 +        "wunpckehub wr3, wr3            \n\t"
  19.754 +        "wunpckelub wr8, wr10           \n\t"
  19.755 +        "wunpckehub wr9, wr10           \n\t"
  19.756 +        "wunpckelub wr10, wr11          \n\t"
  19.757 +        "wunpckehub wr11, wr11          \n\t"
  19.758 +        "waddhus wr0, wr0, wr8          \n\t"
  19.759 +        "waddhus wr1, wr1, wr9          \n\t"
  19.760 +        "waddhus wr2, wr2, wr10         \n\t"
  19.761 +        "waddhus wr3, wr3, wr11         \n\t"
  19.762 +
  19.763 +        "1:                             \n\t"
  19.764 +        // [wr0 wr1 wr2 wr3]
  19.765 +        // [wr4 wr5 wr6 wr7] <= *
  19.766 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.767 +        "cmp r12, #8                    \n\t"
  19.768 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.769 +        "wldrd wr14, [%[pixels], #16]   \n\t"
  19.770 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.771 +        "walignr1 wr6, wr12, wr13       \n\t"
  19.772 +        "pld [%[pixels]]                \n\t"
  19.773 +        "pld [%[pixels], #32]           \n\t"
  19.774 +        "walignr1 wr7, wr13, wr14       \n\t"
  19.775 +        "wmoveq wr10, wr13              \n\t"
  19.776 +        "wmoveq wr11, wr14              \n\t"
  19.777 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.778 +        "walignr2ne wr11, wr13, wr14    \n\t"
  19.779 +        "wunpckelub wr4, wr6            \n\t"
  19.780 +        "wunpckehub wr5, wr6            \n\t"
  19.781 +        "wunpckelub wr6, wr7            \n\t"
  19.782 +        "wunpckehub wr7, wr7            \n\t"
  19.783 +        "wunpckelub wr8, wr10           \n\t"
  19.784 +        "wunpckehub wr9, wr10           \n\t"
  19.785 +        "wunpckelub wr10, wr11          \n\t"
  19.786 +        "wunpckehub wr11, wr11          \n\t"
  19.787 +        "waddhus wr4, wr4, wr8          \n\t"
  19.788 +        "waddhus wr5, wr5, wr9          \n\t"
  19.789 +        "waddhus wr6, wr6, wr10         \n\t"
  19.790 +        "waddhus wr7, wr7, wr11         \n\t"
  19.791 +        "waddhus wr8, wr0, wr4          \n\t"
  19.792 +        "waddhus wr9, wr1, wr5          \n\t"
  19.793 +        "waddhus wr10, wr2, wr6         \n\t"
  19.794 +        "waddhus wr11, wr3, wr7         \n\t"
  19.795 +        "waddhus wr8, wr8, wr15         \n\t"
  19.796 +        "waddhus wr9, wr9, wr15         \n\t"
  19.797 +        "waddhus wr10, wr10, wr15       \n\t"
  19.798 +        "waddhus wr11, wr11, wr15       \n\t"
  19.799 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
  19.800 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
  19.801 +        "wsrlhg wr10, wr10, wcgr0       \n\t"
  19.802 +        "wsrlhg wr11, wr11, wcgr0       \n\t"
  19.803 +        "wpackhus wr8, wr8, wr9         \n\t"
  19.804 +        "wpackhus wr9, wr10, wr11       \n\t"
  19.805 +        "wstrd wr8, [%[block]]          \n\t"
  19.806 +        "wstrd wr9, [%[block], #8]      \n\t"
  19.807 +        "add %[block], %[block], %[line_size]   \n\t"
  19.808 +
  19.809 +        // [wr0 wr1 wr2 wr3] <= *
  19.810 +        // [wr4 wr5 wr6 wr7]
  19.811 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.812 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.813 +        "wldrd wr14, [%[pixels], #16]   \n\t"
  19.814 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.815 +        "walignr1 wr2, wr12, wr13       \n\t"
  19.816 +        "pld [%[pixels]]                \n\t"
  19.817 +        "pld [%[pixels], #32]           \n\t"
  19.818 +        "walignr1 wr3, wr13, wr14       \n\t"
  19.819 +        "wmoveq wr10, wr13              \n\t"
  19.820 +        "wmoveq wr11, wr14              \n\t"
  19.821 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.822 +        "walignr2ne wr11, wr13, wr14    \n\t"
  19.823 +        "wunpckelub wr0, wr2            \n\t"
  19.824 +        "wunpckehub wr1, wr2            \n\t"
  19.825 +        "wunpckelub wr2, wr3            \n\t"
  19.826 +        "wunpckehub wr3, wr3            \n\t"
  19.827 +        "wunpckelub wr8, wr10           \n\t"
  19.828 +        "wunpckehub wr9, wr10           \n\t"
  19.829 +        "wunpckelub wr10, wr11          \n\t"
  19.830 +        "wunpckehub wr11, wr11          \n\t"
  19.831 +        "waddhus wr0, wr0, wr8          \n\t"
  19.832 +        "waddhus wr1, wr1, wr9          \n\t"
  19.833 +        "waddhus wr2, wr2, wr10         \n\t"
  19.834 +        "waddhus wr3, wr3, wr11         \n\t"
  19.835 +        "waddhus wr8, wr0, wr4          \n\t"
  19.836 +        "waddhus wr9, wr1, wr5          \n\t"
  19.837 +        "waddhus wr10, wr2, wr6         \n\t"
  19.838 +        "waddhus wr11, wr3, wr7         \n\t"
  19.839 +        "waddhus wr8, wr8, wr15         \n\t"
  19.840 +        "waddhus wr9, wr9, wr15         \n\t"
  19.841 +        "waddhus wr10, wr10, wr15       \n\t"
  19.842 +        "waddhus wr11, wr11, wr15       \n\t"
  19.843 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
  19.844 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
  19.845 +        "wsrlhg wr10, wr10, wcgr0       \n\t"
  19.846 +        "wsrlhg wr11, wr11, wcgr0       \n\t"
  19.847 +        "wpackhus wr8, wr8, wr9         \n\t"
  19.848 +        "wpackhus wr9, wr10, wr11       \n\t"
  19.849 +        "wstrd wr8, [%[block]]          \n\t"
  19.850 +        "wstrd wr9, [%[block], #8]      \n\t"
  19.851 +        "add %[block], %[block], %[line_size]   \n\t"
  19.852 +
  19.853 +        "subs %[h], %[h], #2            \n\t"
  19.854 +        "bne 1b                         \n\t"
  19.855 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
  19.856 +        : [line_size]"r"(line_size)
  19.857 +        : "r12", "memory");
  19.858 +}
  19.859 +
  19.860 +void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.861 +{
  19.862 +    // [wr0 wr1 wr2 wr3] for previous line
  19.863 +    // [wr4 wr5 wr6 wr7] for current line
  19.864 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.865 +    __asm__ volatile(
  19.866 +        "pld [%[block]]                 \n\t"
  19.867 +        "pld [%[block], #32]            \n\t"
  19.868 +        "pld [%[pixels]]                \n\t"
  19.869 +        "mov r12, #2                    \n\t"
  19.870 +        "pld [%[pixels], #32]           \n\t"
  19.871 +        "tmcr wcgr0, r12                \n\t" /* for shift value */
  19.872 +        "and r12, %[pixels], #7         \n\t"
  19.873 +        "bic %[pixels], %[pixels], #7   \n\t"
  19.874 +        "tmcr wcgr1, r12                \n\t"
  19.875 +
  19.876 +        // [wr0 wr1 wr2 wr3] <= *
  19.877 +        // [wr4 wr5 wr6 wr7]
  19.878 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.879 +        "add r12, r12, #1               \n\t"
  19.880 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.881 +        "tmcr wcgr2, r12                \n\t"
  19.882 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.883 +        "cmp r12, #8                    \n\t"
  19.884 +        "pld [%[pixels]]                \n\t"
  19.885 +        "pld [%[pixels], #32]           \n\t"
  19.886 +        "walignr1 wr2, wr12, wr13       \n\t"
  19.887 +        "wmoveq wr10, wr13              \n\t"
  19.888 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.889 +        "wunpckelub wr0, wr2            \n\t"
  19.890 +        "wunpckehub wr1, wr2            \n\t"
  19.891 +        "wunpckelub wr8, wr10           \n\t"
  19.892 +        "wunpckehub wr9, wr10           \n\t"
  19.893 +        "waddhus wr0, wr0, wr8          \n\t"
  19.894 +        "waddhus wr1, wr1, wr9          \n\t"
  19.895 +
  19.896 +        "1:                             \n\t"
  19.897 +        // [wr0 wr1 wr2 wr3]
  19.898 +        // [wr4 wr5 wr6 wr7] <= *
  19.899 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.900 +        "cmp r12, #8                    \n\t"
  19.901 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.902 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.903 +        "walignr1 wr6, wr12, wr13       \n\t"
  19.904 +        "pld [%[pixels]]                \n\t"
  19.905 +        "pld [%[pixels], #32]           \n\t"
  19.906 +        "wmoveq wr10, wr13              \n\t"
  19.907 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.908 +        "wunpckelub wr4, wr6            \n\t"
  19.909 +        "wunpckehub wr5, wr6            \n\t"
  19.910 +        "wunpckelub wr8, wr10           \n\t"
  19.911 +        "wunpckehub wr9, wr10           \n\t"
  19.912 +        "waddhus wr4, wr4, wr8          \n\t"
  19.913 +        "waddhus wr5, wr5, wr9          \n\t"
  19.914 +        "waddhus wr8, wr0, wr4          \n\t"
  19.915 +        "waddhus wr9, wr1, wr5          \n\t"
  19.916 +        "waddhus wr8, wr8, wr15         \n\t"
  19.917 +        "waddhus wr9, wr9, wr15         \n\t"
  19.918 +        "wldrd wr12, [%[block]]         \n\t"
  19.919 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
  19.920 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
  19.921 +        "wpackhus wr8, wr8, wr9         \n\t"
  19.922 +        WAVG2B" wr8, wr8, wr12          \n\t"
  19.923 +        "wstrd wr8, [%[block]]          \n\t"
  19.924 +        "add %[block], %[block], %[line_size]   \n\t"
  19.925 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.926 +        "pld [%[block]]                 \n\t"
  19.927 +        "pld [%[block], #32]            \n\t"
  19.928 +
  19.929 +        // [wr0 wr1 wr2 wr3] <= *
  19.930 +        // [wr4 wr5 wr6 wr7]
  19.931 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.932 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.933 +        "walignr1 wr2, wr12, wr13       \n\t"
  19.934 +        "pld [%[pixels]]                \n\t"
  19.935 +        "pld [%[pixels], #32]           \n\t"
  19.936 +        "wmoveq wr10, wr13              \n\t"
  19.937 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.938 +        "wunpckelub wr0, wr2            \n\t"
  19.939 +        "wunpckehub wr1, wr2            \n\t"
  19.940 +        "wunpckelub wr8, wr10           \n\t"
  19.941 +        "wunpckehub wr9, wr10           \n\t"
  19.942 +        "waddhus wr0, wr0, wr8          \n\t"
  19.943 +        "waddhus wr1, wr1, wr9          \n\t"
  19.944 +        "waddhus wr8, wr0, wr4          \n\t"
  19.945 +        "waddhus wr9, wr1, wr5          \n\t"
  19.946 +        "waddhus wr8, wr8, wr15         \n\t"
  19.947 +        "waddhus wr9, wr9, wr15         \n\t"
  19.948 +        "wldrd wr12, [%[block]]         \n\t"
  19.949 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
  19.950 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
  19.951 +        "wpackhus wr8, wr8, wr9         \n\t"
  19.952 +        "subs %[h], %[h], #2            \n\t"
  19.953 +        WAVG2B" wr8, wr8, wr12          \n\t"
  19.954 +        "wstrd wr8, [%[block]]          \n\t"
  19.955 +        "add %[block], %[block], %[line_size]   \n\t"
  19.956 +        "pld [%[block]]                 \n\t"
  19.957 +        "pld [%[block], #32]            \n\t"
  19.958 +        "bne 1b                         \n\t"
  19.959 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
  19.960 +        : [line_size]"r"(line_size)
  19.961 +        : "r12", "memory");
  19.962 +}
  19.963 +
  19.964 +void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  19.965 +{
  19.966 +    // [wr0 wr1 wr2 wr3] for previous line
  19.967 +    // [wr4 wr5 wr6 wr7] for current line
  19.968 +    SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
  19.969 +    __asm__ volatile(
  19.970 +        "pld [%[block]]                 \n\t"
  19.971 +        "pld [%[block], #32]            \n\t"
  19.972 +        "pld [%[pixels]]                \n\t"
  19.973 +        "mov r12, #2                    \n\t"
  19.974 +        "pld [%[pixels], #32]           \n\t"
  19.975 +        "tmcr wcgr0, r12                \n\t" /* for shift value */
  19.976 +        /* alignment */
  19.977 +        "and r12, %[pixels], #7         \n\t"
  19.978 +        "bic %[pixels], %[pixels], #7           \n\t"
  19.979 +        "tmcr wcgr1, r12                \n\t"
  19.980 +        "add r12, r12, #1               \n\t"
  19.981 +        "tmcr wcgr2, r12                \n\t"
  19.982 +
  19.983 +        // [wr0 wr1 wr2 wr3] <= *
  19.984 +        // [wr4 wr5 wr6 wr7]
  19.985 +        "wldrd wr12, [%[pixels]]        \n\t"
  19.986 +        "cmp r12, #8                    \n\t"
  19.987 +        "wldrd wr13, [%[pixels], #8]    \n\t"
  19.988 +        "wldrd wr14, [%[pixels], #16]   \n\t"
  19.989 +        "add %[pixels], %[pixels], %[line_size] \n\t"
  19.990 +        "pld [%[pixels]]                \n\t"
  19.991 +        "walignr1 wr2, wr12, wr13       \n\t"
  19.992 +        "pld [%[pixels], #32]           \n\t"
  19.993 +        "walignr1 wr3, wr13, wr14       \n\t"
  19.994 +        "wmoveq wr10, wr13              \n\t"
  19.995 +        "wmoveq wr11, wr14              \n\t"
  19.996 +        "walignr2ne wr10, wr12, wr13    \n\t"
  19.997 +        "walignr2ne wr11, wr13, wr14    \n\t"
  19.998 +        "wunpckelub wr0, wr2            \n\t"
  19.999 +        "wunpckehub wr1, wr2            \n\t"
 19.1000 +        "wunpckelub wr2, wr3            \n\t"
 19.1001 +        "wunpckehub wr3, wr3            \n\t"
 19.1002 +        "wunpckelub wr8, wr10           \n\t"
 19.1003 +        "wunpckehub wr9, wr10           \n\t"
 19.1004 +        "wunpckelub wr10, wr11          \n\t"
 19.1005 +        "wunpckehub wr11, wr11          \n\t"
 19.1006 +        "waddhus wr0, wr0, wr8          \n\t"
 19.1007 +        "waddhus wr1, wr1, wr9          \n\t"
 19.1008 +        "waddhus wr2, wr2, wr10         \n\t"
 19.1009 +        "waddhus wr3, wr3, wr11         \n\t"
 19.1010 +
 19.1011 +        "1:                             \n\t"
 19.1012 +        // [wr0 wr1 wr2 wr3]
 19.1013 +        // [wr4 wr5 wr6 wr7] <= *
 19.1014 +        "wldrd wr12, [%[pixels]]        \n\t"
 19.1015 +        "cmp r12, #8                    \n\t"
 19.1016 +        "wldrd wr13, [%[pixels], #8]    \n\t"
 19.1017 +        "wldrd wr14, [%[pixels], #16]   \n\t"
 19.1018 +        "add %[pixels], %[pixels], %[line_size] \n\t"
 19.1019 +        "walignr1 wr6, wr12, wr13       \n\t"
 19.1020 +        "pld [%[pixels]]                \n\t"
 19.1021 +        "pld [%[pixels], #32]           \n\t"
 19.1022 +        "walignr1 wr7, wr13, wr14       \n\t"
 19.1023 +        "wmoveq wr10, wr13              \n\t"
 19.1024 +        "wmoveq wr11, wr14              \n\t"
 19.1025 +        "walignr2ne wr10, wr12, wr13    \n\t"
 19.1026 +        "walignr2ne wr11, wr13, wr14    \n\t"
 19.1027 +        "wunpckelub wr4, wr6            \n\t"
 19.1028 +        "wunpckehub wr5, wr6            \n\t"
 19.1029 +        "wunpckelub wr6, wr7            \n\t"
 19.1030 +        "wunpckehub wr7, wr7            \n\t"
 19.1031 +        "wunpckelub wr8, wr10           \n\t"
 19.1032 +        "wunpckehub wr9, wr10           \n\t"
 19.1033 +        "wunpckelub wr10, wr11          \n\t"
 19.1034 +        "wunpckehub wr11, wr11          \n\t"
 19.1035 +        "waddhus wr4, wr4, wr8          \n\t"
 19.1036 +        "waddhus wr5, wr5, wr9          \n\t"
 19.1037 +        "waddhus wr6, wr6, wr10         \n\t"
 19.1038 +        "waddhus wr7, wr7, wr11         \n\t"
 19.1039 +        "waddhus wr8, wr0, wr4          \n\t"
 19.1040 +        "waddhus wr9, wr1, wr5          \n\t"
 19.1041 +        "waddhus wr10, wr2, wr6         \n\t"
 19.1042 +        "waddhus wr11, wr3, wr7         \n\t"
 19.1043 +        "waddhus wr8, wr8, wr15         \n\t"
 19.1044 +        "waddhus wr9, wr9, wr15         \n\t"
 19.1045 +        "waddhus wr10, wr10, wr15       \n\t"
 19.1046 +        "waddhus wr11, wr11, wr15       \n\t"
 19.1047 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
 19.1048 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
 19.1049 +        "wldrd wr12, [%[block]]         \n\t"
 19.1050 +        "wldrd wr13, [%[block], #8]     \n\t"
 19.1051 +        "wsrlhg wr10, wr10, wcgr0       \n\t"
 19.1052 +        "wsrlhg wr11, wr11, wcgr0       \n\t"
 19.1053 +        "wpackhus wr8, wr8, wr9         \n\t"
 19.1054 +        "wpackhus wr9, wr10, wr11       \n\t"
 19.1055 +        WAVG2B" wr8, wr8, wr12          \n\t"
 19.1056 +        WAVG2B" wr9, wr9, wr13          \n\t"
 19.1057 +        "wstrd wr8, [%[block]]          \n\t"
 19.1058 +        "wstrd wr9, [%[block], #8]      \n\t"
 19.1059 +        "add %[block], %[block], %[line_size]   \n\t"
 19.1060 +
 19.1061 +        // [wr0 wr1 wr2 wr3] <= *
 19.1062 +        // [wr4 wr5 wr6 wr7]
 19.1063 +        "wldrd wr12, [%[pixels]]        \n\t"
 19.1064 +        "pld [%[block]]                 \n\t"
 19.1065 +        "wldrd wr13, [%[pixels], #8]    \n\t"
 19.1066 +        "pld [%[block], #32]            \n\t"
 19.1067 +        "wldrd wr14, [%[pixels], #16]   \n\t"
 19.1068 +        "add %[pixels], %[pixels], %[line_size] \n\t"
 19.1069 +        "walignr1 wr2, wr12, wr13       \n\t"
 19.1070 +        "pld [%[pixels]]                \n\t"
 19.1071 +        "pld [%[pixels], #32]           \n\t"
 19.1072 +        "walignr1 wr3, wr13, wr14       \n\t"
 19.1073 +        "wmoveq wr10, wr13              \n\t"
 19.1074 +        "wmoveq wr11, wr14              \n\t"
 19.1075 +        "walignr2ne wr10, wr12, wr13    \n\t"
 19.1076 +        "walignr2ne wr11, wr13, wr14    \n\t"
 19.1077 +        "wunpckelub wr0, wr2            \n\t"
 19.1078 +        "wunpckehub wr1, wr2            \n\t"
 19.1079 +        "wunpckelub wr2, wr3            \n\t"
 19.1080 +        "wunpckehub wr3, wr3            \n\t"
 19.1081 +        "wunpckelub wr8, wr10           \n\t"
 19.1082 +        "wunpckehub wr9, wr10           \n\t"
 19.1083 +        "wunpckelub wr10, wr11          \n\t"
 19.1084 +        "wunpckehub wr11, wr11          \n\t"
 19.1085 +        "waddhus wr0, wr0, wr8          \n\t"
 19.1086 +        "waddhus wr1, wr1, wr9          \n\t"
 19.1087 +        "waddhus wr2, wr2, wr10         \n\t"
 19.1088 +        "waddhus wr3, wr3, wr11         \n\t"
 19.1089 +        "waddhus wr8, wr0, wr4          \n\t"
 19.1090 +        "waddhus wr9, wr1, wr5          \n\t"
 19.1091 +        "waddhus wr10, wr2, wr6         \n\t"
 19.1092 +        "waddhus wr11, wr3, wr7         \n\t"
 19.1093 +        "waddhus wr8, wr8, wr15         \n\t"
 19.1094 +        "waddhus wr9, wr9, wr15         \n\t"
 19.1095 +        "waddhus wr10, wr10, wr15       \n\t"
 19.1096 +        "waddhus wr11, wr11, wr15       \n\t"
 19.1097 +        "wsrlhg wr8, wr8, wcgr0         \n\t"
 19.1098 +        "wsrlhg wr9, wr9, wcgr0         \n\t"
 19.1099 +        "wldrd wr12, [%[block]]         \n\t"
 19.1100 +        "wldrd wr13, [%[block], #8]     \n\t"
 19.1101 +        "wsrlhg wr10, wr10, wcgr0       \n\t"
 19.1102 +        "wsrlhg wr11, wr11, wcgr0       \n\t"
 19.1103 +        "wpackhus wr8, wr8, wr9         \n\t"
 19.1104 +        "wpackhus wr9, wr10, wr11       \n\t"
 19.1105 +        WAVG2B" wr8, wr8, wr12          \n\t"
 19.1106 +        WAVG2B" wr9, wr9, wr13          \n\t"
 19.1107 +        "wstrd wr8, [%[block]]          \n\t"
 19.1108 +        "wstrd wr9, [%[block], #8]      \n\t"
 19.1109 +        "add %[block], %[block], %[line_size]   \n\t"
 19.1110 +        "subs %[h], %[h], #2            \n\t"
 19.1111 +        "pld [%[block]]                 \n\t"
 19.1112 +        "pld [%[block], #32]            \n\t"
 19.1113 +        "bne 1b                         \n\t"
 19.1114 +        : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
 19.1115 +        : [line_size]"r"(line_size)
 19.1116 +        : "r12", "memory");
 19.1117 +}

    20.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    20.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_neon.S	Mon Aug 27 12:09:56 2012 +0200
    20.3 @@ -0,0 +1,1146 @@
    20.4 +/*
    20.5 + * ARM NEON optimised DSP functions
    20.6 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
    20.7 + *
    20.8 + * This file is part of FFmpeg.
    20.9 + *
   20.10 + * FFmpeg is free software; you can redistribute it and/or
   20.11 + * modify it under the terms of the GNU Lesser General Public
   20.12 + * License as published by the Free Software Foundation; either
   20.13 + * version 2.1 of the License, or (at your option) any later version.
   20.14 + *
   20.15 + * FFmpeg is distributed in the hope that it will be useful,
   20.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   20.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   20.18 + * Lesser General Public License for more details.
   20.19 + *
   20.20 + * You should have received a copy of the GNU Lesser General Public
   20.21 + * License along with FFmpeg; if not, write to the Free Software
   20.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   20.23 + */
   20.24 +
   20.25 +#include "config.h"
   20.26 +#include "asm.S"
   20.27 +
   20.28 +        preserve8
   20.29 +        .text
   20.30 +
   20.31 +        .macro pixels16 avg=0
   20.32 +.if \avg
   20.33 +        mov             ip,  r0
   20.34 +.endif
   20.35 +1:      vld1.64         {d0, d1},  [r1], r2
   20.36 +        vld1.64         {d2, d3},  [r1], r2
   20.37 +        vld1.64         {d4, d5},  [r1], r2
   20.38 +        pld             [r1, r2, lsl #2]
   20.39 +        vld1.64         {d6, d7},  [r1], r2
   20.40 +        pld             [r1]
   20.41 +        pld             [r1, r2]
   20.42 +        pld             [r1, r2, lsl #1]
   20.43 +.if \avg
   20.44 +        vld1.64         {d16,d17}, [ip,:128], r2
   20.45 +        vrhadd.u8       q0,  q0,  q8
   20.46 +        vld1.64         {d18,d19}, [ip,:128], r2
   20.47 +        vrhadd.u8       q1,  q1,  q9
   20.48 +        vld1.64         {d20,d21}, [ip,:128], r2
   20.49 +        vrhadd.u8       q2,  q2,  q10
   20.50 +        vld1.64         {d22,d23}, [ip,:128], r2
   20.51 +        vrhadd.u8       q3,  q3,  q11
   20.52 +.endif
   20.53 +        subs            r3,  r3,  #4
   20.54 +        vst1.64         {d0, d1},  [r0,:128], r2
   20.55 +        vst1.64         {d2, d3},  [r0,:128], r2
   20.56 +        vst1.64         {d4, d5},  [r0,:128], r2
   20.57 +        vst1.64         {d6, d7},  [r0,:128], r2
   20.58 +        bne             1b
   20.59 +        bx              lr
   20.60 +        .endm
   20.61 +
   20.62 +        .macro pixels16_x2 vhadd=vrhadd.u8
   20.63 +1:      vld1.64         {d0-d2},   [r1], r2
   20.64 +        vld1.64         {d4-d6},   [r1], r2
   20.65 +        pld             [r1]
   20.66 +        pld             [r1, r2]
   20.67 +        subs            r3,  r3,  #2
   20.68 +        vext.8          q1,  q0,  q1,  #1
   20.69 +        \vhadd          q0,  q0,  q1
   20.70 +        vext.8          q3,  q2,  q3,  #1
   20.71 +        \vhadd          q2,  q2,  q3
   20.72 +        vst1.64         {d0, d1},  [r0,:128], r2
   20.73 +        vst1.64         {d4, d5},  [r0,:128], r2
   20.74 +        bne             1b
   20.75 +        bx              lr
   20.76 +        .endm
   20.77 +
   20.78 +        .macro pixels16_y2 vhadd=vrhadd.u8
   20.79 +        vld1.64         {d0, d1},  [r1], r2
   20.80 +        vld1.64         {d2, d3},  [r1], r2
   20.81 +1:      subs            r3,  r3,  #2
   20.82 +        \vhadd          q2,  q0,  q1
   20.83 +        vld1.64         {d0, d1},  [r1], r2
   20.84 +        \vhadd          q3,  q0,  q1
   20.85 +        vld1.64         {d2, d3},  [r1], r2
   20.86 +        pld             [r1]
   20.87 +        pld             [r1, r2]
   20.88 +        vst1.64         {d4, d5},  [r0,:128], r2
   20.89 +        vst1.64         {d6, d7},  [r0,:128], r2
   20.90 +        bne             1b
   20.91 +        bx              lr
   20.92 +        .endm
   20.93 +
   20.94 +        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
   20.95 +        vld1.64         {d0-d2},   [r1], r2
   20.96 +        vld1.64         {d4-d6},   [r1], r2
   20.97 +.if \no_rnd
   20.98 +        vmov.i16        q13, #1
   20.99 +.endif
  20.100 +        pld             [r1]
  20.101 +        pld             [r1, r2]
  20.102 +        vext.8          q1,  q0,  q1,  #1
  20.103 +        vext.8          q3,  q2,  q3,  #1
  20.104 +        vaddl.u8        q8,  d0,  d2
  20.105 +        vaddl.u8        q10, d1,  d3
  20.106 +        vaddl.u8        q9,  d4,  d6
  20.107 +        vaddl.u8        q11, d5,  d7
  20.108 +1:      subs            r3,  r3,  #2
  20.109 +        vld1.64         {d0-d2},   [r1], r2
  20.110 +        vadd.u16        q12, q8,  q9
  20.111 +        pld             [r1]
  20.112 +.if \no_rnd
  20.113 +        vadd.u16        q12, q12, q13
  20.114 +.endif
  20.115 +        vext.8          q15, q0,  q1,  #1
  20.116 +        vadd.u16        q1 , q10, q11
  20.117 +        \vshrn          d28, q12, #2
  20.118 +.if \no_rnd
  20.119 +        vadd.u16        q1,  q1,  q13
  20.120 +.endif
  20.121 +        \vshrn          d29, q1,  #2
  20.122 +        vaddl.u8        q8,  d0,  d30
  20.123 +        vld1.64         {d2-d4},   [r1], r2
  20.124 +        vaddl.u8        q10, d1,  d31
  20.125 +        vst1.64         {d28,d29}, [r0,:128], r2
  20.126 +        vadd.u16        q12, q8,  q9
  20.127 +        pld             [r1, r2]
  20.128 +.if \no_rnd
  20.129 +        vadd.u16        q12, q12, q13
  20.130 +.endif
  20.131 +        vext.8          q2,  q1,  q2,  #1
  20.132 +        vadd.u16        q0,  q10, q11
  20.133 +        \vshrn          d30, q12, #2
  20.134 +.if \no_rnd
  20.135 +        vadd.u16        q0,  q0,  q13
  20.136 +.endif
  20.137 +        \vshrn          d31, q0,  #2
  20.138 +        vaddl.u8        q9,  d2,  d4
  20.139 +        vaddl.u8        q11, d3,  d5
  20.140 +        vst1.64         {d30,d31}, [r0,:128], r2
  20.141 +        bgt             1b
  20.142 +        bx              lr
  20.143 +        .endm
  20.144 +
  20.145 +        .macro pixels8 avg=0
  20.146 +1:      vld1.64         {d0}, [r1], r2
  20.147 +        vld1.64         {d1}, [r1], r2
  20.148 +        vld1.64         {d2}, [r1], r2
  20.149 +        pld             [r1, r2, lsl #2]
  20.150 +        vld1.64         {d3}, [r1], r2
  20.151 +        pld             [r1]
  20.152 +        pld             [r1, r2]
  20.153 +        pld             [r1, r2, lsl #1]
  20.154 +.if \avg
  20.155 +        vld1.64         {d4}, [r0,:64], r2
  20.156 +        vrhadd.u8       d0,  d0,  d4
  20.157 +        vld1.64         {d5}, [r0,:64], r2
  20.158 +        vrhadd.u8       d1,  d1,  d5
  20.159 +        vld1.64         {d6}, [r0,:64], r2
  20.160 +        vrhadd.u8       d2,  d2,  d6
  20.161 +        vld1.64         {d7}, [r0,:64], r2
  20.162 +        vrhadd.u8       d3,  d3,  d7
  20.163 +        sub             r0,  r0,  r2,  lsl #2
  20.164 +.endif
  20.165 +        subs            r3,  r3,  #4
  20.166 +        vst1.64         {d0}, [r0,:64], r2
  20.167 +        vst1.64         {d1}, [r0,:64], r2
  20.168 +        vst1.64         {d2}, [r0,:64], r2
  20.169 +        vst1.64         {d3}, [r0,:64], r2
  20.170 +        bne             1b
  20.171 +        bx              lr
  20.172 +        .endm
  20.173 +
  20.174 +        .macro pixels8_x2 vhadd=vrhadd.u8
  20.175 +1:      vld1.64         {d0, d1},  [r1], r2
  20.176 +        vext.8          d1,  d0,  d1,  #1
  20.177 +        vld1.64         {d2, d3},  [r1], r2
  20.178 +        vext.8          d3,  d2,  d3,  #1
  20.179 +        pld             [r1]
  20.180 +        pld             [r1, r2]
  20.181 +        subs            r3,  r3,  #2
  20.182 +        vswp            d1,  d2
  20.183 +        \vhadd          q0,  q0,  q1
  20.184 +        vst1.64         {d0},      [r0,:64], r2
  20.185 +        vst1.64         {d1},      [r0,:64], r2
  20.186 +        bne             1b
  20.187 +        bx              lr
  20.188 +        .endm
  20.189 +
  20.190 +        .macro pixels8_y2 vhadd=vrhadd.u8
  20.191 +        vld1.64         {d0},      [r1], r2
  20.192 +        vld1.64         {d1},      [r1], r2
  20.193 +1:      subs            r3,  r3,  #2
  20.194 +        \vhadd          d4,  d0,  d1
  20.195 +        vld1.64         {d0},      [r1], r2
  20.196 +        \vhadd          d5,  d0,  d1
  20.197 +        vld1.64         {d1},      [r1], r2
  20.198 +        pld             [r1]
  20.199 +        pld             [r1, r2]
  20.200 +        vst1.64         {d4},      [r0,:64], r2
  20.201 +        vst1.64         {d5},      [r0,:64], r2
  20.202 +        bne             1b
  20.203 +        bx              lr
  20.204 +        .endm
  20.205 +
  20.206 +        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
  20.207 +        vld1.64         {d0, d1},  [r1], r2
  20.208 +        vld1.64         {d2, d3},  [r1], r2
  20.209 +.if \no_rnd
  20.210 +        vmov.i16        q11, #1
  20.211 +.endif
  20.212 +        pld             [r1]
  20.213 +        pld             [r1, r2]
  20.214 +        vext.8          d4,  d0,  d1,  #1
  20.215 +        vext.8          d6,  d2,  d3,  #1
  20.216 +        vaddl.u8        q8,  d0,  d4
  20.217 +        vaddl.u8        q9,  d2,  d6
  20.218 +1:      subs            r3,  r3,  #2
  20.219 +        vld1.64         {d0, d1},  [r1], r2
  20.220 +        pld             [r1]
  20.221 +        vadd.u16        q10, q8,  q9
  20.222 +        vext.8          d4,  d0,  d1,  #1
  20.223 +.if \no_rnd
  20.224 +        vadd.u16        q10, q10, q11
  20.225 +.endif
  20.226 +        vaddl.u8        q8,  d0,  d4
  20.227 +        \vshrn          d5,  q10, #2
  20.228 +        vld1.64         {d2, d3},  [r1], r2
  20.229 +        vadd.u16        q10, q8,  q9
  20.230 +        pld             [r1, r2]
  20.231 +.if \no_rnd
  20.232 +        vadd.u16        q10, q10, q11
  20.233 +.endif
  20.234 +        vst1.64         {d5},      [r0,:64], r2
  20.235 +        \vshrn          d7,  q10, #2
  20.236 +        vext.8          d6,  d2,  d3,  #1
  20.237 +        vaddl.u8        q9,  d2,  d6
  20.238 +        vst1.64         {d7},      [r0,:64], r2
  20.239 +        bgt             1b
  20.240 +        bx              lr
  20.241 +        .endm
  20.242 +
  20.243 +        .macro pixfunc pfx name suf rnd_op args:vararg
  20.244 +function ff_\pfx\name\suf\()_neon, export=1
  20.245 +        \name \rnd_op \args
  20.246 +endfunc
  20.247 +        .endm
  20.248 +
  20.249 +        .macro pixfunc2 pfx name args:vararg
  20.250 +        pixfunc \pfx \name
  20.251 +        pixfunc \pfx \name \args
  20.252 +        .endm
  20.253 +
  20.254 +function ff_put_h264_qpel16_mc00_neon, export=1
  20.255 +        mov             r3,  #16
  20.256 +endfunc
  20.257 +
  20.258 +        pixfunc  put_ pixels16
  20.259 +        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
  20.260 +        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
  20.261 +        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
  20.262 +
  20.263 +function ff_avg_h264_qpel16_mc00_neon, export=1
  20.264 +        mov             r3,  #16
  20.265 +endfunc
  20.266 +
  20.267 +        pixfunc  avg_ pixels16,, 1
  20.268 +
  20.269 +function ff_put_h264_qpel8_mc00_neon, export=1
  20.270 +        mov             r3,  #8
  20.271 +endfunc
  20.272 +
  20.273 +        pixfunc  put_ pixels8
  20.274 +        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
  20.275 +        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
  20.276 +        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
  20.277 +
  20.278 +function ff_avg_h264_qpel8_mc00_neon, export=1
  20.279 +        mov             r3,  #8
  20.280 +endfunc
  20.281 +
  20.282 +        pixfunc  avg_ pixels8,, 1
  20.283 +
  20.284 +function ff_put_pixels_clamped_neon, export=1
  20.285 +        vld1.64         {d16-d19}, [r0,:128]!
  20.286 +        vqmovun.s16     d0, q8
  20.287 +        vld1.64         {d20-d23}, [r0,:128]!
  20.288 +        vqmovun.s16     d1, q9
  20.289 +        vld1.64         {d24-d27}, [r0,:128]!
  20.290 +        vqmovun.s16     d2, q10
  20.291 +        vld1.64         {d28-d31}, [r0,:128]!
  20.292 +        vqmovun.s16     d3, q11
  20.293 +        vst1.64         {d0},      [r1,:64], r2
  20.294 +        vqmovun.s16     d4, q12
  20.295 +        vst1.64         {d1},      [r1,:64], r2
  20.296 +        vqmovun.s16     d5, q13
  20.297 +        vst1.64         {d2},      [r1,:64], r2
  20.298 +        vqmovun.s16     d6, q14
  20.299 +        vst1.64         {d3},      [r1,:64], r2
  20.300 +        vqmovun.s16     d7, q15
  20.301 +        vst1.64         {d4},      [r1,:64], r2
  20.302 +        vst1.64         {d5},      [r1,:64], r2
  20.303 +        vst1.64         {d6},      [r1,:64], r2
  20.304 +        vst1.64         {d7},      [r1,:64], r2
  20.305 +        bx              lr
  20.306 +endfunc
  20.307 +
  20.308 +function ff_put_signed_pixels_clamped_neon, export=1
  20.309 +        vmov.u8         d31, #128
  20.310 +        vld1.64         {d16-d17}, [r0,:128]!
  20.311 +        vqmovn.s16      d0, q8
  20.312 +        vld1.64         {d18-d19}, [r0,:128]!
  20.313 +        vqmovn.s16      d1, q9
  20.314 +        vld1.64         {d16-d17}, [r0,:128]!
  20.315 +        vqmovn.s16      d2, q8
  20.316 +        vld1.64         {d18-d19}, [r0,:128]!
  20.317 +        vadd.u8         d0, d0, d31
  20.318 +        vld1.64         {d20-d21}, [r0,:128]!
  20.319 +        vadd.u8         d1, d1, d31
  20.320 +        vld1.64         {d22-d23}, [r0,:128]!
  20.321 +        vadd.u8         d2, d2, d31
  20.322 +        vst1.64         {d0},      [r1,:64], r2
  20.323 +        vqmovn.s16      d3, q9
  20.324 +        vst1.64         {d1},      [r1,:64], r2
  20.325 +        vqmovn.s16      d4, q10
  20.326 +        vst1.64         {d2},      [r1,:64], r2
  20.327 +        vqmovn.s16      d5, q11
  20.328 +        vld1.64         {d24-d25}, [r0,:128]!
  20.329 +        vadd.u8         d3, d3, d31
  20.330 +        vld1.64         {d26-d27}, [r0,:128]!
  20.331 +        vadd.u8         d4, d4, d31
  20.332 +        vadd.u8         d5, d5, d31
  20.333 +        vst1.64         {d3},      [r1,:64], r2
  20.334 +        vqmovn.s16      d6, q12
  20.335 +        vst1.64         {d4},      [r1,:64], r2
  20.336 +        vqmovn.s16      d7, q13
  20.337 +        vst1.64         {d5},      [r1,:64], r2
  20.338 +        vadd.u8         d6, d6, d31
  20.339 +        vadd.u8         d7, d7, d31
  20.340 +        vst1.64         {d6},      [r1,:64], r2
  20.341 +        vst1.64         {d7},      [r1,:64], r2
  20.342 +        bx              lr
  20.343 +endfunc
  20.344 +
  20.345 +function ff_add_pixels_clamped_neon, export=1
  20.346 +        mov             r3, r1
  20.347 +        vld1.64         {d16},   [r1,:64], r2
  20.348 +        vld1.64         {d0-d1}, [r0,:128]!
  20.349 +        vaddw.u8        q0, q0, d16
  20.350 +        vld1.64         {d17},   [r1,:64], r2
  20.351 +        vld1.64         {d2-d3}, [r0,:128]!
  20.352 +        vqmovun.s16     d0, q0
  20.353 +        vld1.64         {d18},   [r1,:64], r2
  20.354 +        vaddw.u8        q1, q1, d17
  20.355 +        vld1.64         {d4-d5}, [r0,:128]!
  20.356 +        vaddw.u8        q2, q2, d18
  20.357 +        vst1.64         {d0},    [r3,:64], r2
  20.358 +        vqmovun.s16     d2, q1
  20.359 +        vld1.64         {d19},   [r1,:64], r2
  20.360 +        vld1.64         {d6-d7}, [r0,:128]!
  20.361 +        vaddw.u8        q3, q3, d19
  20.362 +        vqmovun.s16     d4, q2
  20.363 +        vst1.64         {d2},    [r3,:64], r2
  20.364 +        vld1.64         {d16},   [r1,:64], r2
  20.365 +        vqmovun.s16     d6, q3
  20.366 +        vld1.64         {d0-d1}, [r0,:128]!
  20.367 +        vaddw.u8        q0, q0, d16
  20.368 +        vst1.64         {d4},    [r3,:64], r2
  20.369 +        vld1.64         {d17},   [r1,:64], r2
  20.370 +        vld1.64         {d2-d3}, [r0,:128]!
  20.371 +        vaddw.u8        q1, q1, d17
  20.372 +        vst1.64         {d6},    [r3,:64], r2
  20.373 +        vqmovun.s16     d0, q0
  20.374 +        vld1.64         {d18},   [r1,:64], r2
  20.375 +        vld1.64         {d4-d5}, [r0,:128]!
  20.376 +        vaddw.u8        q2, q2, d18
  20.377 +        vst1.64         {d0},    [r3,:64], r2
  20.378 +        vqmovun.s16     d2, q1
  20.379 +        vld1.64         {d19},   [r1,:64], r2
  20.380 +        vqmovun.s16     d4, q2
  20.381 +        vld1.64         {d6-d7}, [r0,:128]!
  20.382 +        vaddw.u8        q3, q3, d19
  20.383 +        vst1.64         {d2},    [r3,:64], r2
  20.384 +        vqmovun.s16     d6, q3
  20.385 +        vst1.64         {d4},    [r3,:64], r2
  20.386 +        vst1.64         {d6},    [r3,:64], r2
  20.387 +        bx              lr
  20.388 +endfunc
  20.389 +
  20.390 +function ff_float_to_int16_neon, export=1
  20.391 +        subs            r2,  r2,  #8
  20.392 +        vld1.64         {d0-d1},  [r1,:128]!
  20.393 +        vcvt.s32.f32    q8,  q0,  #16
  20.394 +        vld1.64         {d2-d3},  [r1,:128]!
  20.395 +        vcvt.s32.f32    q9,  q1,  #16
  20.396 +        beq             3f
  20.397 +        bics            ip,  r2,  #15
  20.398 +        beq             2f
  20.399 +1:      subs            ip,  ip,  #16
  20.400 +        vshrn.s32       d4,  q8,  #16
  20.401 +        vld1.64         {d0-d1},  [r1,:128]!
  20.402 +        vcvt.s32.f32    q0,  q0,  #16
  20.403 +        vshrn.s32       d5,  q9,  #16
  20.404 +        vld1.64         {d2-d3},  [r1,:128]!
  20.405 +        vcvt.s32.f32    q1,  q1,  #16
  20.406 +        vshrn.s32       d6,  q0,  #16
  20.407 +        vst1.64         {d4-d5},  [r0,:128]!
  20.408 +        vshrn.s32       d7,  q1,  #16
  20.409 +        vld1.64         {d16-d17},[r1,:128]!
  20.410 +        vcvt.s32.f32    q8,  q8,  #16
  20.411 +        vld1.64         {d18-d19},[r1,:128]!
  20.412 +        vcvt.s32.f32    q9,  q9,  #16
  20.413 +        vst1.64         {d6-d7},  [r0,:128]!
  20.414 +        bne             1b
  20.415 +        ands            r2,  r2,  #15
  20.416 +        beq             3f
  20.417 +2:      vld1.64         {d0-d1},  [r1,:128]!
  20.418 +        vshrn.s32       d4,  q8,  #16
  20.419 +        vcvt.s32.f32    q0,  q0,  #16
  20.420 +        vld1.64         {d2-d3},  [r1,:128]!
  20.421 +        vshrn.s32       d5,  q9,  #16
  20.422 +        vcvt.s32.f32    q1,  q1,  #16
  20.423 +        vshrn.s32       d6,  q0,  #16
  20.424 +        vst1.64         {d4-d5},  [r0,:128]!
  20.425 +        vshrn.s32       d7,  q1,  #16
  20.426 +        vst1.64         {d6-d7},  [r0,:128]!
  20.427 +        bx              lr
  20.428 +3:      vshrn.s32       d4,  q8,  #16
  20.429 +        vshrn.s32       d5,  q9,  #16
  20.430 +        vst1.64         {d4-d5},  [r0,:128]!
  20.431 +        bx              lr
  20.432 +endfunc
  20.433 +
  20.434 +function ff_float_to_int16_interleave_neon, export=1
  20.435 +        cmp             r3, #2
  20.436 +        ldrlt           r1, [r1]
  20.437 +        blt             ff_float_to_int16_neon
  20.438 +        bne             4f
  20.439 +
  20.440 +        ldr             r3, [r1]
  20.441 +        ldr             r1, [r1, #4]
  20.442 +
  20.443 +        subs            r2,  r2,  #8
  20.444 +        vld1.64         {d0-d1},  [r3,:128]!
  20.445 +        vcvt.s32.f32    q8,  q0,  #16
  20.446 +        vld1.64         {d2-d3},  [r3,:128]!
  20.447 +        vcvt.s32.f32    q9,  q1,  #16
  20.448 +        vld1.64         {d20-d21},[r1,:128]!
  20.449 +        vcvt.s32.f32    q10, q10, #16
  20.450 +        vld1.64         {d22-d23},[r1,:128]!
  20.451 +        vcvt.s32.f32    q11, q11, #16
  20.452 +        beq             3f
  20.453 +        bics            ip,  r2,  #15
  20.454 +        beq             2f
  20.455 +1:      subs            ip,  ip,  #16
  20.456 +        vld1.64         {d0-d1},  [r3,:128]!
  20.457 +        vcvt.s32.f32    q0,  q0,  #16
  20.458 +        vsri.32         q10, q8,  #16
  20.459 +        vld1.64         {d2-d3},  [r3,:128]!
  20.460 +        vcvt.s32.f32    q1,  q1,  #16
  20.461 +        vld1.64         {d24-d25},[r1,:128]!
  20.462 +        vcvt.s32.f32    q12, q12, #16
  20.463 +        vld1.64         {d26-d27},[r1,:128]!
  20.464 +        vsri.32         q11, q9,  #16
  20.465 +        vst1.64         {d20-d21},[r0,:128]!
  20.466 +        vcvt.s32.f32    q13, q13, #16
  20.467 +        vst1.64         {d22-d23},[r0,:128]!
  20.468 +        vsri.32         q12, q0,  #16
  20.469 +        vld1.64         {d16-d17},[r3,:128]!
  20.470 +        vsri.32         q13, q1,  #16
  20.471 +        vst1.64         {d24-d25},[r0,:128]!
  20.472 +        vcvt.s32.f32    q8,  q8,  #16
  20.473 +        vld1.64         {d18-d19},[r3,:128]!
  20.474 +        vcvt.s32.f32    q9,  q9,  #16
  20.475 +        vld1.64         {d20-d21},[r1,:128]!
  20.476 +        vcvt.s32.f32    q10, q10, #16
  20.477 +        vld1.64         {d22-d23},[r1,:128]!
  20.478 +        vcvt.s32.f32    q11, q11, #16
  20.479 +        vst1.64         {d26-d27},[r0,:128]!
  20.480 +        bne             1b
  20.481 +        ands            r2,  r2,  #15
  20.482 +        beq             3f
  20.483 +2:      vsri.32         q10, q8,  #16
  20.484 +        vld1.64         {d0-d1},  [r3,:128]!
  20.485 +        vcvt.s32.f32    q0,  q0,  #16
  20.486 +        vld1.64         {d2-d3},  [r3,:128]!
  20.487 +        vcvt.s32.f32    q1,  q1,  #16
  20.488 +        vld1.64         {d24-d25},[r1,:128]!
  20.489 +        vcvt.s32.f32    q12, q12, #16
  20.490 +        vsri.32         q11, q9,  #16
  20.491 +        vld1.64         {d26-d27},[r1,:128]!
  20.492 +        vcvt.s32.f32    q13, q13, #16
  20.493 +        vst1.64         {d20-d21},[r0,:128]!
  20.494 +        vsri.32         q12, q0,  #16
  20.495 +        vst1.64         {d22-d23},[r0,:128]!
  20.496 +        vsri.32         q13, q1,  #16
  20.497 +        vst1.64         {d24-d27},[r0,:128]!
  20.498 +        bx              lr
  20.499 +3:      vsri.32         q10, q8,  #16
  20.500 +        vsri.32         q11, q9,  #16
  20.501 +        vst1.64         {d20-d23},[r0,:128]!
  20.502 +        bx              lr
  20.503 +
  20.504 +4:      push            {r4-r8,lr}
  20.505 +        cmp             r3,  #4
  20.506 +        lsl             ip,  r3,  #1
  20.507 +        blt             4f
  20.508 +
  20.509 +        @ 4 channels
  20.510 +5:      ldmia           r1!, {r4-r7}
  20.511 +        mov             lr,  r2
  20.512 +        mov             r8,  r0
  20.513 +        vld1.64         {d16-d17},[r4,:128]!
  20.514 +        vcvt.s32.f32    q8,  q8,  #16
  20.515 +        vld1.64         {d18-d19},[r5,:128]!
  20.516 +        vcvt.s32.f32    q9,  q9,  #16
  20.517 +        vld1.64         {d20-d21},[r6,:128]!
  20.518 +        vcvt.s32.f32    q10, q10, #16
  20.519 +        vld1.64         {d22-d23},[r7,:128]!
  20.520 +        vcvt.s32.f32    q11, q11, #16
  20.521 +6:      subs            lr,  lr,  #8
  20.522 +        vld1.64         {d0-d1},  [r4,:128]!
  20.523 +        vcvt.s32.f32    q0,  q0,  #16
  20.524 +        vsri.32         q9,  q8,  #16
  20.525 +        vld1.64         {d2-d3},  [r5,:128]!
  20.526 +        vcvt.s32.f32    q1,  q1,  #16
  20.527 +        vsri.32         q11, q10, #16
  20.528 +        vld1.64         {d4-d5},  [r6,:128]!
  20.529 +        vcvt.s32.f32    q2,  q2,  #16
  20.530 +        vzip.32         d18, d22
  20.531 +        vld1.64         {d6-d7},  [r7,:128]!
  20.532 +        vcvt.s32.f32    q3,  q3,  #16
  20.533 +        vzip.32         d19, d23
  20.534 +        vst1.64         {d18},    [r8], ip
  20.535 +        vsri.32         q1,  q0,  #16
  20.536 +        vst1.64         {d22},    [r8], ip
  20.537 +        vsri.32         q3,  q2,  #16
  20.538 +        vst1.64         {d19},    [r8], ip
  20.539 +        vzip.32         d2,  d6
  20.540 +        vst1.64         {d23},    [r8], ip
  20.541 +        vzip.32         d3,  d7
  20.542 +        beq             7f
  20.543 +        vld1.64         {d16-d17},[r4,:128]!
  20.544 +        vcvt.s32.f32    q8,  q8,  #16
  20.545 +        vst1.64         {d2},     [r8], ip
  20.546 +        vld1.64         {d18-d19},[r5,:128]!
  20.547 +        vcvt.s32.f32    q9,  q9,  #16
  20.548 +        vst1.64         {d6},     [r8], ip
  20.549 +        vld1.64         {d20-d21},[r6,:128]!
  20.550 +        vcvt.s32.f32    q10, q10, #16
  20.551 +        vst1.64         {d3},     [r8], ip
  20.552 +        vld1.64         {d22-d23},[r7,:128]!
  20.553 +        vcvt.s32.f32    q11, q11, #16
  20.554 +        vst1.64         {d7},     [r8], ip
  20.555 +        b               6b
  20.556 +7:      vst1.64         {d2},     [r8], ip
  20.557 +        vst1.64         {d6},     [r8], ip
  20.558 +        vst1.64         {d3},     [r8], ip
  20.559 +        vst1.64         {d7},     [r8], ip
  20.560 +        subs            r3,  r3,  #4
  20.561 +        popeq           {r4-r8,pc}
  20.562 +        cmp             r3,  #4
  20.563 +        add             r0,  r0,  #8
  20.564 +        bge             5b
  20.565 +
  20.566 +        @ 2 channels
  20.567 +4:      cmp             r3,  #2
  20.568 +        blt             4f
  20.569 +        ldmia           r1!, {r4-r5}
  20.570 +        mov             lr,  r2
  20.571 +        mov             r8,  r0
  20.572 +        tst             lr,  #8
  20.573 +        vld1.64         {d16-d17},[r4,:128]!
  20.574 +        vcvt.s32.f32    q8,  q8,  #16
  20.575 +        vld1.64         {d18-d19},[r5,:128]!
  20.576 +        vcvt.s32.f32    q9,  q9,  #16
  20.577 +        vld1.64         {d20-d21},[r4,:128]!
  20.578 +        vcvt.s32.f32    q10, q10, #16
  20.579 +        vld1.64         {d22-d23},[r5,:128]!
  20.580 +        vcvt.s32.f32    q11, q11, #16
  20.581 +        beq             6f
  20.582 +        subs            lr,  lr,  #8
  20.583 +        beq             7f
  20.584 +        vsri.32         d18, d16, #16
  20.585 +        vsri.32         d19, d17, #16
  20.586 +        vld1.64         {d16-d17},[r4,:128]!
  20.587 +        vcvt.s32.f32    q8,  q8,  #16
  20.588 +        vst1.32         {d18[0]}, [r8], ip
  20.589 +        vsri.32         d22, d20, #16
  20.590 +        vst1.32         {d18[1]}, [r8], ip
  20.591 +        vsri.32         d23, d21, #16
  20.592 +        vst1.32         {d19[0]}, [r8], ip
  20.593 +        vst1.32         {d19[1]}, [r8], ip
  20.594 +        vld1.64         {d18-d19},[r5,:128]!
  20.595 +        vcvt.s32.f32    q9,  q9,  #16
  20.596 +        vst1.32         {d22[0]}, [r8], ip
  20.597 +        vst1.32         {d22[1]}, [r8], ip
  20.598 +        vld1.64         {d20-d21},[r4,:128]!
  20.599 +        vcvt.s32.f32    q10, q10, #16
  20.600 +        vst1.32         {d23[0]}, [r8], ip
  20.601 +        vst1.32         {d23[1]}, [r8], ip
  20.602 +        vld1.64         {d22-d23},[r5,:128]!
  20.603 +        vcvt.s32.f32    q11, q11, #16
  20.604 +6:      subs            lr,  lr,  #16
  20.605 +        vld1.64         {d0-d1},  [r4,:128]!
  20.606 +        vcvt.s32.f32    q0,  q0,  #16
  20.607 +        vsri.32         d18, d16, #16
  20.608 +        vld1.64         {d2-d3},  [r5,:128]!
  20.609 +        vcvt.s32.f32    q1,  q1,  #16
  20.610 +        vsri.32         d19, d17, #16
  20.611 +        vld1.64         {d4-d5},  [r4,:128]!
  20.612 +        vcvt.s32.f32    q2,  q2,  #16
  20.613 +        vld1.64         {d6-d7},  [r5,:128]!
  20.614 +        vcvt.s32.f32    q3,  q3,  #16
  20.615 +        vst1.32         {d18[0]}, [r8], ip
  20.616 +        vsri.32         d22, d20, #16
  20.617 +        vst1.32         {d18[1]}, [r8], ip
  20.618 +        vsri.32         d23, d21, #16
  20.619 +        vst1.32         {d19[0]}, [r8], ip
  20.620 +        vsri.32         d2,  d0,  #16
  20.621 +        vst1.32         {d19[1]}, [r8], ip
  20.622 +        vsri.32         d3,  d1,  #16
  20.623 +        vst1.32         {d22[0]}, [r8], ip
  20.624 +        vsri.32         d6,  d4,  #16
  20.625 +        vst1.32         {d22[1]}, [r8], ip
  20.626 +        vsri.32         d7,  d5,  #16
  20.627 +        vst1.32         {d23[0]}, [r8], ip
  20.628 +        vst1.32         {d23[1]}, [r8], ip
  20.629 +        beq             6f
  20.630 +        vld1.64         {d16-d17},[r4,:128]!
  20.631 +        vcvt.s32.f32    q8,  q8,  #16
  20.632 +        vst1.32         {d2[0]},  [r8], ip
  20.633 +        vst1.32         {d2[1]},  [r8], ip
  20.634 +        vld1.64         {d18-d19},[r5,:128]!
  20.635 +        vcvt.s32.f32    q9,  q9,  #16
  20.636 +        vst1.32         {d3[0]},  [r8], ip
  20.637 +        vst1.32         {d3[1]},  [r8], ip
  20.638 +        vld1.64         {d20-d21},[r4,:128]!
  20.639 +        vcvt.s32.f32    q10, q10, #16
  20.640 +        vst1.32         {d6[0]},  [r8], ip
  20.641 +        vst1.32         {d6[1]},  [r8], ip
  20.642 +        vld1.64         {d22-d23},[r5,:128]!
  20.643 +        vcvt.s32.f32    q11, q11, #16
  20.644 +        vst1.32         {d7[0]},  [r8], ip
  20.645 +        vst1.32         {d7[1]},  [r8], ip
  20.646 +        bgt             6b
  20.647 +6:      vst1.32         {d2[0]},  [r8], ip
  20.648 +        vst1.32         {d2[1]},  [r8], ip
  20.649 +        vst1.32         {d3[0]},  [r8], ip
  20.650 +        vst1.32         {d3[1]},  [r8], ip
  20.651 +        vst1.32         {d6[0]},  [r8], ip
  20.652 +        vst1.32         {d6[1]},  [r8], ip
  20.653 +        vst1.32         {d7[0]},  [r8], ip
  20.654 +        vst1.32         {d7[1]},  [r8], ip
  20.655 +        b               8f
  20.656 +7:      vsri.32         d18, d16, #16
  20.657 +        vsri.32         d19, d17, #16
  20.658 +        vst1.32         {d18[0]}, [r8], ip
  20.659 +        vsri.32         d22, d20, #16
  20.660 +        vst1.32         {d18[1]}, [r8], ip
  20.661 +        vsri.32         d23, d21, #16
  20.662 +        vst1.32         {d19[0]}, [r8], ip
  20.663 +        vst1.32         {d19[1]}, [r8], ip
  20.664 +        vst1.32         {d22[0]}, [r8], ip
  20.665 +        vst1.32         {d22[1]}, [r8], ip
  20.666 +        vst1.32         {d23[0]}, [r8], ip
  20.667 +        vst1.32         {d23[1]}, [r8], ip
  20.668 +8:      subs            r3,  r3,  #2
  20.669 +        add             r0,  r0,  #4
  20.670 +        popeq           {r4-r8,pc}
  20.671 +
  20.672 +        @ 1 channel
  20.673 +4:      ldr             r4,  [r1],#4
  20.674 +        tst             r2,  #8
  20.675 +        mov             lr,  r2
  20.676 +        mov             r5,  r0
  20.677 +        vld1.64         {d0-d1},  [r4,:128]!
  20.678 +        vcvt.s32.f32    q0,  q0,  #16
  20.679 +        vld1.64         {d2-d3},  [r4,:128]!
  20.680 +        vcvt.s32.f32    q1,  q1,  #16
  20.681 +        bne             8f
  20.682 +6:      subs            lr,  lr,  #16
  20.683 +        vld1.64         {d4-d5},  [r4,:128]!
  20.684 +        vcvt.s32.f32    q2,  q2,  #16
  20.685 +        vld1.64         {d6-d7},  [r4,:128]!
  20.686 +        vcvt.s32.f32    q3,  q3,  #16
  20.687 +        vst1.16         {d0[1]},  [r5,:16], ip
  20.688 +        vst1.16         {d0[3]},  [r5,:16], ip
  20.689 +        vst1.16         {d1[1]},  [r5,:16], ip
  20.690 +        vst1.16         {d1[3]},  [r5,:16], ip
  20.691 +        vst1.16         {d2[1]},  [r5,:16], ip
  20.692 +        vst1.16         {d2[3]},  [r5,:16], ip
  20.693 +        vst1.16         {d3[1]},  [r5,:16], ip
  20.694 +        vst1.16         {d3[3]},  [r5,:16], ip
  20.695 +        beq             7f
  20.696 +        vld1.64         {d0-d1},  [r4,:128]!
  20.697 +        vcvt.s32.f32    q0,  q0,  #16
  20.698 +        vld1.64         {d2-d3},  [r4,:128]!
  20.699 +        vcvt.s32.f32    q1,  q1,  #16
  20.700 +7:      vst1.16         {d4[1]},  [r5,:16], ip
  20.701 +        vst1.16         {d4[3]},  [r5,:16], ip
  20.702 +        vst1.16         {d5[1]},  [r5,:16], ip
  20.703 +        vst1.16         {d5[3]},  [r5,:16], ip
  20.704 +        vst1.16         {d6[1]},  [r5,:16], ip
  20.705 +        vst1.16         {d6[3]},  [r5,:16], ip
  20.706 +        vst1.16         {d7[1]},  [r5,:16], ip
  20.707 +        vst1.16         {d7[3]},  [r5,:16], ip
  20.708 +        bgt             6b
  20.709 +        pop             {r4-r8,pc}
  20.710 +8:      subs            lr,  lr,  #8
  20.711 +        vst1.16         {d0[1]},  [r5,:16], ip
  20.712 +        vst1.16         {d0[3]},  [r5,:16], ip
  20.713 +        vst1.16         {d1[1]},  [r5,:16], ip
  20.714 +        vst1.16         {d1[3]},  [r5,:16], ip
  20.715 +        vst1.16         {d2[1]},  [r5,:16], ip
  20.716 +        vst1.16         {d2[3]},  [r5,:16], ip
  20.717 +        vst1.16         {d3[1]},  [r5,:16], ip
  20.718 +        vst1.16         {d3[3]},  [r5,:16], ip
  20.719 +        popeq           {r4-r8,pc}
  20.720 +        vld1.64         {d0-d1},  [r4,:128]!
  20.721 +        vcvt.s32.f32    q0,  q0,  #16
  20.722 +        vld1.64         {d2-d3},  [r4,:128]!
  20.723 +        vcvt.s32.f32    q1,  q1,  #16
  20.724 +        b               6b
  20.725 +endfunc
  20.726 +
  20.727 +function ff_vector_fmul_neon, export=1
  20.728 +        mov             r3,  r0
  20.729 +        subs            r2,  r2,  #8
  20.730 +        vld1.64         {d0-d3},  [r0,:128]!
  20.731 +        vld1.64         {d4-d7},  [r1,:128]!
  20.732 +        vmul.f32        q8,  q0,  q2
  20.733 +        vmul.f32        q9,  q1,  q3
  20.734 +        beq             3f
  20.735 +        bics            ip,  r2,  #15
  20.736 +        beq             2f
  20.737 +1:      subs            ip,  ip,  #16
  20.738 +        vld1.64         {d0-d1},  [r0,:128]!
  20.739 +        vld1.64         {d4-d5},  [r1,:128]!
  20.740 +        vmul.f32        q10, q0,  q2
  20.741 +        vld1.64         {d2-d3},  [r0,:128]!
  20.742 +        vld1.64         {d6-d7},  [r1,:128]!
  20.743 +        vmul.f32        q11, q1,  q3
  20.744 +        vst1.64         {d16-d19},[r3,:128]!
  20.745 +        vld1.64         {d0-d1},  [r0,:128]!
  20.746 +        vld1.64         {d4-d5},  [r1,:128]!
  20.747 +        vmul.f32        q8,  q0,  q2
  20.748 +        vld1.64         {d2-d3},  [r0,:128]!
  20.749 +        vld1.64         {d6-d7},  [r1,:128]!
  20.750 +        vmul.f32        q9,  q1,  q3
  20.751 +        vst1.64         {d20-d23},[r3,:128]!
  20.752 +        bne             1b
  20.753 +        ands            r2,  r2,  #15
  20.754 +        beq             3f
  20.755 +2:      vld1.64         {d0-d1},  [r0,:128]!
  20.756 +        vld1.64         {d4-d5},  [r1,:128]!
  20.757 +        vst1.64         {d16-d17},[r3,:128]!
  20.758 +        vmul.f32        q8,  q0,  q2
  20.759 +        vld1.64         {d2-d3},  [r0,:128]!
  20.760 +        vld1.64         {d6-d7},  [r1,:128]!
  20.761 +        vst1.64         {d18-d19},[r3,:128]!
  20.762 +        vmul.f32        q9,  q1,  q3
  20.763 +3:      vst1.64         {d16-d19},[r3,:128]!
  20.764 +        bx              lr
  20.765 +endfunc
  20.766 +
  20.767 +function ff_vector_fmul_window_neon, export=1
  20.768 +VFP     vdup.32         q8,  d0[0]
  20.769 +NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
  20.770 +        push            {r4,r5,lr}
  20.771 +VFP     ldr             lr,  [sp, #12]
  20.772 +NOVFP   ldr             lr,  [sp, #16]
  20.773 +        sub             r2,  r2,  #8
  20.774 +        sub             r5,  lr,  #2
  20.775 +        add             r2,  r2,  r5, lsl #2
  20.776 +        add             r4,  r3,  r5, lsl #3
  20.777 +        add             ip,  r0,  r5, lsl #3
  20.778 +        mov             r5,  #-16
  20.779 +        vld1.64         {d0,d1},  [r1,:128]!
  20.780 +        vld1.64         {d2,d3},  [r2,:128], r5
  20.781 +        vld1.64         {d4,d5},  [r3,:128]!
  20.782 +        vld1.64         {d6,d7},  [r4,:128], r5
  20.783 +1:      subs            lr,  lr,  #4
  20.784 +        vmov            q11, q8
  20.785 +        vmla.f32        d22, d0,  d4
  20.786 +        vmov            q10, q8
  20.787 +        vmla.f32        d23, d1,  d5
  20.788 +        vrev64.32       q3,  q3
  20.789 +        vmla.f32        d20, d0,  d7
  20.790 +        vrev64.32       q1,  q1
  20.791 +        vmla.f32        d21, d1,  d6
  20.792 +        beq             2f
  20.793 +        vmla.f32        d22, d3,  d7
  20.794 +        vld1.64         {d0,d1},  [r1,:128]!
  20.795 +        vmla.f32        d23, d2,  d6
  20.796 +        vld1.64         {d18,d19},[r2,:128], r5
  20.797 +        vmls.f32        d20, d3,  d4
  20.798 +        vld1.64         {d24,d25},[r3,:128]!
  20.799 +        vmls.f32        d21, d2,  d5
  20.800 +        vld1.64         {d6,d7},  [r4,:128], r5
  20.801 +        vmov            q1,  q9
  20.802 +        vrev64.32       q11, q11
  20.803 +        vmov            q2,  q12
  20.804 +        vswp            d22, d23
  20.805 +        vst1.64         {d20,d21},[r0,:128]!
  20.806 +        vst1.64         {d22,d23},[ip,:128], r5
  20.807 +        b               1b
  20.808 +2:      vmla.f32        d22, d3,  d7
  20.809 +        vmla.f32        d23, d2,  d6
  20.810 +        vmls.f32        d20, d3,  d4
  20.811 +        vmls.f32        d21, d2,  d5
  20.812 +        vrev64.32       q11, q11
  20.813 +        vswp            d22, d23
  20.814 +        vst1.64         {d20,d21},[r0,:128]!
  20.815 +        vst1.64         {d22,d23},[ip,:128], r5
  20.816 +        pop             {r4,r5,pc}
  20.817 +endfunc
  20.818 +
  20.819 +#if CONFIG_VORBIS_DECODER
  20.820 +function ff_vorbis_inverse_coupling_neon, export=1
  20.821 +        vmov.i32        q10, #1<<31
  20.822 +        subs            r2,  r2,  #4
  20.823 +        mov             r3,  r0
  20.824 +        mov             r12, r1
  20.825 +        beq             3f
  20.826 +
  20.827 +        vld1.32         {d24-d25},[r1,:128]!
  20.828 +        vld1.32         {d22-d23},[r0,:128]!
  20.829 +        vcle.s32        q8,  q12, #0
  20.830 +        vand            q9,  q11, q10
  20.831 +        veor            q12, q12, q9
  20.832 +        vand            q2,  q12, q8
  20.833 +        vbic            q3,  q12, q8
  20.834 +        vadd.f32        q12, q11, q2
  20.835 +        vsub.f32        q11, q11, q3
  20.836 +1:      vld1.32         {d2-d3},  [r1,:128]!
  20.837 +        vld1.32         {d0-d1},  [r0,:128]!
  20.838 +        vcle.s32        q8,  q1,  #0
  20.839 +        vand            q9,  q0,  q10
  20.840 +        veor            q1,  q1,  q9
  20.841 +        vst1.32         {d24-d25},[r3, :128]!
  20.842 +        vst1.32         {d22-d23},[r12,:128]!
  20.843 +        vand            q2,  q1,  q8
  20.844 +        vbic            q3,  q1,  q8
  20.845 +        vadd.f32        q1,  q0,  q2
  20.846 +        vsub.f32        q0,  q0,  q3
  20.847 +        subs            r2,  r2,  #8
  20.848 +        ble             2f
  20.849 +        vld1.32         {d24-d25},[r1,:128]!
  20.850 +        vld1.32         {d22-d23},[r0,:128]!
  20.851 +        vcle.s32        q8,  q12, #0
  20.852 +        vand            q9,  q11, q10
  20.853 +        veor            q12, q12, q9
  20.854 +        vst1.32         {d2-d3},  [r3, :128]!
  20.855 +        vst1.32         {d0-d1},  [r12,:128]!
  20.856 +        vand            q2,  q12, q8
  20.857 +        vbic            q3,  q12, q8
  20.858 +        vadd.f32        q12, q11, q2
  20.859 +        vsub.f32        q11, q11, q3
  20.860 +        b               1b
  20.861 +
  20.862 +2:      vst1.32         {d2-d3},  [r3, :128]!
  20.863 +        vst1.32         {d0-d1},  [r12,:128]!
  20.864 +        bxlt            lr
  20.865 +
  20.866 +3:      vld1.32         {d2-d3},  [r1,:128]
  20.867 +        vld1.32         {d0-d1},  [r0,:128]
  20.868 +        vcle.s32        q8,  q1,  #0
  20.869 +        vand            q9,  q0,  q10
  20.870 +        veor            q1,  q1,  q9
  20.871 +        vand            q2,  q1,  q8
  20.872 +        vbic            q3,  q1,  q8
  20.873 +        vadd.f32        q1,  q0,  q2
  20.874 +        vsub.f32        q0,  q0,  q3
  20.875 +        vst1.32         {d2-d3},  [r0,:128]!
  20.876 +        vst1.32         {d0-d1},  [r1,:128]!
  20.877 +        bx              lr
  20.878 +endfunc
  20.879 +#endif
  20.880 +
  20.881 +function ff_vector_fmul_scalar_neon, export=1
  20.882 +VFP     len .req r2
  20.883 +NOVFP   len .req r3
  20.884 +VFP     vdup.32         q8,  d0[0]
  20.885 +NOVFP   vdup.32         q8,  r2
  20.886 +        bics            r12, len, #15
  20.887 +        beq             3f
  20.888 +        vld1.32         {q0},[r1,:128]!
  20.889 +        vld1.32         {q1},[r1,:128]!
  20.890 +1:      vmul.f32        q0,  q0,  q8
  20.891 +        vld1.32         {q2},[r1,:128]!
  20.892 +        vmul.f32        q1,  q1,  q8
  20.893 +        vld1.32         {q3},[r1,:128]!
  20.894 +        vmul.f32        q2,  q2,  q8
  20.895 +        vst1.32         {q0},[r0,:128]!
  20.896 +        vmul.f32        q3,  q3,  q8
  20.897 +        vst1.32         {q1},[r0,:128]!
  20.898 +        subs            r12, r12, #16
  20.899 +        beq             2f
  20.900 +        vld1.32         {q0},[r1,:128]!
  20.901 +        vst1.32         {q2},[r0,:128]!
  20.902 +        vld1.32         {q1},[r1,:128]!
  20.903 +        vst1.32         {q3},[r0,:128]!
  20.904 +        b               1b
  20.905 +2:      vst1.32         {q2},[r0,:128]!
  20.906 +        vst1.32         {q3},[r0,:128]!
  20.907 +        ands            len, len, #15
  20.908 +        bxeq            lr
  20.909 +3:      vld1.32         {q0},[r1,:128]!
  20.910 +        vmul.f32        q0,  q0,  q8
  20.911 +        vst1.32         {q0},[r0,:128]!
  20.912 +        subs            len, len, #4
  20.913 +        bgt             3b
  20.914 +        bx              lr
  20.915 +        .unreq          len
  20.916 +endfunc
  20.917 +
  20.918 +function ff_vector_fmul_sv_scalar_2_neon, export=1
  20.919 +VFP     vdup.32         d16, d0[0]
  20.920 +NOVFP   vdup.32         d16, r3
  20.921 +NOVFP   ldr             r3,  [sp]
  20.922 +        vld1.32         {d0},[r1,:64]!
  20.923 +        vld1.32         {d1},[r1,:64]!
  20.924 +1:      subs            r3,  r3,  #4
  20.925 +        vmul.f32        d4,  d0,  d16
  20.926 +        vmul.f32        d5,  d1,  d16
  20.927 +        ldr             r12, [r2], #4
  20.928 +        vld1.32         {d2},[r12,:64]
  20.929 +        ldr             r12, [r2], #4
  20.930 +        vld1.32         {d3},[r12,:64]
  20.931 +        vmul.f32        d4,  d4,  d2
  20.932 +        vmul.f32        d5,  d5,  d3
  20.933 +        beq             2f
  20.934 +        vld1.32         {d0},[r1,:64]!
  20.935 +        vld1.32         {d1},[r1,:64]!
  20.936 +        vst1.32         {d4},[r0,:64]!
  20.937 +        vst1.32         {d5},[r0,:64]!
  20.938 +        b               1b
  20.939 +2:      vst1.32         {d4},[r0,:64]!
  20.940 +        vst1.32         {d5},[r0,:64]!
  20.941 +        bx              lr
  20.942 +endfunc
  20.943 +
  20.944 +function ff_vector_fmul_sv_scalar_4_neon, export=1
  20.945 +VFP     vdup.32         q10, d0[0]
  20.946 +NOVFP   vdup.32         q10, r3
  20.947 +NOVFP   ldr             r3,  [sp]
  20.948 +        push            {lr}
  20.949 +        bics            lr,  r3,  #7
  20.950 +        beq             3f
  20.951 +        vld1.32         {q0},[r1,:128]!
  20.952 +        vld1.32         {q2},[r1,:128]!
  20.953 +1:      ldr             r12, [r2], #4
  20.954 +        vld1.32         {q1},[r12,:128]
  20.955 +        ldr             r12, [r2], #4
  20.956 +        vld1.32         {q3},[r12,:128]
  20.957 +        vmul.f32        q8,  q0,  q10
  20.958 +        vmul.f32        q8,  q8,  q1
  20.959 +        vmul.f32        q9,  q2,  q10
  20.960 +        vmul.f32        q9,  q9,  q3
  20.961 +        subs            lr,  lr,  #8
  20.962 +        beq             2f
  20.963 +        vld1.32         {q0},[r1,:128]!
  20.964 +        vld1.32         {q2},[r1,:128]!
  20.965 +        vst1.32         {q8},[r0,:128]!
  20.966 +        vst1.32         {q9},[r0,:128]!
  20.967 +        b               1b
  20.968 +2:      vst1.32         {q8},[r0,:128]!
  20.969 +        vst1.32         {q9},[r0,:128]!
  20.970 +        ands            r3,  r3,  #7
  20.971 +        popeq           {pc}
  20.972 +3:      vld1.32         {q0},[r1,:128]!
  20.973 +        ldr             r12, [r2], #4
  20.974 +        vld1.32         {q1},[r12,:128]
  20.975 +        vmul.f32        q0,  q0,  q10
  20.976 +        vmul.f32        q0,  q0,  q1
  20.977 +        vst1.32         {q0},[r0,:128]!
  20.978 +        subs            r3,  r3,  #4
  20.979 +        bgt             3b
  20.980 +        pop             {pc}
  20.981 +endfunc
  20.982 +
  20.983 +function ff_sv_fmul_scalar_2_neon, export=1
  20.984 +VFP     len .req r2
  20.985 +NOVFP   len .req r3
  20.986 +VFP     vdup.32         q8,  d0[0]
  20.987 +NOVFP   vdup.32         q8,  r2
  20.988 +        ldr             r12, [r1], #4
  20.989 +        vld1.32         {d0},[r12,:64]
  20.990 +        ldr             r12, [r1], #4
  20.991 +        vld1.32         {d1},[r12,:64]
  20.992 +1:      vmul.f32        q1,  q0,  q8
  20.993 +        subs            len, len, #4
  20.994 +        beq             2f
  20.995 +        ldr             r12, [r1], #4
  20.996 +        vld1.32         {d0},[r12,:64]
  20.997 +        ldr             r12, [r1], #4
  20.998 +        vld1.32         {d1},[r12,:64]
  20.999 +        vst1.32         {q1},[r0,:128]!
 20.1000 +        b               1b
 20.1001 +2:      vst1.32         {q1},[r0,:128]!
 20.1002 +        bx              lr
 20.1003 +        .unreq          len
 20.1004 +endfunc
 20.1005 +
 20.1006 +function ff_sv_fmul_scalar_4_neon, export=1
 20.1007 +VFP     len .req r2
 20.1008 +NOVFP   len .req r3
 20.1009 +VFP     vdup.32         q8,  d0[0]
 20.1010 +NOVFP   vdup.32         q8,  r2
 20.1011 +1:      ldr             r12, [r1], #4
 20.1012 +        vld1.32         {q0},[r12,:128]
 20.1013 +        vmul.f32        q0,  q0,  q8
 20.1014 +        vst1.32         {q0},[r0,:128]!
 20.1015 +        subs            len, len, #4
 20.1016 +        bgt             1b
 20.1017 +        bx              lr
 20.1018 +        .unreq          len
 20.1019 +endfunc
 20.1020 +
 20.1021 +function ff_butterflies_float_neon, export=1
 20.1022 +1:      vld1.32         {q0},[r0,:128]
 20.1023 +        vld1.32         {q1},[r1,:128]
 20.1024 +        vsub.f32        q2,  q0,  q1
 20.1025 +        vadd.f32        q1,  q0,  q1
 20.1026 +        vst1.32         {q2},[r1,:128]!
 20.1027 +        vst1.32         {q1},[r0,:128]!
 20.1028 +        subs            r2,  r2,  #4
 20.1029 +        bgt             1b
 20.1030 +        bx              lr
 20.1031 +endfunc
 20.1032 +
 20.1033 +function ff_scalarproduct_float_neon, export=1
 20.1034 +        vmov.f32        q2,  #0.0
 20.1035 +1:      vld1.32         {q0},[r0,:128]!
 20.1036 +        vld1.32         {q1},[r1,:128]!
 20.1037 +        vmla.f32        q2,  q0,  q1
 20.1038 +        subs            r2,  r2,  #4
 20.1039 +        bgt             1b
 20.1040 +        vadd.f32        d0,  d4,  d5
 20.1041 +        vpadd.f32       d0,  d0,  d0
 20.1042 +NOVFP   vmov.32         r0,  d0[0]
 20.1043 +        bx              lr
 20.1044 +endfunc
 20.1045 +
 20.1046 +function ff_int32_to_float_fmul_scalar_neon, export=1
 20.1047 +VFP     vdup.32         q0,  d0[0]
 20.1048 +VFP     len     .req    r2
 20.1049 +NOVFP   vdup.32         q0,  r2
 20.1050 +NOVFP   len     .req    r3
 20.1051 +
 20.1052 +        vld1.32         {q1},[r1,:128]!
 20.1053 +        vcvt.f32.s32    q3,  q1
 20.1054 +        vld1.32         {q2},[r1,:128]!
 20.1055 +        vcvt.f32.s32    q8,  q2
 20.1056 +1:      subs            len, len, #8
 20.1057 +        pld             [r1, #16]
 20.1058 +        vmul.f32        q9,  q3,  q0
 20.1059 +        vmul.f32        q10, q8,  q0
 20.1060 +        beq             2f
 20.1061 +        vld1.32         {q1},[r1,:128]!
 20.1062 +        vcvt.f32.s32    q3,  q1
 20.1063 +        vld1.32         {q2},[r1,:128]!
 20.1064 +        vcvt.f32.s32    q8,  q2
 20.1065 +        vst1.32         {q9}, [r0,:128]!
 20.1066 +        vst1.32         {q10},[r0,:128]!
 20.1067 +        b               1b
 20.1068 +2:      vst1.32         {q9}, [r0,:128]!
 20.1069 +        vst1.32         {q10},[r0,:128]!
 20.1070 +        bx              lr
 20.1071 +        .unreq  len
 20.1072 +endfunc
 20.1073 +
 20.1074 +function ff_vector_fmul_reverse_neon, export=1
 20.1075 +        add             r2,  r2,  r3,  lsl #2
 20.1076 +        sub             r2,  r2,  #32
 20.1077 +        mov             r12, #-32
 20.1078 +        vld1.32         {q0-q1},  [r1,:128]!
 20.1079 +        vld1.32         {q2-q3},  [r2,:128], r12
 20.1080 +1:      pld             [r1, #32]
 20.1081 +        vrev64.32       q3,  q3
 20.1082 +        vmul.f32        d16, d0,  d7
 20.1083 +        vmul.f32        d17, d1,  d6
 20.1084 +        pld             [r2, #-32]
 20.1085 +        vrev64.32       q2,  q2
 20.1086 +        vmul.f32        d18, d2,  d5
 20.1087 +        vmul.f32        d19, d3,  d4
 20.1088 +        subs            r3,  r3,  #8
 20.1089 +        beq             2f
 20.1090 +        vld1.32         {q0-q1},  [r1,:128]!
 20.1091 +        vld1.32         {q2-q3},  [r2,:128], r12
 20.1092 +        vst1.32         {q8-q9},  [r0,:128]!
 20.1093 +        b               1b
 20.1094 +2:      vst1.32         {q8-q9},  [r0,:128]!
 20.1095 +        bx              lr
 20.1096 +endfunc
 20.1097 +
 20.1098 +function ff_vector_fmul_add_neon, export=1
 20.1099 +        ldr             r12, [sp]
 20.1100 +        vld1.32         {q0-q1},  [r1,:128]!
 20.1101 +        vld1.32         {q8-q9},  [r2,:128]!
 20.1102 +        vld1.32         {q2-q3},  [r3,:128]!
 20.1103 +        vmul.f32        q10, q0,  q8
 20.1104 +        vmul.f32        q11, q1,  q9
 20.1105 +1:      vadd.f32        q12, q2,  q10
 20.1106 +        vadd.f32        q13, q3,  q11
 20.1107 +        pld             [r1, #16]
 20.1108 +        pld             [r2, #16]
 20.1109 +        pld             [r3, #16]
 20.1110 +        subs            r12, r12, #8
 20.1111 +        beq             2f
 20.1112 +        vld1.32         {q0},     [r1,:128]!
 20.1113 +        vld1.32         {q8},     [r2,:128]!
 20.1114 +        vmul.f32        q10, q0,  q8
 20.1115 +        vld1.32         {q1},     [r1,:128]!
 20.1116 +        vld1.32         {q9},     [r2,:128]!
 20.1117 +        vmul.f32        q11, q1,  q9
 20.1118 +        vld1.32         {q2-q3},  [r3,:128]!
 20.1119 +        vst1.32         {q12-q13},[r0,:128]!
 20.1120 +        b               1b
 20.1121 +2:      vst1.32         {q12-q13},[r0,:128]!
 20.1122 +        bx              lr
 20.1123 +endfunc
 20.1124 +
 20.1125 +function ff_vector_clipf_neon, export=1
 20.1126 +VFP     vdup.32         q1,  d0[1]
 20.1127 +VFP     vdup.32         q0,  d0[0]
 20.1128 +NOVFP   vdup.32         q0,  r2
 20.1129 +NOVFP   vdup.32         q1,  r3
 20.1130 +NOVFP   ldr             r2,  [sp]
 20.1131 +        vld1.f32        {q2},[r1,:128]!
 20.1132 +        vmin.f32        q10, q2,  q1
 20.1133 +        vld1.f32        {q3},[r1,:128]!
 20.1134 +        vmin.f32        q11, q3,  q1
 20.1135 +1:      vmax.f32        q8,  q10, q0
 20.1136 +        vmax.f32        q9,  q11, q0
 20.1137 +        subs            r2,  r2,  #8
 20.1138 +        beq             2f
 20.1139 +        vld1.f32        {q2},[r1,:128]!
 20.1140 +        vmin.f32        q10, q2,  q1
 20.1141 +        vld1.f32        {q3},[r1,:128]!
 20.1142 +        vmin.f32        q11, q3,  q1
 20.1143 +        vst1.f32        {q8},[r0,:128]!
 20.1144 +        vst1.f32        {q9},[r0,:128]!
 20.1145 +        b               1b
 20.1146 +2:      vst1.f32        {q8},[r0,:128]!
 20.1147 +        vst1.f32        {q9},[r0,:128]!
 20.1148 +        bx              lr
 20.1149 +endfunc

    21.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    21.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/dsputil_vfp.S	Mon Aug 27 12:09:56 2012 +0200
    21.3 @@ -0,0 +1,189 @@
    21.4 +/*
    21.5 + * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
    21.6 + *
    21.7 + * This file is part of FFmpeg.
    21.8 + *
    21.9 + * FFmpeg is free software; you can redistribute it and/or
   21.10 + * modify it under the terms of the GNU Lesser General Public
   21.11 + * License as published by the Free Software Foundation; either
   21.12 + * version 2.1 of the License, or (at your option) any later version.
   21.13 + *
   21.14 + * FFmpeg is distributed in the hope that it will be useful,
   21.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   21.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   21.17 + * Lesser General Public License for more details.
   21.18 + *
   21.19 + * You should have received a copy of the GNU Lesser General Public
   21.20 + * License along with FFmpeg; if not, write to the Free Software
   21.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   21.22 + */
   21.23 +
   21.24 +#include "config.h"
   21.25 +#include "asm.S"
   21.26 +
   21.27 +        .syntax unified
   21.28 +/*
   21.29 + * VFP is a floating point coprocessor used in some ARM cores. VFP11 has 1 cycle
   21.30 + * throughput for almost all the instructions (except for double precision
   21.31 + * arithmetics), but rather high latency. Latency is 4 cycles for loads and 8 cycles
   21.32 + * for arithmetic operations. Scheduling code to avoid pipeline stalls is very
   21.33 + * important for performance. One more interesting feature is that VFP has
   21.34 + * independent load/store and arithmetics pipelines, so it is possible to make
   21.35 + * them work simultaneously and get more than 1 operation per cycle. Load/store
   21.36 + * pipeline can process 2 single precision floating point values per cycle and
   21.37 + * supports bulk loads and stores for large sets of registers. Arithmetic operations
   21.38 + * can be done on vectors, which allows to keep the arithmetics pipeline busy,
   21.39 + * while the processor may issue and execute other instructions. Detailed
   21.40 + * optimization manuals can be found at http://www.arm.com
   21.41 + */
   21.42 +
   21.43 +/**
   21.44 + * ARM VFP optimized implementation of 'vector_fmul_c' function.
   21.45 + * Assume that len is a positive number and is multiple of 8
   21.46 + */
   21.47 +@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
   21.48 +function ff_vector_fmul_vfp, export=1
   21.49 +        vpush           {d8-d15}
   21.50 +        mov             r3,  r0
   21.51 +        fmrx            r12, fpscr
   21.52 +        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
   21.53 +        fmxr            fpscr, r12
   21.54 +
   21.55 +        vldmia          r3!, {s0-s3}
   21.56 +        vldmia          r1!, {s8-s11}
   21.57 +        vldmia          r3!, {s4-s7}
   21.58 +        vldmia          r1!, {s12-s15}
   21.59 +        vmul.f32        s8,  s0,  s8
   21.60 +1:
   21.61 +        subs            r2,  r2,  #16
   21.62 +        vmul.f32        s12, s4,  s12
   21.63 +        vldmiage        r3!, {s16-s19}
   21.64 +        vldmiage        r1!, {s24-s27}
   21.65 +        vldmiage        r3!, {s20-s23}
   21.66 +        vldmiage        r1!, {s28-s31}
   21.67 +        vmulge.f32      s24, s16, s24
   21.68 +        vstmia          r0!, {s8-s11}
   21.69 +        vstmia          r0!, {s12-s15}
   21.70 +        vmulge.f32      s28, s20, s28
   21.71 +        vldmiagt        r3!, {s0-s3}
   21.72 +        vldmiagt        r1!, {s8-s11}
   21.73 +        vldmiagt        r3!, {s4-s7}
   21.74 +        vldmiagt        r1!, {s12-s15}
   21.75 +        vmulge.f32      s8,  s0,  s8
   21.76 +        vstmiage        r0!, {s24-s27}
   21.77 +        vstmiage        r0!, {s28-s31}
   21.78 +        bgt             1b
   21.79 +
   21.80 +        bic             r12, r12, #(7 << 16) /* set vector size back to 1 */
   21.81 +        fmxr            fpscr, r12
   21.82 +        vpop            {d8-d15}
   21.83 +        bx              lr
   21.84 +endfunc
   21.85 +
   21.86 +/**
   21.87 + * ARM VFP optimized implementation of 'vector_fmul_reverse_c' function.
   21.88 + * Assume that len is a positive number and is multiple of 8
   21.89 + */
   21.90 +@ void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
   21.91 +@                                 const float *src1, int len)
   21.92 +function ff_vector_fmul_reverse_vfp, export=1
   21.93 +        vpush           {d8-d15}
   21.94 +        add             r2,  r2,  r3, lsl #2
   21.95 +        vldmdb          r2!, {s0-s3}
   21.96 +        vldmia          r1!, {s8-s11}
   21.97 +        vldmdb          r2!, {s4-s7}
   21.98 +        vldmia          r1!, {s12-s15}
   21.99 +        vmul.f32        s8,  s3,  s8
  21.100 +        vmul.f32        s9,  s2,  s9
  21.101 +        vmul.f32        s10, s1,  s10
  21.102 +        vmul.f32        s11, s0,  s11
  21.103 +1:
  21.104 +        subs            r3,  r3,  #16
  21.105 +        vldmdbge        r2!, {s16-s19}
  21.106 +        vmul.f32        s12, s7,  s12
  21.107 +        vldmiage        r1!, {s24-s27}
  21.108 +        vmul.f32        s13, s6,  s13
  21.109 +        vldmdbge        r2!, {s20-s23}
  21.110 +        vmul.f32        s14, s5,  s14
  21.111 +        vldmiage        r1!, {s28-s31}
  21.112 +        vmul.f32        s15, s4,  s15
  21.113 +        vmulge.f32      s24, s19, s24
  21.114 +        vldmdbgt        r2!, {s0-s3}
  21.115 +        vmulge.f32      s25, s18, s25
  21.116 +        vstmia          r0!, {s8-s13}
  21.117 +        vmulge.f32      s26, s17, s26
  21.118 +        vldmiagt        r1!, {s8-s11}
  21.119 +        vmulge.f32      s27, s16, s27
  21.120 +        vmulge.f32      s28, s23, s28
  21.121 +        vldmdbgt        r2!, {s4-s7}
  21.122 +        vmulge.f32      s29, s22, s29
  21.123 +        vstmia          r0!, {s14-s15}
  21.124 +        vmulge.f32      s30, s21, s30
  21.125 +        vmulge.f32      s31, s20, s31
  21.126 +        vmulge.f32      s8,  s3,  s8
  21.127 +        vldmiagt        r1!, {s12-s15}
  21.128 +        vmulge.f32      s9,  s2,  s9
  21.129 +        vmulge.f32      s10, s1,  s10
  21.130 +        vstmiage        r0!, {s24-s27}
  21.131 +        vmulge.f32      s11, s0,  s11
  21.132 +        vstmiage        r0!, {s28-s31}
  21.133 +        bgt             1b
  21.134 +
  21.135 +        vpop            {d8-d15}
  21.136 +        bx              lr
  21.137 +endfunc
  21.138 +
  21.139 +#if HAVE_ARMV6
  21.140 +/**
  21.141 + * ARM VFP optimized float to int16 conversion.
  21.142 + * Assume that len is a positive number and is multiple of 8, destination
  21.143 + * buffer is at least 4 bytes aligned (8 bytes alignment is better for
  21.144 + * performance), little endian byte sex
  21.145 + */
  21.146 +@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
  21.147 +function ff_float_to_int16_vfp, export=1
  21.148 +        push            {r4-r8,lr}
  21.149 +        vpush           {d8-d11}
  21.150 +        vldmia          r1!, {s16-s23}
  21.151 +        vcvt.s32.f32    s0,  s16
  21.152 +        vcvt.s32.f32    s1,  s17
  21.153 +        vcvt.s32.f32    s2,  s18
  21.154 +        vcvt.s32.f32    s3,  s19
  21.155 +        vcvt.s32.f32    s4,  s20
  21.156 +        vcvt.s32.f32    s5,  s21
  21.157 +        vcvt.s32.f32    s6,  s22
  21.158 +        vcvt.s32.f32    s7,  s23
  21.159 +1:
  21.160 +        subs            r2,  r2,  #8
  21.161 +        vmov            r3,  r4,  s0, s1
  21.162 +        vmov            r5,  r6,  s2, s3
  21.163 +        vmov            r7,  r8,  s4, s5
  21.164 +        vmov            ip,  lr,  s6, s7
  21.165 +        vldmiagt        r1!, {s16-s23}
  21.166 +        ssat            r4,  #16, r4
  21.167 +        ssat            r3,  #16, r3
  21.168 +        ssat            r6,  #16, r6
  21.169 +        ssat            r5,  #16, r5
  21.170 +        pkhbt           r3,  r3,  r4, lsl #16
  21.171 +        pkhbt           r4,  r5,  r6, lsl #16
  21.172 +        vcvtgt.s32.f32  s0,  s16
  21.173 +        vcvtgt.s32.f32  s1,  s17
  21.174 +        vcvtgt.s32.f32  s2,  s18
  21.175 +        vcvtgt.s32.f32  s3,  s19
  21.176 +        vcvtgt.s32.f32  s4,  s20
  21.177 +        vcvtgt.s32.f32  s5,  s21
  21.178 +        vcvtgt.s32.f32  s6,  s22
  21.179 +        vcvtgt.s32.f32  s7,  s23
  21.180 +        ssat            r8,  #16, r8
  21.181 +        ssat            r7,  #16, r7
  21.182 +        ssat            lr,  #16, lr
  21.183 +        ssat            ip,  #16, ip
  21.184 +        pkhbt           r5,  r7,  r8, lsl #16
  21.185 +        pkhbt           r6,  ip,  lr, lsl #16
  21.186 +        stmia           r0!, {r3-r6}
  21.187 +        bgt             1b
  21.188 +
  21.189 +        vpop            {d8-d11}
  21.190 +        pop             {r4-r8,pc}
  21.191 +endfunc
  21.192 +#endif

    22.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    22.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/fft_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
    22.3 @@ -0,0 +1,65 @@
    22.4 +/*
    22.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    22.6 + *
    22.7 + * This file is part of FFmpeg.
    22.8 + *
    22.9 + * FFmpeg is free software; you can redistribute it and/or
   22.10 + * modify it under the terms of the GNU Lesser General Public
   22.11 + * License as published by the Free Software Foundation; either
   22.12 + * version 2.1 of the License, or (at your option) any later version.
   22.13 + *
   22.14 + * FFmpeg is distributed in the hope that it will be useful,
   22.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   22.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   22.17 + * Lesser General Public License for more details.
   22.18 + *
   22.19 + * You should have received a copy of the GNU Lesser General Public
   22.20 + * License along with FFmpeg; if not, write to the Free Software
   22.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   22.22 + */
   22.23 +
   22.24 +#include "libavcodec/fft.h"
   22.25 +#include "libavcodec/synth_filter.h"
   22.26 +
   22.27 +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
   22.28 +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
   22.29 +
   22.30 +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
   22.31 +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
   22.32 +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
   22.33 +
   22.34 +void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
   22.35 +
   22.36 +void ff_synth_filter_float_neon(FFTContext *imdct,
   22.37 +                                float *synth_buf_ptr, int *synth_buf_offset,
   22.38 +                                float synth_buf2[32], const float window[512],
   22.39 +                                float out[32], const float in[32],
   22.40 +                                float scale, float bias);
   22.41 +
   22.42 +av_cold void ff_fft_init_arm(FFTContext *s)
   22.43 +{
   22.44 +    if (HAVE_NEON) {
   22.45 +        s->fft_permute  = ff_fft_permute_neon;
   22.46 +        s->fft_calc     = ff_fft_calc_neon;
   22.47 +        s->imdct_calc   = ff_imdct_calc_neon;
   22.48 +        s->imdct_half   = ff_imdct_half_neon;
   22.49 +        s->mdct_calc    = ff_mdct_calc_neon;
   22.50 +        s->permutation  = FF_MDCT_PERM_INTERLEAVE;
   22.51 +    }
   22.52 +}
   22.53 +
   22.54 +#if CONFIG_RDFT
   22.55 +av_cold void ff_rdft_init_arm(RDFTContext *s)
   22.56 +{
   22.57 +    if (HAVE_NEON)
   22.58 +        s->rdft_calc    = ff_rdft_calc_neon;
   22.59 +}
   22.60 +#endif
   22.61 +
   22.62 +#if CONFIG_DCA_DECODER
   22.63 +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
   22.64 +{
   22.65 +    if (HAVE_NEON)
   22.66 +        s->synth_filter_float = ff_synth_filter_float_neon;
   22.67 +}
   22.68 +#endif

    23.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    23.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/fft_neon.S	Mon Aug 27 12:09:56 2012 +0200
    23.3 @@ -0,0 +1,371 @@
    23.4 +/*
    23.5 + * ARM NEON optimised FFT
    23.6 + *
    23.7 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    23.8 + * Copyright (c) 2009 Naotoshi Nojiri
    23.9 + *
   23.10 + * This file is part of FFmpeg.
   23.11 + *
   23.12 + * FFmpeg is free software; you can redistribute it and/or
   23.13 + * modify it under the terms of the GNU Lesser General Public
   23.14 + * License as published by the Free Software Foundation; either
   23.15 + * version 2.1 of the License, or (at your option) any later version.
   23.16 + *
   23.17 + * FFmpeg is distributed in the hope that it will be useful,
   23.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   23.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   23.20 + * Lesser General Public License for more details.
   23.21 + *
   23.22 + * You should have received a copy of the GNU Lesser General Public
   23.23 + * License along with FFmpeg; if not, write to the Free Software
   23.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   23.25 + */
   23.26 +
   23.27 +#include "asm.S"
   23.28 +
   23.29 +#define M_SQRT1_2 0.70710678118654752440
   23.30 +
   23.31 +        .text
   23.32 +
   23.33 +function fft4_neon
   23.34 +        vld1.32         {d0-d3}, [r0,:128]
   23.35 +
   23.36 +        vext.32         q8,  q1,  q1,  #1       @ i2,r3 d3=i3,r2
   23.37 +        vsub.f32        d6,  d0,  d1            @ r0-r1,i0-i1
   23.38 +        vsub.f32        d7,  d16, d17           @ r3-r2,i2-i3
   23.39 +        vadd.f32        d4,  d0,  d1            @ r0+r1,i0+i1
   23.40 +        vadd.f32        d5,  d2,  d3            @ i2+i3,r2+r3
   23.41 +        vadd.f32        d1,  d6,  d7
   23.42 +        vsub.f32        d3,  d6,  d7
   23.43 +        vadd.f32        d0,  d4,  d5
   23.44 +        vsub.f32        d2,  d4,  d5
   23.45 +
   23.46 +        vst1.32         {d0-d3}, [r0,:128]
   23.47 +
   23.48 +        bx              lr
   23.49 +endfunc
   23.50 +
   23.51 +function fft8_neon
   23.52 +        mov             r1,  r0
   23.53 +        vld1.32         {d0-d3},   [r1,:128]!
   23.54 +        vld1.32         {d16-d19}, [r1,:128]
   23.55 +
   23.56 +        movw            r2,  #0x04f3            @ sqrt(1/2)
   23.57 +        movt            r2,  #0x3f35
   23.58 +        eor             r3,  r2,  #1<<31
   23.59 +        vdup.32         d31, r2
   23.60 +
   23.61 +        vext.32         q11, q1,  q1,  #1       @ i2,r3,i3,r2
   23.62 +        vadd.f32        d4,  d16, d17           @ r4+r5,i4+i5
   23.63 +        vmov            d28, r3,  r2
   23.64 +        vadd.f32        d5,  d18, d19           @ r6+r7,i6+i7
   23.65 +        vsub.f32        d17, d16, d17           @ r4-r5,i4-i5
   23.66 +        vsub.f32        d19, d18, d19           @ r6-r7,i6-i7
   23.67 +        vrev64.32       d29, d28
   23.68 +        vadd.f32        d20, d0,  d1            @ r0+r1,i0+i1
   23.69 +        vadd.f32        d21, d2,  d3            @ r2+r3,i2+i3
   23.70 +        vmul.f32        d26, d17, d28           @ -a2r*w,a2i*w
   23.71 +        vext.32         q3,  q2,  q2,  #1
   23.72 +        vmul.f32        d27, d19, d29           @ a3r*w,-a3i*w
   23.73 +        vsub.f32        d23, d22, d23           @ i2-i3,r3-r2
   23.74 +        vsub.f32        d22, d0,  d1            @ r0-r1,i0-i1
   23.75 +        vmul.f32        d24, d17, d31           @ a2r*w,a2i*w
   23.76 +        vmul.f32        d25, d19, d31           @ a3r*w,a3i*w
   23.77 +        vadd.f32        d0,  d20, d21
   23.78 +        vsub.f32        d2,  d20, d21
   23.79 +        vadd.f32        d1,  d22, d23
   23.80 +        vrev64.32       q13, q13
   23.81 +        vsub.f32        d3,  d22, d23
   23.82 +        vsub.f32        d6,  d6,  d7
   23.83 +        vadd.f32        d24, d24, d26           @ a2r+a2i,a2i-a2r   t1,t2
   23.84 +        vadd.f32        d25, d25, d27           @ a3r-a3i,a3i+a3r   t5,t6
   23.85 +        vadd.f32        d7,  d4,  d5
   23.86 +        vsub.f32        d18, d2,  d6
   23.87 +        vext.32         q13, q12, q12, #1
   23.88 +        vadd.f32        d2,  d2,  d6
   23.89 +        vsub.f32        d16, d0,  d7
   23.90 +        vadd.f32        d5,  d25, d24
   23.91 +        vsub.f32        d4,  d26, d27
   23.92 +        vadd.f32        d0,  d0,  d7
   23.93 +        vsub.f32        d17, d1,  d5
   23.94 +        vsub.f32        d19, d3,  d4
   23.95 +        vadd.f32        d3,  d3,  d4
   23.96 +        vadd.f32        d1,  d1,  d5
   23.97 +
   23.98 +        vst1.32         {d16-d19}, [r1,:128]
   23.99 +        vst1.32         {d0-d3},   [r0,:128]
  23.100 +
  23.101 +        bx              lr
  23.102 +endfunc
  23.103 +
  23.104 +function fft16_neon
  23.105 +        movrel          r1, mppm
  23.106 +        vld1.32         {d16-d19}, [r0,:128]!   @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
  23.107 +        pld             [r0, #32]
  23.108 +        vld1.32         {d2-d3}, [r1,:128]
  23.109 +        vext.32         q13, q9,  q9,  #1
  23.110 +        vld1.32         {d22-d25}, [r0,:128]!   @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
  23.111 +        vadd.f32        d4,  d16, d17
  23.112 +        vsub.f32        d5,  d16, d17
  23.113 +        vadd.f32        d18, d18, d19
  23.114 +        vsub.f32        d19, d26, d27
  23.115 +
  23.116 +        vadd.f32        d20, d22, d23
  23.117 +        vsub.f32        d22, d22, d23
  23.118 +        vsub.f32        d23, d24, d25
  23.119 +        vadd.f32        q8,  q2,  q9            @ {r0,i0,r1,i1}
  23.120 +        vadd.f32        d21, d24, d25
  23.121 +        vmul.f32        d24, d22, d2
  23.122 +        vsub.f32        q9,  q2,  q9            @ {r2,i2,r3,i3}
  23.123 +        vmul.f32        d25, d23, d3
  23.124 +        vuzp.32         d16, d17                @ {r0,r1,i0,i1}
  23.125 +        vmul.f32        q1,  q11, d2[1]
  23.126 +        vuzp.32         d18, d19                @ {r2,r3,i2,i3}
  23.127 +        vrev64.32       q12, q12
  23.128 +        vadd.f32        q11, q12, q1            @ {t1a,t2a,t5,t6}
  23.129 +        vld1.32         {d24-d27}, [r0,:128]!   @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
  23.130 +        vzip.32         q10, q11
  23.131 +        vld1.32         {d28-d31}, [r0,:128]    @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
  23.132 +        vadd.f32        d0,  d22, d20
  23.133 +        vadd.f32        d1,  d21, d23
  23.134 +        vsub.f32        d2,  d21, d23
  23.135 +        vsub.f32        d3,  d22, d20
  23.136 +        sub             r0,  r0,  #96
  23.137 +        vext.32         q13, q13, q13, #1
  23.138 +        vsub.f32        q10, q8,  q0            @ {r4,r5,i4,i5}
  23.139 +        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
  23.140 +        vext.32         q15, q15, q15, #1
  23.141 +        vsub.f32        q11, q9,  q1            @ {r6,r7,i6,i7}
  23.142 +        vswp            d25, d26                @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
  23.143 +        vadd.f32        q9,  q9,  q1            @ {r2,r3,i2,i3}
  23.144 +        vswp            d29, d30                @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
  23.145 +        vadd.f32        q0,  q12, q13           @ {t1,t2,t5,t6}
  23.146 +        vadd.f32        q1,  q14, q15           @ {t1a,t2a,t5a,t6a}
  23.147 +        movrel          r2,  X(ff_cos_16)
  23.148 +        vsub.f32        q13, q12, q13           @ {t3,t4,t7,t8}
  23.149 +        vrev64.32       d1,  d1
  23.150 +        vsub.f32        q15, q14, q15           @ {t3a,t4a,t7a,t8a}
  23.151 +        vrev64.32       d3,  d3
  23.152 +        movrel          r3,  pmmp
  23.153 +        vswp            d1,  d26                @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
  23.154 +        vswp            d3,  d30                @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
  23.155 +        vadd.f32        q12, q0,  q13           @ {r8,i8,r9,i9}
  23.156 +        vadd.f32        q14, q1,  q15           @ {r12,i12,r13,i13}
  23.157 +        vld1.32         {d4-d5},  [r2,:64]
  23.158 +        vsub.f32        q13, q0,  q13           @ {r10,i10,r11,i11}
  23.159 +        vsub.f32        q15, q1,  q15           @ {r14,i14,r15,i15}
  23.160 +        vswp            d25, d28                @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
  23.161 +        vld1.32         {d6-d7},  [r3,:128]
  23.162 +        vrev64.32       q1,  q14
  23.163 +        vmul.f32        q14, q14, d4[1]
  23.164 +        vmul.f32        q1,  q1,  q3
  23.165 +        vmla.f32        q14, q1,  d5[1]         @ {t1a,t2a,t5a,t6a}
  23.166 +        vswp            d27, d30                @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
  23.167 +        vzip.32         q12, q14
  23.168 +        vadd.f32        d0,  d28, d24
  23.169 +        vadd.f32        d1,  d25, d29
  23.170 +        vsub.f32        d2,  d25, d29
  23.171 +        vsub.f32        d3,  d28, d24
  23.172 +        vsub.f32        q12, q8,  q0            @ {r8,r9,i8,i9}
  23.173 +        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
  23.174 +        vsub.f32        q14, q10, q1            @ {r12,r13,i12,i13}
  23.175 +        mov             r1,  #32
  23.176 +        vadd.f32        q10, q10, q1            @ {r4,r5,i4,i5}
  23.177 +        vrev64.32       q0,  q13
  23.178 +        vmul.f32        q13, q13, d5[0]
  23.179 +        vrev64.32       q1,  q15
  23.180 +        vmul.f32        q15, q15, d5[1]
  23.181 +        vst2.32         {d16-d17},[r0,:128], r1
  23.182 +        vmul.f32        q0,  q0,  q3
  23.183 +        vst2.32         {d20-d21},[r0,:128], r1
  23.184 +        vmul.f32        q1,  q1,  q3
  23.185 +        vmla.f32        q13, q0,  d5[0]         @ {t1,t2,t5,t6}
  23.186 +        vmla.f32        q15, q1,  d4[1]         @ {t1a,t2a,t5a,t6a}
  23.187 +        vst2.32         {d24-d25},[r0,:128], r1
  23.188 +        vst2.32         {d28-d29},[r0,:128]
  23.189 +        vzip.32         q13, q15
  23.190 +        sub             r0, r0, #80
  23.191 +        vadd.f32        d0,  d30, d26
  23.192 +        vadd.f32        d1,  d27, d31
  23.193 +        vsub.f32        d2,  d27, d31
  23.194 +        vsub.f32        d3,  d30, d26
  23.195 +        vsub.f32        q13, q9,  q0            @ {r10,r11,i10,i11}
  23.196 +        vadd.f32        q9,  q9,  q0            @ {r2,r3,i2,i3}
  23.197 +        vsub.f32        q15, q11, q1            @ {r14,r15,i14,i15}
  23.198 +        vadd.f32        q11, q11, q1            @ {r6,r7,i6,i7}
  23.199 +        vst2.32         {d18-d19},[r0,:128], r1
  23.200 +        vst2.32         {d22-d23},[r0,:128], r1
  23.201 +        vst2.32         {d26-d27},[r0,:128], r1
  23.202 +        vst2.32         {d30-d31},[r0,:128]
  23.203 +        bx              lr
  23.204 +endfunc
  23.205 +
  23.206 +function fft_pass_neon
  23.207 +        push            {r4-r6,lr}
  23.208 +        mov             r6,  r2                 @ n
  23.209 +        lsl             r5,  r2,  #3            @ 2 * n * sizeof FFTSample
  23.210 +        lsl             r4,  r2,  #4            @ 2 * n * sizeof FFTComplex
  23.211 +        lsl             r2,  r2,  #5            @ 4 * n * sizeof FFTComplex
  23.212 +        add             r3,  r2,  r4
  23.213 +        add             r4,  r4,  r0            @ &z[o1]
  23.214 +        add             r2,  r2,  r0            @ &z[o2]
  23.215 +        add             r3,  r3,  r0            @ &z[o3]
  23.216 +        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
  23.217 +        movrel          r12, pmmp
  23.218 +        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
  23.219 +        add             r5,  r5,  r1            @ wim
  23.220 +        vld1.32         {d6-d7},  [r12,:128]    @ pmmp
  23.221 +        vswp            d21, d22
  23.222 +        vld1.32         {d4},     [r1,:64]!     @ {wre[0],wre[1]}
  23.223 +        sub             r5,  r5,  #4            @ wim--
  23.224 +        vrev64.32       q1,  q11
  23.225 +        vmul.f32        q11, q11, d4[1]
  23.226 +        vmul.f32        q1,  q1,  q3
  23.227 +        vld1.32         {d5[0]},  [r5,:32]      @ d5[0] = wim[-1]
  23.228 +        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
  23.229 +        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
  23.230 +        sub             r6, r6, #1              @ n--
  23.231 +        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
  23.232 +        vzip.32         q10, q11
  23.233 +        vadd.f32        d0,  d22, d20
  23.234 +        vadd.f32        d1,  d21, d23
  23.235 +        vsub.f32        d2,  d21, d23
  23.236 +        vsub.f32        d3,  d22, d20
  23.237 +        vsub.f32        q10, q8,  q0
  23.238 +        vadd.f32        q8,  q8,  q0
  23.239 +        vsub.f32        q11, q9,  q1
  23.240 +        vadd.f32        q9,  q9,  q1
  23.241 +        vst2.32         {d20-d21},[r2,:128]!    @ {z[o2],z[o2+1]}
  23.242 +        vst2.32         {d16-d17},[r0,:128]!    @ {z[0],z[1]}
  23.243 +        vst2.32         {d22-d23},[r3,:128]!    @ {z[o3],z[o3+1]}
  23.244 +        vst2.32         {d18-d19},[r4,:128]!    @ {z[o1],z[o1+1]}
  23.245 +        sub             r5,  r5,  #8            @ wim -= 2
  23.246 +1:
  23.247 +        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
  23.248 +        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
  23.249 +        vswp            d21, d22
  23.250 +        vld1.32         {d4}, [r1]!             @ {wre[0],wre[1]}
  23.251 +        vrev64.32       q0,  q10
  23.252 +        vmul.f32        q10, q10, d4[0]
  23.253 +        vrev64.32       q1,  q11
  23.254 +        vmul.f32        q11, q11, d4[1]
  23.255 +        vld1.32         {d5}, [r5]              @ {wim[-1],wim[0]}
  23.256 +        vmul.f32        q0,  q0,  q3
  23.257 +        sub             r5,  r5,  #8            @ wim -= 2
  23.258 +        vmul.f32        q1,  q1,  q3
  23.259 +        vmla.f32        q10, q0,  d5[1]         @ {t1,t2,t5,t6}
  23.260 +        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
  23.261 +        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
  23.262 +        subs            r6,  r6,  #1            @ n--
  23.263 +        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
  23.264 +        vzip.32         q10, q11
  23.265 +        vadd.f32        d0,  d22, d20
  23.266 +        vadd.f32        d1,  d21, d23
  23.267 +        vsub.f32        d2,  d21, d23
  23.268 +        vsub.f32        d3,  d22, d20
  23.269 +        vsub.f32        q10, q8,  q0
  23.270 +        vadd.f32        q8,  q8,  q0
  23.271 +        vsub.f32        q11, q9,  q1
  23.272 +        vadd.f32        q9,  q9,  q1
  23.273 +        vst2.32         {d20-d21}, [r2,:128]!   @ {z[o2],z[o2+1]}
  23.274 +        vst2.32         {d16-d17}, [r0,:128]!   @ {z[0],z[1]}
  23.275 +        vst2.32         {d22-d23}, [r3,:128]!   @ {z[o3],z[o3+1]}
  23.276 +        vst2.32         {d18-d19}, [r4,:128]!   @ {z[o1],z[o1+1]}
  23.277 +        bne             1b
  23.278 +
  23.279 +        pop             {r4-r6,pc}
  23.280 +endfunc
  23.281 +
  23.282 +.macro  def_fft n, n2, n4
  23.283 +        .align 6
  23.284 +function fft\n\()_neon
  23.285 +        push            {r4, lr}
  23.286 +        mov             r4,  r0
  23.287 +        bl              fft\n2\()_neon
  23.288 +        add             r0,  r4,  #\n4*2*8
  23.289 +        bl              fft\n4\()_neon
  23.290 +        add             r0,  r4,  #\n4*3*8
  23.291 +        bl              fft\n4\()_neon
  23.292 +        mov             r0,  r4
  23.293 +        pop             {r4, lr}
  23.294 +        movrel          r1,  X(ff_cos_\n)
  23.295 +        mov             r2,  #\n4/2
  23.296 +        b               fft_pass_neon
  23.297 +endfunc
  23.298 +.endm
  23.299 +
  23.300 +        def_fft    32,    16,     8
  23.301 +        def_fft    64,    32,    16
  23.302 +        def_fft   128,    64,    32
  23.303 +        def_fft   256,   128,    64
  23.304 +        def_fft   512,   256,   128
  23.305 +        def_fft  1024,   512,   256
  23.306 +        def_fft  2048,  1024,   512
  23.307 +        def_fft  4096,  2048,  1024
  23.308 +        def_fft  8192,  4096,  2048
  23.309 +        def_fft 16384,  8192,  4096
  23.310 +        def_fft 32768, 16384,  8192
  23.311 +        def_fft 65536, 32768, 16384
  23.312 +
  23.313 +function ff_fft_calc_neon, export=1
  23.314 +        ldr             r2,  [r0]
  23.315 +        sub             r2,  r2,  #2
  23.316 +        movrel          r3,  fft_tab_neon
  23.317 +        ldr             r3,  [r3, r2, lsl #2]
  23.318 +        mov             r0,  r1
  23.319 +        bx              r3
  23.320 +endfunc
  23.321 +
  23.322 +function ff_fft_permute_neon, export=1
  23.323 +        push            {r4,lr}
  23.324 +        mov             r12, #1
  23.325 +        ldr             r2,  [r0]       @ nbits
  23.326 +        ldr             r3,  [r0, #20]  @ tmp_buf
  23.327 +        ldr             r0,  [r0, #8]   @ revtab
  23.328 +        lsl             r12, r12, r2
  23.329 +        mov             r2,  r12
  23.330 +1:
  23.331 +        vld1.32         {d0-d1}, [r1,:128]!
  23.332 +        ldr             r4,  [r0], #4
  23.333 +        uxth            lr,  r4
  23.334 +        uxth            r4,  r4,  ror #16
  23.335 +        add             lr,  r3,  lr,  lsl #3
  23.336 +        add             r4,  r3,  r4,  lsl #3
  23.337 +        vst1.32         {d0}, [lr,:64]
  23.338 +        vst1.32         {d1}, [r4,:64]
  23.339 +        subs            r12, r12, #2
  23.340 +        bgt             1b
  23.341 +
  23.342 +        sub             r1,  r1,  r2,  lsl #3
  23.343 +1:
  23.344 +        vld1.32         {d0-d3}, [r3,:128]!
  23.345 +        vst1.32         {d0-d3}, [r1,:128]!
  23.346 +        subs            r2,  r2,  #4
  23.347 +        bgt             1b
  23.348 +
  23.349 +        pop             {r4,pc}
  23.350 +endfunc
  23.351 +
  23.352 +        .section .rodata
  23.353 +        .align 4
  23.354 +fft_tab_neon:
  23.355 +        .word fft4_neon
  23.356 +        .word fft8_neon
  23.357 +        .word fft16_neon
  23.358 +        .word fft32_neon
  23.359 +        .word fft64_neon
  23.360 +        .word fft128_neon
  23.361 +        .word fft256_neon
  23.362 +        .word fft512_neon
  23.363 +        .word fft1024_neon
  23.364 +        .word fft2048_neon
  23.365 +        .word fft4096_neon
  23.366 +        .word fft8192_neon
  23.367 +        .word fft16384_neon
  23.368 +        .word fft32768_neon
  23.369 +        .word fft65536_neon
  23.370 +        .size fft_tab_neon, . - fft_tab_neon
  23.371 +
  23.372 +        .align 4
  23.373 +pmmp:   .float  +1.0, -1.0, -1.0, +1.0
  23.374 +mppm:   .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2

    24.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    24.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
    24.3 @@ -0,0 +1,126 @@
    24.4 +/*
    24.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
    24.6 + *
    24.7 + * This file is part of FFmpeg.
    24.8 + *
    24.9 + * FFmpeg is free software; you can redistribute it and/or
   24.10 + * modify it under the terms of the GNU Lesser General Public
   24.11 + * License as published by the Free Software Foundation; either
   24.12 + * version 2.1 of the License, or (at your option) any later version.
   24.13 + *
   24.14 + * FFmpeg is distributed in the hope that it will be useful,
   24.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   24.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   24.17 + * Lesser General Public License for more details.
   24.18 + *
   24.19 + * You should have received a copy of the GNU Lesser General Public
   24.20 + * License along with FFmpeg; if not, write to the Free Software
   24.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   24.22 + */
   24.23 +
   24.24 +#include <stdint.h>
   24.25 +
   24.26 +#include "libavcodec/dsputil.h"
   24.27 +#include "libavcodec/h264dsp.h"
   24.28 +
   24.29 +void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
   24.30 +                                     int beta, int8_t *tc0);
   24.31 +void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
   24.32 +                                     int beta, int8_t *tc0);
   24.33 +void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
   24.34 +                                       int beta, int8_t *tc0);
   24.35 +void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
   24.36 +                                       int beta, int8_t *tc0);
   24.37 +
   24.38 +void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
   24.39 +                                      int weight, int offset);
   24.40 +void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
   24.41 +                                     int weight, int offset);
   24.42 +void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
   24.43 +                                     int weight, int offset);
   24.44 +void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
   24.45 +                                    int weight, int offset);
   24.46 +void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
   24.47 +                                    int weight, int offset);
   24.48 +void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
   24.49 +                                    int weight, int offset);
   24.50 +void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
   24.51 +                                    int weight, int offset);
   24.52 +void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
   24.53 +                                    int weight, int offset);
   24.54 +
   24.55 +void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
   24.56 +                                        int log2_den, int weightd, int weights,
   24.57 +                                        int offset);
   24.58 +void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
   24.59 +                                       int log2_den, int weightd, int weights,
   24.60 +                                       int offset);
   24.61 +void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
   24.62 +                                       int log2_den, int weightd, int weights,
   24.63 +                                       int offset);
   24.64 +void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
   24.65 +                                      int log2_den, int weightd, int weights,
   24.66 +                                      int offset);
   24.67 +void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
   24.68 +                                      int log2_den, int weightd, int weights,
   24.69 +                                      int offset);
   24.70 +void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
   24.71 +                                      int log2_den, int weightd, int weights,
   24.72 +                                      int offset);
   24.73 +void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
   24.74 +                                      int log2_den, int weightd, int weights,
   24.75 +                                      int offset);
   24.76 +void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
   24.77 +                                      int log2_den, int weightd, int weights,
   24.78 +                                      int offset);
   24.79 +
   24.80 +void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
   24.81 +void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
   24.82 +void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
   24.83 +                             DCTELEM *block, int stride,
   24.84 +                             const uint8_t nnzc[6*8]);
   24.85 +void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
   24.86 +                                  DCTELEM *block, int stride,
   24.87 +                                  const uint8_t nnzc[6*8]);
   24.88 +void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
   24.89 +                            DCTELEM *block, int stride,
   24.90 +                            const uint8_t nnzc[6*8]);
   24.91 +
   24.92 +#if HAVE_NEON
   24.93 +static void ff_h264dsp_init_neon(H264DSPContext *c)
   24.94 +{
   24.95 +    c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
   24.96 +    c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
   24.97 +    c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
   24.98 +    c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
   24.99 +
  24.100 +    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
  24.101 +    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
  24.102 +    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
  24.103 +    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
  24.104 +    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
  24.105 +    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
  24.106 +    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
  24.107 +    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
  24.108 +
  24.109 +    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
  24.110 +    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
  24.111 +    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
  24.112 +    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
  24.113 +    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
  24.114 +    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
  24.115 +    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
  24.116 +    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
  24.117 +
  24.118 +    c->h264_idct_add        = ff_h264_idct_add_neon;
  24.119 +    c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
  24.120 +    c->h264_idct_add16      = ff_h264_idct_add16_neon;
  24.121 +    c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
  24.122 +    c->h264_idct_add8       = ff_h264_idct_add8_neon;
  24.123 +}
  24.124 +#endif
  24.125 +
  24.126 +void ff_h264dsp_init_arm(H264DSPContext *c)
  24.127 +{
  24.128 +    if (HAVE_NEON) ff_h264dsp_init_neon(c);
  24.129 +}

    25.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    25.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264dsp_neon.S	Mon Aug 27 12:09:56 2012 +0200
    25.3 @@ -0,0 +1,1883 @@
    25.4 +/*
    25.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
    25.6 + *
    25.7 + * This file is part of FFmpeg.
    25.8 + *
    25.9 + * FFmpeg is free software; you can redistribute it and/or
   25.10 + * modify it under the terms of the GNU Lesser General Public
   25.11 + * License as published by the Free Software Foundation; either
   25.12 + * version 2.1 of the License, or (at your option) any later version.
   25.13 + *
   25.14 + * FFmpeg is distributed in the hope that it will be useful,
   25.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   25.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   25.17 + * Lesser General Public License for more details.
   25.18 + *
   25.19 + * You should have received a copy of the GNU Lesser General Public
   25.20 + * License along with FFmpeg; if not, write to the Free Software
   25.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   25.22 + */
   25.23 +
   25.24 +#include "asm.S"
   25.25 +
   25.26 +        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
   25.27 +        vtrn.32         \r0, \r4
   25.28 +        vtrn.32         \r1, \r5
   25.29 +        vtrn.32         \r2, \r6
   25.30 +        vtrn.32         \r3, \r7
   25.31 +        vtrn.16         \r0, \r2
   25.32 +        vtrn.16         \r1, \r3
   25.33 +        vtrn.16         \r4, \r6
   25.34 +        vtrn.16         \r5, \r7
   25.35 +        vtrn.8          \r0, \r1
   25.36 +        vtrn.8          \r2, \r3
   25.37 +        vtrn.8          \r4, \r5
   25.38 +        vtrn.8          \r6, \r7
   25.39 +        .endm
   25.40 +
   25.41 +        .macro transpose_4x4 r0 r1 r2 r3
   25.42 +        vtrn.16         \r0, \r2
   25.43 +        vtrn.16         \r1, \r3
   25.44 +        vtrn.8          \r0, \r1
   25.45 +        vtrn.8          \r2, \r3
   25.46 +        .endm
   25.47 +
   25.48 +        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
   25.49 +        vswp            \r0, \r4
   25.50 +        vswp            \r1, \r5
   25.51 +        vswp            \r2, \r6
   25.52 +        vswp            \r3, \r7
   25.53 +        .endm
   25.54 +
   25.55 +        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
   25.56 +        vtrn.32         \r0, \r2
   25.57 +        vtrn.32         \r1, \r3
   25.58 +        vtrn.32         \r4, \r6
   25.59 +        vtrn.32         \r5, \r7
   25.60 +        vtrn.16         \r0, \r1
   25.61 +        vtrn.16         \r2, \r3
   25.62 +        vtrn.16         \r4, \r5
   25.63 +        vtrn.16         \r6, \r7
   25.64 +        .endm
   25.65 +
   25.66 +/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
   25.67 +        .macro  h264_chroma_mc8 type
   25.68 +function ff_\type\()_h264_chroma_mc8_neon, export=1
   25.69 +        push            {r4-r7, lr}
   25.70 +        ldrd            r4,  [sp, #20]
   25.71 +.ifc \type,avg
   25.72 +        mov             lr,  r0
   25.73 +.endif
   25.74 +        pld             [r1]
   25.75 +        pld             [r1, r2]
   25.76 +
   25.77 +        muls            r7,  r4,  r5
   25.78 +        rsb             r6,  r7,  r5,  lsl #3
   25.79 +        rsb             ip,  r7,  r4,  lsl #3
   25.80 +        sub             r4,  r7,  r4,  lsl #3
   25.81 +        sub             r4,  r4,  r5,  lsl #3
   25.82 +        add             r4,  r4,  #64
   25.83 +
   25.84 +        beq             2f
   25.85 +
   25.86 +        add             r5,  r1,  r2
   25.87 +
   25.88 +        vdup.8          d0,  r4
   25.89 +        lsl             r4,  r2,  #1
   25.90 +        vdup.8          d1,  ip
   25.91 +        vld1.64         {d4, d5}, [r1], r4
   25.92 +        vdup.8          d2,  r6
   25.93 +        vld1.64         {d6, d7}, [r5], r4
   25.94 +        vdup.8          d3,  r7
   25.95 +
   25.96 +        vext.8          d5,  d4,  d5,  #1
   25.97 +        vext.8          d7,  d6,  d7,  #1
   25.98 +
   25.99 +1:      pld             [r5]
  25.100 +        vmull.u8        q8,  d4,  d0
  25.101 +        vmlal.u8        q8,  d5,  d1
  25.102 +        vld1.64         {d4, d5}, [r1], r4
  25.103 +        vmlal.u8        q8,  d6,  d2
  25.104 +        vext.8          d5,  d4,  d5,  #1
  25.105 +        vmlal.u8        q8,  d7,  d3
  25.106 +        vmull.u8        q9,  d6,  d0
  25.107 +        subs            r3,  r3,  #2
  25.108 +        vmlal.u8        q9,  d7,  d1
  25.109 +        vmlal.u8        q9,  d4,  d2
  25.110 +        vmlal.u8        q9,  d5,  d3
  25.111 +        vrshrn.u16      d16, q8,  #6
  25.112 +        vld1.64         {d6, d7}, [r5], r4
  25.113 +        pld             [r1]
  25.114 +        vrshrn.u16      d17, q9,  #6
  25.115 +.ifc \type,avg
  25.116 +        vld1.64         {d20}, [lr,:64], r2
  25.117 +        vld1.64         {d21}, [lr,:64], r2
  25.118 +        vrhadd.u8       q8,  q8,  q10
  25.119 +.endif
  25.120 +        vext.8          d7,  d6,  d7,  #1
  25.121 +        vst1.64         {d16}, [r0,:64], r2
  25.122 +        vst1.64         {d17}, [r0,:64], r2
  25.123 +        bgt             1b
  25.124 +
  25.125 +        pop             {r4-r7, pc}
  25.126 +
  25.127 +2:      tst             r6,  r6
  25.128 +        add             ip,  ip,  r6
  25.129 +        vdup.8          d0,  r4
  25.130 +        vdup.8          d1,  ip
  25.131 +
  25.132 +        beq             4f
  25.133 +
  25.134 +        add             r5,  r1,  r2
  25.135 +        lsl             r4,  r2,  #1
  25.136 +        vld1.64         {d4}, [r1], r4
  25.137 +        vld1.64         {d6}, [r5], r4
  25.138 +
  25.139 +3:      pld             [r5]
  25.140 +        vmull.u8        q8,  d4,  d0
  25.141 +        vmlal.u8        q8,  d6,  d1
  25.142 +        vld1.64         {d4}, [r1], r4
  25.143 +        vmull.u8        q9,  d6,  d0
  25.144 +        vmlal.u8        q9,  d4,  d1
  25.145 +        vld1.64         {d6}, [r5], r4
  25.146 +        vrshrn.u16      d16, q8,  #6
  25.147 +        vrshrn.u16      d17, q9,  #6
  25.148 +.ifc \type,avg
  25.149 +        vld1.64         {d20}, [lr,:64], r2
  25.150 +        vld1.64         {d21}, [lr,:64], r2
  25.151 +        vrhadd.u8       q8,  q8,  q10
  25.152 +.endif
  25.153 +        subs            r3,  r3,  #2
  25.154 +        pld             [r1]
  25.155 +        vst1.64         {d16}, [r0,:64], r2
  25.156 +        vst1.64         {d17}, [r0,:64], r2
  25.157 +        bgt             3b
  25.158 +
  25.159 +        pop             {r4-r7, pc}
  25.160 +
  25.161 +4:      vld1.64         {d4, d5}, [r1], r2
  25.162 +        vld1.64         {d6, d7}, [r1], r2
  25.163 +        vext.8          d5,  d4,  d5,  #1
  25.164 +        vext.8          d7,  d6,  d7,  #1
  25.165 +
  25.166 +5:      pld             [r1]
  25.167 +        subs            r3,  r3,  #2
  25.168 +        vmull.u8        q8,  d4,  d0
  25.169 +        vmlal.u8        q8,  d5,  d1
  25.170 +        vld1.64         {d4, d5}, [r1], r2
  25.171 +        vmull.u8        q9,  d6,  d0
  25.172 +        vmlal.u8        q9,  d7,  d1
  25.173 +        pld             [r1]
  25.174 +        vext.8          d5,  d4,  d5,  #1
  25.175 +        vrshrn.u16      d16, q8,  #6
  25.176 +        vrshrn.u16      d17, q9,  #6
  25.177 +.ifc \type,avg
  25.178 +        vld1.64         {d20}, [lr,:64], r2
  25.179 +        vld1.64         {d21}, [lr,:64], r2
  25.180 +        vrhadd.u8       q8,  q8,  q10
  25.181 +.endif
  25.182 +        vld1.64         {d6, d7}, [r1], r2
  25.183 +        vext.8          d7,  d6,  d7,  #1
  25.184 +        vst1.64         {d16}, [r0,:64], r2
  25.185 +        vst1.64         {d17}, [r0,:64], r2
  25.186 +        bgt             5b
  25.187 +
  25.188 +        pop             {r4-r7, pc}
  25.189 +endfunc
  25.190 +        .endm
  25.191 +
  25.192 +/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
  25.193 +        .macro  h264_chroma_mc4 type
  25.194 +function ff_\type\()_h264_chroma_mc4_neon, export=1
  25.195 +        push            {r4-r7, lr}
  25.196 +        ldrd            r4,  [sp, #20]
  25.197 +.ifc \type,avg
  25.198 +        mov             lr,  r0
  25.199 +.endif
  25.200 +        pld             [r1]
  25.201 +        pld             [r1, r2]
  25.202 +
  25.203 +        muls            r7,  r4,  r5
  25.204 +        rsb             r6,  r7,  r5,  lsl #3
  25.205 +        rsb             ip,  r7,  r4,  lsl #3
  25.206 +        sub             r4,  r7,  r4,  lsl #3
  25.207 +        sub             r4,  r4,  r5,  lsl #3
  25.208 +        add             r4,  r4,  #64
  25.209 +
  25.210 +        beq             2f
  25.211 +
  25.212 +        add             r5,  r1,  r2
  25.213 +
  25.214 +        vdup.8          d0,  r4
  25.215 +        lsl             r4,  r2,  #1
  25.216 +        vdup.8          d1,  ip
  25.217 +        vld1.64         {d4},     [r1], r4
  25.218 +        vdup.8          d2,  r6
  25.219 +        vld1.64         {d6},     [r5], r4
  25.220 +        vdup.8          d3,  r7
  25.221 +
  25.222 +        vext.8          d5,  d4,  d5,  #1
  25.223 +        vext.8          d7,  d6,  d7,  #1
  25.224 +        vtrn.32         d4,  d5
  25.225 +        vtrn.32         d6,  d7
  25.226 +
  25.227 +        vtrn.32         d0,  d1
  25.228 +        vtrn.32         d2,  d3
  25.229 +
  25.230 +1:      pld             [r5]
  25.231 +        vmull.u8        q8,  d4,  d0
  25.232 +        vmlal.u8        q8,  d6,  d2
  25.233 +        vld1.64         {d4},     [r1], r4
  25.234 +        vext.8          d5,  d4,  d5,  #1
  25.235 +        vtrn.32         d4,  d5
  25.236 +        vmull.u8        q9,  d6,  d0
  25.237 +        vmlal.u8        q9,  d4,  d2
  25.238 +        vld1.64         {d6},     [r5], r4
  25.239 +        vadd.i16        d16, d16, d17
  25.240 +        vadd.i16        d17, d18, d19
  25.241 +        vrshrn.u16      d16, q8,  #6
  25.242 +        subs            r3,  r3,  #2
  25.243 +        pld             [r1]
  25.244 +.ifc \type,avg
  25.245 +        vld1.32         {d20[0]}, [lr,:32], r2
  25.246 +        vld1.32         {d20[1]}, [lr,:32], r2
  25.247 +        vrhadd.u8       d16, d16, d20
  25.248 +.endif
  25.249 +        vext.8          d7,  d6,  d7,  #1
  25.250 +        vtrn.32         d6,  d7
  25.251 +        vst1.32         {d16[0]}, [r0,:32], r2
  25.252 +        vst1.32         {d16[1]}, [r0,:32], r2
  25.253 +        bgt             1b
  25.254 +
  25.255 +        pop             {r4-r7, pc}
  25.256 +
  25.257 +2:      tst             r6,  r6
  25.258 +        add             ip,  ip,  r6
  25.259 +        vdup.8          d0,  r4
  25.260 +        vdup.8          d1,  ip
  25.261 +        vtrn.32         d0,  d1
  25.262 +
  25.263 +        beq             4f
  25.264 +
  25.265 +        vext.32         d1,  d0,  d1,  #1
  25.266 +        add             r5,  r1,  r2
  25.267 +        lsl             r4,  r2,  #1
  25.268 +        vld1.32         {d4[0]},  [r1], r4
  25.269 +        vld1.32         {d4[1]},  [r5], r4
  25.270 +
  25.271 +3:      pld             [r5]
  25.272 +        vmull.u8        q8,  d4,  d0
  25.273 +        vld1.32         {d4[0]},  [r1], r4
  25.274 +        vmull.u8        q9,  d4,  d1
  25.275 +        vld1.32         {d4[1]},  [r5], r4
  25.276 +        vadd.i16        d16, d16, d17
  25.277 +        vadd.i16        d17, d18, d19
  25.278 +        vrshrn.u16      d16, q8,  #6
  25.279 +.ifc \type,avg
  25.280 +        vld1.32         {d20[0]}, [lr,:32], r2
  25.281 +        vld1.32         {d20[1]}, [lr,:32], r2
  25.282 +        vrhadd.u8       d16, d16, d20
  25.283 +.endif
  25.284 +        subs            r3,  r3,  #2
  25.285 +        pld             [r1]
  25.286 +        vst1.32         {d16[0]}, [r0,:32], r2
  25.287 +        vst1.32         {d16[1]}, [r0,:32], r2
  25.288 +        bgt             3b
  25.289 +
  25.290 +        pop             {r4-r7, pc}
  25.291 +
  25.292 +4:      vld1.64         {d4},     [r1], r2
  25.293 +        vld1.64         {d6},     [r1], r2
  25.294 +        vext.8          d5,  d4,  d5,  #1
  25.295 +        vext.8          d7,  d6,  d7,  #1
  25.296 +        vtrn.32         d4,  d5
  25.297 +        vtrn.32         d6,  d7
  25.298 +
  25.299 +5:      vmull.u8        q8,  d4,  d0
  25.300 +        vmull.u8        q9,  d6,  d0
  25.301 +        subs            r3,  r3,  #2
  25.302 +        vld1.64         {d4},     [r1], r2
  25.303 +        vext.8          d5,  d4,  d5,  #1
  25.304 +        vtrn.32         d4,  d5
  25.305 +        vadd.i16        d16, d16, d17
  25.306 +        vadd.i16        d17, d18, d19
  25.307 +        pld             [r1]
  25.308 +        vrshrn.u16      d16, q8,  #6
  25.309 +.ifc \type,avg
  25.310 +        vld1.32         {d20[0]}, [lr,:32], r2
  25.311 +        vld1.32         {d20[1]}, [lr,:32], r2
  25.312 +        vrhadd.u8       d16, d16, d20
  25.313 +.endif
  25.314 +        vld1.64         {d6},     [r1], r2
  25.315 +        vext.8          d7,  d6,  d7,  #1
  25.316 +        vtrn.32         d6,  d7
  25.317 +        pld             [r1]
  25.318 +        vst1.32         {d16[0]}, [r0,:32], r2
  25.319 +        vst1.32         {d16[1]}, [r0,:32], r2
  25.320 +        bgt             5b
  25.321 +
  25.322 +        pop             {r4-r7, pc}
  25.323 +endfunc
  25.324 +        .endm
  25.325 +
  25.326 +        .macro  h264_chroma_mc2 type
  25.327 +function ff_\type\()_h264_chroma_mc2_neon, export=1
  25.328 +        push            {r4-r6, lr}
  25.329 +        ldr             r4,  [sp, #16]
  25.330 +        ldr             lr,  [sp, #20]
  25.331 +        pld             [r1]
  25.332 +        pld             [r1, r2]
  25.333 +        orrs            r5,  r4,  lr
  25.334 +        beq             2f
  25.335 +
  25.336 +        mul             r5,  r4,  lr
  25.337 +        rsb             r6,  r5,  lr,  lsl #3
  25.338 +        rsb             r12, r5,  r4,  lsl #3
  25.339 +        sub             r4,  r5,  r4,  lsl #3
  25.340 +        sub             r4,  r4,  lr,  lsl #3
  25.341 +        add             r4,  r4,  #64
  25.342 +        vdup.8          d0,  r4
  25.343 +        vdup.8          d2,  r12
  25.344 +        vdup.8          d1,  r6
  25.345 +        vdup.8          d3,  r5
  25.346 +        vtrn.16         q0,  q1
  25.347 +1:
  25.348 +        vld1.32         {d4[0]},  [r1], r2
  25.349 +        vld1.32         {d4[1]},  [r1], r2
  25.350 +        vrev64.32       d5,  d4
  25.351 +        vld1.32         {d5[1]},  [r1]
  25.352 +        vext.8          q3,  q2,  q2,  #1
  25.353 +        vtrn.16         q2,  q3
  25.354 +        vmull.u8        q8,  d4,  d0
  25.355 +        vmlal.u8        q8,  d5,  d1
  25.356 +.ifc \type,avg
  25.357 +        vld1.16         {d18[0]}, [r0,:16], r2
  25.358 +        vld1.16         {d18[1]}, [r0,:16]
  25.359 +        sub             r0,  r0,  r2
  25.360 +.endif
  25.361 +        vtrn.32         d16, d17
  25.362 +        vadd.i16        d16, d16, d17
  25.363 +        vrshrn.u16      d16, q8,  #6
  25.364 +.ifc \type,avg
  25.365 +        vrhadd.u8       d16, d16, d18
  25.366 +.endif
  25.367 +        vst1.16         {d16[0]}, [r0,:16], r2
  25.368 +        vst1.16         {d16[1]}, [r0,:16], r2
  25.369 +        subs            r3,  r3,  #2
  25.370 +        bgt             1b
  25.371 +        pop             {r4-r6, pc}
  25.372 +2:
  25.373 +.ifc \type,put
  25.374 +        ldrh            r5,  [r1], r2
  25.375 +        strh            r5,  [r0], r2
  25.376 +        ldrh            r6,  [r1], r2
  25.377 +        strh            r6,  [r0], r2
  25.378 +.else
  25.379 +        vld1.16         {d16[0]}, [r1], r2
  25.380 +        vld1.16         {d16[1]}, [r1], r2
  25.381 +        vld1.16         {d18[0]}, [r0,:16], r2
  25.382 +        vld1.16         {d18[1]}, [r0,:16]
  25.383 +        sub             r0,  r0,  r2
  25.384 +        vrhadd.u8       d16, d16, d18
  25.385 +        vst1.16         {d16[0]}, [r0,:16], r2
  25.386 +        vst1.16         {d16[1]}, [r0,:16], r2
  25.387 +.endif
  25.388 +        subs            r3,  r3,  #2
  25.389 +        bgt             2b
  25.390 +        pop             {r4-r6, pc}
  25.391 +endfunc
  25.392 +.endm
  25.393 +
  25.394 +        .text
  25.395 +        .align
  25.396 +
  25.397 +        h264_chroma_mc8 put
  25.398 +        h264_chroma_mc8 avg
  25.399 +        h264_chroma_mc4 put
  25.400 +        h264_chroma_mc4 avg
  25.401 +        h264_chroma_mc2 put
  25.402 +        h264_chroma_mc2 avg
  25.403 +
  25.404 +        /* H.264 loop filter */
  25.405 +
  25.406 +        .macro h264_loop_filter_start
  25.407 +        ldr             ip,  [sp]
  25.408 +        tst             r2,  r2
  25.409 +        ldr             ip,  [ip]
  25.410 +        tstne           r3,  r3
  25.411 +        vmov.32         d24[0], ip
  25.412 +        and             ip,  ip,  ip, lsl #16
  25.413 +        bxeq            lr
  25.414 +        ands            ip,  ip,  ip, lsl #8
  25.415 +        bxlt            lr
  25.416 +        .endm
  25.417 +
  25.418 +        .macro align_push_regs
  25.419 +        and             ip,  sp,  #15
  25.420 +        add             ip,  ip,  #32
  25.421 +        sub             sp,  sp,  ip
  25.422 +        vst1.64         {d12-d15}, [sp,:128]
  25.423 +        sub             sp,  sp,  #32
  25.424 +        vst1.64         {d8-d11},  [sp,:128]
  25.425 +        .endm
  25.426 +
  25.427 +        .macro align_pop_regs
  25.428 +        vld1.64         {d8-d11},  [sp,:128]!
  25.429 +        vld1.64         {d12-d15}, [sp,:128], ip
  25.430 +        .endm
  25.431 +
  25.432 +        .macro h264_loop_filter_luma
  25.433 +        vdup.8          q11, r2         @ alpha
  25.434 +        vmovl.u8        q12, d24
  25.435 +        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
  25.436 +        vmovl.u16       q12, d24
  25.437 +        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
  25.438 +        vsli.16         q12, q12, #8
  25.439 +        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
  25.440 +        vsli.32         q12, q12, #16
  25.441 +        vclt.u8         q6,  q6,  q11   @ < alpha
  25.442 +        vdup.8          q11, r3         @ beta
  25.443 +        vclt.s8         q7,  q12, #0
  25.444 +        vclt.u8         q14, q14, q11   @ < beta
  25.445 +        vclt.u8         q15, q15, q11   @ < beta
  25.446 +        vbic            q6,  q6,  q7
  25.447 +        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
  25.448 +        vand            q6,  q6,  q14
  25.449 +        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
  25.450 +        vclt.u8         q4,  q4,  q11   @ < beta
  25.451 +        vand            q6,  q6,  q15
  25.452 +        vclt.u8         q5,  q5,  q11   @ < beta
  25.453 +        vand            q4,  q4,  q6
  25.454 +        vand            q5,  q5,  q6
  25.455 +        vand            q12, q12, q6
  25.456 +        vrhadd.u8       q14, q8,  q0
  25.457 +        vsub.i8         q6,  q12, q4
  25.458 +        vqadd.u8        q7,  q9,  q12
  25.459 +        vhadd.u8        q10, q10, q14
  25.460 +        vsub.i8         q6,  q6,  q5
  25.461 +        vhadd.u8        q14, q2,  q14
  25.462 +        vmin.u8         q7,  q7,  q10
  25.463 +        vqsub.u8        q11, q9,  q12
  25.464 +        vqadd.u8        q2,  q1,  q12
  25.465 +        vmax.u8         q7,  q7,  q11
  25.466 +        vqsub.u8        q11, q1,  q12
  25.467 +        vmin.u8         q14, q2,  q14
  25.468 +        vmovl.u8        q2,  d0
  25.469 +        vmax.u8         q14, q14, q11
  25.470 +        vmovl.u8        q10, d1
  25.471 +        vsubw.u8        q2,  q2,  d16
  25.472 +        vsubw.u8        q10, q10, d17
  25.473 +        vshl.i16        q2,  q2,  #2
  25.474 +        vshl.i16        q10, q10, #2
  25.475 +        vaddw.u8        q2,  q2,  d18
  25.476 +        vaddw.u8        q10, q10, d19
  25.477 +        vsubw.u8        q2,  q2,  d2
  25.478 +        vsubw.u8        q10, q10, d3
  25.479 +        vrshrn.i16      d4,  q2,  #3
  25.480 +        vrshrn.i16      d5,  q10, #3
  25.481 +        vbsl            q4,  q7,  q9
  25.482 +        vbsl            q5,  q14, q1
  25.483 +        vneg.s8         q7,  q6
  25.484 +        vmovl.u8        q14, d16
  25.485 +        vmin.s8         q2,  q2,  q6
  25.486 +        vmovl.u8        q6,  d17
  25.487 +        vmax.s8         q2,  q2,  q7
  25.488 +        vmovl.u8        q11, d0
  25.489 +        vmovl.u8        q12, d1
  25.490 +        vaddw.s8        q14, q14, d4
  25.491 +        vaddw.s8        q6,  q6,  d5
  25.492 +        vsubw.s8        q11, q11, d4
  25.493 +        vsubw.s8        q12, q12, d5
  25.494 +        vqmovun.s16     d16, q14
  25.495 +        vqmovun.s16     d17, q6
  25.496 +        vqmovun.s16     d0,  q11
  25.497 +        vqmovun.s16     d1,  q12
  25.498 +        .endm
  25.499 +
  25.500 +function ff_h264_v_loop_filter_luma_neon, export=1
  25.501 +        h264_loop_filter_start
  25.502 +
  25.503 +        vld1.64         {d0, d1},  [r0,:128], r1
  25.504 +        vld1.64         {d2, d3},  [r0,:128], r1
  25.505 +        vld1.64         {d4, d5},  [r0,:128], r1
  25.506 +        sub             r0,  r0,  r1, lsl #2
  25.507 +        sub             r0,  r0,  r1, lsl #1
  25.508 +        vld1.64         {d20,d21}, [r0,:128], r1
  25.509 +        vld1.64         {d18,d19}, [r0,:128], r1
  25.510 +        vld1.64         {d16,d17}, [r0,:128], r1
  25.511 +
  25.512 +        align_push_regs
  25.513 +
  25.514 +        h264_loop_filter_luma
  25.515 +
  25.516 +        sub             r0,  r0,  r1, lsl #1
  25.517 +        vst1.64         {d8, d9},  [r0,:128], r1
  25.518 +        vst1.64         {d16,d17}, [r0,:128], r1
  25.519 +        vst1.64         {d0, d1},  [r0,:128], r1
  25.520 +        vst1.64         {d10,d11}, [r0,:128]
  25.521 +
  25.522 +        align_pop_regs
  25.523 +        bx              lr
  25.524 +endfunc
  25.525 +
  25.526 +function ff_h264_h_loop_filter_luma_neon, export=1
  25.527 +        h264_loop_filter_start
  25.528 +
  25.529 +        sub             r0,  r0,  #4
  25.530 +        vld1.64         {d6},  [r0], r1
  25.531 +        vld1.64         {d20}, [r0], r1
  25.532 +        vld1.64         {d18}, [r0], r1
  25.533 +        vld1.64         {d16}, [r0], r1
  25.534 +        vld1.64         {d0},  [r0], r1
  25.535 +        vld1.64         {d2},  [r0], r1
  25.536 +        vld1.64         {d4},  [r0], r1
  25.537 +        vld1.64         {d26}, [r0], r1
  25.538 +        vld1.64         {d7},  [r0], r1
  25.539 +        vld1.64         {d21}, [r0], r1
  25.540 +        vld1.64         {d19}, [r0], r1
  25.541 +        vld1.64         {d17}, [r0], r1
  25.542 +        vld1.64         {d1},  [r0], r1
  25.543 +        vld1.64         {d3},  [r0], r1
  25.544 +        vld1.64         {d5},  [r0], r1
  25.545 +        vld1.64         {d27}, [r0], r1
  25.546 +
  25.547 +        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
  25.548 +
  25.549 +        align_push_regs
  25.550 +
  25.551 +        h264_loop_filter_luma
  25.552 +
  25.553 +        transpose_4x4   q4, q8, q0, q5
  25.554 +
  25.555 +        sub             r0,  r0,  r1, lsl #4
  25.556 +        add             r0,  r0,  #2
  25.557 +        vst1.32         {d8[0]},  [r0], r1
  25.558 +        vst1.32         {d16[0]}, [r0], r1
  25.559 +        vst1.32         {d0[0]},  [r0], r1
  25.560 +        vst1.32         {d10[0]}, [r0], r1
  25.561 +        vst1.32         {d8[1]},  [r0], r1
  25.562 +        vst1.32         {d16[1]}, [r0], r1
  25.563 +        vst1.32         {d0[1]},  [r0], r1
  25.564 +        vst1.32         {d10[1]}, [r0], r1
  25.565 +        vst1.32         {d9[0]},  [r0], r1
  25.566 +        vst1.32         {d17[0]}, [r0], r1
  25.567 +        vst1.32         {d1[0]},  [r0], r1
  25.568 +        vst1.32         {d11[0]}, [r0], r1
  25.569 +        vst1.32         {d9[1]},  [r0], r1
  25.570 +        vst1.32         {d17[1]}, [r0], r1
  25.571 +        vst1.32         {d1[1]},  [r0], r1
  25.572 +        vst1.32         {d11[1]}, [r0], r1
  25.573 +
  25.574 +        align_pop_regs
  25.575 +        bx              lr
  25.576 +endfunc
  25.577 +
  25.578 +        .macro h264_loop_filter_chroma
  25.579 +        vdup.8          d22, r2         @ alpha
  25.580 +        vmovl.u8        q12, d24
  25.581 +        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
  25.582 +        vmovl.u8        q2,  d0
  25.583 +        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
  25.584 +        vsubw.u8        q2,  q2,  d16
  25.585 +        vsli.16         d24, d24, #8
  25.586 +        vshl.i16        q2,  q2,  #2
  25.587 +        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
  25.588 +        vaddw.u8        q2,  q2,  d18
  25.589 +        vclt.u8         d26, d26, d22   @ < alpha
  25.590 +        vsubw.u8        q2,  q2,  d2
  25.591 +        vdup.8          d22, r3         @ beta
  25.592 +        vclt.s8         d25, d24, #0
  25.593 +        vrshrn.i16      d4,  q2,  #3
  25.594 +        vclt.u8         d28, d28, d22   @ < beta
  25.595 +        vbic            d26, d26, d25
  25.596 +        vclt.u8         d30, d30, d22   @ < beta
  25.597 +        vand            d26, d26, d28
  25.598 +        vneg.s8         d25, d24
  25.599 +        vand            d26, d26, d30
  25.600 +        vmin.s8         d4,  d4,  d24
  25.601 +        vmovl.u8        q14, d16
  25.602 +        vand            d4,  d4,  d26
  25.603 +        vmax.s8         d4,  d4,  d25
  25.604 +        vmovl.u8        q11, d0
  25.605 +        vaddw.s8        q14, q14, d4
  25.606 +        vsubw.s8        q11, q11, d4
  25.607 +        vqmovun.s16     d16, q14
  25.608 +        vqmovun.s16     d0,  q11
  25.609 +        .endm
  25.610 +
  25.611 +function ff_h264_v_loop_filter_chroma_neon, export=1
  25.612 +        h264_loop_filter_start
  25.613 +
  25.614 +        sub             r0,  r0,  r1, lsl #1
  25.615 +        vld1.64         {d18}, [r0,:64], r1
  25.616 +        vld1.64         {d16}, [r0,:64], r1
  25.617 +        vld1.64         {d0},  [r0,:64], r1
  25.618 +        vld1.64         {d2},  [r0,:64]
  25.619 +
  25.620 +        h264_loop_filter_chroma
  25.621 +
  25.622 +        sub             r0,  r0,  r1, lsl #1
  25.623 +        vst1.64         {d16}, [r0,:64], r1
  25.624 +        vst1.64         {d0},  [r0,:64], r1
  25.625 +
  25.626 +        bx              lr
  25.627 +endfunc
  25.628 +
  25.629 +function ff_h264_h_loop_filter_chroma_neon, export=1
  25.630 +        h264_loop_filter_start
  25.631 +
  25.632 +        sub             r0,  r0,  #2
  25.633 +        vld1.32         {d18[0]}, [r0], r1
  25.634 +        vld1.32         {d16[0]}, [r0], r1
  25.635 +        vld1.32         {d0[0]},  [r0], r1
  25.636 +        vld1.32         {d2[0]},  [r0], r1
  25.637 +        vld1.32         {d18[1]}, [r0], r1
  25.638 +        vld1.32         {d16[1]}, [r0], r1
  25.639 +        vld1.32         {d0[1]},  [r0], r1
  25.640 +        vld1.32         {d2[1]},  [r0], r1
  25.641 +
  25.642 +        vtrn.16         d18, d0
  25.643 +        vtrn.16         d16, d2
  25.644 +        vtrn.8          d18, d16
  25.645 +        vtrn.8          d0,  d2
  25.646 +
  25.647 +        h264_loop_filter_chroma
  25.648 +
  25.649 +        vtrn.16         d18, d0
  25.650 +        vtrn.16         d16, d2
  25.651 +        vtrn.8          d18, d16
  25.652 +        vtrn.8          d0,  d2
  25.653 +
  25.654 +        sub             r0,  r0,  r1, lsl #3
  25.655 +        vst1.32         {d18[0]}, [r0], r1
  25.656 +        vst1.32         {d16[0]}, [r0], r1
  25.657 +        vst1.32         {d0[0]},  [r0], r1
  25.658 +        vst1.32         {d2[0]},  [r0], r1
  25.659 +        vst1.32         {d18[1]}, [r0], r1
  25.660 +        vst1.32         {d16[1]}, [r0], r1
  25.661 +        vst1.32         {d0[1]},  [r0], r1
  25.662 +        vst1.32         {d2[1]},  [r0], r1
  25.663 +
  25.664 +        bx              lr
  25.665 +endfunc
  25.666 +
  25.667 +        /* H.264 qpel MC */
  25.668 +
  25.669 +        .macro  lowpass_const r
  25.670 +        movw            \r,  #5
  25.671 +        movt            \r,  #20
  25.672 +        vmov.32         d6[0], \r
  25.673 +        .endm
  25.674 +
  25.675 +        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
  25.676 +.if \narrow
  25.677 +        t0 .req q0
  25.678 +        t1 .req q8
  25.679 +.else
  25.680 +        t0 .req \d0
  25.681 +        t1 .req \d1
  25.682 +.endif
  25.683 +        vext.8          d2,  \r0, \r1, #2
  25.684 +        vext.8          d3,  \r0, \r1, #3
  25.685 +        vaddl.u8        q1,  d2,  d3
  25.686 +        vext.8          d4,  \r0, \r1, #1
  25.687 +        vext.8          d5,  \r0, \r1, #4
  25.688 +        vaddl.u8        q2,  d4,  d5
  25.689 +        vext.8          d30, \r0, \r1, #5
  25.690 +        vaddl.u8        t0,  \r0, d30
  25.691 +        vext.8          d18, \r2, \r3, #2
  25.692 +        vmla.i16        t0,  q1,  d6[1]
  25.693 +        vext.8          d19, \r2, \r3, #3
  25.694 +        vaddl.u8        q9,  d18, d19
  25.695 +        vext.8          d20, \r2, \r3, #1
  25.696 +        vmls.i16        t0,  q2,  d6[0]
  25.697 +        vext.8          d21, \r2, \r3, #4
  25.698 +        vaddl.u8        q10, d20, d21
  25.699 +        vext.8          d31, \r2, \r3, #5
  25.700 +        vaddl.u8        t1,  \r2, d31
  25.701 +        vmla.i16        t1,  q9,  d6[1]
  25.702 +        vmls.i16        t1,  q10, d6[0]
  25.703 +.if \narrow
  25.704 +        vqrshrun.s16    \d0, t0,  #5
  25.705 +        vqrshrun.s16    \d1, t1,  #5
  25.706 +.endif
  25.707 +        .unreq  t0
  25.708 +        .unreq  t1
  25.709 +        .endm
  25.710 +
  25.711 +        .macro  lowpass_8_1 r0, r1, d0, narrow=1
  25.712 +.if \narrow
  25.713 +        t0 .req q0
  25.714 +.else
  25.715 +        t0 .req \d0
  25.716 +.endif
  25.717 +        vext.8          d2,  \r0, \r1, #2
  25.718 +        vext.8          d3,  \r0, \r1, #3
  25.719 +        vaddl.u8        q1,  d2,  d3
  25.720 +        vext.8          d4,  \r0, \r1, #1
  25.721 +        vext.8          d5,  \r0, \r1, #4
  25.722 +        vaddl.u8        q2,  d4,  d5
  25.723 +        vext.8          d30, \r0, \r1, #5
  25.724 +        vaddl.u8        t0,  \r0, d30
  25.725 +        vmla.i16        t0,  q1,  d6[1]
  25.726 +        vmls.i16        t0,  q2,  d6[0]
  25.727 +.if \narrow
  25.728 +        vqrshrun.s16    \d0, t0,  #5
  25.729 +.endif
  25.730 +        .unreq  t0
  25.731 +        .endm
  25.732 +
  25.733 +        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
  25.734 +        vext.16         q1,  \r0, \r1, #2
  25.735 +        vext.16         q0,  \r0, \r1, #3
  25.736 +        vaddl.s16       q9,  d2,  d0
  25.737 +        vext.16         q2,  \r0, \r1, #1
  25.738 +        vaddl.s16       q1,  d3,  d1
  25.739 +        vext.16         q3,  \r0, \r1, #4
  25.740 +        vaddl.s16       q10, d4,  d6
  25.741 +        vext.16         \r1, \r0, \r1, #5
  25.742 +        vaddl.s16       q2,  d5,  d7
  25.743 +        vaddl.s16       q0,  \h0, \h1
  25.744 +        vaddl.s16       q8,  \l0, \l1
  25.745 +
  25.746 +        vshl.i32        q3,  q9,  #4
  25.747 +        vshl.i32        q9,  q9,  #2
  25.748 +        vshl.i32        q15, q10, #2
  25.749 +        vadd.i32        q9,  q9,  q3
  25.750 +        vadd.i32        q10, q10, q15
  25.751 +
  25.752 +        vshl.i32        q3,  q1,  #4
  25.753 +        vshl.i32        q1,  q1,  #2
  25.754 +        vshl.i32        q15, q2,  #2
  25.755 +        vadd.i32        q1,  q1,  q3
  25.756 +        vadd.i32        q2,  q2,  q15
  25.757 +
  25.758 +        vadd.i32        q9,  q9,  q8
  25.759 +        vsub.i32        q9,  q9,  q10
  25.760 +
  25.761 +        vadd.i32        q1,  q1,  q0
  25.762 +        vsub.i32        q1,  q1,  q2
  25.763 +
  25.764 +        vrshrn.s32      d18, q9,  #10
  25.765 +        vrshrn.s32      d19, q1,  #10
  25.766 +
  25.767 +        vqmovun.s16     \d,  q9
  25.768 +        .endm
  25.769 +
  25.770 +function put_h264_qpel16_h_lowpass_neon_packed
  25.771 +        mov             r4,  lr
  25.772 +        mov             ip,  #16
  25.773 +        mov             r3,  #8
  25.774 +        bl              put_h264_qpel8_h_lowpass_neon
  25.775 +        sub             r1,  r1,  r2, lsl #4
  25.776 +        add             r1,  r1,  #8
  25.777 +        mov             ip,  #16
  25.778 +        mov             lr,  r4
  25.779 +        b               put_h264_qpel8_h_lowpass_neon
  25.780 +endfunc
  25.781 +
  25.782 +        .macro h264_qpel_h_lowpass type
  25.783 +function \type\()_h264_qpel16_h_lowpass_neon
  25.784 +        push            {lr}
  25.785 +        mov             ip,  #16
  25.786 +        bl              \type\()_h264_qpel8_h_lowpass_neon
  25.787 +        sub             r0,  r0,  r3, lsl #4
  25.788 +        sub             r1,  r1,  r2, lsl #4
  25.789 +        add             r0,  r0,  #8
  25.790 +        add             r1,  r1,  #8
  25.791 +        mov             ip,  #16
  25.792 +        pop             {lr}
  25.793 +endfunc
  25.794 +
  25.795 +function \type\()_h264_qpel8_h_lowpass_neon
  25.796 +1:      vld1.64         {d0, d1},  [r1], r2
  25.797 +        vld1.64         {d16,d17}, [r1], r2
  25.798 +        subs            ip,  ip,  #2
  25.799 +        lowpass_8       d0,  d1,  d16, d17, d0,  d16
  25.800 +.ifc \type,avg
  25.801 +        vld1.8          {d2},     [r0,:64], r3
  25.802 +        vrhadd.u8       d0,  d0,  d2
  25.803 +        vld1.8          {d3},     [r0,:64]
  25.804 +        vrhadd.u8       d16, d16, d3
  25.805 +        sub             r0,  r0,  r3
  25.806 +.endif
  25.807 +        vst1.64         {d0},     [r0,:64], r3
  25.808 +        vst1.64         {d16},    [r0,:64], r3
  25.809 +        bne             1b
  25.810 +        bx              lr
  25.811 +endfunc
  25.812 +        .endm
  25.813 +
  25.814 +        h264_qpel_h_lowpass put
  25.815 +        h264_qpel_h_lowpass avg
  25.816 +
  25.817 +        .macro h264_qpel_h_lowpass_l2 type
  25.818 +function \type\()_h264_qpel16_h_lowpass_l2_neon
  25.819 +        push            {lr}
  25.820 +        mov             ip,  #16
  25.821 +        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
  25.822 +        sub             r0,  r0,  r2, lsl #4
  25.823 +        sub             r1,  r1,  r2, lsl #4
  25.824 +        sub             r3,  r3,  r2, lsl #4
  25.825 +        add             r0,  r0,  #8
  25.826 +        add             r1,  r1,  #8
  25.827 +        add             r3,  r3,  #8
  25.828 +        mov             ip,  #16
  25.829 +        pop             {lr}
  25.830 +endfunc
  25.831 +
  25.832 +function \type\()_h264_qpel8_h_lowpass_l2_neon
  25.833 +1:      vld1.64         {d0, d1},  [r1], r2
  25.834 +        vld1.64         {d16,d17}, [r1], r2
  25.835 +        vld1.64         {d28},     [r3], r2
  25.836 +        vld1.64         {d29},     [r3], r2
  25.837 +        subs            ip,  ip,  #2
  25.838 +        lowpass_8       d0,  d1,  d16, d17, d0,  d1
  25.839 +        vrhadd.u8       q0,  q0,  q14
  25.840 +.ifc \type,avg
  25.841 +        vld1.8          {d2},      [r0,:64], r2
  25.842 +        vrhadd.u8       d0,  d0,  d2
  25.843 +        vld1.8          {d3},      [r0,:64]
  25.844 +        vrhadd.u8       d1,  d1,  d3
  25.845 +        sub             r0,  r0,  r2
  25.846 +.endif
  25.847 +        vst1.64         {d0},      [r0,:64], r2
  25.848 +        vst1.64         {d1},      [r0,:64], r2
  25.849 +        bne             1b
  25.850 +        bx              lr
  25.851 +endfunc
  25.852 +        .endm
  25.853 +
  25.854 +        h264_qpel_h_lowpass_l2 put
  25.855 +        h264_qpel_h_lowpass_l2 avg
  25.856 +
  25.857 +function put_h264_qpel16_v_lowpass_neon_packed
  25.858 +        mov             r4,  lr
  25.859 +        mov             r2,  #8
  25.860 +        bl              put_h264_qpel8_v_lowpass_neon
  25.861 +        sub             r1,  r1,  r3, lsl #2
  25.862 +        bl              put_h264_qpel8_v_lowpass_neon
  25.863 +        sub             r1,  r1,  r3, lsl #4
  25.864 +        sub             r1,  r1,  r3, lsl #2
  25.865 +        add             r1,  r1,  #8
  25.866 +        bl              put_h264_qpel8_v_lowpass_neon
  25.867 +        sub             r1,  r1,  r3, lsl #2
  25.868 +        mov             lr,  r4
  25.869 +        b               put_h264_qpel8_v_lowpass_neon
  25.870 +endfunc
  25.871 +
  25.872 +        .macro h264_qpel_v_lowpass type
  25.873 +function \type\()_h264_qpel16_v_lowpass_neon
  25.874 +        mov             r4,  lr
  25.875 +        bl              \type\()_h264_qpel8_v_lowpass_neon
  25.876 +        sub             r1,  r1,  r3, lsl #2
  25.877 +        bl              \type\()_h264_qpel8_v_lowpass_neon
  25.878 +        sub             r0,  r0,  r2, lsl #4
  25.879 +        add             r0,  r0,  #8
  25.880 +        sub             r1,  r1,  r3, lsl #4
  25.881 +        sub             r1,  r1,  r3, lsl #2
  25.882 +        add             r1,  r1,  #8
  25.883 +        bl              \type\()_h264_qpel8_v_lowpass_neon
  25.884 +        sub             r1,  r1,  r3, lsl #2
  25.885 +        mov             lr,  r4
  25.886 +endfunc
  25.887 +
  25.888 +function \type\()_h264_qpel8_v_lowpass_neon
  25.889 +        vld1.64         {d8},  [r1], r3
  25.890 +        vld1.64         {d10}, [r1], r3
  25.891 +        vld1.64         {d12}, [r1], r3
  25.892 +        vld1.64         {d14}, [r1], r3
  25.893 +        vld1.64         {d22}, [r1], r3
  25.894 +        vld1.64         {d24}, [r1], r3
  25.895 +        vld1.64         {d26}, [r1], r3
  25.896 +        vld1.64         {d28}, [r1], r3
  25.897 +        vld1.64         {d9},  [r1], r3
  25.898 +        vld1.64         {d11}, [r1], r3
  25.899 +        vld1.64         {d13}, [r1], r3
  25.900 +        vld1.64         {d15}, [r1], r3
  25.901 +        vld1.64         {d23}, [r1]
  25.902 +
  25.903 +        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
  25.904 +        lowpass_8       d8,  d9,  d10, d11, d8,  d10
  25.905 +        lowpass_8       d12, d13, d14, d15, d12, d14
  25.906 +        lowpass_8       d22, d23, d24, d25, d22, d24
  25.907 +        lowpass_8       d26, d27, d28, d29, d26, d28
  25.908 +        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
  25.909 +
  25.910 +.ifc \type,avg
  25.911 +        vld1.8          {d9},  [r0,:64], r2
  25.912 +        vrhadd.u8       d8,  d8,  d9
  25.913 +        vld1.8          {d11}, [r0,:64], r2
  25.914 +        vrhadd.u8       d10, d10, d11
  25.915 +        vld1.8          {d13}, [r0,:64], r2
  25.916 +        vrhadd.u8       d12, d12, d13
  25.917 +        vld1.8          {d15}, [r0,:64], r2
  25.918 +        vrhadd.u8       d14, d14, d15
  25.919 +        vld1.8          {d23}, [r0,:64], r2
  25.920 +        vrhadd.u8       d22, d22, d23
  25.921 +        vld1.8          {d25}, [r0,:64], r2
  25.922 +        vrhadd.u8       d24, d24, d25
  25.923 +        vld1.8          {d27}, [r0,:64], r2
  25.924 +        vrhadd.u8       d26, d26, d27
  25.925 +        vld1.8          {d29}, [r0,:64], r2
  25.926 +        vrhadd.u8       d28, d28, d29
  25.927 +        sub             r0,  r0,  r2,  lsl #3
  25.928 +.endif
  25.929 +
  25.930 +        vst1.64         {d8},  [r0,:64], r2
  25.931 +        vst1.64         {d10}, [r0,:64], r2
  25.932 +        vst1.64         {d12}, [r0,:64], r2
  25.933 +        vst1.64         {d14}, [r0,:64], r2
  25.934 +        vst1.64         {d22}, [r0,:64], r2
  25.935 +        vst1.64         {d24}, [r0,:64], r2
  25.936 +        vst1.64         {d26}, [r0,:64], r2
  25.937 +        vst1.64         {d28}, [r0,:64], r2
  25.938 +
  25.939 +        bx              lr
  25.940 +endfunc
  25.941 +        .endm
  25.942 +
  25.943 +        h264_qpel_v_lowpass put
  25.944 +        h264_qpel_v_lowpass avg
  25.945 +
  25.946 +        .macro h264_qpel_v_lowpass_l2 type
  25.947 +function \type\()_h264_qpel16_v_lowpass_l2_neon
  25.948 +        mov             r4,  lr
  25.949 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
  25.950 +        sub             r1,  r1,  r3, lsl #2
  25.951 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
  25.952 +        sub             r0,  r0,  r3, lsl #4
  25.953 +        sub             ip,  ip,  r2, lsl #4
  25.954 +        add             r0,  r0,  #8
  25.955 +        add             ip,  ip,  #8
  25.956 +        sub             r1,  r1,  r3, lsl #4
  25.957 +        sub             r1,  r1,  r3, lsl #2
  25.958 +        add             r1,  r1,  #8
  25.959 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
  25.960 +        sub             r1,  r1,  r3, lsl #2
  25.961 +        mov             lr,  r4
  25.962 +endfunc
  25.963 +
  25.964 +function \type\()_h264_qpel8_v_lowpass_l2_neon
  25.965 +        vld1.64         {d8},  [r1], r3
  25.966 +        vld1.64         {d10}, [r1], r3
  25.967 +        vld1.64         {d12}, [r1], r3
  25.968 +        vld1.64         {d14}, [r1], r3
  25.969 +        vld1.64         {d22}, [r1], r3
  25.970 +        vld1.64         {d24}, [r1], r3
  25.971 +        vld1.64         {d26}, [r1], r3
  25.972 +        vld1.64         {d28}, [r1], r3
  25.973 +        vld1.64         {d9},  [r1], r3
  25.974 +        vld1.64         {d11}, [r1], r3
  25.975 +        vld1.64         {d13}, [r1], r3
  25.976 +        vld1.64         {d15}, [r1], r3
  25.977 +        vld1.64         {d23}, [r1]
  25.978 +
  25.979 +        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
  25.980 +        lowpass_8       d8,  d9,  d10, d11, d8,  d9
  25.981 +        lowpass_8       d12, d13, d14, d15, d12, d13
  25.982 +        lowpass_8       d22, d23, d24, d25, d22, d23
  25.983 +        lowpass_8       d26, d27, d28, d29, d26, d27
  25.984 +        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
  25.985 +
  25.986 +        vld1.64         {d0},  [ip], r2
  25.987 +        vld1.64         {d1},  [ip], r2
  25.988 +        vld1.64         {d2},  [ip], r2
  25.989 +        vld1.64         {d3},  [ip], r2
  25.990 +        vld1.64         {d4},  [ip], r2
  25.991 +        vrhadd.u8       q0,  q0,  q4
  25.992 +        vld1.64         {d5},  [ip], r2
  25.993 +        vrhadd.u8       q1,  q1,  q6
  25.994 +        vld1.64         {d10}, [ip], r2
  25.995 +        vrhadd.u8       q2,  q2,  q11
  25.996 +        vld1.64         {d11}, [ip], r2
  25.997 +        vrhadd.u8       q5,  q5,  q13
  25.998 +
  25.999 +.ifc \type,avg
 25.1000 +        vld1.8          {d16}, [r0,:64], r3
 25.1001 +        vrhadd.u8       d0,  d0,  d16
 25.1002 +        vld1.8          {d17}, [r0,:64], r3
 25.1003 +        vrhadd.u8       d1,  d1,  d17
 25.1004 +        vld1.8          {d16}, [r0,:64], r3
 25.1005 +        vrhadd.u8       d2,  d2,  d16
 25.1006 +        vld1.8          {d17}, [r0,:64], r3
 25.1007 +        vrhadd.u8       d3,  d3,  d17
 25.1008 +        vld1.8          {d16}, [r0,:64], r3
 25.1009 +        vrhadd.u8       d4,  d4,  d16
 25.1010 +        vld1.8          {d17}, [r0,:64], r3
 25.1011 +        vrhadd.u8       d5,  d5,  d17
 25.1012 +        vld1.8          {d16}, [r0,:64], r3
 25.1013 +        vrhadd.u8       d10, d10, d16
 25.1014 +        vld1.8          {d17}, [r0,:64], r3
 25.1015 +        vrhadd.u8       d11, d11, d17
 25.1016 +        sub             r0,  r0,  r3,  lsl #3
 25.1017 +.endif
 25.1018 +
 25.1019 +        vst1.64         {d0},  [r0,:64], r3
 25.1020 +        vst1.64         {d1},  [r0,:64], r3
 25.1021 +        vst1.64         {d2},  [r0,:64], r3
 25.1022 +        vst1.64         {d3},  [r0,:64], r3
 25.1023 +        vst1.64         {d4},  [r0,:64], r3
 25.1024 +        vst1.64         {d5},  [r0,:64], r3
 25.1025 +        vst1.64         {d10}, [r0,:64], r3
 25.1026 +        vst1.64         {d11}, [r0,:64], r3
 25.1027 +
 25.1028 +        bx              lr
 25.1029 +endfunc
 25.1030 +        .endm
 25.1031 +
 25.1032 +        h264_qpel_v_lowpass_l2 put
 25.1033 +        h264_qpel_v_lowpass_l2 avg
 25.1034 +
 25.1035 +function put_h264_qpel8_hv_lowpass_neon_top
 25.1036 +        lowpass_const   ip
 25.1037 +        mov             ip,  #12
 25.1038 +1:      vld1.64         {d0, d1},  [r1], r3
 25.1039 +        vld1.64         {d16,d17}, [r1], r3
 25.1040 +        subs            ip,  ip,  #2
 25.1041 +        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
 25.1042 +        vst1.64         {d22-d25}, [r4,:128]!
 25.1043 +        bne             1b
 25.1044 +
 25.1045 +        vld1.64         {d0, d1},  [r1]
 25.1046 +        lowpass_8_1     d0,  d1,  q12, narrow=0
 25.1047 +
 25.1048 +        mov             ip,  #-16
 25.1049 +        add             r4,  r4,  ip
 25.1050 +        vld1.64         {d30,d31}, [r4,:128], ip
 25.1051 +        vld1.64         {d20,d21}, [r4,:128], ip
 25.1052 +        vld1.64         {d18,d19}, [r4,:128], ip
 25.1053 +        vld1.64         {d16,d17}, [r4,:128], ip
 25.1054 +        vld1.64         {d14,d15}, [r4,:128], ip
 25.1055 +        vld1.64         {d12,d13}, [r4,:128], ip
 25.1056 +        vld1.64         {d10,d11}, [r4,:128], ip
 25.1057 +        vld1.64         {d8, d9},  [r4,:128], ip
 25.1058 +        vld1.64         {d6, d7},  [r4,:128], ip
 25.1059 +        vld1.64         {d4, d5},  [r4,:128], ip
 25.1060 +        vld1.64         {d2, d3},  [r4,:128], ip
 25.1061 +        vld1.64         {d0, d1},  [r4,:128]
 25.1062 +
 25.1063 +        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
 25.1064 +        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
 25.1065 +
 25.1066 +        swap4           d17, d19, d21, d31, d24, d26, d28, d22
 25.1067 +        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
 25.1068 +
 25.1069 +        vst1.64         {d30,d31}, [r4,:128]!
 25.1070 +        vst1.64         {d6, d7},  [r4,:128]!
 25.1071 +        vst1.64         {d20,d21}, [r4,:128]!
 25.1072 +        vst1.64         {d4, d5},  [r4,:128]!
 25.1073 +        vst1.64         {d18,d19}, [r4,:128]!
 25.1074 +        vst1.64         {d2, d3},  [r4,:128]!
 25.1075 +        vst1.64         {d16,d17}, [r4,:128]!
 25.1076 +        vst1.64         {d0, d1},  [r4,:128]
 25.1077 +
 25.1078 +        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
 25.1079 +        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
 25.1080 +        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
 25.1081 +        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
 25.1082 +
 25.1083 +        vld1.64         {d16,d17}, [r4,:128], ip
 25.1084 +        vld1.64         {d30,d31}, [r4,:128], ip
 25.1085 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
 25.1086 +        vld1.64         {d16,d17}, [r4,:128], ip
 25.1087 +        vld1.64         {d30,d31}, [r4,:128], ip
 25.1088 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
 25.1089 +        vld1.64         {d16,d17}, [r4,:128], ip
 25.1090 +        vld1.64         {d30,d31}, [r4,:128], ip
 25.1091 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
 25.1092 +        vld1.64         {d16,d17}, [r4,:128], ip
 25.1093 +        vld1.64         {d30,d31}, [r4,:128]
 25.1094 +        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
 25.1095 +
 25.1096 +        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
 25.1097 +
 25.1098 +        bx              lr
 25.1099 +endfunc
 25.1100 +
 25.1101 +        .macro h264_qpel8_hv_lowpass type
 25.1102 +function \type\()_h264_qpel8_hv_lowpass_neon
 25.1103 +        mov             r10, lr
 25.1104 +        bl              put_h264_qpel8_hv_lowpass_neon_top
 25.1105 +.ifc \type,avg
 25.1106 +        vld1.8          {d0},      [r0,:64], r2
 25.1107 +        vrhadd.u8       d12, d12, d0
 25.1108 +        vld1.8          {d1},      [r0,:64], r2
 25.1109 +        vrhadd.u8       d13, d13, d1
 25.1110 +        vld1.8          {d2},      [r0,:64], r2
 25.1111 +        vrhadd.u8       d14, d14, d2
 25.1112 +        vld1.8          {d3},      [r0,:64], r2
 25.1113 +        vrhadd.u8       d15, d15, d3
 25.1114 +        vld1.8          {d4},      [r0,:64], r2
 25.1115 +        vrhadd.u8       d8,  d8,  d4
 25.1116 +        vld1.8          {d5},      [r0,:64], r2
 25.1117 +        vrhadd.u8       d9,  d9,  d5
 25.1118 +        vld1.8          {d6},      [r0,:64], r2
 25.1119 +        vrhadd.u8       d10, d10, d6
 25.1120 +        vld1.8          {d7},      [r0,:64], r2
 25.1121 +        vrhadd.u8       d11, d11, d7
 25.1122 +        sub             r0,  r0,  r2,  lsl #3
 25.1123 +.endif
 25.1124 +        vst1.64         {d12},     [r0,:64], r2
 25.1125 +        vst1.64         {d13},     [r0,:64], r2
 25.1126 +        vst1.64         {d14},     [r0,:64], r2
 25.1127 +        vst1.64         {d15},     [r0,:64], r2
 25.1128 +        vst1.64         {d8},      [r0,:64], r2
 25.1129 +        vst1.64         {d9},      [r0,:64], r2
 25.1130 +        vst1.64         {d10},     [r0,:64], r2
 25.1131 +        vst1.64         {d11},     [r0,:64], r2
 25.1132 +
 25.1133 +        mov             lr,  r10
 25.1134 +        bx              lr
 25.1135 +endfunc
 25.1136 +        .endm
 25.1137 +
 25.1138 +        h264_qpel8_hv_lowpass put
 25.1139 +        h264_qpel8_hv_lowpass avg
 25.1140 +
 25.1141 +        .macro h264_qpel8_hv_lowpass_l2 type
 25.1142 +function \type\()_h264_qpel8_hv_lowpass_l2_neon
 25.1143 +        mov             r10, lr
 25.1144 +        bl              put_h264_qpel8_hv_lowpass_neon_top
 25.1145 +
 25.1146 +        vld1.64         {d0, d1},  [r2,:128]!
 25.1147 +        vld1.64         {d2, d3},  [r2,:128]!
 25.1148 +        vrhadd.u8       q0,  q0,  q6
 25.1149 +        vld1.64         {d4, d5},  [r2,:128]!
 25.1150 +        vrhadd.u8       q1,  q1,  q7
 25.1151 +        vld1.64         {d6, d7},  [r2,:128]!
 25.1152 +        vrhadd.u8       q2,  q2,  q4
 25.1153 +        vrhadd.u8       q3,  q3,  q5
 25.1154 +.ifc \type,avg
 25.1155 +        vld1.8          {d16},     [r0,:64], r3
 25.1156 +        vrhadd.u8       d0,  d0,  d16
 25.1157 +        vld1.8          {d17},     [r0,:64], r3
 25.1158 +        vrhadd.u8       d1,  d1,  d17
 25.1159 +        vld1.8          {d18},     [r0,:64], r3
 25.1160 +        vrhadd.u8       d2,  d2,  d18
 25.1161 +        vld1.8          {d19},     [r0,:64], r3
 25.1162 +        vrhadd.u8       d3,  d3,  d19
 25.1163 +        vld1.8          {d20},     [r0,:64], r3
 25.1164 +        vrhadd.u8       d4,  d4,  d20
 25.1165 +        vld1.8          {d21},     [r0,:64], r3
 25.1166 +        vrhadd.u8       d5,  d5,  d21
 25.1167 +        vld1.8          {d22},     [r0,:64], r3
 25.1168 +        vrhadd.u8       d6,  d6,  d22
 25.1169 +        vld1.8          {d23},     [r0,:64], r3
 25.1170 +        vrhadd.u8       d7,  d7,  d23
 25.1171 +        sub             r0,  r0,  r3,  lsl #3
 25.1172 +.endif
 25.1173 +        vst1.64         {d0},      [r0,:64], r3
 25.1174 +        vst1.64         {d1},      [r0,:64], r3
 25.1175 +        vst1.64         {d2},      [r0,:64], r3
 25.1176 +        vst1.64         {d3},      [r0,:64], r3
 25.1177 +        vst1.64         {d4},      [r0,:64], r3
 25.1178 +        vst1.64         {d5},      [r0,:64], r3
 25.1179 +        vst1.64         {d6},      [r0,:64], r3
 25.1180 +        vst1.64         {d7},      [r0,:64], r3
 25.1181 +
 25.1182 +        mov             lr,  r10
 25.1183 +        bx              lr
 25.1184 +endfunc
 25.1185 +        .endm
 25.1186 +
 25.1187 +        h264_qpel8_hv_lowpass_l2 put
 25.1188 +        h264_qpel8_hv_lowpass_l2 avg
 25.1189 +
 25.1190 +        .macro h264_qpel16_hv type
 25.1191 +function \type\()_h264_qpel16_hv_lowpass_neon
 25.1192 +        mov             r9,  lr
 25.1193 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
 25.1194 +        sub             r1,  r1,  r3, lsl #2
 25.1195 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
 25.1196 +        sub             r1,  r1,  r3, lsl #4
 25.1197 +        sub             r1,  r1,  r3, lsl #2
 25.1198 +        add             r1,  r1,  #8
 25.1199 +        sub             r0,  r0,  r2, lsl #4
 25.1200 +        add             r0,  r0,  #8
 25.1201 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
 25.1202 +        sub             r1,  r1,  r3, lsl #2
 25.1203 +        mov             lr,  r9
 25.1204 +        b               \type\()_h264_qpel8_hv_lowpass_neon
 25.1205 +endfunc
 25.1206 +
 25.1207 +function \type\()_h264_qpel16_hv_lowpass_l2_neon
 25.1208 +        mov             r9,  lr
 25.1209 +        sub             r2,  r4,  #256
 25.1210 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
 25.1211 +        sub             r1,  r1,  r3, lsl #2
 25.1212 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
 25.1213 +        sub             r1,  r1,  r3, lsl #4
 25.1214 +        sub             r1,  r1,  r3, lsl #2
 25.1215 +        add             r1,  r1,  #8
 25.1216 +        sub             r0,  r0,  r3, lsl #4
 25.1217 +        add             r0,  r0,  #8
 25.1218 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
 25.1219 +        sub             r1,  r1,  r3, lsl #2
 25.1220 +        mov             lr,  r9
 25.1221 +        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
 25.1222 +endfunc
 25.1223 +        .endm
 25.1224 +
 25.1225 +        h264_qpel16_hv put
 25.1226 +        h264_qpel16_hv avg
 25.1227 +
 25.1228 +        .macro h264_qpel8 type
 25.1229 +function ff_\type\()_h264_qpel8_mc10_neon, export=1
 25.1230 +        lowpass_const   r3
 25.1231 +        mov             r3,  r1
 25.1232 +        sub             r1,  r1,  #2
 25.1233 +        mov             ip,  #8
 25.1234 +        b               \type\()_h264_qpel8_h_lowpass_l2_neon
 25.1235 +endfunc
 25.1236 +
 25.1237 +function ff_\type\()_h264_qpel8_mc20_neon, export=1
 25.1238 +        lowpass_const   r3
 25.1239 +        sub             r1,  r1,  #2
 25.1240 +        mov             r3,  r2
 25.1241 +        mov             ip,  #8
 25.1242 +        b               \type\()_h264_qpel8_h_lowpass_neon
 25.1243 +endfunc
 25.1244 +
 25.1245 +function ff_\type\()_h264_qpel8_mc30_neon, export=1
 25.1246 +        lowpass_const   r3
 25.1247 +        add             r3,  r1,  #1
 25.1248 +        sub             r1,  r1,  #2
 25.1249 +        mov             ip,  #8
 25.1250 +        b               \type\()_h264_qpel8_h_lowpass_l2_neon
 25.1251 +endfunc
 25.1252 +
 25.1253 +function ff_\type\()_h264_qpel8_mc01_neon, export=1
 25.1254 +        push            {lr}
 25.1255 +        mov             ip,  r1
 25.1256 +\type\()_h264_qpel8_mc01:
 25.1257 +        lowpass_const   r3
 25.1258 +        mov             r3,  r2
 25.1259 +        sub             r1,  r1,  r2, lsl #1
 25.1260 +        vpush           {d8-d15}
 25.1261 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
 25.1262 +        vpop            {d8-d15}
 25.1263 +        pop             {pc}
 25.1264 +endfunc
 25.1265 +
 25.1266 +function ff_\type\()_h264_qpel8_mc11_neon, export=1
 25.1267 +        push            {r0, r1, r11, lr}
 25.1268 +\type\()_h264_qpel8_mc11:
 25.1269 +        lowpass_const   r3
 25.1270 +        mov             r11, sp
 25.1271 +        bic             sp,  sp,  #15
 25.1272 +        sub             sp,  sp,  #64
 25.1273 +        mov             r0,  sp
 25.1274 +        sub             r1,  r1,  #2
 25.1275 +        mov             r3,  #8
 25.1276 +        mov             ip,  #8
 25.1277 +        vpush           {d8-d15}
 25.1278 +        bl              put_h264_qpel8_h_lowpass_neon
 25.1279 +        ldrd            r0,  [r11]
 25.1280 +        mov             r3,  r2
 25.1281 +        add             ip,  sp,  #64
 25.1282 +        sub             r1,  r1,  r2, lsl #1
 25.1283 +        mov             r2,  #8
 25.1284 +        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
 25.1285 +        vpop            {d8-d15}
 25.1286 +        add             sp,  r11, #8
 25.1287 +        pop             {r11, pc}
 25.1288 +endfunc
 25.1289 +
 25.1290 +function ff_\type\()_h264_qpel8_mc21_neon, export=1
 25.1291 +        push            {r0, r1, r4, r10, r11, lr}
 25.1292 +\type\()_h264_qpel8_mc21:
 25.1293 +        lowpass_const   r3
 25.1294 +        mov             r11, sp
 25.1295 +        bic             sp,  sp,  #15
 25.1296 +        sub             sp,  sp,  #(8*8+16*12)
 25.1297 +        sub             r1,  r1,  #2
 25.1298 +        mov             r3,  #8
 25.1299 +        mov             r0,  sp
 25.1300 +        mov             ip,  #8
 25.1301 +        vpush           {d8-d15}
 25.1302 +        bl              put_h264_qpel8_h_lowpass_neon
 25.1303 +        mov             r4,  r0
 25.1304 +        ldrd            r0,  [r11]
 25.1305 +        sub             r1,  r1,  r2, lsl #1
 25.1306 +        sub             r1,  r1,  #2
 25.1307 +        mov             r3,  r2
 25.1308 +        sub             r2,  r4,  #64
 25.1309 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
 25.1310 +        vpop            {d8-d15}
 25.1311 +        add             sp,  r11,  #8
 25.1312 +        pop             {r4, r10, r11, pc}
 25.1313 +endfunc
 25.1314 +
 25.1315 +function ff_\type\()_h264_qpel8_mc31_neon, export=1
 25.1316 +        add             r1,  r1,  #1
 25.1317 +        push            {r0, r1, r11, lr}
 25.1318 +        sub             r1,  r1,  #1
 25.1319 +        b               \type\()_h264_qpel8_mc11
 25.1320 +endfunc
 25.1321 +
 25.1322 +function ff_\type\()_h264_qpel8_mc02_neon, export=1
 25.1323 +        push            {lr}
 25.1324 +        lowpass_const   r3
 25.1325 +        sub             r1,  r1,  r2, lsl #1
 25.1326 +        mov             r3,  r2
 25.1327 +        vpush           {d8-d15}
 25.1328 +        bl              \type\()_h264_qpel8_v_lowpass_neon
 25.1329 +        vpop            {d8-d15}
 25.1330 +        pop             {pc}
 25.1331 +endfunc
 25.1332 +
 25.1333 +function ff_\type\()_h264_qpel8_mc12_neon, export=1
 25.1334 +        push            {r0, r1, r4, r10, r11, lr}
 25.1335 +\type\()_h264_qpel8_mc12:
 25.1336 +        lowpass_const   r3
 25.1337 +        mov             r11, sp
 25.1338 +        bic             sp,  sp,  #15
 25.1339 +        sub             sp,  sp,  #(8*8+16*12)
 25.1340 +        sub             r1,  r1,  r2, lsl #1
 25.1341 +        mov             r3,  r2
 25.1342 +        mov             r2,  #8
 25.1343 +        mov             r0,  sp
 25.1344 +        vpush           {d8-d15}
 25.1345 +        bl              put_h264_qpel8_v_lowpass_neon
 25.1346 +        mov             r4,  r0
 25.1347 +        ldrd            r0,  [r11]
 25.1348 +        sub             r1,  r1,  r3, lsl #1
 25.1349 +        sub             r1,  r1,  #2
 25.1350 +        sub             r2,  r4,  #64
 25.1351 +        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
 25.1352 +        vpop            {d8-d15}
 25.1353 +        add             sp,  r11,  #8
 25.1354 +        pop             {r4, r10, r11, pc}
 25.1355 +endfunc
 25.1356 +
 25.1357 +function ff_\type\()_h264_qpel8_mc22_neon, export=1
 25.1358 +        push            {r4, r10, r11, lr}
 25.1359 +        mov             r11, sp
 25.1360 +        bic             sp,  sp,  #15
 25.1361 +        sub             r1,  r1,  r2, lsl #1
 25.1362 +        sub             r1,  r1,  #2
 25.1363 +        mov             r3,  r2
 25.1364 +        sub             sp,  sp,  #(16*12)
 25.1365 +        mov             r4,  sp
 25.1366 +        vpush           {d8-d15}
 25.1367 +        bl              \type\()_h264_qpel8_hv_lowpass_neon
 25.1368 +        vpop            {d8-d15}
 25.1369 +        mov             sp,  r11
 25.1370 +        pop             {r4, r10, r11, pc}
 25.1371 +endfunc
 25.1372 +
 25.1373 +function ff_\type\()_h264_qpel8_mc32_neon, export=1
 25.1374 +        push            {r0, r1, r4, r10, r11, lr}
 25.1375 +        add             r1,  r1,  #1
 25.1376 +        b               \type\()_h264_qpel8_mc12
 25.1377 +endfunc
 25.1378 +
 25.1379 +function ff_\type\()_h264_qpel8_mc03_neon, export=1
 25.1380 +        push            {lr}
 25.1381 +        add             ip,  r1,  r2
 25.1382 +        b               \type\()_h264_qpel8_mc01
 25.1383 +endfunc
 25.1384 +
 25.1385 +function ff_\type\()_h264_qpel8_mc13_neon, export=1
 25.1386 +        push            {r0, r1, r11, lr}
 25.1387 +        add             r1,  r1,  r2
 25.1388 +        b               \type\()_h264_qpel8_mc11
 25.1389 +endfunc
 25.1390 +
 25.1391 +function ff_\type\()_h264_qpel8_mc23_neon, export=1
 25.1392 +        push            {r0, r1, r4, r10, r11, lr}
 25.1393 +        add             r1,  r1,  r2
 25.1394 +        b               \type\()_h264_qpel8_mc21
 25.1395 +endfunc
 25.1396 +
 25.1397 +function ff_\type\()_h264_qpel8_mc33_neon, export=1
 25.1398 +        add             r1,  r1,  #1
 25.1399 +        push            {r0, r1, r11, lr}
 25.1400 +        add             r1,  r1,  r2
 25.1401 +        sub             r1,  r1,  #1
 25.1402 +        b               \type\()_h264_qpel8_mc11
 25.1403 +endfunc
 25.1404 +        .endm
 25.1405 +
 25.1406 +        h264_qpel8 put
 25.1407 +        h264_qpel8 avg
 25.1408 +
 25.1409 +        .macro h264_qpel16 type
 25.1410 +function ff_\type\()_h264_qpel16_mc10_neon, export=1
 25.1411 +        lowpass_const   r3
 25.1412 +        mov             r3,  r1
 25.1413 +        sub             r1,  r1,  #2
 25.1414 +        b               \type\()_h264_qpel16_h_lowpass_l2_neon
 25.1415 +endfunc
 25.1416 +
 25.1417 +function ff_\type\()_h264_qpel16_mc20_neon, export=1
 25.1418 +        lowpass_const   r3
 25.1419 +        sub             r1,  r1,  #2
 25.1420 +        mov             r3,  r2
 25.1421 +        b               \type\()_h264_qpel16_h_lowpass_neon
 25.1422 +endfunc
 25.1423 +
 25.1424 +function ff_\type\()_h264_qpel16_mc30_neon, export=1
 25.1425 +        lowpass_const   r3
 25.1426 +        add             r3,  r1,  #1
 25.1427 +        sub             r1,  r1,  #2
 25.1428 +        b               \type\()_h264_qpel16_h_lowpass_l2_neon
 25.1429 +endfunc
 25.1430 +
 25.1431 +function ff_\type\()_h264_qpel16_mc01_neon, export=1
 25.1432 +        push            {r4, lr}
 25.1433 +        mov             ip,  r1
 25.1434 +\type\()_h264_qpel16_mc01:
 25.1435 +        lowpass_const   r3
 25.1436 +        mov             r3,  r2
 25.1437 +        sub             r1,  r1,  r2, lsl #1
 25.1438 +        vpush           {d8-d15}
 25.1439 +        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
 25.1440 +        vpop            {d8-d15}
 25.1441 +        pop             {r4, pc}
 25.1442 +endfunc
 25.1443 +
 25.1444 +function ff_\type\()_h264_qpel16_mc11_neon, export=1
 25.1445 +        push            {r0, r1, r4, r11, lr}
 25.1446 +\type\()_h264_qpel16_mc11:
 25.1447 +        lowpass_const   r3
 25.1448 +        mov             r11, sp
 25.1449 +        bic             sp,  sp,  #15
 25.1450 +        sub             sp,  sp,  #256
 25.1451 +        mov             r0,  sp
 25.1452 +        sub             r1,  r1,  #2
 25.1453 +        mov             r3,  #16
 25.1454 +        vpush           {d8-d15}
 25.1455 +        bl              put_h264_qpel16_h_lowpass_neon
 25.1456 +        ldrd            r0,  [r11]
 25.1457 +        mov             r3,  r2
 25.1458 +        add             ip,  sp,  #64
 25.1459 +        sub             r1,  r1,  r2, lsl #1
 25.1460 +        mov             r2,  #16
 25.1461 +        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
 25.1462 +        vpop            {d8-d15}
 25.1463 +        add             sp,  r11, #8
 25.1464 +        pop             {r4, r11, pc}
 25.1465 +endfunc
 25.1466 +
 25.1467 +function ff_\type\()_h264_qpel16_mc21_neon, export=1
 25.1468 +        push            {r0, r1, r4-r5, r9-r11, lr}
 25.1469 +\type\()_h264_qpel16_mc21:
 25.1470 +        lowpass_const   r3
 25.1471 +        mov             r11, sp
 25.1472 +        bic             sp,  sp,  #15
 25.1473 +        sub             sp,  sp,  #(16*16+16*12)
 25.1474 +        sub             r1,  r1,  #2
 25.1475 +        mov             r0,  sp
 25.1476 +        vpush           {d8-d15}
 25.1477 +        bl              put_h264_qpel16_h_lowpass_neon_packed
 25.1478 +        mov             r4,  r0
 25.1479 +        ldrd            r0,  [r11]
 25.1480 +        sub             r1,  r1,  r2, lsl #1
 25.1481 +        sub             r1,  r1,  #2
 25.1482 +        mov             r3,  r2
 25.1483 +        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
 25.1484 +        vpop            {d8-d15}
 25.1485 +        add             sp,  r11,  #8
 25.1486 +        pop             {r4-r5, r9-r11, pc}
 25.1487 +endfunc
 25.1488 +
 25.1489 +function ff_\type\()_h264_qpel16_mc31_neon, export=1
 25.1490 +        add             r1,  r1,  #1
 25.1491 +        push            {r0, r1, r4, r11, lr}
 25.1492 +        sub             r1,  r1,  #1
 25.1493 +        b               \type\()_h264_qpel16_mc11
 25.1494 +endfunc
 25.1495 +
 25.1496 +function ff_\type\()_h264_qpel16_mc02_neon, export=1
 25.1497 +        push            {r4, lr}
 25.1498 +        lowpass_const   r3
 25.1499 +        sub             r1,  r1,  r2, lsl #1
 25.1500 +        mov             r3,  r2
 25.1501 +        vpush           {d8-d15}
 25.1502 +        bl              \type\()_h264_qpel16_v_lowpass_neon
 25.1503 +        vpop            {d8-d15}
 25.1504 +        pop             {r4, pc}
 25.1505 +endfunc
 25.1506 +
 25.1507 +function ff_\type\()_h264_qpel16_mc12_neon, export=1
 25.1508 +        push            {r0, r1, r4-r5, r9-r11, lr}
 25.1509 +\type\()_h264_qpel16_mc12:
 25.1510 +        lowpass_const   r3
 25.1511 +        mov             r11, sp
 25.1512 +        bic             sp,  sp,  #15
 25.1513 +        sub             sp,  sp,  #(16*16+16*12)
 25.1514 +        sub             r1,  r1,  r2, lsl #1
 25.1515 +        mov             r0,  sp
 25.1516 +        mov             r3,  r2
 25.1517 +        vpush           {d8-d15}
 25.1518 +        bl              put_h264_qpel16_v_lowpass_neon_packed
 25.1519 +        mov             r4,  r0
 25.1520 +        ldrd            r0,  [r11]
 25.1521 +        sub             r1,  r1,  r3, lsl #1
 25.1522 +        sub             r1,  r1,  #2
 25.1523 +        mov             r2,  r3
 25.1524 +        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
 25.1525 +        vpop            {d8-d15}
 25.1526 +        add             sp,  r11,  #8
 25.1527 +        pop             {r4-r5, r9-r11, pc}
 25.1528 +endfunc
 25.1529 +
 25.1530 +function ff_\type\()_h264_qpel16_mc22_neon, export=1
 25.1531 +        push            {r4, r9-r11, lr}
 25.1532 +        lowpass_const   r3
 25.1533 +        mov             r11, sp
 25.1534 +        bic             sp,  sp,  #15
 25.1535 +        sub             r1,  r1,  r2, lsl #1
 25.1536 +        sub             r1,  r1,  #2
 25.1537 +        mov             r3,  r2
 25.1538 +        sub             sp,  sp,  #(16*12)
 25.1539 +        mov             r4,  sp
 25.1540 +        vpush           {d8-d15}
 25.1541 +        bl              \type\()_h264_qpel16_hv_lowpass_neon
 25.1542 +        vpop            {d8-d15}
 25.1543 +        mov             sp,  r11
 25.1544 +        pop             {r4, r9-r11, pc}
 25.1545 +endfunc
 25.1546 +
 25.1547 +function ff_\type\()_h264_qpel16_mc32_neon, export=1
 25.1548 +        push            {r0, r1, r4-r5, r9-r11, lr}
 25.1549 +        add             r1,  r1,  #1
 25.1550 +        b               \type\()_h264_qpel16_mc12
 25.1551 +endfunc
 25.1552 +
 25.1553 +function ff_\type\()_h264_qpel16_mc03_neon, export=1
 25.1554 +        push            {r4, lr}
 25.1555 +        add             ip,  r1,  r2
 25.1556 +        b               \type\()_h264_qpel16_mc01
 25.1557 +endfunc
 25.1558 +
 25.1559 +function ff_\type\()_h264_qpel16_mc13_neon, export=1
 25.1560 +        push            {r0, r1, r4, r11, lr}
 25.1561 +        add             r1,  r1,  r2
 25.1562 +        b               \type\()_h264_qpel16_mc11
 25.1563 +endfunc
 25.1564 +
 25.1565 +function ff_\type\()_h264_qpel16_mc23_neon, export=1
 25.1566 +        push            {r0, r1, r4-r5, r9-r11, lr}
 25.1567 +        add             r1,  r1,  r2
 25.1568 +        b               \type\()_h264_qpel16_mc21
 25.1569 +endfunc
 25.1570 +
 25.1571 +function ff_\type\()_h264_qpel16_mc33_neon, export=1
 25.1572 +        add             r1,  r1,  #1
 25.1573 +        push            {r0, r1, r4, r11, lr}
 25.1574 +        add             r1,  r1,  r2
 25.1575 +        sub             r1,  r1,  #1
 25.1576 +        b               \type\()_h264_qpel16_mc11
 25.1577 +endfunc
 25.1578 +        .endm
 25.1579 +
 25.1580 +        h264_qpel16 put
 25.1581 +        h264_qpel16 avg
 25.1582 +
 25.1583 +@ Biweighted prediction
 25.1584 +
 25.1585 +        .macro  biweight_16 macs, macd
 25.1586 +        vdup.8          d0,  r4
 25.1587 +        vdup.8          d1,  r5
 25.1588 +        vmov            q2,  q8
 25.1589 +        vmov            q3,  q8
 25.1590 +1:      subs            ip,  ip,  #2
 25.1591 +        vld1.8          {d20-d21},[r0,:128], r2
 25.1592 +        \macd           q2,  d0,  d20
 25.1593 +        pld             [r0]
 25.1594 +        \macd           q3,  d0,  d21
 25.1595 +        vld1.8          {d22-d23},[r1,:128], r2
 25.1596 +        \macs           q2,  d1,  d22
 25.1597 +        pld             [r1]
 25.1598 +        \macs           q3,  d1,  d23
 25.1599 +        vmov            q12, q8
 25.1600 +        vld1.8          {d28-d29},[r0,:128], r2
 25.1601 +        vmov            q13, q8
 25.1602 +        \macd           q12, d0,  d28
 25.1603 +        pld             [r0]
 25.1604 +        \macd           q13, d0,  d29
 25.1605 +        vld1.8          {d30-d31},[r1,:128], r2
 25.1606 +        \macs           q12, d1,  d30
 25.1607 +        pld             [r1]
 25.1608 +        \macs           q13, d1,  d31
 25.1609 +        vshl.s16        q2,  q2,  q9
 25.1610 +        vshl.s16        q3,  q3,  q9
 25.1611 +        vqmovun.s16     d4,  q2
 25.1612 +        vqmovun.s16     d5,  q3
 25.1613 +        vshl.s16        q12, q12, q9
 25.1614 +        vshl.s16        q13, q13, q9
 25.1615 +        vqmovun.s16     d24, q12
 25.1616 +        vqmovun.s16     d25, q13
 25.1617 +        vmov            q3,  q8
 25.1618 +        vst1.8          {d4- d5}, [r6,:128], r2
 25.1619 +        vmov            q2,  q8
 25.1620 +        vst1.8          {d24-d25},[r6,:128], r2
 25.1621 +        bne             1b
 25.1622 +        pop             {r4-r6, pc}
 25.1623 +        .endm
 25.1624 +
 25.1625 +        .macro  biweight_8 macs, macd
 25.1626 +        vdup.8          d0,  r4
 25.1627 +        vdup.8          d1,  r5
 25.1628 +        vmov            q1,  q8
 25.1629 +        vmov            q10, q8
 25.1630 +1:      subs            ip,  ip,  #2
 25.1631 +        vld1.8          {d4},[r0,:64], r2
 25.1632 +        \macd           q1,  d0,  d4
 25.1633 +        pld             [r0]
 25.1634 +        vld1.8          {d5},[r1,:64], r2
 25.1635 +        \macs           q1,  d1,  d5
 25.1636 +        pld             [r1]
 25.1637 +        vld1.8          {d6},[r0,:64], r2
 25.1638 +        \macd           q10, d0,  d6
 25.1639 +        pld             [r0]
 25.1640 +        vld1.8          {d7},[r1,:64], r2
 25.1641 +        \macs           q10, d1,  d7
 25.1642 +        pld             [r1]
 25.1643 +        vshl.s16        q1,  q1,  q9
 25.1644 +        vqmovun.s16     d2,  q1
 25.1645 +        vshl.s16        q10, q10, q9
 25.1646 +        vqmovun.s16     d4,  q10
 25.1647 +        vmov            q10, q8
 25.1648 +        vst1.8          {d2},[r6,:64], r2
 25.1649 +        vmov            q1,  q8
 25.1650 +        vst1.8          {d4},[r6,:64], r2
 25.1651 +        bne             1b
 25.1652 +        pop             {r4-r6, pc}
 25.1653 +        .endm
 25.1654 +
 25.1655 +        .macro  biweight_4 macs, macd
 25.1656 +        vdup.8          d0,  r4
 25.1657 +        vdup.8          d1,  r5
 25.1658 +        vmov            q1,  q8
 25.1659 +        vmov            q10, q8
 25.1660 +1:      subs            ip,  ip,  #4
 25.1661 +        vld1.32         {d4[0]},[r0,:32], r2
 25.1662 +        vld1.32         {d4[1]},[r0,:32], r2
 25.1663 +        \macd           q1,  d0,  d4
 25.1664 +        pld             [r0]
 25.1665 +        vld1.32         {d5[0]},[r1,:32], r2
 25.1666 +        vld1.32         {d5[1]},[r1,:32], r2
 25.1667 +        \macs           q1,  d1,  d5
 25.1668 +        pld             [r1]
 25.1669 +        blt             2f
 25.1670 +        vld1.32         {d6[0]},[r0,:32], r2
 25.1671 +        vld1.32         {d6[1]},[r0,:32], r2
 25.1672 +        \macd           q10, d0,  d6
 25.1673 +        pld             [r0]
 25.1674 +        vld1.32         {d7[0]},[r1,:32], r2
 25.1675 +        vld1.32         {d7[1]},[r1,:32], r2
 25.1676 +        \macs           q10, d1,  d7
 25.1677 +        pld             [r1]
 25.1678 +        vshl.s16        q1,  q1,  q9
 25.1679 +        vqmovun.s16     d2,  q1
 25.1680 +        vshl.s16        q10, q10, q9
 25.1681 +        vqmovun.s16     d4,  q10
 25.1682 +        vmov            q10, q8
 25.1683 +        vst1.32         {d2[0]},[r6,:32], r2
 25.1684 +        vst1.32         {d2[1]},[r6,:32], r2
 25.1685 +        vmov            q1,  q8
 25.1686 +        vst1.32         {d4[0]},[r6,:32], r2
 25.1687 +        vst1.32         {d4[1]},[r6,:32], r2
 25.1688 +        bne             1b
 25.1689 +        pop             {r4-r6, pc}
 25.1690 +2:      vshl.s16        q1,  q1,  q9
 25.1691 +        vqmovun.s16     d2,  q1
 25.1692 +        vst1.32         {d2[0]},[r6,:32], r2
 25.1693 +        vst1.32         {d2[1]},[r6,:32], r2
 25.1694 +        pop             {r4-r6, pc}
 25.1695 +        .endm
 25.1696 +
 25.1697 +        .macro  biweight_func w
 25.1698 +function biweight_h264_pixels_\w\()_neon
 25.1699 +        push            {r4-r6, lr}
 25.1700 +        add             r4,  sp,  #16
 25.1701 +        ldm             r4,  {r4-r6}
 25.1702 +        lsr             lr,  r4,  #31
 25.1703 +        add             r6,  r6,  #1
 25.1704 +        eors            lr,  lr,  r5,  lsr #30
 25.1705 +        orr             r6,  r6,  #1
 25.1706 +        vdup.16         q9,  r3
 25.1707 +        lsl             r6,  r6,  r3
 25.1708 +        vmvn            q9,  q9
 25.1709 +        vdup.16         q8,  r6
 25.1710 +        mov             r6,  r0
 25.1711 +        beq             10f
 25.1712 +        subs            lr,  lr,  #1
 25.1713 +        beq             20f
 25.1714 +        subs            lr,  lr,  #1
 25.1715 +        beq             30f
 25.1716 +        b               40f
 25.1717 +10:     biweight_\w     vmlal.u8, vmlal.u8
 25.1718 +20:     rsb             r4,  r4,  #0
 25.1719 +        biweight_\w     vmlal.u8, vmlsl.u8
 25.1720 +30:     rsb             r4,  r4,  #0
 25.1721 +        rsb             r5,  r5,  #0
 25.1722 +        biweight_\w     vmlsl.u8, vmlsl.u8
 25.1723 +40:     rsb             r5,  r5,  #0
 25.1724 +        biweight_\w     vmlsl.u8, vmlal.u8
 25.1725 +endfunc
 25.1726 +        .endm
 25.1727 +
 25.1728 +        .macro  biweight_entry w, h, b=1
 25.1729 +function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
 25.1730 +        mov             ip,  #\h
 25.1731 +.if \b
 25.1732 +        b               biweight_h264_pixels_\w\()_neon
 25.1733 +.endif
 25.1734 +endfunc
 25.1735 +        .endm
 25.1736 +
 25.1737 +        biweight_entry  16, 8
 25.1738 +        biweight_entry  16, 16, b=0
 25.1739 +        biweight_func   16
 25.1740 +
 25.1741 +        biweight_entry  8,  16
 25.1742 +        biweight_entry  8,  4
 25.1743 +        biweight_entry  8,  8,  b=0
 25.1744 +        biweight_func   8
 25.1745 +
 25.1746 +        biweight_entry  4,  8
 25.1747 +        biweight_entry  4,  2
 25.1748 +        biweight_entry  4,  4,  b=0
 25.1749 +        biweight_func   4
 25.1750 +
 25.1751 +@ Weighted prediction
 25.1752 +
 25.1753 +        .macro  weight_16 add
 25.1754 +        vdup.8          d0,  r3
 25.1755 +1:      subs            ip,  ip,  #2
 25.1756 +        vld1.8          {d20-d21},[r0,:128], r1
 25.1757 +        vmull.u8        q2,  d0,  d20
 25.1758 +        pld             [r0]
 25.1759 +        vmull.u8        q3,  d0,  d21
 25.1760 +        vld1.8          {d28-d29},[r0,:128], r1
 25.1761 +        vmull.u8        q12, d0,  d28
 25.1762 +        pld             [r0]
 25.1763 +        vmull.u8        q13, d0,  d29
 25.1764 +        \add            q2,  q8,  q2
 25.1765 +        vrshl.s16       q2,  q2,  q9
 25.1766 +        \add            q3,  q8,  q3
 25.1767 +        vrshl.s16       q3,  q3,  q9
 25.1768 +        vqmovun.s16     d4,  q2
 25.1769 +        vqmovun.s16     d5,  q3
 25.1770 +        \add            q12, q8,  q12
 25.1771 +        vrshl.s16       q12, q12, q9
 25.1772 +        \add            q13, q8,  q13
 25.1773 +        vrshl.s16       q13, q13, q9
 25.1774 +        vqmovun.s16     d24, q12
 25.1775 +        vqmovun.s16     d25, q13
 25.1776 +        vst1.8          {d4- d5}, [r4,:128], r1
 25.1777 +        vst1.8          {d24-d25},[r4,:128], r1
 25.1778 +        bne             1b
 25.1779 +        pop             {r4, pc}
 25.1780 +        .endm
 25.1781 +
 25.1782 +        .macro  weight_8 add
 25.1783 +        vdup.8          d0,  r3
 25.1784 +1:      subs            ip,  ip,  #2
 25.1785 +        vld1.8          {d4},[r0,:64], r1
 25.1786 +        vmull.u8        q1,  d0,  d4
 25.1787 +        pld             [r0]
 25.1788 +        vld1.8          {d6},[r0,:64], r1
 25.1789 +        vmull.u8        q10, d0,  d6
 25.1790 +        \add            q1,  q8,  q1
 25.1791 +        pld             [r0]
 25.1792 +        vrshl.s16       q1,  q1,  q9
 25.1793 +        vqmovun.s16     d2,  q1
 25.1794 +        \add            q10, q8,  q10
 25.1795 +        vrshl.s16       q10, q10, q9
 25.1796 +        vqmovun.s16     d4,  q10
 25.1797 +        vst1.8          {d2},[r4,:64], r1
 25.1798 +        vst1.8          {d4},[r4,:64], r1
 25.1799 +        bne             1b
 25.1800 +        pop             {r4, pc}
 25.1801 +        .endm
 25.1802 +
 25.1803 +        .macro  weight_4 add
 25.1804 +        vdup.8          d0,  r3
 25.1805 +        vmov            q1,  q8
 25.1806 +        vmov            q10, q8
 25.1807 +1:      subs            ip,  ip,  #4
 25.1808 +        vld1.32         {d4[0]},[r0,:32], r1
 25.1809 +        vld1.32         {d4[1]},[r0,:32], r1
 25.1810 +        vmull.u8        q1,  d0,  d4
 25.1811 +        pld             [r0]
 25.1812 +        blt             2f
 25.1813 +        vld1.32         {d6[0]},[r0,:32], r1
 25.1814 +        vld1.32         {d6[1]},[r0,:32], r1
 25.1815 +        vmull.u8        q10, d0,  d6
 25.1816 +        pld             [r0]
 25.1817 +        \add            q1,  q8,  q1
 25.1818 +        vrshl.s16       q1,  q1,  q9
 25.1819 +        vqmovun.s16     d2,  q1
 25.1820 +        \add            q10, q8,  q10
 25.1821 +        vrshl.s16       q10, q10, q9
 25.1822 +        vqmovun.s16     d4,  q10
 25.1823 +        vmov            q10, q8
 25.1824 +        vst1.32         {d2[0]},[r4,:32], r1
 25.1825 +        vst1.32         {d2[1]},[r4,:32], r1
 25.1826 +        vmov            q1,  q8
 25.1827 +        vst1.32         {d4[0]},[r4,:32], r1
 25.1828 +        vst1.32         {d4[1]},[r4,:32], r1
 25.1829 +        bne             1b
 25.1830 +        pop             {r4, pc}
 25.1831 +2:      \add            q1,  q8,  q1
 25.1832 +        vrshl.s16       q1,  q1,  q9
 25.1833 +        vqmovun.s16     d2,  q1
 25.1834 +        vst1.32         {d2[0]},[r4,:32], r1
 25.1835 +        vst1.32         {d2[1]},[r4,:32], r1
 25.1836 +        pop             {r4, pc}
 25.1837 +        .endm
 25.1838 +
 25.1839 +        .macro  weight_func w
 25.1840 +function weight_h264_pixels_\w\()_neon
 25.1841 +        push            {r4, lr}
 25.1842 +        ldr             r4,  [sp, #8]
 25.1843 +        cmp             r2,  #1
 25.1844 +        lsl             r4,  r4,  r2
 25.1845 +        vdup.16         q8,  r4
 25.1846 +        mov             r4,  r0
 25.1847 +        ble             20f
 25.1848 +        rsb             lr,  r2,  #1
 25.1849 +        vdup.16         q9,  lr
 25.1850 +        cmp             r3,  #0
 25.1851 +        blt             10f
 25.1852 +        weight_\w       vhadd.s16
 25.1853 +10:     rsb             r3,  r3,  #0
 25.1854 +        weight_\w       vhsub.s16
 25.1855 +20:     rsb             lr,  r2,  #0
 25.1856 +        vdup.16         q9,  lr
 25.1857 +        cmp             r3,  #0
 25.1858 +        blt             10f
 25.1859 +        weight_\w       vadd.s16
 25.1860 +10:     rsb             r3,  r3,  #0
 25.1861 +        weight_\w       vsub.s16
 25.1862 +endfunc
 25.1863 +        .endm
 25.1864 +
 25.1865 +        .macro  weight_entry w, h, b=1
 25.1866 +function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
 25.1867 +        mov             ip,  #\h
 25.1868 +.if \b
 25.1869 +        b               weight_h264_pixels_\w\()_neon
 25.1870 +.endif
 25.1871 +endfunc
 25.1872 +        .endm
 25.1873 +
 25.1874 +        weight_entry    16, 8
 25.1875 +        weight_entry    16, 16, b=0
 25.1876 +        weight_func     16
 25.1877 +
 25.1878 +        weight_entry    8,  16
 25.1879 +        weight_entry    8,  4
 25.1880 +        weight_entry    8,  8,  b=0
 25.1881 +        weight_func     8
 25.1882 +
 25.1883 +        weight_entry    4,  8
 25.1884 +        weight_entry    4,  2
 25.1885 +        weight_entry    4,  4,  b=0
 25.1886 +        weight_func     4

    26.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    26.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264idct_neon.S	Mon Aug 27 12:09:56 2012 +0200
    26.3 @@ -0,0 +1,180 @@
    26.4 +/*
    26.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
    26.6 + *
    26.7 + * This file is part of FFmpeg.
    26.8 + *
    26.9 + * FFmpeg is free software; you can redistribute it and/or
   26.10 + * modify it under the terms of the GNU Lesser General Public
   26.11 + * License as published by the Free Software Foundation; either
   26.12 + * version 2.1 of the License, or (at your option) any later version.
   26.13 + *
   26.14 + * FFmpeg is distributed in the hope that it will be useful,
   26.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   26.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   26.17 + * Lesser General Public License for more details.
   26.18 + *
   26.19 + * You should have received a copy of the GNU Lesser General Public
   26.20 + * License along with FFmpeg; if not, write to the Free Software
   26.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   26.22 + */
   26.23 +
   26.24 +#include "asm.S"
   26.25 +
   26.26 +        preserve8
   26.27 +        .text
   26.28 +
   26.29 +function ff_h264_idct_add_neon, export=1
   26.30 +        vld1.64         {d0-d3},  [r1,:128]
   26.31 +
   26.32 +        vswp            d1,  d2
   26.33 +        vadd.i16        d4,  d0,  d1
   26.34 +        vshr.s16        q8,  q1,  #1
   26.35 +        vsub.i16        d5,  d0,  d1
   26.36 +        vadd.i16        d6,  d2,  d17
   26.37 +        vsub.i16        d7,  d16, d3
   26.38 +        vadd.i16        q0,  q2,  q3
   26.39 +        vsub.i16        q1,  q2,  q3
   26.40 +
   26.41 +        vtrn.16         d0,  d1
   26.42 +        vtrn.16         d3,  d2
   26.43 +        vtrn.32         d0,  d3
   26.44 +        vtrn.32         d1,  d2
   26.45 +
   26.46 +        vadd.i16        d4,  d0,  d3
   26.47 +        vld1.32         {d18[0]}, [r0,:32], r2
   26.48 +        vswp            d1,  d3
   26.49 +        vshr.s16        q8,  q1,  #1
   26.50 +        vld1.32         {d19[1]}, [r0,:32], r2
   26.51 +        vsub.i16        d5,  d0,  d1
   26.52 +        vld1.32         {d18[1]}, [r0,:32], r2
   26.53 +        vadd.i16        d6,  d16, d3
   26.54 +        vld1.32         {d19[0]}, [r0,:32], r2
   26.55 +        vsub.i16        d7,  d2,  d17
   26.56 +        sub             r0,  r0,  r2, lsl #2
   26.57 +        vadd.i16        q0,  q2,  q3
   26.58 +        vsub.i16        q1,  q2,  q3
   26.59 +
   26.60 +        vrshr.s16       q0,  q0,  #6
   26.61 +        vrshr.s16       q1,  q1,  #6
   26.62 +
   26.63 +        vaddw.u8        q0,  q0,  d18
   26.64 +        vaddw.u8        q1,  q1,  d19
   26.65 +
   26.66 +        vqmovun.s16     d0,  q0
   26.67 +        vqmovun.s16     d1,  q1
   26.68 +
   26.69 +        vst1.32         {d0[0]},  [r0,:32], r2
   26.70 +        vst1.32         {d1[1]},  [r0,:32], r2
   26.71 +        vst1.32         {d0[1]},  [r0,:32], r2
   26.72 +        vst1.32         {d1[0]},  [r0,:32], r2
   26.73 +
   26.74 +        bx              lr
   26.75 +endfunc
   26.76 +
   26.77 +function ff_h264_idct_dc_add_neon, export=1
   26.78 +        vld1.16         {d2[],d3[]}, [r1,:16]
   26.79 +        vrshr.s16       q1,  q1,  #6
   26.80 +        vld1.32         {d0[0]},  [r0,:32], r2
   26.81 +        vld1.32         {d0[1]},  [r0,:32], r2
   26.82 +        vaddw.u8        q2,  q1,  d0
   26.83 +        vld1.32         {d1[0]},  [r0,:32], r2
   26.84 +        vld1.32         {d1[1]},  [r0,:32], r2
   26.85 +        vaddw.u8        q1,  q1,  d1
   26.86 +        vqmovun.s16     d0,  q2
   26.87 +        vqmovun.s16     d1,  q1
   26.88 +        sub             r0,  r0,  r2, lsl #2
   26.89 +        vst1.32         {d0[0]},  [r0,:32], r2
   26.90 +        vst1.32         {d0[1]},  [r0,:32], r2
   26.91 +        vst1.32         {d1[0]},  [r0,:32], r2
   26.92 +        vst1.32         {d1[1]},  [r0,:32], r2
   26.93 +        bx              lr
   26.94 +endfunc
   26.95 +
   26.96 +function ff_h264_idct_add16_neon, export=1
   26.97 +        push            {r4-r8,lr}
   26.98 +        mov             r4,  r0
   26.99 +        mov             r5,  r1
  26.100 +        mov             r1,  r2
  26.101 +        mov             r2,  r3
  26.102 +        ldr             r6,  [sp, #24]
  26.103 +        movrel          r7,  scan8
  26.104 +        mov             ip,  #16
  26.105 +1:      ldrb            r8,  [r7], #1
  26.106 +        ldr             r0,  [r5], #4
  26.107 +        ldrb            r8,  [r6, r8]
  26.108 +        subs            r8,  r8,  #1
  26.109 +        blt             2f
  26.110 +        ldrsh           lr,  [r1]
  26.111 +        add             r0,  r0,  r4
  26.112 +        movne           lr,  #0
  26.113 +        cmp             lr,  #0
  26.114 +        adrne           lr,  ff_h264_idct_dc_add_neon
  26.115 +        adreq           lr,  ff_h264_idct_add_neon
  26.116 +        blx             lr
  26.117 +2:      subs            ip,  ip,  #1
  26.118 +        add             r1,  r1,  #32
  26.119 +        bne             1b
  26.120 +        pop             {r4-r8,pc}
  26.121 +endfunc
  26.122 +
  26.123 +function ff_h264_idct_add16intra_neon, export=1
  26.124 +        push            {r4-r8,lr}
  26.125 +        mov             r4,  r0
  26.126 +        mov             r5,  r1
  26.127 +        mov             r1,  r2
  26.128 +        mov             r2,  r3
  26.129 +        ldr             r6,  [sp, #24]
  26.130 +        movrel          r7,  scan8
  26.131 +        mov             ip,  #16
  26.132 +1:      ldrb            r8,  [r7], #1
  26.133 +        ldr             r0,  [r5], #4
  26.134 +        ldrb            r8,  [r6, r8]
  26.135 +        add             r0,  r0,  r4
  26.136 +        cmp             r8,  #0
  26.137 +        ldrsh           r8,  [r1]
  26.138 +        adrne           lr,  ff_h264_idct_add_neon
  26.139 +        adreq           lr,  ff_h264_idct_dc_add_neon
  26.140 +        cmpeq           r8,  #0
  26.141 +        blxne           lr
  26.142 +        subs            ip,  ip,  #1
  26.143 +        add             r1,  r1,  #32
  26.144 +        bne             1b
  26.145 +        pop             {r4-r8,pc}
  26.146 +endfunc
  26.147 +
  26.148 +function ff_h264_idct_add8_neon, export=1
  26.149 +        push            {r4-r10,lr}
  26.150 +        ldm             r0,  {r4,r9}
  26.151 +        add             r5,  r1,  #16*4
  26.152 +        add             r1,  r2,  #16*32
  26.153 +        mov             r2,  r3
  26.154 +        ldr             r6,  [sp, #32]
  26.155 +        movrel          r7,  scan8+16
  26.156 +        mov             ip,  #8
  26.157 +1:      ldrb            r8,  [r7], #1
  26.158 +        ldr             r0,  [r5], #4
  26.159 +        ldrb            r8,  [r6, r8]
  26.160 +        tst             ip,  #4
  26.161 +        addeq           r0,  r0,  r4
  26.162 +        addne           r0,  r0,  r9
  26.163 +        cmp             r8,  #0
  26.164 +        ldrsh           r8,  [r1]
  26.165 +        adrne           lr,  ff_h264_idct_add_neon
  26.166 +        adreq           lr,  ff_h264_idct_dc_add_neon
  26.167 +        cmpeq           r8,  #0
  26.168 +        blxne           lr
  26.169 +        subs            ip,  ip,  #1
  26.170 +        add             r1,  r1,  #32
  26.171 +        bne             1b
  26.172 +        pop             {r4-r10,pc}
  26.173 +endfunc
  26.174 +
  26.175 +        .section .rodata
  26.176 +scan8:  .byte           4+1*8, 5+1*8, 4+2*8, 5+2*8
  26.177 +        .byte           6+1*8, 7+1*8, 6+2*8, 7+2*8
  26.178 +        .byte           4+3*8, 5+3*8, 4+4*8, 5+4*8
  26.179 +        .byte           6+3*8, 7+3*8, 6+4*8, 7+4*8
  26.180 +        .byte           1+1*8, 2+1*8
  26.181 +        .byte           1+2*8, 2+2*8
  26.182 +        .byte           1+4*8, 2+4*8
  26.183 +        .byte           1+5*8, 2+5*8

    27.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    27.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_init_arm.c	Mon Aug 27 12:09:56 2012 +0200
    27.3 @@ -0,0 +1,75 @@
    27.4 +/*
    27.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    27.6 + *
    27.7 + * This file is part of FFmpeg.
    27.8 + *
    27.9 + * FFmpeg is free software; you can redistribute it and/or
   27.10 + * modify it under the terms of the GNU Lesser General Public
   27.11 + * License as published by the Free Software Foundation; either
   27.12 + * version 2.1 of the License, or (at your option) any later version.
   27.13 + *
   27.14 + * FFmpeg is distributed in the hope that it will be useful,
   27.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   27.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   27.17 + * Lesser General Public License for more details.
   27.18 + *
   27.19 + * You should have received a copy of the GNU Lesser General Public
   27.20 + * License along with FFmpeg; if not, write to the Free Software
   27.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   27.22 + */
   27.23 +
   27.24 +#include <stdint.h>
   27.25 +
   27.26 +#include "libavcodec/h264pred.h"
   27.27 +
   27.28 +void ff_pred16x16_vert_neon(uint8_t *src, int stride);
   27.29 +void ff_pred16x16_hor_neon(uint8_t *src, int stride);
   27.30 +void ff_pred16x16_plane_neon(uint8_t *src, int stride);
   27.31 +void ff_pred16x16_dc_neon(uint8_t *src, int stride);
   27.32 +void ff_pred16x16_128_dc_neon(uint8_t *src, int stride);
   27.33 +void ff_pred16x16_left_dc_neon(uint8_t *src, int stride);
   27.34 +void ff_pred16x16_top_dc_neon(uint8_t *src, int stride);
   27.35 +
   27.36 +void ff_pred8x8_vert_neon(uint8_t *src, int stride);
   27.37 +void ff_pred8x8_hor_neon(uint8_t *src, int stride);
   27.38 +void ff_pred8x8_plane_neon(uint8_t *src, int stride);
   27.39 +void ff_pred8x8_dc_neon(uint8_t *src, int stride);
   27.40 +void ff_pred8x8_128_dc_neon(uint8_t *src, int stride);
   27.41 +void ff_pred8x8_left_dc_neon(uint8_t *src, int stride);
   27.42 +void ff_pred8x8_top_dc_neon(uint8_t *src, int stride);
   27.43 +void ff_pred8x8_l0t_dc_neon(uint8_t *src, int stride);
   27.44 +void ff_pred8x8_0lt_dc_neon(uint8_t *src, int stride);
   27.45 +void ff_pred8x8_l00_dc_neon(uint8_t *src, int stride);
   27.46 +void ff_pred8x8_0l0_dc_neon(uint8_t *src, int stride);
   27.47 +
   27.48 +#if HAVE_NEON
   27.49 +static void ff_h264_pred_init_neon(H264PredContext *h)
   27.50 +{
   27.51 +    h->pred8x8[VERT_PRED8x8     ] = ff_pred8x8_vert_neon;
   27.52 +    h->pred8x8[HOR_PRED8x8      ] = ff_pred8x8_hor_neon;
   27.53 +    h->pred8x8[PLANE_PRED8x8    ] = ff_pred8x8_plane_neon;
   27.54 +    h->pred8x8[DC_128_PRED8x8   ] = ff_pred8x8_128_dc_neon;
   27.55 +    
   27.56 +    h->pred8x8[DC_PRED8x8     ] = ff_pred8x8_dc_neon;
   27.57 +    h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
   27.58 +    h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
   27.59 +    h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
   27.60 +    h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
   27.61 +    h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
   27.62 +    h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
   27.63 +
   27.64 +
   27.65 +    h->pred16x16[DC_PRED8x8     ] = ff_pred16x16_dc_neon;
   27.66 +    h->pred16x16[VERT_PRED8x8   ] = ff_pred16x16_vert_neon;
   27.67 +    h->pred16x16[HOR_PRED8x8    ] = ff_pred16x16_hor_neon;
   27.68 +    h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
   27.69 +    h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
   27.70 +    h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
   27.71 +    h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_neon;
   27.72 +}
   27.73 +#endif
   27.74 +
   27.75 +void ff_h264_pred_init_arm(H264PredContext *h)
   27.76 +{
   27.77 +    if (HAVE_NEON)    ff_h264_pred_init_neon(h);
   27.78 +}

    28.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    28.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/h264pred_neon.S	Mon Aug 27 12:09:56 2012 +0200
    28.3 @@ -0,0 +1,362 @@
    28.4 +/*
    28.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    28.6 + *
    28.7 + * This file is part of FFmpeg.
    28.8 + *
    28.9 + * FFmpeg is free software; you can redistribute it and/or
   28.10 + * modify it under the terms of the GNU Lesser General Public
   28.11 + * License as published by the Free Software Foundation; either
   28.12 + * version 2.1 of the License, or (at your option) any later version.
   28.13 + *
   28.14 + * FFmpeg is distributed in the hope that it will be useful,
   28.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   28.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   28.17 + * Lesser General Public License for more details.
   28.18 + *
   28.19 + * You should have received a copy of the GNU Lesser General Public
   28.20 + * License along with FFmpeg; if not, write to the Free Software
   28.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   28.22 + */
   28.23 +
   28.24 +#include "asm.S"
   28.25 +
   28.26 +        .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
   28.27 +.if \n == 8 || \hi == 0
   28.28 +        vld1.8          {\rd[0]}, [\rs], \rt
   28.29 +        vld1.8          {\rd[1]}, [\rs], \rt
   28.30 +        vld1.8          {\rd[2]}, [\rs], \rt
   28.31 +        vld1.8          {\rd[3]}, [\rs], \rt
   28.32 +.endif
   28.33 +.if \n == 8 || \hi == 1
   28.34 +        vld1.8          {\rd[4]}, [\rs], \rt
   28.35 +        vld1.8          {\rd[5]}, [\rs], \rt
   28.36 +        vld1.8          {\rd[6]}, [\rs], \rt
   28.37 +        vld1.8          {\rd[7]}, [\rs], \rt
   28.38 +.endif
   28.39 +        .endm
   28.40 +
   28.41 +        .macro add16x8  dq,  dl,  dh,  rl,  rh
   28.42 +        vaddl.u8        \dq, \rl, \rh
   28.43 +        vadd.u16        \dl, \dl, \dh
   28.44 +        vpadd.u16       \dl, \dl, \dl
   28.45 +        vpadd.u16       \dl, \dl, \dl
   28.46 +        .endm
   28.47 +
   28.48 +function ff_pred16x16_128_dc_neon, export=1
   28.49 +        vmov.i8         q0,  #128
   28.50 +        b               .L_pred16x16_dc_end
   28.51 +endfunc
   28.52 +
   28.53 +function ff_pred16x16_top_dc_neon, export=1
   28.54 +        sub             r2,  r0,  r1
   28.55 +        vld1.8          {q0},     [r2,:128]
   28.56 +        add16x8         q0,  d0,  d1,  d0,  d1
   28.57 +        vrshrn.u16      d0,  q0,  #4
   28.58 +        vdup.8          q0,  d0[0]
   28.59 +        b               .L_pred16x16_dc_end
   28.60 +endfunc
   28.61 +
   28.62 +function ff_pred16x16_left_dc_neon, export=1
   28.63 +        sub             r2,  r0,  #1
   28.64 +        ldcol.8         d0,  r2,  r1
   28.65 +        ldcol.8         d1,  r2,  r1
   28.66 +        add16x8         q0,  d0,  d1,  d0,  d1
   28.67 +        vrshrn.u16      d0,  q0,  #4
   28.68 +        vdup.8          q0,  d0[0]
   28.69 +        b               .L_pred16x16_dc_end
   28.70 +endfunc
   28.71 +
   28.72 +function ff_pred16x16_dc_neon, export=1
   28.73 +        sub             r2,  r0,  r1
   28.74 +        vld1.8          {q0},     [r2,:128]
   28.75 +        sub             r2,  r0,  #1
   28.76 +        ldcol.8         d2,  r2,  r1
   28.77 +        ldcol.8         d3,  r2,  r1
   28.78 +        vaddl.u8        q0,  d0,  d1
   28.79 +        vaddl.u8        q1,  d2,  d3
   28.80 +        vadd.u16        q0,  q0,  q1
   28.81 +        vadd.u16        d0,  d0,  d1
   28.82 +        vpadd.u16       d0,  d0,  d0
   28.83 +        vpadd.u16       d0,  d0,  d0
   28.84 +        vrshrn.u16      d0,  q0,  #5
   28.85 +        vdup.8          q0,  d0[0]
   28.86 +.L_pred16x16_dc_end:
   28.87 +        mov             r3,  #8
   28.88 +6:      vst1.8          {q0},     [r0,:128], r1
   28.89 +        vst1.8          {q0},     [r0,:128], r1
   28.90 +        subs            r3,  r3,  #1
   28.91 +        bne             6b
   28.92 +        bx              lr
   28.93 +endfunc
   28.94 +
   28.95 +function ff_pred16x16_hor_neon, export=1
   28.96 +        sub             r2,  r0,  #1
   28.97 +        mov             r3,  #16
   28.98 +1:      vld1.8          {d0[],d1[]},[r2],      r1
   28.99 +        vst1.8          {q0},       [r0,:128], r1
  28.100 +        subs            r3,  r3,  #1
  28.101 +        bne             1b
  28.102 +        bx              lr
  28.103 +endfunc
  28.104 +
  28.105 +function ff_pred16x16_vert_neon, export=1
  28.106 +        sub             r0,  r0,  r1
  28.107 +        vld1.8          {q0},     [r0,:128], r1
  28.108 +        mov             r3,  #8
  28.109 +1:      vst1.8          {q0},     [r0,:128], r1
  28.110 +        vst1.8          {q0},     [r0,:128], r1
  28.111 +        subs            r3,  r3,  #1
  28.112 +        bne             1b
  28.113 +        bx              lr
  28.114 +endfunc
  28.115 +
  28.116 +function ff_pred16x16_plane_neon, export=1
  28.117 +        sub             r3,  r0,  r1
  28.118 +        add             r2,  r3,  #8
  28.119 +        sub             r3,  r3,  #1
  28.120 +        vld1.8          {d0},     [r3]
  28.121 +        vld1.8          {d2},     [r2,:64], r1
  28.122 +        ldcol.8         d1,  r3,  r1
  28.123 +        add             r3,  r3,  r1
  28.124 +        ldcol.8         d3,  r3,  r1
  28.125 +        vrev64.8        q0,  q0
  28.126 +        vaddl.u8        q8,  d2,  d3
  28.127 +        vsubl.u8        q2,  d2,  d0
  28.128 +        vsubl.u8        q3,  d3,  d1
  28.129 +        movrel          r3,  p16weight
  28.130 +        vld1.8          {q0},     [r3,:128]
  28.131 +        vmul.s16        q2,  q2,  q0
  28.132 +        vmul.s16        q3,  q3,  q0
  28.133 +        vadd.i16        d4,  d4,  d5
  28.134 +        vadd.i16        d5,  d6,  d7
  28.135 +        vpadd.i16       d4,  d4,  d5
  28.136 +        vpadd.i16       d4,  d4,  d4
  28.137 +        vshl.i16        d5,  d4,  #2
  28.138 +        vaddl.s16       q2,  d4,  d5
  28.139 +        vrshrn.s32      d4,  q2,  #6
  28.140 +        mov             r3,  #0
  28.141 +        vtrn.16         d4,  d5
  28.142 +        vadd.i16        d2,  d4,  d5
  28.143 +        vshl.i16        d3,  d2,  #3
  28.144 +        vrev64.16       d16, d17
  28.145 +        vsub.i16        d3,  d3,  d2
  28.146 +        vadd.i16        d16, d16, d0
  28.147 +        vshl.i16        d2,  d16, #4
  28.148 +        vsub.i16        d2,  d2,  d3
  28.149 +        vshl.i16        d3,  d4,  #4
  28.150 +        vext.16         q0,  q0,  q0,  #7
  28.151 +        vsub.i16        d6,  d5,  d3
  28.152 +        vmov.16         d0[0], r3
  28.153 +        vmul.i16        q0,  q0,  d4[0]
  28.154 +        vdup.16         q1,  d2[0]
  28.155 +        vdup.16         q2,  d4[0]
  28.156 +        vdup.16         q3,  d6[0]
  28.157 +        vshl.i16        q2,  q2,  #3
  28.158 +        vadd.i16        q1,  q1,  q0
  28.159 +        vadd.i16        q3,  q3,  q2
  28.160 +        mov             r3,  #16
  28.161 +1:
  28.162 +        vqshrun.s16     d0,  q1,  #5
  28.163 +        vadd.i16        q1,  q1,  q2
  28.164 +        vqshrun.s16     d1,  q1,  #5
  28.165 +        vadd.i16        q1,  q1,  q3
  28.166 +        vst1.8          {q0},     [r0,:128], r1
  28.167 +        subs            r3,  r3,  #1
  28.168 +        bne             1b
  28.169 +        bx              lr
  28.170 +endfunc
  28.171 +
  28.172 +        .section        .rodata
  28.173 +        .align          4
  28.174 +p16weight:
  28.175 +        .short          1,2,3,4,5,6,7,8
  28.176 +
  28.177 +        .text
  28.178 +
  28.179 +function ff_pred8x8_hor_neon, export=1
  28.180 +        sub             r2,  r0,  #1
  28.181 +        mov             r3,  #8
  28.182 +1:      vld1.8          {d0[]},   [r2],     r1
  28.183 +        vst1.8          {d0},     [r0,:64], r1
  28.184 +        subs            r3,  r3,  #1
  28.185 +        bne             1b
  28.186 +        bx              lr
  28.187 +endfunc
  28.188 +
  28.189 +function ff_pred8x8_vert_neon, export=1
  28.190 +        sub             r0,  r0,  r1
  28.191 +        vld1.8          {d0},     [r0,:64], r1
  28.192 +        mov             r3,  #4
  28.193 +1:      vst1.8          {d0},     [r0,:64], r1
  28.194 +        vst1.8          {d0},     [r0,:64], r1
  28.195 +        subs            r3,  r3,  #1
  28.196 +        bne             1b
  28.197 +        bx              lr
  28.198 +endfunc
  28.199 +
  28.200 +function ff_pred8x8_plane_neon, export=1
  28.201 +        sub             r3,  r0,  r1
  28.202 +        add             r2,  r3,  #4
  28.203 +        sub             r3,  r3,  #1
  28.204 +        vld1.32         {d0[0]},  [r3]
  28.205 +        vld1.32         {d2[0]},  [r2,:32], r1
  28.206 +        ldcol.8         d0,  r3,  r1,  4,  hi=1
  28.207 +        add             r3,  r3,  r1
  28.208 +        ldcol.8         d3,  r3,  r1,  4
  28.209 +        vaddl.u8        q8,  d2,  d3
  28.210 +        vrev32.8        d0,  d0
  28.211 +        vtrn.32         d2,  d3
  28.212 +        vsubl.u8        q2,  d2,  d0
  28.213 +        movrel          r3,  p16weight
  28.214 +        vld1.16         {q0},     [r3,:128]
  28.215 +        vmul.s16        d4,  d4,  d0
  28.216 +        vmul.s16        d5,  d5,  d0
  28.217 +        vpadd.i16       d4,  d4,  d5
  28.218 +        vpaddl.s16      d4,  d4
  28.219 +        vshl.i32        d5,  d4,  #4
  28.220 +        vadd.s32        d4,  d4,  d5
  28.221 +        vrshrn.s32      d4,  q2,  #5
  28.222 +        mov             r3,  #0
  28.223 +        vtrn.16         d4,  d5
  28.224 +        vadd.i16        d2,  d4,  d5
  28.225 +        vshl.i16        d3,  d2,  #2
  28.226 +        vrev64.16       d16, d16
  28.227 +        vsub.i16        d3,  d3,  d2
  28.228 +        vadd.i16        d16, d16, d0
  28.229 +        vshl.i16        d2,  d16, #4
  28.230 +        vsub.i16        d2,  d2,  d3
  28.231 +        vshl.i16        d3,  d4,  #3
  28.232 +        vext.16         q0,  q0,  q0,  #7
  28.233 +        vsub.i16        d6,  d5,  d3
  28.234 +        vmov.16         d0[0], r3
  28.235 +        vmul.i16        q0,  q0,  d4[0]
  28.236 +        vdup.16         q1,  d2[0]
  28.237 +        vdup.16         q2,  d4[0]
  28.238 +        vdup.16         q3,  d6[0]
  28.239 +        vshl.i16        q2,  q2,  #3
  28.240 +        vadd.i16        q1,  q1,  q0
  28.241 +        vadd.i16        q3,  q3,  q2
  28.242 +        mov             r3,  #8
  28.243 +1:
  28.244 +        vqshrun.s16     d0,  q1,  #5
  28.245 +        vadd.i16        q1,  q1,  q3
  28.246 +        vst1.8          {d0},     [r0,:64], r1
  28.247 +        subs            r3,  r3,  #1
  28.248 +        bne             1b
  28.249 +        bx              lr
  28.250 +endfunc
  28.251 +
  28.252 +function ff_pred8x8_128_dc_neon, export=1
  28.253 +        vmov.i8         q0,  #128
  28.254 +        b               .L_pred8x8_dc_end
  28.255 +endfunc
  28.256 +
  28.257 +function ff_pred8x8_top_dc_neon, export=1
  28.258 +        sub             r2,  r0,  r1
  28.259 +        vld1.8          {d0},     [r2,:64]
  28.260 +        vpaddl.u8       d0,  d0
  28.261 +        vpadd.u16       d0,  d0,  d0
  28.262 +        vrshrn.u16      d0,  q0,  #2
  28.263 +        vdup.8          d1,  d0[1]
  28.264 +        vdup.8          d0,  d0[0]
  28.265 +        vtrn.32         d0,  d1
  28.266 +        b               .L_pred8x8_dc_end
  28.267 +endfunc
  28.268 +
  28.269 +function ff_pred8x8_left_dc_neon, export=1
  28.270 +        sub             r2,  r0,  #1
  28.271 +        ldcol.8         d0,  r2,  r1
  28.272 +        vpaddl.u8       d0,  d0
  28.273 +        vpadd.u16       d0,  d0,  d0
  28.274 +        vrshrn.u16      d0,  q0,  #2
  28.275 +        vdup.8          d1,  d0[1]
  28.276 +        vdup.8          d0,  d0[0]
  28.277 +        b               .L_pred8x8_dc_end
  28.278 +endfunc
  28.279 +
  28.280 +function ff_pred8x8_dc_neon, export=1
  28.281 +        sub             r2,  r0,  r1
  28.282 +        vld1.8          {d0},     [r2,:64]
  28.283 +        sub             r2,  r0,  #1
  28.284 +        ldcol.8         d1,  r2,  r1
  28.285 +        vtrn.32         d0,  d1
  28.286 +        vpaddl.u8       q0,  q0
  28.287 +        vpadd.u16       d0,  d0,  d1
  28.288 +        vpadd.u16       d1,  d0,  d0
  28.289 +        vrshrn.u16      d2,  q0,  #3
  28.290 +        vrshrn.u16      d3,  q0,  #2
  28.291 +        vdup.8          d0,  d2[4]
  28.292 +        vdup.8          d1,  d3[3]
  28.293 +        vdup.8          d4,  d3[2]
  28.294 +        vdup.8          d5,  d2[5]
  28.295 +        vtrn.32         q0,  q2
  28.296 +.L_pred8x8_dc_end:
  28.297 +        mov             r3,  #4
  28.298 +        add             r2,  r0,  r1,  lsl #2
  28.299 +6:      vst1.8          {d0},     [r0,:64], r1
  28.300 +        vst1.8          {d1},     [r2,:64], r1
  28.301 +        subs            r3,  r3,  #1
  28.302 +        bne             6b
  28.303 +        bx              lr
  28.304 +endfunc
  28.305 +
  28.306 +function ff_pred8x8_l0t_dc_neon, export=1
  28.307 +        sub             r2,  r0,  r1
  28.308 +        vld1.8          {d0},     [r2,:64]
  28.309 +        sub             r2,  r0,  #1
  28.310 +        ldcol.8         d1,  r2,  r1,  4
  28.311 +        vtrn.32         d0,  d1
  28.312 +        vpaddl.u8       q0,  q0
  28.313 +        vpadd.u16       d0,  d0,  d1
  28.314 +        vpadd.u16       d1,  d0,  d0
  28.315 +        vrshrn.u16      d2,  q0,  #3
  28.316 +        vrshrn.u16      d3,  q0,  #2
  28.317 +        vdup.8          d0,  d2[4]
  28.318 +        vdup.8          d1,  d3[0]
  28.319 +        vdup.8          q2,  d3[2]
  28.320 +        vtrn.32         q0,  q2
  28.321 +        b               .L_pred8x8_dc_end
  28.322 +endfunc
  28.323 +
  28.324 +function ff_pred8x8_l00_dc_neon, export=1
  28.325 +        sub             r2,  r0,  #1
  28.326 +        ldcol.8         d0,  r2,  r1,  4
  28.327 +        vpaddl.u8       d0,  d0
  28.328 +        vpadd.u16       d0,  d0,  d0
  28.329 +        vrshrn.u16      d0,  q0,  #2
  28.330 +        vmov.i8         d1,  #128
  28.331 +        vdup.8          d0,  d0[0]
  28.332 +        b               .L_pred8x8_dc_end
  28.333 +endfunc
  28.334 +
  28.335 +function ff_pred8x8_0lt_dc_neon, export=1
  28.336 +        sub             r2,  r0,  r1
  28.337 +        vld1.8          {d0},     [r2,:64]
  28.338 +        add             r2,  r0,  r1,  lsl #2
  28.339 +        sub             r2,  r2,  #1
  28.340 +        ldcol.8         d1,  r2,  r1,  4,  hi=1
  28.341 +        vtrn.32         d0,  d1
  28.342 +        vpaddl.u8       q0,  q0
  28.343 +        vpadd.u16       d0,  d0,  d1
  28.344 +        vpadd.u16       d1,  d0,  d0
  28.345 +        vrshrn.u16      d3,  q0,  #2
  28.346 +        vrshrn.u16      d2,  q0,  #3
  28.347 +        vdup.8          d0,  d3[0]
  28.348 +        vdup.8          d1,  d3[3]
  28.349 +        vdup.8          d4,  d3[2]
  28.350 +        vdup.8          d5,  d2[5]
  28.351 +        vtrn.32         q0,  q2
  28.352 +        b               .L_pred8x8_dc_end
  28.353 +endfunc
  28.354 +
  28.355 +function ff_pred8x8_0l0_dc_neon, export=1
  28.356 +        add             r2,  r0,  r1,  lsl #2
  28.357 +        sub             r2,  r2,  #1
  28.358 +        ldcol.8         d1,  r2,  r1,  4
  28.359 +        vpaddl.u8       d2,  d1
  28.360 +        vpadd.u16       d2,  d2,  d2
  28.361 +        vrshrn.u16      d1,  q1,  #2
  28.362 +        vmov.i8         d0,  #128
  28.363 +        vdup.8          d1,  d1[0]
  28.364 +        b               .L_pred8x8_dc_end
  28.365 +endfunc

    29.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    29.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/int_neon.S	Mon Aug 27 12:09:56 2012 +0200
    29.3 @@ -0,0 +1,118 @@
    29.4 +/*
    29.5 + * ARM NEON optimised integer operations
    29.6 + * Copyright (c) 2009 Kostya Shishkov
    29.7 + *
    29.8 + * This file is part of FFmpeg.
    29.9 + *
   29.10 + * FFmpeg is free software; you can redistribute it and/or
   29.11 + * modify it under the terms of the GNU Lesser General Public
   29.12 + * License as published by the Free Software Foundation; either
   29.13 + * version 2.1 of the License, or (at your option) any later version.
   29.14 + *
   29.15 + * FFmpeg is distributed in the hope that it will be useful,
   29.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   29.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   29.18 + * Lesser General Public License for more details.
   29.19 + *
   29.20 + * You should have received a copy of the GNU Lesser General Public
   29.21 + * License along with FFmpeg; if not, write to the Free Software
   29.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   29.23 + */
   29.24 +
   29.25 +#include "asm.S"
   29.26 +
   29.27 +        preserve8
   29.28 +        .fpu neon
   29.29 +        .text
   29.30 +
   29.31 +function ff_scalarproduct_int16_neon, export=1
   29.32 +        vmov.i16        q0,  #0
   29.33 +        vmov.i16        q1,  #0
   29.34 +        vmov.i16        q2,  #0
   29.35 +        vmov.i16        q3,  #0
   29.36 +        negs            r3,  r3
   29.37 +        beq             2f
   29.38 +
   29.39 +        vdup.s32        q12, r3
   29.40 +1:      vld1.16         {d16-d17}, [r0]!
   29.41 +        vld1.16         {d20-d21}, [r1,:128]!
   29.42 +        vmull.s16       q12, d16,  d20
   29.43 +        vld1.16         {d18-d19}, [r0]!
   29.44 +        vmull.s16       q13, d17,  d21
   29.45 +        vld1.16         {d22-d23}, [r1,:128]!
   29.46 +        vmull.s16       q14, d18,  d22
   29.47 +        vmull.s16       q15, d19,  d23
   29.48 +        vshl.s32        q8,  q12,  q12
   29.49 +        vshl.s32        q9,  q13,  q12
   29.50 +        vadd.s32        q0,  q0,   q8
   29.51 +        vshl.s32        q10, q14,  q12
   29.52 +        vadd.s32        q1,  q1,   q9
   29.53 +        vshl.s32        q11, q15,  q12
   29.54 +        vadd.s32        q2,  q2,   q10
   29.55 +        vadd.s32        q3,  q3,   q11
   29.56 +        subs            r2,  r2,   #16
   29.57 +        bne             1b
   29.58 +        b               3f
   29.59 +
   29.60 +2:      vld1.16         {d16-d17}, [r0]!
   29.61 +        vld1.16         {d20-d21}, [r1,:128]!
   29.62 +        vmlal.s16       q0,  d16,  d20
   29.63 +        vld1.16         {d18-d19}, [r0]!
   29.64 +        vmlal.s16       q1,  d17,  d21
   29.65 +        vld1.16         {d22-d23}, [r1,:128]!
   29.66 +        vmlal.s16       q2,  d18,  d22
   29.67 +        vmlal.s16       q3,  d19,  d23
   29.68 +        subs            r2,  r2,   #16
   29.69 +        bne             2b
   29.70 +
   29.71 +3:      vpadd.s32       d16, d0,   d1
   29.72 +        vpadd.s32       d17, d2,   d3
   29.73 +        vpadd.s32       d10, d4,   d5
   29.74 +        vpadd.s32       d11, d6,   d7
   29.75 +        vpadd.s32       d0,  d16,  d17
   29.76 +        vpadd.s32       d1,  d10,  d11
   29.77 +        vpadd.s32       d2,  d0,   d1
   29.78 +        vpaddl.s32      d3,  d2
   29.79 +        vmov.32         r0,  d3[0]
   29.80 +        bx              lr
   29.81 +endfunc
   29.82 +
   29.83 +@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
   29.84 +function ff_scalarproduct_and_madd_int16_neon, export=1
   29.85 +        vld1.16         {d28[],d29[]}, [sp]
   29.86 +        vmov.i16        q0,  #0
   29.87 +        vmov.i16        q1,  #0
   29.88 +        vmov.i16        q2,  #0
   29.89 +        vmov.i16        q3,  #0
   29.90 +        mov             r12, r0
   29.91 +
   29.92 +1:      vld1.16         {d16-d17}, [r0,:128]!
   29.93 +        vld1.16         {d18-d19}, [r1]!
   29.94 +        vld1.16         {d20-d21}, [r2]!
   29.95 +        vld1.16         {d22-d23}, [r0,:128]!
   29.96 +        vld1.16         {d24-d25}, [r1]!
   29.97 +        vld1.16         {d26-d27}, [r2]!
   29.98 +        vmul.s16        q10, q10,  q14
   29.99 +        vmul.s16        q13, q13,  q14
  29.100 +        vmlal.s16       q0,  d16,  d18
  29.101 +        vmlal.s16       q1,  d17,  d19
  29.102 +        vadd.s16        q10, q8,   q10
  29.103 +        vadd.s16        q13, q11,  q13
  29.104 +        vmlal.s16       q2,  d22,  d24
  29.105 +        vmlal.s16       q3,  d23,  d25
  29.106 +        vst1.16         {q10},     [r12,:128]!
  29.107 +        subs            r3,  r3,   #16
  29.108 +        vst1.16         {q13},     [r12,:128]!
  29.109 +        bne             1b
  29.110 +
  29.111 +        vpadd.s32       d16, d0,   d1
  29.112 +        vpadd.s32       d17, d2,   d3
  29.113 +        vpadd.s32       d10, d4,   d5
  29.114 +        vpadd.s32       d11, d6,   d7
  29.115 +        vpadd.s32       d0,  d16,  d17
  29.116 +        vpadd.s32       d1,  d10,  d11
  29.117 +        vpadd.s32       d2,  d0,   d1
  29.118 +        vpaddl.s32      d3,  d2
  29.119 +        vmov.32         r0,  d3[0]
  29.120 +        bx              lr
  29.121 +endfunc

    30.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    30.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/jrevdct_arm.S	Mon Aug 27 12:09:56 2012 +0200
    30.3 @@ -0,0 +1,388 @@
    30.4 +/*
    30.5 +   C-like prototype :
    30.6 +        void j_rev_dct_arm(DCTBLOCK data)
    30.7 +
    30.8 +   With DCTBLOCK being a pointer to an array of 64 'signed shorts'
    30.9 +
   30.10 +   Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
   30.11 +
   30.12 +   Permission is hereby granted, free of charge, to any person obtaining a copy
   30.13 +   of this software and associated documentation files (the "Software"), to deal
   30.14 +   in the Software without restriction, including without limitation the rights
   30.15 +   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   30.16 +   copies of the Software, and to permit persons to whom the Software is
   30.17 +   furnished to do so, subject to the following conditions:
   30.18 +
   30.19 +   The above copyright notice and this permission notice shall be included in
   30.20 +   all copies or substantial portions of the Software.
   30.21 +
   30.22 +   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   30.23 +   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   30.24 +   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
   30.25 +   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
   30.26 +   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   30.27 +   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   30.28 +
   30.29 +*/
   30.30 +
   30.31 +#include "asm.S"
   30.32 +
   30.33 +#define FIX_0_298631336 2446
   30.34 +#define FIX_0_541196100 4433
   30.35 +#define FIX_0_765366865 6270
   30.36 +#define FIX_1_175875602 9633
   30.37 +#define FIX_1_501321110 12299
   30.38 +#define FIX_2_053119869 16819
   30.39 +#define FIX_3_072711026 25172
   30.40 +#define FIX_M_0_390180644 -3196
   30.41 +#define FIX_M_0_899976223 -7373
   30.42 +#define FIX_M_1_847759065 -15137
   30.43 +#define FIX_M_1_961570560 -16069
   30.44 +#define FIX_M_2_562915447 -20995
   30.45 +#define FIX_0xFFFF 0xFFFF
   30.46 +
   30.47 +#define FIX_0_298631336_ID      0
   30.48 +#define FIX_0_541196100_ID      4
   30.49 +#define FIX_0_765366865_ID      8
   30.50 +#define FIX_1_175875602_ID     12
   30.51 +#define FIX_1_501321110_ID     16
   30.52 +#define FIX_2_053119869_ID     20
   30.53 +#define FIX_3_072711026_ID     24
   30.54 +#define FIX_M_0_390180644_ID   28
   30.55 +#define FIX_M_0_899976223_ID   32
   30.56 +#define FIX_M_1_847759065_ID   36
   30.57 +#define FIX_M_1_961570560_ID   40
   30.58 +#define FIX_M_2_562915447_ID   44
   30.59 +#define FIX_0xFFFF_ID          48
   30.60 +        .text
   30.61 +        .align
   30.62 +
   30.63 +function ff_j_rev_dct_arm, export=1
   30.64 +        stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
   30.65 +
   30.66 +        sub sp, sp, #4                  @ reserve some space on the stack
   30.67 +        str r0, [ sp ]                  @ save the DCT pointer to the stack
   30.68 +
   30.69 +        mov lr, r0                      @ lr = pointer to the current row
   30.70 +        mov r12, #8                     @ r12 = row-counter
   30.71 +        adr r11, const_array            @ r11 = base pointer to the constants array
   30.72 +row_loop:
   30.73 +        ldrsh r0, [lr, # 0]             @ r0 = 'd0'
   30.74 +        ldrsh r2, [lr, # 2]             @ r2 = 'd2'
   30.75 +
   30.76 +        @ Optimization for row that have all items except the first set to 0
   30.77 +        @ (this works as the DCTELEMS are always 4-byte aligned)
   30.78 +        ldr r5, [lr, # 0]
   30.79 +        ldr r6, [lr, # 4]
   30.80 +        ldr r3, [lr, # 8]
   30.81 +        ldr r4, [lr, #12]
   30.82 +        orr r3, r3, r4
   30.83 +        orr r3, r3, r6
   30.84 +        orrs r5, r3, r5
   30.85 +        beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
   30.86 +        orrs r3, r3, r2
   30.87 +        beq empty_row
   30.88 +
   30.89 +        ldrsh r1, [lr, # 8]             @ r1 = 'd1'
   30.90 +        ldrsh r4, [lr, # 4]             @ r4 = 'd4'
   30.91 +        ldrsh r6, [lr, # 6]             @ r6 = 'd6'
   30.92 +
   30.93 +        ldr r3, [r11, #FIX_0_541196100_ID]
   30.94 +        add r7, r2, r6
   30.95 +        ldr r5, [r11, #FIX_M_1_847759065_ID]
   30.96 +        mul r7, r3, r7                      @ r7 = z1
   30.97 +        ldr r3, [r11, #FIX_0_765366865_ID]
   30.98 +        mla r6, r5, r6, r7                  @ r6 = tmp2
   30.99 +        add r5, r0, r4                      @ r5 = tmp0
  30.100 +        mla r2, r3, r2, r7                  @ r2 = tmp3
  30.101 +        sub r3, r0, r4                      @ r3 = tmp1
  30.102 +
  30.103 +        add r0, r2, r5, lsl #13             @ r0 = tmp10
  30.104 +        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
  30.105 +        add r4, r6, r3, lsl #13             @ r4 = tmp11
  30.106 +        rsb r3, r6, r3, lsl #13             @ r3 = tmp12
  30.107 +
  30.108 +        stmdb   sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
  30.109 +
  30.110 +        ldrsh r3, [lr, #10]             @ r3 = 'd3'
  30.111 +        ldrsh r5, [lr, #12]             @ r5 = 'd5'
  30.112 +        ldrsh r7, [lr, #14]             @ r7 = 'd7'
  30.113 +
  30.114 +        add r0, r3, r5                        @ r0 = 'z2'
  30.115 +        add r2, r1, r7                  @ r2 = 'z1'
  30.116 +        add r4, r3, r7                  @ r4 = 'z3'
  30.117 +        add r6, r1, r5                  @ r6 = 'z4'
  30.118 +        ldr r9, [r11, #FIX_1_175875602_ID]
  30.119 +        add r8, r4, r6                  @ r8 = z3 + z4
  30.120 +        ldr r10, [r11, #FIX_M_0_899976223_ID]
  30.121 +        mul r8, r9, r8                  @ r8 = 'z5'
  30.122 +        ldr r9, [r11, #FIX_M_2_562915447_ID]
  30.123 +        mul r2, r10, r2                 @ r2 = 'z1'
  30.124 +        ldr r10, [r11, #FIX_M_1_961570560_ID]
  30.125 +        mul r0, r9, r0                  @ r0 = 'z2'
  30.126 +        ldr r9, [r11, #FIX_M_0_390180644_ID]
  30.127 +        mla r4, r10, r4, r8             @ r4 = 'z3'
  30.128 +        ldr r10, [r11, #FIX_0_298631336_ID]
  30.129 +        mla r6, r9, r6, r8              @ r6 = 'z4'
  30.130 +        ldr r9, [r11, #FIX_2_053119869_ID]
  30.131 +        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
  30.132 +        ldr r10, [r11, #FIX_3_072711026_ID]
  30.133 +        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
  30.134 +        ldr r9, [r11, #FIX_1_501321110_ID]
  30.135 +        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
  30.136 +        add r7, r7, r4                  @ r7 = tmp0
  30.137 +        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
  30.138 +        add r5,        r5, r6                  @ r5 = tmp1
  30.139 +        add r3, r3, r4                  @ r3 = tmp2
  30.140 +        add r1, r1, r6                  @ r1 = tmp3
  30.141 +
  30.142 +        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
  30.143 +                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
  30.144 +
  30.145 +        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
  30.146 +        add r8, r0, r1
  30.147 +        add r8, r8, #(1<<10)
  30.148 +        mov r8, r8, asr #11
  30.149 +        strh r8, [lr, # 0]
  30.150 +
  30.151 +        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
  30.152 +        sub r8, r0, r1
  30.153 +        add r8, r8, #(1<<10)
  30.154 +        mov r8, r8, asr #11
  30.155 +        strh r8, [lr, #14]
  30.156 +
  30.157 +        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
  30.158 +        add r8, r6, r3
  30.159 +        add r8, r8, #(1<<10)
  30.160 +        mov r8, r8, asr #11
  30.161 +        strh r8, [lr, # 2]
  30.162 +
  30.163 +        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
  30.164 +        sub r8, r6, r3
  30.165 +        add r8, r8, #(1<<10)
  30.166 +        mov r8, r8, asr #11
  30.167 +        strh r8, [lr, #12]
  30.168 +
  30.169 +        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
  30.170 +        add r8, r4, r5
  30.171 +        add r8, r8, #(1<<10)
  30.172 +        mov r8, r8, asr #11
  30.173 +        strh r8, [lr, # 4]
  30.174 +
  30.175 +        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
  30.176 +        sub r8, r4, r5
  30.177 +        add r8, r8, #(1<<10)
  30.178 +        mov r8, r8, asr #11
  30.179 +        strh r8, [lr, #10]
  30.180 +
  30.181 +        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
  30.182 +        add r8, r2, r7
  30.183 +        add r8, r8, #(1<<10)
  30.184 +        mov r8, r8, asr #11
  30.185 +        strh r8, [lr, # 6]
  30.186 +
  30.187 +        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
  30.188 +        sub r8, r2, r7
  30.189 +        add r8, r8, #(1<<10)
  30.190 +        mov r8, r8, asr #11
  30.191 +        strh r8, [lr, # 8]
  30.192 +
  30.193 +        @ End of row loop
  30.194 +        add lr, lr, #16
  30.195 +        subs r12, r12, #1
  30.196 +        bne row_loop
  30.197 +        beq start_column_loop
  30.198 +
  30.199 +empty_row:
  30.200 +        ldr r1, [r11, #FIX_0xFFFF_ID]
  30.201 +        mov r0, r0, lsl #2
  30.202 +        and r0, r0, r1
  30.203 +        add r0, r0, r0, lsl #16
  30.204 +        str r0, [lr, # 0]
  30.205 +        str r0, [lr, # 4]
  30.206 +        str r0, [lr, # 8]
  30.207 +        str r0, [lr, #12]
  30.208 +
  30.209 +end_of_row_loop:
  30.210 +        @ End of loop
  30.211 +        add lr, lr, #16
  30.212 +        subs r12, r12, #1
  30.213 +        bne row_loop
  30.214 +
  30.215 +start_column_loop:
  30.216 +        @ Start of column loop
  30.217 +        ldr lr, [ sp ]
  30.218 +        mov r12, #8
  30.219 +column_loop:
  30.220 +        ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
  30.221 +        ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
  30.222 +        ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
  30.223 +        ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
  30.224 +
  30.225 +        ldr r3, [r11, #FIX_0_541196100_ID]
  30.226 +        add r1, r2, r6
  30.227 +        ldr r5, [r11, #FIX_M_1_847759065_ID]
  30.228 +        mul r1, r3, r1                      @ r1 = z1
  30.229 +        ldr r3, [r11, #FIX_0_765366865_ID]
  30.230 +        mla r6, r5, r6, r1                  @ r6 = tmp2
  30.231 +        add r5, r0, r4                      @ r5 = tmp0
  30.232 +        mla r2, r3, r2, r1                  @ r2 = tmp3
  30.233 +        sub r3, r0, r4                      @ r3 = tmp1
  30.234 +
  30.235 +        add r0, r2, r5, lsl #13             @ r0 = tmp10
  30.236 +        rsb r2, r2, r5, lsl #13             @ r2 = tmp13
  30.237 +        add r4, r6, r3, lsl #13             @ r4 = tmp11
  30.238 +        rsb r6, r6, r3, lsl #13             @ r6 = tmp12
  30.239 +
  30.240 +        ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
  30.241 +        ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
  30.242 +        ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
  30.243 +        ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
  30.244 +
  30.245 +        @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
  30.246 +        orr r9, r1, r3
  30.247 +        orr r10, r5, r7
  30.248 +        orrs r10, r9, r10
  30.249 +        beq empty_odd_column
  30.250 +
  30.251 +        stmdb   sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
  30.252 +
  30.253 +        add r0, r3, r5                  @ r0 = 'z2'
  30.254 +        add r2, r1, r7                  @ r2 = 'z1'
  30.255 +        add r4, r3, r7                  @ r4 = 'z3'
  30.256 +        add r6, r1, r5                  @ r6 = 'z4'
  30.257 +        ldr r9, [r11, #FIX_1_175875602_ID]
  30.258 +        add r8, r4, r6
  30.259 +        ldr r10, [r11, #FIX_M_0_899976223_ID]
  30.260 +        mul r8, r9, r8                  @ r8 = 'z5'
  30.261 +        ldr r9, [r11, #FIX_M_2_562915447_ID]
  30.262 +        mul r2, r10, r2                 @ r2 = 'z1'
  30.263 +        ldr r10, [r11, #FIX_M_1_961570560_ID]
  30.264 +        mul r0, r9, r0                  @ r0 = 'z2'
  30.265 +        ldr r9, [r11, #FIX_M_0_390180644_ID]
  30.266 +        mla r4, r10, r4, r8             @ r4 = 'z3'
  30.267 +        ldr r10, [r11, #FIX_0_298631336_ID]
  30.268 +        mla r6, r9, r6, r8              @ r6 = 'z4'
  30.269 +        ldr r9, [r11, #FIX_2_053119869_ID]
  30.270 +        mla r7, r10, r7, r2             @ r7 = tmp0 + z1
  30.271 +        ldr r10, [r11, #FIX_3_072711026_ID]
  30.272 +        mla r5, r9, r5, r0              @ r5 = tmp1 + z2
  30.273 +        ldr r9, [r11, #FIX_1_501321110_ID]
  30.274 +        mla r3, r10, r3, r0             @ r3 = tmp2 + z2
  30.275 +        add r7, r7, r4                  @ r7 = tmp0
  30.276 +        mla r1, r9, r1, r2              @ r1 = tmp3 + z1
  30.277 +        add r5,        r5, r6                  @ r5 = tmp1
  30.278 +        add r3, r3, r4                  @ r3 = tmp2
  30.279 +        add r1, r1, r6                  @ r1 = tmp3
  30.280 +
  30.281 +        ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
  30.282 +                                      @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
  30.283 +
  30.284 +        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
  30.285 +        add r8, r0, r1
  30.286 +        add r8, r8, #(1<<17)
  30.287 +        mov r8, r8, asr #18
  30.288 +        strh r8, [lr, #( 0*8)]
  30.289 +
  30.290 +        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
  30.291 +        sub r8, r0, r1
  30.292 +        add r8, r8, #(1<<17)
  30.293 +        mov r8, r8, asr #18
  30.294 +        strh r8, [lr, #(14*8)]
  30.295 +
  30.296 +        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
  30.297 +        add r8, r4, r3
  30.298 +        add r8, r8, #(1<<17)
  30.299 +        mov r8, r8, asr #18
  30.300 +        strh r8, [lr, #( 2*8)]
  30.301 +
  30.302 +        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
  30.303 +        sub r8, r4, r3
  30.304 +        add r8, r8, #(1<<17)
  30.305 +        mov r8, r8, asr #18
  30.306 +        strh r8, [lr, #(12*8)]
  30.307 +
  30.308 +        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
  30.309 +        add r8, r6, r5
  30.310 +        add r8, r8, #(1<<17)
  30.311 +        mov r8, r8, asr #18
  30.312 +        strh r8, [lr, #( 4*8)]
  30.313 +
  30.314 +        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
  30.315 +        sub r8, r6, r5
  30.316 +        add r8, r8, #(1<<17)
  30.317 +        mov r8, r8, asr #18
  30.318 +        strh r8, [lr, #(10*8)]
  30.319 +
  30.320 +        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
  30.321 +        add r8, r2, r7
  30.322 +        add r8, r8, #(1<<17)
  30.323 +        mov r8, r8, asr #18
  30.324 +        strh r8, [lr, #( 6*8)]
  30.325 +
  30.326 +        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
  30.327 +        sub r8, r2, r7
  30.328 +        add r8, r8, #(1<<17)
  30.329 +        mov r8, r8, asr #18
  30.330 +        strh r8, [lr, #( 8*8)]
  30.331 +
  30.332 +        @ End of row loop
  30.333 +        add lr, lr, #2
  30.334 +        subs r12, r12, #1
  30.335 +        bne column_loop
  30.336 +        beq the_end
  30.337 +
  30.338 +empty_odd_column:
  30.339 +        @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
  30.340 +        @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
  30.341 +        add r0, r0, #(1<<17)
  30.342 +        mov r0, r0, asr #18
  30.343 +        strh r0, [lr, #( 0*8)]
  30.344 +        strh r0, [lr, #(14*8)]
  30.345 +
  30.346 +        @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
  30.347 +        @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
  30.348 +        add r4, r4, #(1<<17)
  30.349 +        mov r4, r4, asr #18
  30.350 +        strh r4, [lr, #( 2*8)]
  30.351 +        strh r4, [lr, #(12*8)]
  30.352 +
  30.353 +        @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
  30.354 +        @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
  30.355 +        add r6, r6, #(1<<17)
  30.356 +        mov r6, r6, asr #18
  30.357 +        strh r6, [lr, #( 4*8)]
  30.358 +        strh r6, [lr, #(10*8)]
  30.359 +
  30.360 +        @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
  30.361 +        @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
  30.362 +        add r2, r2, #(1<<17)
  30.363 +        mov r2, r2, asr #18
  30.364 +        strh r2, [lr, #( 6*8)]
  30.365 +        strh r2, [lr, #( 8*8)]
  30.366 +
  30.367 +        @ End of row loop
  30.368 +        add lr, lr, #2
  30.369 +        subs r12, r12, #1
  30.370 +        bne column_loop
  30.371 +
  30.372 +the_end:
  30.373 +        @ The end....
  30.374 +        add sp, sp, #4
  30.375 +        ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
  30.376 +
  30.377 +const_array:
  30.378 +        .align
  30.379 +        .word FIX_0_298631336
  30.380 +        .word FIX_0_541196100
  30.381 +        .word FIX_0_765366865
  30.382 +        .word FIX_1_175875602
  30.383 +        .word FIX_1_501321110
  30.384 +        .word FIX_2_053119869
  30.385 +        .word FIX_3_072711026
  30.386 +        .word FIX_M_0_390180644
  30.387 +        .word FIX_M_0_899976223
  30.388 +        .word FIX_M_1_847759065
  30.389 +        .word FIX_M_1_961570560
  30.390 +        .word FIX_M_2_562915447
  30.391 +        .word FIX_0xFFFF

    31.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    31.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mathops.h	Mon Aug 27 12:09:56 2012 +0200
    31.3 @@ -0,0 +1,116 @@
    31.4 +/*
    31.5 + * simple math operations
    31.6 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
    31.7 + *
    31.8 + * This file is part of FFmpeg.
    31.9 + *
   31.10 + * FFmpeg is free software; you can redistribute it and/or
   31.11 + * modify it under the terms of the GNU Lesser General Public
   31.12 + * License as published by the Free Software Foundation; either
   31.13 + * version 2.1 of the License, or (at your option) any later version.
   31.14 + *
   31.15 + * FFmpeg is distributed in the hope that it will be useful,
   31.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   31.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   31.18 + * Lesser General Public License for more details.
   31.19 + *
   31.20 + * You should have received a copy of the GNU Lesser General Public
   31.21 + * License along with FFmpeg; if not, write to the Free Software
   31.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   31.23 + */
   31.24 +
   31.25 +#ifndef AVCODEC_ARM_MATHOPS_H
   31.26 +#define AVCODEC_ARM_MATHOPS_H
   31.27 +
   31.28 +#include <stdint.h>
   31.29 +#include "config.h"
   31.30 +#include "libavutil/common.h"
   31.31 +
   31.32 +#if HAVE_INLINE_ASM
   31.33 +
   31.34 +#   define MULL MULL
   31.35 +static inline av_const int MULL(int a, int b, unsigned shift)
   31.36 +{
   31.37 +    int lo, hi;
   31.38 +    __asm__("smull %0, %1, %2, %3     \n\t"
   31.39 +            "mov   %0, %0,     lsr %4 \n\t"
   31.40 +            "add   %1, %0, %1, lsl %5 \n\t"
   31.41 +            : "=&r"(lo), "=&r"(hi)
   31.42 +            : "r"(b), "r"(a), "ir"(shift), "ir"(32-shift));
   31.43 +    return hi;
   31.44 +}
   31.45 +
   31.46 +#define MULH MULH
   31.47 +#if HAVE_ARMV6
   31.48 +static inline av_const int MULH(int a, int b)
   31.49 +{
   31.50 +    int r;
   31.51 +    __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
   31.52 +    return r;
   31.53 +}
   31.54 +#else
   31.55 +static inline av_const int MULH(int a, int b)
   31.56 +{
   31.57 +    int lo, hi;
   31.58 +    __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
   31.59 +    return hi;
   31.60 +}
   31.61 +#endif
   31.62 +
   31.63 +static inline av_const int64_t MUL64(int a, int b)
   31.64 +{
   31.65 +    union { uint64_t x; unsigned hl[2]; } x;
   31.66 +    __asm__ ("smull %0, %1, %2, %3"
   31.67 +             : "=r"(x.hl[0]), "=r"(x.hl[1]) : "r"(a), "r"(b));
   31.68 +    return x.x;
   31.69 +}
   31.70 +#define MUL64 MUL64
   31.71 +
   31.72 +static inline av_const int64_t MAC64(int64_t d, int a, int b)
   31.73 +{
   31.74 +    union { uint64_t x; unsigned hl[2]; } x = { d };
   31.75 +    __asm__ ("smlal %0, %1, %2, %3"
   31.76 +             : "+r"(x.hl[0]), "+r"(x.hl[1]) : "r"(a), "r"(b));
   31.77 +    return x.x;
   31.78 +}
   31.79 +#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
   31.80 +#define MLS64(d, a, b) MAC64(d, -(a), b)
   31.81 +
   31.82 +#if HAVE_ARMV5TE
   31.83 +
   31.84 +/* signed 16x16 -> 32 multiply add accumulate */
   31.85 +#   define MAC16(rt, ra, rb)                                            \
   31.86 +    __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
   31.87 +
   31.88 +/* signed 16x16 -> 32 multiply */
   31.89 +#   define MUL16 MUL16
   31.90 +static inline av_const int MUL16(int ra, int rb)
   31.91 +{
   31.92 +    int rt;
   31.93 +    __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
   31.94 +    return rt;
   31.95 +}
   31.96 +
   31.97 +#endif
   31.98 +
   31.99 +#define mid_pred mid_pred
  31.100 +static inline av_const int mid_pred(int a, int b, int c)
  31.101 +{
  31.102 +    int m;
  31.103 +    __asm__ volatile (
  31.104 +        "mov   %0, %2  \n\t"
  31.105 +        "cmp   %1, %2  \n\t"
  31.106 +        "movgt %0, %1  \n\t"
  31.107 +        "movgt %1, %2  \n\t"
  31.108 +        "cmp   %1, %3  \n\t"
  31.109 +        "movle %1, %3  \n\t"
  31.110 +        "cmp   %0, %1  \n\t"
  31.111 +        "movgt %0, %1  \n\t"
  31.112 +        : "=&r"(m), "+r"(a)
  31.113 +        : "r"(b), "r"(c));
  31.114 +    return m;
  31.115 +}
  31.116 +
  31.117 +#endif /* HAVE_INLINE_ASM */
  31.118 +
  31.119 +#endif /* AVCODEC_ARM_MATHOPS_H */

    32.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    32.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mdct_neon.S	Mon Aug 27 12:09:56 2012 +0200
    32.3 @@ -0,0 +1,303 @@
    32.4 +/*
    32.5 + * ARM NEON optimised MDCT
    32.6 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    32.7 + *
    32.8 + * This file is part of FFmpeg.
    32.9 + *
   32.10 + * FFmpeg is free software; you can redistribute it and/or
   32.11 + * modify it under the terms of the GNU Lesser General Public
   32.12 + * License as published by the Free Software Foundation; either
   32.13 + * version 2.1 of the License, or (at your option) any later version.
   32.14 + *
   32.15 + * FFmpeg is distributed in the hope that it will be useful,
   32.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   32.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   32.18 + * Lesser General Public License for more details.
   32.19 + *
   32.20 + * You should have received a copy of the GNU Lesser General Public
   32.21 + * License along with FFmpeg; if not, write to the Free Software
   32.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   32.23 + */
   32.24 +
   32.25 +#include "asm.S"
   32.26 +
   32.27 +        preserve8
   32.28 +
   32.29 +        .text
   32.30 +
   32.31 +#define ff_fft_calc_neon X(ff_fft_calc_neon)
   32.32 +
   32.33 +function ff_imdct_half_neon, export=1
   32.34 +        push            {r4-r8,lr}
   32.35 +
   32.36 +        mov             r12, #1
   32.37 +        ldr             lr,  [r0, #28]          @ mdct_bits
   32.38 +        ldr             r4,  [r0, #32]          @ tcos
   32.39 +        ldr             r3,  [r0, #8]           @ revtab
   32.40 +        lsl             r12, r12, lr            @ n  = 1 << nbits
   32.41 +        lsr             lr,  r12, #2            @ n4 = n >> 2
   32.42 +        add             r7,  r2,  r12,  lsl #1
   32.43 +        mov             r12, #-16
   32.44 +        sub             r7,  r7,  #16
   32.45 +
   32.46 +        vld2.32         {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
   32.47 +        vld2.32         {d0-d1},  [r2,:128]!    @ d0 =m0,x d1 =m1,x
   32.48 +        vrev64.32       d17, d17
   32.49 +        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
   32.50 +        vmul.f32        d6,  d17, d2
   32.51 +        vmul.f32        d7,  d0,  d2
   32.52 +1:
   32.53 +        subs            lr,  lr,  #2
   32.54 +        ldr             r6,  [r3], #4
   32.55 +        vmul.f32        d4,  d0,  d3
   32.56 +        vmul.f32        d5,  d17, d3
   32.57 +        vsub.f32        d4,  d6,  d4
   32.58 +        vadd.f32        d5,  d5,  d7
   32.59 +        uxth            r8,  r6,  ror #16
   32.60 +        uxth            r6,  r6
   32.61 +        add             r8,  r1,  r8,  lsl #3
   32.62 +        add             r6,  r1,  r6,  lsl #3
   32.63 +        beq             1f
   32.64 +        vld2.32         {d16-d17},[r7,:128],r12
   32.65 +        vld2.32         {d0-d1},  [r2,:128]!
   32.66 +        vrev64.32       d17, d17
   32.67 +        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
   32.68 +        vmul.f32        d6,  d17, d2
   32.69 +        vmul.f32        d7,  d0,  d2
   32.70 +        vst2.32         {d4[0],d5[0]}, [r6,:64]
   32.71 +        vst2.32         {d4[1],d5[1]}, [r8,:64]
   32.72 +        b               1b
   32.73 +1:
   32.74 +        vst2.32         {d4[0],d5[0]}, [r6,:64]
   32.75 +        vst2.32         {d4[1],d5[1]}, [r8,:64]
   32.76 +
   32.77 +        mov             r4,  r0
   32.78 +        mov             r6,  r1
   32.79 +        bl              ff_fft_calc_neon
   32.80 +
   32.81 +        mov             r12, #1
   32.82 +        ldr             lr,  [r4, #28]          @ mdct_bits
   32.83 +        ldr             r4,  [r4, #32]          @ tcos
   32.84 +        lsl             r12, r12, lr            @ n  = 1 << nbits
   32.85 +        lsr             lr,  r12, #3            @ n8 = n >> 3
   32.86 +
   32.87 +        add             r4,  r4,  lr,  lsl #3
   32.88 +        add             r6,  r6,  lr,  lsl #3
   32.89 +        sub             r1,  r4,  #16
   32.90 +        sub             r3,  r6,  #16
   32.91 +
   32.92 +        mov             r7,  #-16
   32.93 +        mov             r8,  r6
   32.94 +        mov             r0,  r3
   32.95 +
   32.96 +        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
   32.97 +        vld2.32         {d20-d21},[r6,:128]!    @ d20=i2,r2 d21=i3,r3
   32.98 +        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
   32.99 +1:
  32.100 +        subs            lr,  lr,  #2
  32.101 +        vmul.f32        d7,  d0,  d18
  32.102 +        vld2.32         {d17,d19},[r4,:128]!    @ d17=c2,c3 d19=s2,s3
  32.103 +        vmul.f32        d4,  d1,  d18
  32.104 +        vmul.f32        d5,  d21, d19
  32.105 +        vmul.f32        d6,  d20, d19
  32.106 +        vmul.f32        d22, d1,  d16
  32.107 +        vmul.f32        d23, d21, d17
  32.108 +        vmul.f32        d24, d0,  d16
  32.109 +        vmul.f32        d25, d20, d17
  32.110 +        vadd.f32        d7,  d7,  d22
  32.111 +        vadd.f32        d6,  d6,  d23
  32.112 +        vsub.f32        d4,  d4,  d24
  32.113 +        vsub.f32        d5,  d5,  d25
  32.114 +        beq             1f
  32.115 +        vld2.32         {d0-d1},  [r3,:128], r7
  32.116 +        vld2.32         {d20-d21},[r6,:128]!
  32.117 +        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
  32.118 +        vrev64.32       q3,  q3
  32.119 +        vst2.32         {d4,d6},  [r0,:128], r7
  32.120 +        vst2.32         {d5,d7},  [r8,:128]!
  32.121 +        b               1b
  32.122 +1:
  32.123 +        vrev64.32       q3,  q3
  32.124 +        vst2.32         {d4,d6},  [r0,:128]
  32.125 +        vst2.32         {d5,d7},  [r8,:128]
  32.126 +
  32.127 +        pop             {r4-r8,pc}
  32.128 +endfunc
  32.129 +
  32.130 +function ff_imdct_calc_neon, export=1
  32.131 +        push            {r4-r6,lr}
  32.132 +
  32.133 +        ldr             r3,  [r0, #28]
  32.134 +        mov             r4,  #1
  32.135 +        mov             r5,  r1
  32.136 +        lsl             r4,  r4,  r3
  32.137 +        add             r1,  r1,  r4
  32.138 +
  32.139 +        bl              ff_imdct_half_neon
  32.140 +
  32.141 +        add             r0,  r5,  r4,  lsl #2
  32.142 +        add             r1,  r5,  r4,  lsl #1
  32.143 +        sub             r0,  r0,  #8
  32.144 +        sub             r2,  r1,  #16
  32.145 +        mov             r3,  #-16
  32.146 +        mov             r6,  #-8
  32.147 +        vmov.i32        d30, #1<<31
  32.148 +1:
  32.149 +        vld1.32         {d0-d1},  [r2,:128], r3
  32.150 +        pld             [r0, #-16]
  32.151 +        vrev64.32       q0,  q0
  32.152 +        vld1.32         {d2-d3},  [r1,:128]!
  32.153 +        veor            d4,  d1,  d30
  32.154 +        pld             [r2, #-16]
  32.155 +        vrev64.32       q1,  q1
  32.156 +        veor            d5,  d0,  d30
  32.157 +        vst1.32         {d2},     [r0,:64], r6
  32.158 +        vst1.32         {d3},     [r0,:64], r6
  32.159 +        vst1.32         {d4-d5},  [r5,:128]!
  32.160 +        subs            r4,  r4,  #16
  32.161 +        bgt             1b
  32.162 +
  32.163 +        pop             {r4-r6,pc}
  32.164 +endfunc
  32.165 +
  32.166 +function ff_mdct_calc_neon, export=1
  32.167 +        push            {r4-r10,lr}
  32.168 +
  32.169 +        mov             r12, #1
  32.170 +        ldr             lr,  [r0, #28]          @ mdct_bits
  32.171 +        ldr             r4,  [r0, #32]          @ tcos
  32.172 +        ldr             r3,  [r0, #8]           @ revtab
  32.173 +        lsl             lr,  r12, lr            @ n  = 1 << nbits
  32.174 +        add             r7,  r2,  lr            @ in4u
  32.175 +        sub             r9,  r7,  #16           @ in4d
  32.176 +        add             r2,  r7,  lr,  lsl #1   @ in3u
  32.177 +        add             r8,  r9,  lr,  lsl #1   @ in3d
  32.178 +        add             r5,  r4,  lr,  lsl #1
  32.179 +        sub             r5,  r5,  #16
  32.180 +        sub             r3,  r3,  #4
  32.181 +        mov             r12, #-16
  32.182 +
  32.183 +        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
  32.184 +        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
  32.185 +        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
  32.186 +        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
  32.187 +        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
  32.188 +        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
  32.189 +        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
  32.190 +        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
  32.191 +        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
  32.192 +        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
  32.193 +        vsub.f32        d16, d16, d2            @ in0u-in2d      R
  32.194 +        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
  32.195 +1:
  32.196 +        vmul.f32        d7,  d0,  d21           @  I*s
  32.197 +        ldr             r10, [r3, lr, lsr #1]
  32.198 +        vmul.f32        d6,  d1,  d20           @ -R*c
  32.199 +        ldr             r6,  [r3, #4]!
  32.200 +        vmul.f32        d4,  d1,  d21           @ -R*s
  32.201 +        vmul.f32        d5,  d0,  d20           @  I*c
  32.202 +        vmul.f32        d24, d16, d30           @  R*c
  32.203 +        vmul.f32        d25, d17, d31           @ -I*s
  32.204 +        vmul.f32        d22, d16, d31           @  R*s
  32.205 +        vmul.f32        d23, d17, d30           @  I*c
  32.206 +        subs            lr,  lr,  #16
  32.207 +        vsub.f32        d6,  d6,  d7            @ -R*c-I*s
  32.208 +        vadd.f32        d7,  d4,  d5            @ -R*s+I*c
  32.209 +        vsub.f32        d24, d25, d24           @ I*s-R*c
  32.210 +        vadd.f32        d25, d22, d23           @ R*s-I*c
  32.211 +        beq             1f
  32.212 +        mov             r12, #-16
  32.213 +        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
  32.214 +        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
  32.215 +        vneg.f32        d7,  d7                 @  R*s-I*c
  32.216 +        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
  32.217 +        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
  32.218 +        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
  32.219 +        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
  32.220 +        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
  32.221 +        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
  32.222 +        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
  32.223 +        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
  32.224 +        vsub.f32        d16, d16, d2            @ in0u-in2d      R
  32.225 +        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
  32.226 +        uxth            r12, r6,  ror #16
  32.227 +        uxth            r6,  r6
  32.228 +        add             r12, r1,  r12, lsl #3
  32.229 +        add             r6,  r1,  r6,  lsl #3
  32.230 +        vst2.32         {d6[0],d7[0]}, [r6,:64]
  32.231 +        vst2.32         {d6[1],d7[1]}, [r12,:64]
  32.232 +        uxth            r6,  r10, ror #16
  32.233 +        uxth            r10, r10
  32.234 +        add             r6 , r1,  r6,  lsl #3
  32.235 +        add             r10, r1,  r10, lsl #3
  32.236 +        vst2.32         {d24[0],d25[0]},[r10,:64]
  32.237 +        vst2.32         {d24[1],d25[1]},[r6,:64]
  32.238 +        b               1b
  32.239 +1:
  32.240 +        vneg.f32        d7,  d7                 @  R*s-I*c
  32.241 +        uxth            r12, r6,  ror #16
  32.242 +        uxth            r6,  r6
  32.243 +        add             r12, r1,  r12, lsl #3
  32.244 +        add             r6,  r1,  r6,  lsl #3
  32.245 +        vst2.32         {d6[0],d7[0]}, [r6,:64]
  32.246 +        vst2.32         {d6[1],d7[1]}, [r12,:64]
  32.247 +        uxth            r6,  r10, ror #16
  32.248 +        uxth            r10, r10
  32.249 +        add             r6 , r1,  r6,  lsl #3
  32.250 +        add             r10, r1,  r10, lsl #3
  32.251 +        vst2.32         {d24[0],d25[0]},[r10,:64]
  32.252 +        vst2.32         {d24[1],d25[1]},[r6,:64]
  32.253 +
  32.254 +        mov             r4,  r0
  32.255 +        mov             r6,  r1
  32.256 +        bl              ff_fft_calc_neon
  32.257 +
  32.258 +        mov             r12, #1
  32.259 +        ldr             lr,  [r4, #28]          @ mdct_bits
  32.260 +        ldr             r4,  [r4, #32]          @ tcos
  32.261 +        lsl             r12, r12, lr            @ n  = 1 << nbits
  32.262 +        lsr             lr,  r12, #3            @ n8 = n >> 3
  32.263 +
  32.264 +        add             r4,  r4,  lr,  lsl #3
  32.265 +        add             r6,  r6,  lr,  lsl #3
  32.266 +        sub             r1,  r4,  #16
  32.267 +        sub             r3,  r6,  #16
  32.268 +
  32.269 +        mov             r7,  #-16
  32.270 +        mov             r8,  r6
  32.271 +        mov             r0,  r3
  32.272 +
  32.273 +        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
  32.274 +        vld2.32         {d20-d21},[r6,:128]!    @ d20=r2,i2 d21=r3,i3
  32.275 +        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
  32.276 +1:
  32.277 +        subs            lr,  lr,  #2
  32.278 +        vmul.f32        d7,  d0,  d18           @ r1*s1,r0*s0
  32.279 +        vld2.32         {d17,d19},[r4,:128]!    @ c2,c3 s2,s3
  32.280 +        vmul.f32        d4,  d1,  d18           @ i1*s1,i0*s0
  32.281 +        vmul.f32        d5,  d21, d19           @ i2*s2,i3*s3
  32.282 +        vmul.f32        d6,  d20, d19           @ r2*s2,r3*s3
  32.283 +        vmul.f32        d24, d0,  d16           @ r1*c1,r0*c0
  32.284 +        vmul.f32        d25, d20, d17           @ r2*c2,r3*c3
  32.285 +        vmul.f32        d22, d21, d17           @ i2*c2,i3*c3
  32.286 +        vmul.f32        d23, d1,  d16           @ i1*c1,i0*c0
  32.287 +        vadd.f32        d4,  d4,  d24           @ i1*s1+r1*c1,i0*s0+r0*c0
  32.288 +        vadd.f32        d5,  d5,  d25           @ i2*s2+r2*c2,i3*s3+r3*c3
  32.289 +        vsub.f32        d6,  d22, d6            @ i2*c2-r2*s2,i3*c3-r3*s3
  32.290 +        vsub.f32        d7,  d23, d7            @ i1*c1-r1*s1,i0*c0-r0*s0
  32.291 +        vneg.f32        q2,  q2
  32.292 +        beq             1f
  32.293 +        vld2.32         {d0-d1},  [r3,:128], r7
  32.294 +        vld2.32         {d20-d21},[r6,:128]!
  32.295 +        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
  32.296 +        vrev64.32       q3,  q3
  32.297 +        vst2.32         {d4,d6},  [r0,:128], r7
  32.298 +        vst2.32         {d5,d7},  [r8,:128]!
  32.299 +        b               1b
  32.300 +1:
  32.301 +        vrev64.32       q3,  q3
  32.302 +        vst2.32         {d4,d6},  [r0,:128]
  32.303 +        vst2.32         {d5,d7},  [r8,:128]
  32.304 +
  32.305 +        pop             {r4-r10,pc}
  32.306 +endfunc

    33.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    33.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.c	Mon Aug 27 12:09:56 2012 +0200
    33.3 @@ -0,0 +1,38 @@
    33.4 +/*
    33.5 + * Copyright (c) 2002 Michael Niedermayer
    33.6 + *
    33.7 + * This file is part of FFmpeg.
    33.8 + *
    33.9 + * FFmpeg is free software; you can redistribute it and/or
   33.10 + * modify it under the terms of the GNU Lesser General Public
   33.11 + * License as published by the Free Software Foundation; either
   33.12 + * version 2.1 of the License, or (at your option) any later version.
   33.13 + *
   33.14 + * FFmpeg is distributed in the hope that it will be useful,
   33.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   33.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   33.17 + * Lesser General Public License for more details.
   33.18 + *
   33.19 + * You should have received a copy of the GNU Lesser General Public
   33.20 + * License along with FFmpeg; if not, write to the Free Software
   33.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   33.22 + */
   33.23 +
   33.24 +#include "libavcodec/avcodec.h"
   33.25 +#include "libavcodec/dsputil.h"
   33.26 +#include "libavcodec/mpegvideo.h"
   33.27 +#include "mpegvideo_arm.h"
   33.28 +
   33.29 +void MPV_common_init_arm(MpegEncContext *s)
   33.30 +{
   33.31 +    /* IWMMXT support is a superset of armv5te, so
   33.32 +     * allow optimized functions for armv5te unless
   33.33 +     * a better iwmmxt function exists
   33.34 +     */
   33.35 +#if HAVE_ARMV5TE
   33.36 +    MPV_common_init_armv5te(s);
   33.37 +#endif
   33.38 +#if HAVE_IWMMXT
   33.39 +    MPV_common_init_iwmmxt(s);
   33.40 +#endif
   33.41 +}

    34.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    34.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_arm.h	Mon Aug 27 12:09:56 2012 +0200
    34.3 @@ -0,0 +1,27 @@
    34.4 +/*
    34.5 + * This file is part of FFmpeg.
    34.6 + *
    34.7 + * FFmpeg is free software; you can redistribute it and/or
    34.8 + * modify it under the terms of the GNU Lesser General Public
    34.9 + * License as published by the Free Software Foundation; either
   34.10 + * version 2.1 of the License, or (at your option) any later version.
   34.11 + *
   34.12 + * FFmpeg is distributed in the hope that it will be useful,
   34.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   34.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   34.15 + * Lesser General Public License for more details.
   34.16 + *
   34.17 + * You should have received a copy of the GNU Lesser General Public
   34.18 + * License along with FFmpeg; if not, write to the Free Software
   34.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   34.20 + */
   34.21 +
   34.22 +#ifndef AVCODEC_ARM_MPEGVIDEO_H
   34.23 +#define AVCODEC_ARM_MPEGVIDEO_H
   34.24 +
   34.25 +#include "libavcodec/mpegvideo.h"
   34.26 +
   34.27 +void MPV_common_init_iwmmxt(MpegEncContext *s);
   34.28 +void MPV_common_init_armv5te(MpegEncContext *s);
   34.29 +
   34.30 +#endif

    35.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    35.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te.c	Mon Aug 27 12:09:56 2012 +0200
    35.3 @@ -0,0 +1,101 @@
    35.4 +/*
    35.5 + * Optimization of some functions from mpegvideo.c for armv5te
    35.6 + * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
    35.7 + *
    35.8 + * This file is part of FFmpeg.
    35.9 + *
   35.10 + * FFmpeg is free software; you can redistribute it and/or
   35.11 + * modify it under the terms of the GNU Lesser General Public
   35.12 + * License as published by the Free Software Foundation; either
   35.13 + * version 2.1 of the License, or (at your option) any later version.
   35.14 + *
   35.15 + * FFmpeg is distributed in the hope that it will be useful,
   35.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   35.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   35.18 + * Lesser General Public License for more details.
   35.19 + *
   35.20 + * You should have received a copy of the GNU Lesser General Public
   35.21 + * License along with FFmpeg; if not, write to the Free Software
   35.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   35.23 + */
   35.24 +
   35.25 +#include "libavcodec/avcodec.h"
   35.26 +#include "libavcodec/dsputil.h"
   35.27 +#include "libavcodec/mpegvideo.h"
   35.28 +#include "mpegvideo_arm.h"
   35.29 +
   35.30 +void ff_dct_unquantize_h263_armv5te(DCTELEM *block, int qmul, int qadd, int count);
   35.31 +
   35.32 +#ifdef ENABLE_ARM_TESTS
   35.33 +/**
   35.34 + * h263 dequantizer supplementary function, it is performance critical and needs to
   35.35 + * have optimized implementations for each architecture. Is also used as a reference
   35.36 + * implementation in regression tests
   35.37 + */
   35.38 +static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count)
   35.39 +{
   35.40 +    int i, level;
   35.41 +    for (i = 0; i < count; i++) {
   35.42 +        level = block[i];
   35.43 +        if (level) {
   35.44 +            if (level < 0) {
   35.45 +                level = level * qmul - qadd;
   35.46 +            } else {
   35.47 +                level = level * qmul + qadd;
   35.48 +            }
   35.49 +            block[i] = level;
   35.50 +        }
   35.51 +    }
   35.52 +}
   35.53 +#endif
   35.54 +
   35.55 +static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
   35.56 +                                  DCTELEM *block, int n, int qscale)
   35.57 +{
   35.58 +    int level, qmul, qadd;
   35.59 +    int nCoeffs;
   35.60 +
   35.61 +    assert(s->block_last_index[n]>=0);
   35.62 +
   35.63 +    qmul = qscale << 1;
   35.64 +
   35.65 +    if (!s->h263_aic) {
   35.66 +        if (n < 4)
   35.67 +            level = block[0] * s->y_dc_scale;
   35.68 +        else
   35.69 +            level = block[0] * s->c_dc_scale;
   35.70 +        qadd = (qscale - 1) | 1;
   35.71 +    }else{
   35.72 +        qadd = 0;
   35.73 +        level = block[0];
   35.74 +    }
   35.75 +    if(s->ac_pred)
   35.76 +        nCoeffs=63;
   35.77 +    else
   35.78 +        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
   35.79 +
   35.80 +    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
   35.81 +    block[0] = level;
   35.82 +}
   35.83 +
   35.84 +static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
   35.85 +                                  DCTELEM *block, int n, int qscale)
   35.86 +{
   35.87 +    int qmul, qadd;
   35.88 +    int nCoeffs;
   35.89 +
   35.90 +    assert(s->block_last_index[n]>=0);
   35.91 +
   35.92 +    qadd = (qscale - 1) | 1;
   35.93 +    qmul = qscale << 1;
   35.94 +
   35.95 +    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
   35.96 +
   35.97 +    ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
   35.98 +}
   35.99 +
  35.100 +void MPV_common_init_armv5te(MpegEncContext *s)
  35.101 +{
  35.102 +    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
  35.103 +    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
  35.104 +}

    36.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    36.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_armv5te_s.S	Mon Aug 27 12:09:56 2012 +0200
    36.3 @@ -0,0 +1,117 @@
    36.4 +/*
    36.5 + * Optimization of some functions from mpegvideo.c for armv5te
    36.6 + * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
    36.7 + *
    36.8 + * This file is part of FFmpeg.
    36.9 + *
   36.10 + * FFmpeg is free software; you can redistribute it and/or
   36.11 + * modify it under the terms of the GNU Lesser General Public
   36.12 + * License as published by the Free Software Foundation; either
   36.13 + * version 2.1 of the License, or (at your option) any later version.
   36.14 + *
   36.15 + * FFmpeg is distributed in the hope that it will be useful,
   36.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   36.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   36.18 + * Lesser General Public License for more details.
   36.19 + *
   36.20 + * You should have received a copy of the GNU Lesser General Public
   36.21 + * License along with FFmpeg; if not, write to the Free Software
   36.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   36.23 + */
   36.24 +
   36.25 +#include "config.h"
   36.26 +#include "asm.S"
   36.27 +
   36.28 +/*
   36.29 + * Special optimized version of dct_unquantize_h263_helper_c, it
   36.30 + * requires the block to be at least 8 bytes aligned, and may process
   36.31 + * more elements than requested.  But it is guaranteed to never
   36.32 + * process more than 64 elements provided that count argument is <= 64,
   36.33 + * so it is safe. This function is optimized for a common distribution
   36.34 + * of values for nCoeffs (they are mostly multiple of 8 plus one or
   36.35 + * two extra elements). So this function processes data as 8 elements
   36.36 + * per loop iteration and contains optional 2 elements processing in
   36.37 + * the end.
   36.38 + *
   36.39 + * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
   36.40 + */
   36.41 +function ff_dct_unquantize_h263_armv5te, export=1
   36.42 +        push            {r4-r9,lr}
   36.43 +        mov             ip, #0
   36.44 +        subs            r3, r3, #2
   36.45 +        ble             2f
   36.46 +        ldrd            r4, [r0, #0]
   36.47 +1:
   36.48 +        ldrd            r6, [r0, #8]
   36.49 +
   36.50 +        rsbs            r9, ip, r4, asr #16
   36.51 +        addgt           r9, r2, #0
   36.52 +        rsblt           r9, r2, #0
   36.53 +        smlatbne        r9, r4, r1, r9
   36.54 +
   36.55 +        rsbs            lr, ip, r5, asr #16
   36.56 +        addgt           lr, r2, #0
   36.57 +        rsblt           lr, r2, #0
   36.58 +        smlatbne        lr, r5, r1, lr
   36.59 +
   36.60 +        rsbs            r8, ip, r4, asl #16
   36.61 +        addgt           r8, r2, #0
   36.62 +        rsblt           r8, r2, #0
   36.63 +        smlabbne        r4, r4, r1, r8
   36.64 +
   36.65 +        rsbs            r8, ip, r5, asl #16
   36.66 +        addgt           r8, r2, #0
   36.67 +        rsblt           r8, r2, #0
   36.68 +        smlabbne        r5, r5, r1, r8
   36.69 +
   36.70 +        strh            r4, [r0], #2
   36.71 +        strh            r9, [r0], #2
   36.72 +        strh            r5, [r0], #2
   36.73 +        strh            lr, [r0], #2
   36.74 +
   36.75 +        rsbs            r9, ip, r6, asr #16
   36.76 +        addgt           r9, r2, #0
   36.77 +        rsblt           r9, r2, #0
   36.78 +        smlatbne        r9, r6, r1, r9
   36.79 +
   36.80 +        rsbs            lr, ip, r7, asr #16
   36.81 +        addgt           lr, r2, #0
   36.82 +        rsblt           lr, r2, #0
   36.83 +        smlatbne        lr, r7, r1, lr
   36.84 +
   36.85 +        rsbs            r8, ip, r6, asl #16
   36.86 +        addgt           r8, r2, #0
   36.87 +        rsblt           r8, r2, #0
   36.88 +        smlabbne        r6, r6, r1, r8
   36.89 +
   36.90 +        rsbs            r8, ip, r7, asl #16
   36.91 +        addgt           r8, r2, #0
   36.92 +        rsblt           r8, r2, #0
   36.93 +        smlabbne        r7, r7, r1, r8
   36.94 +
   36.95 +        strh            r6, [r0], #2
   36.96 +        strh            r9, [r0], #2
   36.97 +        strh            r7, [r0], #2
   36.98 +        strh            lr, [r0], #2
   36.99 +
  36.100 +        subs            r3, r3, #8
  36.101 +        ldrgtd          r4, [r0, #0] /* load data early to avoid load/use pipeline stall */
  36.102 +        bgt             1b
  36.103 +
  36.104 +        adds            r3, r3, #2
  36.105 +        pople           {r4-r9,pc}
  36.106 +2:
  36.107 +        ldrsh           r9, [r0, #0]
  36.108 +        ldrsh           lr, [r0, #2]
  36.109 +        mov             r8, r2
  36.110 +        cmp             r9, #0
  36.111 +        rsblt           r8, r2, #0
  36.112 +        smlabbne        r9, r9, r1, r8
  36.113 +        mov             r8, r2
  36.114 +        cmp             lr, #0
  36.115 +        rsblt           r8, r2, #0
  36.116 +        smlabbne        lr, lr, r1, r8
  36.117 +        strh            r9, [r0], #2
  36.118 +        strh            lr, [r0], #2
  36.119 +        pop             {r4-r9,pc}
  36.120 +endfunc

    37.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    37.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/mpegvideo_iwmmxt.c	Mon Aug 27 12:09:56 2012 +0200
    37.3 @@ -0,0 +1,120 @@
    37.4 +/*
    37.5 + * copyright (c) 2004 AGAWA Koji
    37.6 + *
    37.7 + * This file is part of FFmpeg.
    37.8 + *
    37.9 + * FFmpeg is free software; you can redistribute it and/or
   37.10 + * modify it under the terms of the GNU Lesser General Public
   37.11 + * License as published by the Free Software Foundation; either
   37.12 + * version 2.1 of the License, or (at your option) any later version.
   37.13 + *
   37.14 + * FFmpeg is distributed in the hope that it will be useful,
   37.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   37.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   37.17 + * Lesser General Public License for more details.
   37.18 + *
   37.19 + * You should have received a copy of the GNU Lesser General Public
   37.20 + * License along with FFmpeg; if not, write to the Free Software
   37.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   37.22 + */
   37.23 +
   37.24 +#include "libavcodec/avcodec.h"
   37.25 +#include "libavcodec/dsputil.h"
   37.26 +#include "libavcodec/mpegvideo.h"
   37.27 +#include "mpegvideo_arm.h"
   37.28 +
   37.29 +static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
   37.30 +                                             DCTELEM *block, int n, int qscale)
   37.31 +{
   37.32 +    int level, qmul, qadd;
   37.33 +    int nCoeffs;
   37.34 +    DCTELEM *block_orig = block;
   37.35 +
   37.36 +    assert(s->block_last_index[n]>=0);
   37.37 +
   37.38 +    qmul = qscale << 1;
   37.39 +
   37.40 +    if (!s->h263_aic) {
   37.41 +        if (n < 4)
   37.42 +            level = block[0] * s->y_dc_scale;
   37.43 +        else
   37.44 +            level = block[0] * s->c_dc_scale;
   37.45 +        qadd = (qscale - 1) | 1;
   37.46 +    }else{
   37.47 +        qadd = 0;
   37.48 +        level = block[0];
   37.49 +    }
   37.50 +    if(s->ac_pred)
   37.51 +        nCoeffs=63;
   37.52 +    else
   37.53 +        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
   37.54 +
   37.55 +    __asm__ volatile (
   37.56 +/*      "movd %1, %%mm6                 \n\t" //qmul */
   37.57 +/*      "packssdw %%mm6, %%mm6          \n\t" */
   37.58 +/*      "packssdw %%mm6, %%mm6          \n\t" */
   37.59 +        "tbcsth wr6, %[qmul]            \n\t"
   37.60 +/*      "movd %2, %%mm5                 \n\t" //qadd */
   37.61 +/*      "packssdw %%mm5, %%mm5          \n\t" */
   37.62 +/*      "packssdw %%mm5, %%mm5          \n\t" */
   37.63 +        "tbcsth wr5, %[qadd]            \n\t"
   37.64 +        "wzero wr7                      \n\t" /* "pxor %%mm7, %%mm7             \n\t" */
   37.65 +        "wzero wr4                      \n\t" /* "pxor %%mm4, %%mm4             \n\t" */
   37.66 +        "wsubh wr7, wr5, wr7            \n\t" /* "psubw %%mm5, %%mm7            \n\t" */
   37.67 +        "1:                             \n\t"
   37.68 +        "wldrd wr2, [%[block]]          \n\t" /* "movq (%0, %3), %%mm0          \n\t" */
   37.69 +        "wldrd wr3, [%[block], #8]      \n\t" /* "movq 8(%0, %3), %%mm1         \n\t" */
   37.70 +        "wmulsl wr0, wr6, wr2           \n\t" /* "pmullw %%mm6, %%mm0           \n\t" */
   37.71 +        "wmulsl wr1, wr6, wr3           \n\t" /* "pmullw %%mm6, %%mm1           \n\t" */
   37.72 +/*      "movq (%0, %3), %%mm2           \n\t" */
   37.73 +/*      "movq 8(%0, %3), %%mm3          \n\t" */
   37.74 +        "wcmpgtsh wr2, wr4, wr2         \n\t" /* "pcmpgtw %%mm4, %%mm2          \n\t" // block[i] < 0 ? -1 : 0 */
   37.75 +        "wcmpgtsh wr3, wr4, wr2         \n\t" /* "pcmpgtw %%mm4, %%mm3          \n\t" // block[i] < 0 ? -1 : 0 */
   37.76 +        "wxor wr0, wr2, wr0             \n\t" /* "pxor %%mm2, %%mm0             \n\t" */
   37.77 +        "wxor wr1, wr3, wr1             \n\t" /* "pxor %%mm3, %%mm1             \n\t" */
   37.78 +        "waddh wr0, wr7, wr0            \n\t" /* "paddw %%mm7, %%mm0            \n\t" */
   37.79 +        "waddh wr1, wr7, wr1            \n\t" /* "paddw %%mm7, %%mm1            \n\t" */
   37.80 +        "wxor wr2, wr0, wr2             \n\t" /* "pxor %%mm0, %%mm2             \n\t" */
   37.81 +        "wxor wr3, wr1, wr3             \n\t" /* "pxor %%mm1, %%mm3             \n\t" */
   37.82 +        "wcmpeqh wr0, wr7, wr0          \n\t" /* "pcmpeqw %%mm7, %%mm0          \n\t" // block[i] == 0 ? -1 : 0 */
   37.83 +        "wcmpeqh wr1, wr7, wr1          \n\t" /* "pcmpeqw %%mm7, %%mm1          \n\t" // block[i] == 0 ? -1 : 0 */
   37.84 +        "wandn wr0, wr2, wr0            \n\t" /* "pandn %%mm2, %%mm0            \n\t" */
   37.85 +        "wandn wr1, wr3, wr1            \n\t" /* "pandn %%mm3, %%mm1            \n\t" */
   37.86 +        "wstrd wr0, [%[block]]          \n\t" /* "movq %%mm0, (%0, %3)          \n\t" */
   37.87 +        "wstrd wr1, [%[block], #8]      \n\t" /* "movq %%mm1, 8(%0, %3)         \n\t" */
   37.88 +        "add %[block], %[block], #16    \n\t" /* "addl $16, %3                  \n\t" */
   37.89 +        "subs %[i], %[i], #1            \n\t"
   37.90 +        "bne 1b                         \n\t" /* "jng 1b                                \n\t" */
   37.91 +        :[block]"+r"(block)
   37.92 +        :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
   37.93 +        :"memory");
   37.94 +
   37.95 +    block_orig[0] = level;
   37.96 +}
   37.97 +
   37.98 +#if 0
   37.99 +static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
  37.100 +                                             DCTELEM *block, int n, int qscale)
  37.101 +{
  37.102 +    int nCoeffs;
  37.103 +
  37.104 +    assert(s->block_last_index[n]>=0);
  37.105 +
  37.106 +    if(s->ac_pred)
  37.107 +        nCoeffs=63;
  37.108 +    else
  37.109 +        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
  37.110 +
  37.111 +    ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
  37.112 +}
  37.113 +#endif
  37.114 +
  37.115 +void MPV_common_init_iwmmxt(MpegEncContext *s)
  37.116 +{
  37.117 +    if (!(mm_flags & FF_MM_IWMMXT)) return;
  37.118 +
  37.119 +    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
  37.120 +#if 0
  37.121 +    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
  37.122 +#endif
  37.123 +}

    38.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    38.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/rdft_neon.S	Mon Aug 27 12:09:56 2012 +0200
    38.3 @@ -0,0 +1,151 @@
    38.4 +/*
    38.5 + * ARM NEON optimised RDFT
    38.6 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
    38.7 + *
    38.8 + * This file is part of FFmpeg.
    38.9 + *
   38.10 + * FFmpeg is free software; you can redistribute it and/or
   38.11 + * modify it under the terms of the GNU Lesser General Public
   38.12 + * License as published by the Free Software Foundation; either
   38.13 + * version 2.1 of the License, or (at your option) any later version.
   38.14 + *
   38.15 + * FFmpeg is distributed in the hope that it will be useful,
   38.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   38.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   38.18 + * Lesser General Public License for more details.
   38.19 + *
   38.20 + * You should have received a copy of the GNU Lesser General Public
   38.21 + * License along with FFmpeg; if not, write to the Free Software
   38.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   38.23 + */
   38.24 +
   38.25 +#include "asm.S"
   38.26 +
   38.27 +        preserve8
   38.28 +
   38.29 +function ff_rdft_calc_neon, export=1
   38.30 +        push            {r4-r8,lr}
   38.31 +
   38.32 +        ldr             r6,  [r0, #4]           @ inverse
   38.33 +        mov             r4,  r0
   38.34 +        mov             r5,  r1
   38.35 +
   38.36 +        lsls            r6,  r6,  #31
   38.37 +        bne             1f
   38.38 +        add             r0,  r4,  #20
   38.39 +        bl              X(ff_fft_permute_neon)
   38.40 +        add             r0,  r4,  #20
   38.41 +        mov             r1,  r5
   38.42 +        bl              X(ff_fft_calc_neon)
   38.43 +1:
   38.44 +        ldr             r12, [r4, #0]           @ nbits
   38.45 +        mov             r2,  #1
   38.46 +        lsl             r12, r2,  r12
   38.47 +        add             r0,  r5,  #8
   38.48 +        add             r1,  r5,  r12, lsl #2
   38.49 +        lsr             r12, r12, #2
   38.50 +        ldr             r2,  [r4, #12]          @ tcos
   38.51 +        sub             r12, r12, #2
   38.52 +        ldr             r3,  [r4, #16]          @ tsin
   38.53 +        mov             r7,  r0
   38.54 +        sub             r1,  r1,  #8
   38.55 +        mov             lr,  r1
   38.56 +        mov             r8,  #-8
   38.57 +        vld1.32         {d0},     [r0,:64]!     @ d1[0,1]
   38.58 +        vld1.32         {d1},     [r1,:64], r8  @ d2[0,1]
   38.59 +        vld1.32         {d4},     [r2,:64]!     @ tcos[i]
   38.60 +        vld1.32         {d5},     [r3,:64]!     @ tsin[i]
   38.61 +        vmov.f32        d18, #0.5               @ k1
   38.62 +        vdup.32         d19, r6
   38.63 +        pld             [r0, #32]
   38.64 +        veor            d19, d18, d19           @ k2
   38.65 +        vmov.i32        d16, #0
   38.66 +        vmov.i32        d17, #1<<31
   38.67 +        pld             [r1, #-32]
   38.68 +        vtrn.32         d16, d17
   38.69 +        pld             [r2, #32]
   38.70 +        vrev64.32       d16, d16                @ d16=1,0 d17=0,1
   38.71 +        pld             [r3, #32]
   38.72 +2:
   38.73 +        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
   38.74 +        vld1.32         {d24},    [r0,:64]!     @  d1[0,1]
   38.75 +        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
   38.76 +        vld1.32         {d25},    [r1,:64], r8  @  d2[0,1]
   38.77 +        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
   38.78 +        veor            q3,  q12, q8            @ -d1[0],d1[1], d2[0],-d2[1]
   38.79 +        pld             [r0, #32]
   38.80 +        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
   38.81 +        pld             [r1, #-32]
   38.82 +        vadd.f32        d0,  d24, d7            @  d1[0]+d2[0], d1[1]-d2[1]
   38.83 +        vadd.f32        d1,  d6,  d25           @ -d1[0]+d2[0], d1[1]+d2[1]
   38.84 +        vmul.f32        q11, q0,  q9            @  ev.re, ev.im, od.im, od.re
   38.85 +        veor            d7,  d21, d16           @ -od.im, od.re
   38.86 +        vrev64.32       d3,  d21                @  od.re, od.im
   38.87 +        veor            d6,  d20, d17           @  ev.re,-ev.im
   38.88 +        veor            d2,  d3,  d16           @ -od.re, od.im
   38.89 +        vmla.f32        d20, d3,  d4[1]
   38.90 +        vmla.f32        d20, d7,  d5[1]
   38.91 +        vmla.f32        d6,  d2,  d4[1]
   38.92 +        vmla.f32        d6,  d21, d5[1]
   38.93 +        vld1.32         {d4},     [r2,:64]!     @  tcos[i]
   38.94 +        veor            d7,  d23, d16           @ -od.im, od.re
   38.95 +        vld1.32         {d5},     [r3,:64]!     @  tsin[i]
   38.96 +        veor            d24, d22, d17           @  ev.re,-ev.im
   38.97 +        vrev64.32       d3,  d23                @  od.re, od.im
   38.98 +        pld             [r2, #32]
   38.99 +        veor            d2,  d3,  d16           @ -od.re, od.im
  38.100 +        pld             [r3, #32]
  38.101 +        vmla.f32        d22, d3,  d4[0]
  38.102 +        vmla.f32        d22, d7,  d5[0]
  38.103 +        vmla.f32        d24, d2,  d4[0]
  38.104 +        vmla.f32        d24, d23, d5[0]
  38.105 +        vld1.32         {d0},     [r0,:64]!     @  d1[0,1]
  38.106 +        vld1.32         {d1},     [r1,:64], r8  @  d2[0,1]
  38.107 +        vst1.32         {d20},    [r7,:64]!
  38.108 +        vst1.32         {d6},     [lr,:64], r8
  38.109 +        vst1.32         {d22},    [r7,:64]!
  38.110 +        vst1.32         {d24},    [lr,:64], r8
  38.111 +        subs            r12, r12, #2
  38.112 +        bgt             2b
  38.113 +
  38.114 +        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
  38.115 +        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
  38.116 +        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
  38.117 +        ldr             r2,  [r4, #8]           @  sign_convention
  38.118 +        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
  38.119 +        add             r0,  r0,  #4
  38.120 +        bfc             r2,  #0,  #31
  38.121 +        vld1.32         {d0[0]},  [r0,:32]
  38.122 +        veor            d7,  d21, d16           @ -od.im, od.re
  38.123 +        vrev64.32       d3,  d21                @  od.re, od.im
  38.124 +        veor            d6,  d20, d17           @  ev.re,-ev.im
  38.125 +        vld1.32         {d22},    [r5,:64]
  38.126 +        vdup.32         d1,  r2
  38.127 +        vmov            d23, d22
  38.128 +        veor            d2,  d3,  d16           @ -od.re, od.im
  38.129 +        vtrn.32         d22, d23
  38.130 +        veor            d0,  d0,  d1
  38.131 +        veor            d23, d23, d17
  38.132 +        vmla.f32        d20, d3,  d4[1]
  38.133 +        vmla.f32        d20, d7,  d5[1]
  38.134 +        vmla.f32        d6,  d2,  d4[1]
  38.135 +        vmla.f32        d6,  d21, d5[1]
  38.136 +        vadd.f32        d22, d22, d23
  38.137 +        vst1.32         {d20},    [r7,:64]
  38.138 +        vst1.32         {d6},     [lr,:64]
  38.139 +        vst1.32         {d0[0]},  [r0,:32]
  38.140 +        vst1.32         {d22},    [r5,:64]
  38.141 +
  38.142 +        cmp             r6,  #0
  38.143 +        popeq           {r4-r8,pc}
  38.144 +
  38.145 +        vmul.f32        d22, d22, d18
  38.146 +        vst1.32         {d22},    [r5,:64]
  38.147 +        add             r0,  r4,  #20
  38.148 +        mov             r1,  r5
  38.149 +        bl              X(ff_fft_permute_neon)
  38.150 +        add             r0,  r4,  #20
  38.151 +        mov             r1,  r5
  38.152 +        pop             {r4-r8,lr}
  38.153 +        b               X(ff_fft_calc_neon)
  38.154 +endfunc

    39.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    39.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_arm.S	Mon Aug 27 12:09:56 2012 +0200
    39.3 @@ -0,0 +1,486 @@
    39.4 +/*
    39.5 + * simple_idct_arm.S
    39.6 + * Copyright (C) 2002 Frederic 'dilb' Boulay
    39.7 + *
    39.8 + * Author: Frederic Boulay <dilb@handhelds.org>
    39.9 + *
   39.10 + * The function defined in this file is derived from the simple_idct function
   39.11 + * from the libavcodec library part of the FFmpeg project.
   39.12 + *
   39.13 + * This file is part of FFmpeg.
   39.14 + *
   39.15 + * FFmpeg is free software; you can redistribute it and/or
   39.16 + * modify it under the terms of the GNU Lesser General Public
   39.17 + * License as published by the Free Software Foundation; either
   39.18 + * version 2.1 of the License, or (at your option) any later version.
   39.19 + *
   39.20 + * FFmpeg is distributed in the hope that it will be useful,
   39.21 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   39.22 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   39.23 + * Lesser General Public License for more details.
   39.24 + *
   39.25 + * You should have received a copy of the GNU Lesser General Public
   39.26 + * License along with FFmpeg; if not, write to the Free Software
   39.27 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   39.28 + */
   39.29 +
   39.30 +#include "asm.S"
   39.31 +
   39.32 +/* useful constants for the algorithm, they are save in __constant_ptr__ at */
   39.33 +/* the end of the source code.*/
   39.34 +#define W1  22725
   39.35 +#define W2  21407
   39.36 +#define W3  19266
   39.37 +#define W4  16383
   39.38 +#define W5  12873
   39.39 +#define W6  8867
   39.40 +#define W7  4520
   39.41 +#define MASK_MSHW 0xFFFF0000
   39.42 +
   39.43 +/* offsets of the constants in the vector */
   39.44 +#define offW1  0
   39.45 +#define offW2  4
   39.46 +#define offW3  8
   39.47 +#define offW4  12
   39.48 +#define offW5  16
   39.49 +#define offW6  20
   39.50 +#define offW7  24
   39.51 +#define offMASK_MSHW 28
   39.52 +
   39.53 +#define ROW_SHIFT 11
   39.54 +#define ROW_SHIFT2MSHW (16-11)
   39.55 +#define COL_SHIFT 20
   39.56 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
   39.57 +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
   39.58 +
   39.59 +
   39.60 +        .text
   39.61 +
   39.62 +function ff_simple_idct_arm, export=1
   39.63 +        @@ void simple_idct_arm(int16_t *block)
   39.64 +        @@ save stack for reg needed (take all of them),
   39.65 +        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
   39.66 +        @@ so it must not be overwritten, if it is not saved!!
   39.67 +        @@ R12 is another scratch register, so it should not be saved too
   39.68 +        @@ save all registers
   39.69 +        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
   39.70 +        @@ at this point, R0=block, other registers are free.
   39.71 +        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
   39.72 +        adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
   39.73 +        @@ add 2 temporary variables in the stack: R0 and R14
   39.74 +        sub sp, sp, #8          @ allow 2 local variables
   39.75 +        str r0, [sp, #0]        @ save block in sp[0]
   39.76 +        @@ stack status
   39.77 +        @@ sp+4   free
   39.78 +        @@ sp+0   R0  (block)
   39.79 +
   39.80 +
   39.81 +        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
   39.82 +
   39.83 +
   39.84 +__row_loop:
   39.85 +        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
   39.86 +        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
   39.87 +        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
   39.88 +        ldr r3, [r14, #8]        @ R3=ROWr32[2]
   39.89 +        ldr r4, [r14, #12]       @ R4=ROWr32[3]
   39.90 +        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
   39.91 +        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
   39.92 +        @@ else follow the complete algorithm.
   39.93 +        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
   39.94 +        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
   39.95 +        orr r5, r4, r3           @ R5=R4 | R3
   39.96 +        orr r5, r5, r2           @ R5=R4 | R3 | R2
   39.97 +        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
   39.98 +        beq __end_row_loop
   39.99 +        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  39.100 +        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  39.101 +        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
  39.102 +        beq __almost_empty_row
  39.103 +
  39.104 +__b_evaluation:
  39.105 +        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
  39.106 +        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
  39.107 +        @@     R12=__const_ptr_, R14=&block[n]
  39.108 +        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
  39.109 +
  39.110 +        @@ MUL16(b0, W1, row[1]);
  39.111 +        @@ MUL16(b1, W3, row[1]);
  39.112 +        @@ MUL16(b2, W5, row[1]);
  39.113 +        @@ MUL16(b3, W7, row[1]);
  39.114 +        @@ MAC16(b0, W3, row[3]);
  39.115 +        @@ MAC16(b1, -W7, row[3]);
  39.116 +        @@ MAC16(b2, -W1, row[3]);
  39.117 +        @@ MAC16(b3, -W5, row[3]);
  39.118 +        ldr r8, [r12, #offW1]    @ R8=W1
  39.119 +        mov r2, r2, asr #16      @ R2=ROWr16[3]
  39.120 +        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.121 +        ldr r9, [r12, #offW3]    @ R9=W3
  39.122 +        ldr r10, [r12, #offW5]   @ R10=W5
  39.123 +        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.124 +        ldr r11, [r12, #offW7]   @ R11=W7
  39.125 +        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.126 +        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.127 +                teq r2, #0               @ if null avoid muls
  39.128 +                mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.129 +        rsbne r2, r2, #0         @ R2=-ROWr16[3]
  39.130 +        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.131 +        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.132 +        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.133 +
  39.134 +        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
  39.135 +        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
  39.136 +        @@     R12=__const_ptr_, R14=&block[n]
  39.137 +        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
  39.138 +        @@ if (temp != 0) {}
  39.139 +        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
  39.140 +        beq __end_b_evaluation
  39.141 +
  39.142 +        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
  39.143 +        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
  39.144 +        @@     R12=__const_ptr_, R14=&block[n]
  39.145 +        @@ MAC16(b0, W5, row[5]);
  39.146 +        @@ MAC16(b2, W7, row[5]);
  39.147 +        @@ MAC16(b3, W3, row[5]);
  39.148 +        @@ MAC16(b1, -W1, row[5]);
  39.149 +        @@ MAC16(b0, W7, row[7]);
  39.150 +        @@ MAC16(b2, W3, row[7]);
  39.151 +        @@ MAC16(b3, -W1, row[7]);
  39.152 +        @@ MAC16(b1, -W5, row[7]);
  39.153 +        mov r3, r3, asr #16      @ R3=ROWr16[5]
  39.154 +                teq r3, #0               @ if null avoid muls
  39.155 +        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
  39.156 +        mov r4, r4, asr #16      @ R4=ROWr16[7]
  39.157 +        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
  39.158 +        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
  39.159 +        rsbne r3, r3, #0         @ R3=-ROWr16[5]
  39.160 +        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
  39.161 +        @@ R3 is free now
  39.162 +                teq r4, #0               @ if null avoid muls
  39.163 +        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
  39.164 +        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
  39.165 +        rsbne r4, r4, #0         @ R4=-ROWr16[7]
  39.166 +        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
  39.167 +        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
  39.168 +        @@ R4 is free now
  39.169 +__end_b_evaluation:
  39.170 +        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
  39.171 +        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  39.172 +        @@     R12=__const_ptr_, R14=&block[n]
  39.173 +
  39.174 +__a_evaluation:
  39.175 +        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
  39.176 +        @@ a1 = a0 + W6 * row[2];
  39.177 +        @@ a2 = a0 - W6 * row[2];
  39.178 +        @@ a3 = a0 - W2 * row[2];
  39.179 +        @@ a0 = a0 + W2 * row[2];
  39.180 +        ldr r9, [r12, #offW4]    @ R9=W4
  39.181 +        mul r6, r9, r6           @ R6=W4*ROWr16[0]
  39.182 +        ldr r10, [r12, #offW6]   @ R10=W6
  39.183 +        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
  39.184 +        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
  39.185 +
  39.186 +        mul r11, r10, r4         @ R11=W6*ROWr16[2]
  39.187 +        ldr r8, [r12, #offW2]    @ R8=W2
  39.188 +        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
  39.189 +        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
  39.190 +        @@ if (temp != 0) {}
  39.191 +        teq r2, #0
  39.192 +        beq __end_bef_a_evaluation
  39.193 +
  39.194 +        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
  39.195 +        mul r11, r8, r4          @ R11=W2*ROWr16[2]
  39.196 +        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
  39.197 +        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
  39.198 +
  39.199 +
  39.200 +        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
  39.201 +        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
  39.202 +        @@     R12=__const_ptr_, R14=&block[n]
  39.203 +
  39.204 +
  39.205 +        @@ a0 += W4*row[4]
  39.206 +        @@ a1 -= W4*row[4]
  39.207 +        @@ a2 -= W4*row[4]
  39.208 +        @@ a3 += W4*row[4]
  39.209 +        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
  39.210 +                teq r11, #0              @ if null avoid muls
  39.211 +        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
  39.212 +        @@ R9 is free now
  39.213 +        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
  39.214 +        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
  39.215 +        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
  39.216 +        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
  39.217 +        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
  39.218 +        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
  39.219 +                teq r9, #0               @ if null avoid muls
  39.220 +        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
  39.221 +        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
  39.222 +        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
  39.223 +        @@ a0 += W6*row[6];
  39.224 +        @@ a3 -= W6*row[6];
  39.225 +        @@ a1 -= W2*row[6];
  39.226 +        @@ a2 += W2*row[6];
  39.227 +        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
  39.228 +        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
  39.229 +        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
  39.230 +
  39.231 +__end_a_evaluation:
  39.232 +        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
  39.233 +        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  39.234 +        @@     R12=__const_ptr_, R14=&block[n]
  39.235 +        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
  39.236 +        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
  39.237 +        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
  39.238 +        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
  39.239 +        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
  39.240 +        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
  39.241 +        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
  39.242 +        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
  39.243 +        add r8, r6, r0           @ R8=a0+b0
  39.244 +        add r9, r2, r1           @ R9=a1+b1
  39.245 +        @@ put 2 16 bits half-words in a 32bits word
  39.246 +        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
  39.247 +        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
  39.248 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
  39.249 +        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
  39.250 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
  39.251 +        orr r8, r8, r9
  39.252 +        str r8, [r14, #0]
  39.253 +
  39.254 +        add r8, r3, r5           @ R8=a2+b2
  39.255 +        add r9, r4, r7           @ R9=a3+b3
  39.256 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
  39.257 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
  39.258 +        orr r8, r8, r9
  39.259 +        str r8, [r14, #4]
  39.260 +
  39.261 +        sub r8, r4, r7           @ R8=a3-b3
  39.262 +        sub r9, r3, r5           @ R9=a2-b2
  39.263 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
  39.264 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
  39.265 +        orr r8, r8, r9
  39.266 +        str r8, [r14, #8]
  39.267 +
  39.268 +        sub r8, r2, r1           @ R8=a1-b1
  39.269 +        sub r9, r6, r0           @ R9=a0-b0
  39.270 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
  39.271 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
  39.272 +        orr r8, r8, r9
  39.273 +        str r8, [r14, #12]
  39.274 +
  39.275 +        bal __end_row_loop
  39.276 +
  39.277 +__almost_empty_row:
  39.278 +        @@ the row was empty, except ROWr16[0], now, management of this special case
  39.279 +        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  39.280 +        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
  39.281 +        @@                R8=0xFFFF (temp), R9-R11 free
  39.282 +        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
  39.283 +        sub r8, r8, #1           @ R8 is now ready.
  39.284 +        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
  39.285 +        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
  39.286 +        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
  39.287 +        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
  39.288 +        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
  39.289 +        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
  39.290 +
  39.291 +__end_row_loop:
  39.292 +        @@ at this point, R0-R11 (free)
  39.293 +        @@     R12=__const_ptr_, R14=&block[n]
  39.294 +        ldr r0, [sp, #0]         @ R0=block
  39.295 +        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
  39.296 +        sub r14, r14, #16
  39.297 +        bne __row_loop
  39.298 +
  39.299 +
  39.300 +
  39.301 +        @@ at this point, R0=block, R1-R11 (free)
  39.302 +        @@     R12=__const_ptr_, R14=&block[n]
  39.303 +        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
  39.304 +__col_loop:
  39.305 +
  39.306 +__b_evaluation2:
  39.307 +        @@ at this point, R0=block (temp),  R1-R11 (free)
  39.308 +        @@     R12=__const_ptr_, R14=&block[n]
  39.309 +        @@ proceed with b0-b3 first, followed by a0-a3
  39.310 +        @@ MUL16(b0, W1, col[8x1]);
  39.311 +        @@ MUL16(b1, W3, col[8x1]);
  39.312 +        @@ MUL16(b2, W5, col[8x1]);
  39.313 +        @@ MUL16(b3, W7, col[8x1]);
  39.314 +        @@ MAC16(b0, W3, col[8x3]);
  39.315 +        @@ MAC16(b1, -W7, col[8x3]);
  39.316 +        @@ MAC16(b2, -W1, col[8x3]);
  39.317 +        @@ MAC16(b3, -W5, col[8x3]);
  39.318 +        ldr r8, [r12, #offW1]    @ R8=W1
  39.319 +        ldrsh r7, [r14, #16]
  39.320 +        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.321 +        ldr r9, [r12, #offW3]    @ R9=W3
  39.322 +        ldr r10, [r12, #offW5]   @ R10=W5
  39.323 +        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.324 +        ldr r11, [r12, #offW7]   @ R11=W7
  39.325 +        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.326 +        ldrsh r2, [r14, #48]
  39.327 +        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
  39.328 +        teq r2, #0               @ if 0, then avoid muls
  39.329 +        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.330 +        rsbne r2, r2, #0         @ R2=-ROWr16[3]
  39.331 +        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.332 +        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.333 +        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
  39.334 +
  39.335 +        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
  39.336 +        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
  39.337 +        @@     R12=__const_ptr_, R14=&block[n]
  39.338 +        @@ MAC16(b0, W5, col[5x8]);
  39.339 +        @@ MAC16(b2, W7, col[5x8]);
  39.340 +        @@ MAC16(b3, W3, col[5x8]);
  39.341 +        @@ MAC16(b1, -W1, col[5x8]);
  39.342 +        @@ MAC16(b0, W7, col[7x8]);
  39.343 +        @@ MAC16(b2, W3, col[7x8]);
  39.344 +        @@ MAC16(b3, -W1, col[7x8]);
  39.345 +        @@ MAC16(b1, -W5, col[7x8]);
  39.346 +        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
  39.347 +        teq r3, #0               @ if 0 then avoid muls
  39.348 +        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
  39.349 +        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
  39.350 +        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
  39.351 +        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
  39.352 +        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
  39.353 +        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
  39.354 +        @@ R3 is free now
  39.355 +        teq r4, #0               @ if 0 then avoid muls
  39.356 +        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
  39.357 +        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
  39.358 +        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
  39.359 +        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
  39.360 +        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
  39.361 +        @@ R4 is free now
  39.362 +__end_b_evaluation2:
  39.363 +        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
  39.364 +        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  39.365 +        @@     R12=__const_ptr_, R14=&block[n]
  39.366 +
  39.367 +__a_evaluation2:
  39.368 +        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
  39.369 +        @@ a1 = a0 + W6 * row[2];
  39.370 +        @@ a2 = a0 - W6 * row[2];
  39.371 +        @@ a3 = a0 - W2 * row[2];
  39.372 +        @@ a0 = a0 + W2 * row[2];
  39.373 +        ldrsh r6, [r14, #0]
  39.374 +        ldr r9, [r12, #offW4]    @ R9=W4
  39.375 +        mul r6, r9, r6           @ R6=W4*ROWr16[0]
  39.376 +        ldr r10, [r12, #offW6]   @ R10=W6
  39.377 +        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
  39.378 +        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
  39.379 +        mul r11, r10, r4         @ R11=W6*ROWr16[2]
  39.380 +        ldr r8, [r12, #offW2]    @ R8=W2
  39.381 +        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
  39.382 +        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
  39.383 +        mul r11, r8, r4          @ R11=W2*ROWr16[2]
  39.384 +        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
  39.385 +        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
  39.386 +
  39.387 +        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
  39.388 +        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
  39.389 +        @@     R12=__const_ptr_, R14=&block[n]
  39.390 +        @@ a0 += W4*row[4]
  39.391 +        @@ a1 -= W4*row[4]
  39.392 +        @@ a2 -= W4*row[4]
  39.393 +        @@ a3 += W4*row[4]
  39.394 +        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
  39.395 +        teq r11, #0              @ if null avoid muls
  39.396 +        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
  39.397 +        @@ R9 is free now
  39.398 +        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
  39.399 +        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
  39.400 +        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
  39.401 +        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
  39.402 +        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
  39.403 +        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
  39.404 +        teq r9, #0               @ if null avoid muls
  39.405 +        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
  39.406 +        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
  39.407 +        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
  39.408 +        @@ a0 += W6*row[6];
  39.409 +        @@ a3 -= W6*row[6];
  39.410 +        @@ a1 -= W2*row[6];
  39.411 +        @@ a2 += W2*row[6];
  39.412 +        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
  39.413 +        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
  39.414 +        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
  39.415 +__end_a_evaluation2:
  39.416 +        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
  39.417 +        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
  39.418 +        @@     R12=__const_ptr_, R14=&block[n]
  39.419 +        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
  39.420 +        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
  39.421 +        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
  39.422 +        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
  39.423 +        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
  39.424 +        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
  39.425 +        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
  39.426 +        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
  39.427 +        @@@@@ no optimization here @@@@@
  39.428 +        add r8, r6, r0           @ R8=a0+b0
  39.429 +        add r9, r2, r1           @ R9=a1+b1
  39.430 +        mov r8, r8, asr #COL_SHIFT
  39.431 +        mov r9, r9, asr #COL_SHIFT
  39.432 +        strh r8, [r14, #0]
  39.433 +        strh r9, [r14, #16]
  39.434 +        add r8, r3, r5           @ R8=a2+b2
  39.435 +        add r9, r4, r7           @ R9=a3+b3
  39.436 +        mov r8, r8, asr #COL_SHIFT
  39.437 +        mov r9, r9, asr #COL_SHIFT
  39.438 +        strh r8, [r14, #32]
  39.439 +        strh r9, [r14, #48]
  39.440 +        sub r8, r4, r7           @ R8=a3-b3
  39.441 +        sub r9, r3, r5           @ R9=a2-b2
  39.442 +        mov r8, r8, asr #COL_SHIFT
  39.443 +        mov r9, r9, asr #COL_SHIFT
  39.444 +        strh r8, [r14, #64]
  39.445 +        strh r9, [r14, #80]
  39.446 +        sub r8, r2, r1           @ R8=a1-b1
  39.447 +        sub r9, r6, r0           @ R9=a0-b0
  39.448 +        mov r8, r8, asr #COL_SHIFT
  39.449 +        mov r9, r9, asr #COL_SHIFT
  39.450 +        strh r8, [r14, #96]
  39.451 +        strh r9, [r14, #112]
  39.452 +
  39.453 +__end_col_loop:
  39.454 +        @@ at this point, R0-R11 (free)
  39.455 +        @@     R12=__const_ptr_, R14=&block[n]
  39.456 +        ldr r0, [sp, #0]         @ R0=block
  39.457 +        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
  39.458 +        sub r14, r14, #2
  39.459 +        bne __col_loop
  39.460 +
  39.461 +
  39.462 +
  39.463 +
  39.464 +__end_simple_idct_arm:
  39.465 +        @@ restore registers to previous status!
  39.466 +        add sp, sp, #8 @@ the local variables!
  39.467 +        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
  39.468 +
  39.469 +
  39.470 +
  39.471 +@@ kind of sub-function, here not to overload the common case.
  39.472 +__end_bef_a_evaluation:
  39.473 +        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
  39.474 +        mul r11, r8, r4          @ R11=W2*ROWr16[2]
  39.475 +        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
  39.476 +        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
  39.477 +        bal __end_a_evaluation
  39.478 +
  39.479 +
  39.480 +__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
  39.481 +        .align
  39.482 +        .word   W1
  39.483 +        .word   W2
  39.484 +        .word   W3
  39.485 +        .word   W4
  39.486 +        .word   W5
  39.487 +        .word   W6
  39.488 +        .word   W7
  39.489 +        .word   MASK_MSHW

    40.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    40.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv5te.S	Mon Aug 27 12:09:56 2012 +0200
    40.3 @@ -0,0 +1,703 @@
    40.4 +/*
    40.5 + * Simple IDCT
    40.6 + *
    40.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
    40.8 + * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
    40.9 + *
   40.10 + * This file is part of FFmpeg.
   40.11 + *
   40.12 + * FFmpeg is free software; you can redistribute it and/or
   40.13 + * modify it under the terms of the GNU Lesser General Public
   40.14 + * License as published by the Free Software Foundation; either
   40.15 + * version 2.1 of the License, or (at your option) any later version.
   40.16 + *
   40.17 + * FFmpeg is distributed in the hope that it will be useful,
   40.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   40.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   40.20 + * Lesser General Public License for more details.
   40.21 + *
   40.22 + * You should have received a copy of the GNU Lesser General Public
   40.23 + * License along with FFmpeg; if not, write to the Free Software
   40.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   40.25 + */
   40.26 +
   40.27 +#include "asm.S"
   40.28 +
   40.29 +#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   40.30 +#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   40.31 +#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   40.32 +#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   40.33 +#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   40.34 +#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   40.35 +#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   40.36 +#define ROW_SHIFT 11
   40.37 +#define COL_SHIFT 20
   40.38 +
   40.39 +#define W13 (W1 | (W3 << 16))
   40.40 +#define W26 (W2 | (W6 << 16))
   40.41 +#define W57 (W5 | (W7 << 16))
   40.42 +
   40.43 +        .text
   40.44 +        .align
   40.45 +w13:    .long W13
   40.46 +w26:    .long W26
   40.47 +w57:    .long W57
   40.48 +
   40.49 +function idct_row_armv5te
   40.50 +        str    lr, [sp, #-4]!
   40.51 +
   40.52 +        ldrd   v1, [a1, #8]
   40.53 +        ldrd   a3, [a1]              /* a3 = row[1:0], a4 = row[3:2] */
   40.54 +        orrs   v1, v1, v2
   40.55 +        cmpeq  v1, a4
   40.56 +        cmpeq  v1, a3, lsr #16
   40.57 +        beq    row_dc_only
   40.58 +
   40.59 +        mov    v1, #(1<<(ROW_SHIFT-1))
   40.60 +        mov    ip, #16384
   40.61 +        sub    ip, ip, #1            /* ip = W4 */
   40.62 +        smlabb v1, ip, a3, v1        /* v1 = W4*row[0]+(1<<(RS-1)) */
   40.63 +        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
   40.64 +        smultb a2, ip, a4
   40.65 +        smulbb lr, ip, a4
   40.66 +        add    v2, v1, a2
   40.67 +        sub    v3, v1, a2
   40.68 +        sub    v4, v1, lr
   40.69 +        add    v1, v1, lr
   40.70 +
   40.71 +        ldr    ip, w13               /* ip = W1 | (W3 << 16) */
   40.72 +        ldr    lr, w57               /* lr = W5 | (W7 << 16) */
   40.73 +        smulbt v5, ip, a3
   40.74 +        smultt v6, lr, a4
   40.75 +        smlatt v5, ip, a4, v5
   40.76 +        smultt a2, ip, a3
   40.77 +        smulbt v7, lr, a3
   40.78 +        sub    v6, v6, a2
   40.79 +        smulbt a2, ip, a4
   40.80 +        smultt fp, lr, a3
   40.81 +        sub    v7, v7, a2
   40.82 +        smulbt a2, lr, a4
   40.83 +        ldrd   a3, [a1, #8]          /* a3=row[5:4] a4=row[7:6] */
   40.84 +        sub    fp, fp, a2
   40.85 +
   40.86 +        orrs   a2, a3, a4
   40.87 +        beq    1f
   40.88 +
   40.89 +        smlabt v5, lr, a3, v5
   40.90 +        smlabt v6, ip, a3, v6
   40.91 +        smlatt v5, lr, a4, v5
   40.92 +        smlabt v6, lr, a4, v6
   40.93 +        smlatt v7, lr, a3, v7
   40.94 +        smlatt fp, ip, a3, fp
   40.95 +        smulbt a2, ip, a4
   40.96 +        smlatt v7, ip, a4, v7
   40.97 +        sub    fp, fp, a2
   40.98 +
   40.99 +        ldr    ip, w26               /* ip = W2 | (W6 << 16) */
  40.100 +        mov    a2, #16384
  40.101 +        sub    a2, a2, #1            /* a2 =  W4 */
  40.102 +        smulbb a2, a2, a3            /* a2 =  W4*row[4] */
  40.103 +        smultb lr, ip, a4            /* lr =  W6*row[6] */
  40.104 +        add    v1, v1, a2            /* v1 += W4*row[4] */
  40.105 +        add    v1, v1, lr            /* v1 += W6*row[6] */
  40.106 +        add    v4, v4, a2            /* v4 += W4*row[4] */
  40.107 +        sub    v4, v4, lr            /* v4 -= W6*row[6] */
  40.108 +        smulbb lr, ip, a4            /* lr =  W2*row[6] */
  40.109 +        sub    v2, v2, a2            /* v2 -= W4*row[4] */
  40.110 +        sub    v2, v2, lr            /* v2 -= W2*row[6] */
  40.111 +        sub    v3, v3, a2            /* v3 -= W4*row[4] */
  40.112 +        add    v3, v3, lr            /* v3 += W2*row[6] */
  40.113 +
  40.114 +1:      add    a2, v1, v5
  40.115 +        mov    a3, a2, lsr #11
  40.116 +        bic    a3, a3, #0x1f0000
  40.117 +        sub    a2, v2, v6
  40.118 +        mov    a2, a2, lsr #11
  40.119 +        add    a3, a3, a2, lsl #16
  40.120 +        add    a2, v3, v7
  40.121 +        mov    a4, a2, lsr #11
  40.122 +        bic    a4, a4, #0x1f0000
  40.123 +        add    a2, v4, fp
  40.124 +        mov    a2, a2, lsr #11
  40.125 +        add    a4, a4, a2, lsl #16
  40.126 +        strd   a3, [a1]
  40.127 +
  40.128 +        sub    a2, v4, fp
  40.129 +        mov    a3, a2, lsr #11
  40.130 +        bic    a3, a3, #0x1f0000
  40.131 +        sub    a2, v3, v7
  40.132 +        mov    a2, a2, lsr #11
  40.133 +        add    a3, a3, a2, lsl #16
  40.134 +        add    a2, v2, v6
  40.135 +        mov    a4, a2, lsr #11
  40.136 +        bic    a4, a4, #0x1f0000
  40.137 +        sub    a2, v1, v5
  40.138 +        mov    a2, a2, lsr #11
  40.139 +        add    a4, a4, a2, lsl #16
  40.140 +        strd   a3, [a1, #8]
  40.141 +
  40.142 +        ldr    pc, [sp], #4
  40.143 +
  40.144 +row_dc_only:
  40.145 +        orr    a3, a3, a3, lsl #16
  40.146 +        bic    a3, a3, #0xe000
  40.147 +        mov    a3, a3, lsl #3
  40.148 +        mov    a4, a3
  40.149 +        strd   a3, [a1]
  40.150 +        strd   a3, [a1, #8]
  40.151 +
  40.152 +        ldr    pc, [sp], #4
  40.153 +endfunc
  40.154 +
  40.155 +        .macro idct_col
  40.156 +        ldr    a4, [a1]              /* a4 = col[1:0] */
  40.157 +        mov    ip, #16384
  40.158 +        sub    ip, ip, #1            /* ip = W4 */
  40.159 +#if 0
  40.160 +        mov    v1, #(1<<(COL_SHIFT-1))
  40.161 +        smlabt v2, ip, a4, v1        /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
  40.162 +        smlabb v1, ip, a4, v1        /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
  40.163 +        ldr    a4, [a1, #(16*4)]
  40.164 +#else
  40.165 +        mov    v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
  40.166 +        add    v2, v1, a4, asr #16
  40.167 +        rsb    v2, v2, v2, lsl #14
  40.168 +        mov    a4, a4, lsl #16
  40.169 +        add    v1, v1, a4, asr #16
  40.170 +        ldr    a4, [a1, #(16*4)]
  40.171 +        rsb    v1, v1, v1, lsl #14
  40.172 +#endif
  40.173 +
  40.174 +        smulbb lr, ip, a4
  40.175 +        smulbt a3, ip, a4
  40.176 +        sub    v3, v1, lr
  40.177 +        sub    v5, v1, lr
  40.178 +        add    v7, v1, lr
  40.179 +        add    v1, v1, lr
  40.180 +        sub    v4, v2, a3
  40.181 +        sub    v6, v2, a3
  40.182 +        add    fp, v2, a3
  40.183 +        ldr    ip, w26
  40.184 +        ldr    a4, [a1, #(16*2)]
  40.185 +        add    v2, v2, a3
  40.186 +
  40.187 +        smulbb lr, ip, a4
  40.188 +        smultb a3, ip, a4
  40.189 +        add    v1, v1, lr
  40.190 +        sub    v7, v7, lr
  40.191 +        add    v3, v3, a3
  40.192 +        sub    v5, v5, a3
  40.193 +        smulbt lr, ip, a4
  40.194 +        smultt a3, ip, a4
  40.195 +        add    v2, v2, lr
  40.196 +        sub    fp, fp, lr
  40.197 +        add    v4, v4, a3
  40.198 +        ldr    a4, [a1, #(16*6)]
  40.199 +        sub    v6, v6, a3
  40.200 +
  40.201 +        smultb lr, ip, a4
  40.202 +        smulbb a3, ip, a4
  40.203 +        add    v1, v1, lr
  40.204 +        sub    v7, v7, lr
  40.205 +        sub    v3, v3, a3
  40.206 +        add    v5, v5, a3
  40.207 +        smultt lr, ip, a4
  40.208 +        smulbt a3, ip, a4
  40.209 +        add    v2, v2, lr
  40.210 +        sub    fp, fp, lr
  40.211 +        sub    v4, v4, a3
  40.212 +        add    v6, v6, a3
  40.213 +
  40.214 +        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
  40.215 +
  40.216 +        ldr    ip, w13
  40.217 +        ldr    a4, [a1, #(16*1)]
  40.218 +        ldr    lr, w57
  40.219 +        smulbb v1, ip, a4
  40.220 +        smultb v3, ip, a4
  40.221 +        smulbb v5, lr, a4
  40.222 +        smultb v7, lr, a4
  40.223 +        smulbt v2, ip, a4
  40.224 +        smultt v4, ip, a4
  40.225 +        smulbt v6, lr, a4
  40.226 +        smultt fp, lr, a4
  40.227 +        rsb    v4, v4, #0
  40.228 +        ldr    a4, [a1, #(16*3)]
  40.229 +        rsb    v3, v3, #0
  40.230 +
  40.231 +        smlatb v1, ip, a4, v1
  40.232 +        smlatb v3, lr, a4, v3
  40.233 +        smulbb a3, ip, a4
  40.234 +        smulbb a2, lr, a4
  40.235 +        sub    v5, v5, a3
  40.236 +        sub    v7, v7, a2
  40.237 +        smlatt v2, ip, a4, v2
  40.238 +        smlatt v4, lr, a4, v4
  40.239 +        smulbt a3, ip, a4
  40.240 +        smulbt a2, lr, a4
  40.241 +        sub    v6, v6, a3
  40.242 +        ldr    a4, [a1, #(16*5)]
  40.243 +        sub    fp, fp, a2
  40.244 +
  40.245 +        smlabb v1, lr, a4, v1
  40.246 +        smlabb v3, ip, a4, v3
  40.247 +        smlatb v5, lr, a4, v5
  40.248 +        smlatb v7, ip, a4, v7
  40.249 +        smlabt v2, lr, a4, v2
  40.250 +        smlabt v4, ip, a4, v4
  40.251 +        smlatt v6, lr, a4, v6
  40.252 +        ldr    a3, [a1, #(16*7)]
  40.253 +        smlatt fp, ip, a4, fp
  40.254 +
  40.255 +        smlatb v1, lr, a3, v1
  40.256 +        smlabb v3, lr, a3, v3
  40.257 +        smlatb v5, ip, a3, v5
  40.258 +        smulbb a4, ip, a3
  40.259 +        smlatt v2, lr, a3, v2
  40.260 +        sub    v7, v7, a4
  40.261 +        smlabt v4, lr, a3, v4
  40.262 +        smulbt a4, ip, a3
  40.263 +        smlatt v6, ip, a3, v6
  40.264 +        sub    fp, fp, a4
  40.265 +        .endm
  40.266 +
  40.267 +function idct_col_armv5te
  40.268 +        str    lr, [sp, #-4]!
  40.269 +
  40.270 +        idct_col
  40.271 +
  40.272 +        ldmfd  sp!, {a3, a4}
  40.273 +        adds   a2, a3, v1
  40.274 +        mov    a2, a2, lsr #20
  40.275 +        orrmi  a2, a2, #0xf000
  40.276 +        add    ip, a4, v2
  40.277 +        mov    ip, ip, asr #20
  40.278 +        orr    a2, a2, ip, lsl #16
  40.279 +        str    a2, [a1]
  40.280 +        subs   a3, a3, v1
  40.281 +        mov    a2, a3, lsr #20
  40.282 +        orrmi  a2, a2, #0xf000
  40.283 +        sub    a4, a4, v2
  40.284 +        mov    a4, a4, asr #20
  40.285 +        orr    a2, a2, a4, lsl #16
  40.286 +        ldmfd  sp!, {a3, a4}
  40.287 +        str    a2, [a1, #(16*7)]
  40.288 +
  40.289 +        subs   a2, a3, v3
  40.290 +        mov    a2, a2, lsr #20
  40.291 +        orrmi  a2, a2, #0xf000
  40.292 +        sub    ip, a4, v4
  40.293 +        mov    ip, ip, asr #20
  40.294 +        orr    a2, a2, ip, lsl #16
  40.295 +        str    a2, [a1, #(16*1)]
  40.296 +        adds   a3, a3, v3
  40.297 +        mov    a2, a3, lsr #20
  40.298 +        orrmi  a2, a2, #0xf000
  40.299 +        add    a4, a4, v4
  40.300 +        mov    a4, a4, asr #20
  40.301 +        orr    a2, a2, a4, lsl #16
  40.302 +        ldmfd  sp!, {a3, a4}
  40.303 +        str    a2, [a1, #(16*6)]
  40.304 +
  40.305 +        adds   a2, a3, v5
  40.306 +        mov    a2, a2, lsr #20
  40.307 +        orrmi  a2, a2, #0xf000
  40.308 +        add    ip, a4, v6
  40.309 +        mov    ip, ip, asr #20
  40.310 +        orr    a2, a2, ip, lsl #16
  40.311 +        str    a2, [a1, #(16*2)]
  40.312 +        subs   a3, a3, v5
  40.313 +        mov    a2, a3, lsr #20
  40.314 +        orrmi  a2, a2, #0xf000
  40.315 +        sub    a4, a4, v6
  40.316 +        mov    a4, a4, asr #20
  40.317 +        orr    a2, a2, a4, lsl #16
  40.318 +        ldmfd  sp!, {a3, a4}
  40.319 +        str    a2, [a1, #(16*5)]
  40.320 +
  40.321 +        adds   a2, a3, v7
  40.322 +        mov    a2, a2, lsr #20
  40.323 +        orrmi  a2, a2, #0xf000
  40.324 +        add    ip, a4, fp
  40.325 +        mov    ip, ip, asr #20
  40.326 +        orr    a2, a2, ip, lsl #16
  40.327 +        str    a2, [a1, #(16*3)]
  40.328 +        subs   a3, a3, v7
  40.329 +        mov    a2, a3, lsr #20
  40.330 +        orrmi  a2, a2, #0xf000
  40.331 +        sub    a4, a4, fp
  40.332 +        mov    a4, a4, asr #20
  40.333 +        orr    a2, a2, a4, lsl #16
  40.334 +        str    a2, [a1, #(16*4)]
  40.335 +
  40.336 +        ldr    pc, [sp], #4
  40.337 +endfunc
  40.338 +
  40.339 +function idct_col_put_armv5te
  40.340 +        str    lr, [sp, #-4]!
  40.341 +
  40.342 +        idct_col
  40.343 +
  40.344 +        ldmfd  sp!, {a3, a4}
  40.345 +        ldr    lr, [sp, #32]
  40.346 +        add    a2, a3, v1
  40.347 +        movs   a2, a2, asr #20
  40.348 +        movmi  a2, #0
  40.349 +        cmp    a2, #255
  40.350 +        movgt  a2, #255
  40.351 +        add    ip, a4, v2
  40.352 +        movs   ip, ip, asr #20
  40.353 +        movmi  ip, #0
  40.354 +        cmp    ip, #255
  40.355 +        movgt  ip, #255
  40.356 +        orr    a2, a2, ip, lsl #8
  40.357 +        sub    a3, a3, v1
  40.358 +        movs   a3, a3, asr #20
  40.359 +        movmi  a3, #0
  40.360 +        cmp    a3, #255
  40.361 +        movgt  a3, #255
  40.362 +        sub    a4, a4, v2
  40.363 +        movs   a4, a4, asr #20
  40.364 +        movmi  a4, #0
  40.365 +        cmp    a4, #255
  40.366 +        ldr    v1, [sp, #28]
  40.367 +        movgt  a4, #255
  40.368 +        strh   a2, [v1]
  40.369 +        add    a2, v1, #2
  40.370 +        str    a2, [sp, #28]
  40.371 +        orr    a2, a3, a4, lsl #8
  40.372 +        rsb    v2, lr, lr, lsl #3
  40.373 +        ldmfd  sp!, {a3, a4}
  40.374 +        strh   a2, [v2, v1]!
  40.375 +
  40.376 +        sub    a2, a3, v3
  40.377 +        movs   a2, a2, asr #20
  40.378 +        movmi  a2, #0
  40.379 +        cmp    a2, #255
  40.380 +        movgt  a2, #255
  40.381 +        sub    ip, a4, v4
  40.382 +        movs   ip, ip, asr #20
  40.383 +        movmi  ip, #0
  40.384 +        cmp    ip, #255
  40.385 +        movgt  ip, #255
  40.386 +        orr    a2, a2, ip, lsl #8
  40.387 +        strh   a2, [v1, lr]!
  40.388 +        add    a3, a3, v3
  40.389 +        movs   a2, a3, asr #20
  40.390 +        movmi  a2, #0
  40.391 +        cmp    a2, #255
  40.392 +        movgt  a2, #255
  40.393 +        add    a4, a4, v4
  40.394 +        movs   a4, a4, asr #20
  40.395 +        movmi  a4, #0
  40.396 +        cmp    a4, #255
  40.397 +        movgt  a4, #255
  40.398 +        orr    a2, a2, a4, lsl #8
  40.399 +        ldmfd  sp!, {a3, a4}
  40.400 +        strh   a2, [v2, -lr]!
  40.401 +
  40.402 +        add    a2, a3, v5
  40.403 +        movs   a2, a2, asr #20
  40.404 +        movmi  a2, #0
  40.405 +        cmp    a2, #255
  40.406 +        movgt  a2, #255
  40.407 +        add    ip, a4, v6
  40.408 +        movs   ip, ip, asr #20
  40.409 +        movmi  ip, #0
  40.410 +        cmp    ip, #255
  40.411 +        movgt  ip, #255
  40.412 +        orr    a2, a2, ip, lsl #8
  40.413 +        strh   a2, [v1, lr]!
  40.414 +        sub    a3, a3, v5
  40.415 +        movs   a2, a3, asr #20
  40.416 +        movmi  a2, #0
  40.417 +        cmp    a2, #255
  40.418 +        movgt  a2, #255
  40.419 +        sub    a4, a4, v6
  40.420 +        movs   a4, a4, asr #20
  40.421 +        movmi  a4, #0
  40.422 +        cmp    a4, #255
  40.423 +        movgt  a4, #255
  40.424 +        orr    a2, a2, a4, lsl #8
  40.425 +        ldmfd  sp!, {a3, a4}
  40.426 +        strh   a2, [v2, -lr]!
  40.427 +
  40.428 +        add    a2, a3, v7
  40.429 +        movs   a2, a2, asr #20
  40.430 +        movmi  a2, #0
  40.431 +        cmp    a2, #255
  40.432 +        movgt  a2, #255
  40.433 +        add    ip, a4, fp
  40.434 +        movs   ip, ip, asr #20
  40.435 +        movmi  ip, #0
  40.436 +        cmp    ip, #255
  40.437 +        movgt  ip, #255
  40.438 +        orr    a2, a2, ip, lsl #8
  40.439 +        strh   a2, [v1, lr]
  40.440 +        sub    a3, a3, v7
  40.441 +        movs   a2, a3, asr #20
  40.442 +        movmi  a2, #0
  40.443 +        cmp    a2, #255
  40.444 +        movgt  a2, #255
  40.445 +        sub    a4, a4, fp
  40.446 +        movs   a4, a4, asr #20
  40.447 +        movmi  a4, #0
  40.448 +        cmp    a4, #255
  40.449 +        movgt  a4, #255
  40.450 +        orr    a2, a2, a4, lsl #8
  40.451 +        strh   a2, [v2, -lr]
  40.452 +
  40.453 +        ldr    pc, [sp], #4
  40.454 +endfunc
  40.455 +
  40.456 +function idct_col_add_armv5te
  40.457 +        str    lr, [sp, #-4]!
  40.458 +
  40.459 +        idct_col
  40.460 +
  40.461 +        ldr    lr, [sp, #36]
  40.462 +
  40.463 +        ldmfd  sp!, {a3, a4}
  40.464 +        ldrh   ip, [lr]
  40.465 +        add    a2, a3, v1
  40.466 +        mov    a2, a2, asr #20
  40.467 +        sub    a3, a3, v1
  40.468 +        and    v1, ip, #255
  40.469 +        adds   a2, a2, v1
  40.470 +        movmi  a2, #0
  40.471 +        cmp    a2, #255
  40.472 +        movgt  a2, #255
  40.473 +        add    v1, a4, v2
  40.474 +        mov    v1, v1, asr #20
  40.475 +        adds   v1, v1, ip, lsr #8
  40.476 +        movmi  v1, #0
  40.477 +        cmp    v1, #255
  40.478 +        movgt  v1, #255
  40.479 +        orr    a2, a2, v1, lsl #8
  40.480 +        ldr    v1, [sp, #32]
  40.481 +        sub    a4, a4, v2
  40.482 +        rsb    v2, v1, v1, lsl #3
  40.483 +        ldrh   ip, [v2, lr]!
  40.484 +        strh   a2, [lr]
  40.485 +        mov    a3, a3, asr #20
  40.486 +        and    a2, ip, #255
  40.487 +        adds   a3, a3, a2
  40.488 +        movmi  a3, #0
  40.489 +        cmp    a3, #255
  40.490 +        movgt  a3, #255
  40.491 +        mov    a4, a4, asr #20
  40.492 +        adds   a4, a4, ip, lsr #8
  40.493 +        movmi  a4, #0
  40.494 +        cmp    a4, #255
  40.495 +        movgt  a4, #255
  40.496 +        add    a2, lr, #2
  40.497 +        str    a2, [sp, #28]
  40.498 +        orr    a2, a3, a4, lsl #8
  40.499 +        strh   a2, [v2]
  40.500 +
  40.501 +        ldmfd  sp!, {a3, a4}
  40.502 +        ldrh   ip, [lr, v1]!
  40.503 +        sub    a2, a3, v3
  40.504 +        mov    a2, a2, asr #20
  40.505 +        add    a3, a3, v3
  40.506 +        and    v3, ip, #255
  40.507 +        adds   a2, a2, v3
  40.508 +        movmi  a2, #0
  40.509 +        cmp    a2, #255
  40.510 +        movgt  a2, #255
  40.511 +        sub    v3, a4, v4
  40.512 +        mov    v3, v3, asr #20
  40.513 +        adds   v3, v3, ip, lsr #8
  40.514 +        movmi  v3, #0
  40.515 +        cmp    v3, #255
  40.516 +        movgt  v3, #255
  40.517 +        orr    a2, a2, v3, lsl #8
  40.518 +        add    a4, a4, v4
  40.519 +        ldrh   ip, [v2, -v1]!
  40.520 +        strh   a2, [lr]
  40.521 +        mov    a3, a3, asr #20
  40.522 +        and    a2, ip, #255
  40.523 +        adds   a3, a3, a2
  40.524 +        movmi  a3, #0
  40.525 +        cmp    a3, #255
  40.526 +        movgt  a3, #255
  40.527 +        mov    a4, a4, asr #20
  40.528 +        adds   a4, a4, ip, lsr #8
  40.529 +        movmi  a4, #0
  40.530 +        cmp    a4, #255
  40.531 +        movgt  a4, #255
  40.532 +        orr    a2, a3, a4, lsl #8
  40.533 +        strh   a2, [v2]
  40.534 +
  40.535 +        ldmfd  sp!, {a3, a4}
  40.536 +        ldrh   ip, [lr, v1]!
  40.537 +        add    a2, a3, v5
  40.538 +        mov    a2, a2, asr #20
  40.539 +        sub    a3, a3, v5
  40.540 +        and    v3, ip, #255
  40.541 +        adds   a2, a2, v3
  40.542 +        movmi  a2, #0
  40.543 +        cmp    a2, #255
  40.544 +        movgt  a2, #255
  40.545 +        add    v3, a4, v6
  40.546 +        mov    v3, v3, asr #20
  40.547 +        adds   v3, v3, ip, lsr #8
  40.548 +        movmi  v3, #0
  40.549 +        cmp    v3, #255
  40.550 +        movgt  v3, #255
  40.551 +        orr    a2, a2, v3, lsl #8
  40.552 +        sub    a4, a4, v6
  40.553 +        ldrh   ip, [v2, -v1]!
  40.554 +        strh   a2, [lr]
  40.555 +        mov    a3, a3, asr #20
  40.556 +        and    a2, ip, #255
  40.557 +        adds   a3, a3, a2
  40.558 +        movmi  a3, #0
  40.559 +        cmp    a3, #255
  40.560 +        movgt  a3, #255
  40.561 +        mov    a4, a4, asr #20
  40.562 +        adds   a4, a4, ip, lsr #8
  40.563 +        movmi  a4, #0
  40.564 +        cmp    a4, #255
  40.565 +        movgt  a4, #255
  40.566 +        orr    a2, a3, a4, lsl #8
  40.567 +        strh   a2, [v2]
  40.568 +
  40.569 +        ldmfd  sp!, {a3, a4}
  40.570 +        ldrh   ip, [lr, v1]!
  40.571 +        add    a2, a3, v7
  40.572 +        mov    a2, a2, asr #20
  40.573 +        sub    a3, a3, v7
  40.574 +        and    v3, ip, #255
  40.575 +        adds   a2, a2, v3
  40.576 +        movmi  a2, #0
  40.577 +        cmp    a2, #255
  40.578 +        movgt  a2, #255
  40.579 +        add    v3, a4, fp
  40.580 +        mov    v3, v3, asr #20
  40.581 +        adds   v3, v3, ip, lsr #8
  40.582 +        movmi  v3, #0
  40.583 +        cmp    v3, #255
  40.584 +        movgt  v3, #255
  40.585 +        orr    a2, a2, v3, lsl #8
  40.586 +        sub    a4, a4, fp
  40.587 +        ldrh   ip, [v2, -v1]!
  40.588 +        strh   a2, [lr]
  40.589 +        mov    a3, a3, asr #20
  40.590 +        and    a2, ip, #255
  40.591 +        adds   a3, a3, a2
  40.592 +        movmi  a3, #0
  40.593 +        cmp    a3, #255
  40.594 +        movgt  a3, #255
  40.595 +        mov    a4, a4, asr #20
  40.596 +        adds   a4, a4, ip, lsr #8
  40.597 +        movmi  a4, #0
  40.598 +        cmp    a4, #255
  40.599 +        movgt  a4, #255
  40.600 +        orr    a2, a3, a4, lsl #8
  40.601 +        strh   a2, [v2]
  40.602 +
  40.603 +        ldr    pc, [sp], #4
  40.604 +endfunc
  40.605 +
  40.606 +function ff_simple_idct_armv5te, export=1
  40.607 +        stmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
  40.608 +
  40.609 +        bl     idct_row_armv5te
  40.610 +        add    a1, a1, #16
  40.611 +        bl     idct_row_armv5te
  40.612 +        add    a1, a1, #16
  40.613 +        bl     idct_row_armv5te
  40.614 +        add    a1, a1, #16
  40.615 +        bl     idct_row_armv5te
  40.616 +        add    a1, a1, #16
  40.617 +        bl     idct_row_armv5te
  40.618 +        add    a1, a1, #16
  40.619 +        bl     idct_row_armv5te
  40.620 +        add    a1, a1, #16
  40.621 +        bl     idct_row_armv5te
  40.622 +        add    a1, a1, #16
  40.623 +        bl     idct_row_armv5te
  40.624 +
  40.625 +        sub    a1, a1, #(16*7)
  40.626 +
  40.627 +        bl     idct_col_armv5te
  40.628 +        add    a1, a1, #4
  40.629 +        bl     idct_col_armv5te
  40.630 +        add    a1, a1, #4
  40.631 +        bl     idct_col_armv5te
  40.632 +        add    a1, a1, #4
  40.633 +        bl     idct_col_armv5te
  40.634 +
  40.635 +        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  40.636 +endfunc
  40.637 +
  40.638 +function ff_simple_idct_add_armv5te, export=1
  40.639 +        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  40.640 +
  40.641 +        mov    a1, a3
  40.642 +
  40.643 +        bl     idct_row_armv5te
  40.644 +        add    a1, a1, #16
  40.645 +        bl     idct_row_armv5te
  40.646 +        add    a1, a1, #16
  40.647 +        bl     idct_row_armv5te
  40.648 +        add    a1, a1, #16
  40.649 +        bl     idct_row_armv5te
  40.650 +        add    a1, a1, #16
  40.651 +        bl     idct_row_armv5te
  40.652 +        add    a1, a1, #16
  40.653 +        bl     idct_row_armv5te
  40.654 +        add    a1, a1, #16
  40.655 +        bl     idct_row_armv5te
  40.656 +        add    a1, a1, #16
  40.657 +        bl     idct_row_armv5te
  40.658 +
  40.659 +        sub    a1, a1, #(16*7)
  40.660 +
  40.661 +        bl     idct_col_add_armv5te
  40.662 +        add    a1, a1, #4
  40.663 +        bl     idct_col_add_armv5te
  40.664 +        add    a1, a1, #4
  40.665 +        bl     idct_col_add_armv5te
  40.666 +        add    a1, a1, #4
  40.667 +        bl     idct_col_add_armv5te
  40.668 +
  40.669 +        add    sp, sp, #8
  40.670 +        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  40.671 +endfunc
  40.672 +
  40.673 +function ff_simple_idct_put_armv5te, export=1
  40.674 +        stmfd  sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
  40.675 +
  40.676 +        mov    a1, a3
  40.677 +
  40.678 +        bl     idct_row_armv5te
  40.679 +        add    a1, a1, #16
  40.680 +        bl     idct_row_armv5te
  40.681 +        add    a1, a1, #16
  40.682 +        bl     idct_row_armv5te
  40.683 +        add    a1, a1, #16
  40.684 +        bl     idct_row_armv5te
  40.685 +        add    a1, a1, #16
  40.686 +        bl     idct_row_armv5te
  40.687 +        add    a1, a1, #16
  40.688 +        bl     idct_row_armv5te
  40.689 +        add    a1, a1, #16
  40.690 +        bl     idct_row_armv5te
  40.691 +        add    a1, a1, #16
  40.692 +        bl     idct_row_armv5te
  40.693 +
  40.694 +        sub    a1, a1, #(16*7)
  40.695 +
  40.696 +        bl     idct_col_put_armv5te
  40.697 +        add    a1, a1, #4
  40.698 +        bl     idct_col_put_armv5te
  40.699 +        add    a1, a1, #4
  40.700 +        bl     idct_col_put_armv5te
  40.701 +        add    a1, a1, #4
  40.702 +        bl     idct_col_put_armv5te
  40.703 +
  40.704 +        add    sp, sp, #8
  40.705 +        ldmfd  sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
  40.706 +endfunc

    41.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    41.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_armv6.S	Mon Aug 27 12:09:56 2012 +0200
    41.3 @@ -0,0 +1,433 @@
    41.4 +/*
    41.5 + * Simple IDCT
    41.6 + *
    41.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
    41.8 + * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
    41.9 + *
   41.10 + * This file is part of FFmpeg.
   41.11 + *
   41.12 + * FFmpeg is free software; you can redistribute it and/or
   41.13 + * modify it under the terms of the GNU Lesser General Public
   41.14 + * License as published by the Free Software Foundation; either
   41.15 + * version 2.1 of the License, or (at your option) any later version.
   41.16 + *
   41.17 + * FFmpeg is distributed in the hope that it will be useful,
   41.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   41.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   41.20 + * Lesser General Public License for more details.
   41.21 + *
   41.22 + * You should have received a copy of the GNU Lesser General Public
   41.23 + * License along with FFmpeg; if not, write to the Free Software
   41.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   41.25 + */
   41.26 +
   41.27 +#include "asm.S"
   41.28 +
   41.29 +#define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   41.30 +#define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   41.31 +#define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   41.32 +#define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   41.33 +#define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   41.34 +#define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   41.35 +#define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
   41.36 +#define ROW_SHIFT 11
   41.37 +#define COL_SHIFT 20
   41.38 +
   41.39 +#define W13 (W1 | (W3 << 16))
   41.40 +#define W26 (W2 | (W6 << 16))
   41.41 +#define W42 (W4 | (W2 << 16))
   41.42 +#define W42n (-W4&0xffff | (-W2 << 16))
   41.43 +#define W46 (W4 | (W6 << 16))
   41.44 +#define W57 (W5 | (W7 << 16))
   41.45 +
   41.46 +        .text
   41.47 +        .align
   41.48 +w13:    .long W13
   41.49 +w26:    .long W26
   41.50 +w42:    .long W42
   41.51 +w42n:   .long W42n
   41.52 +w46:    .long W46
   41.53 +w57:    .long W57
   41.54 +
   41.55 +/*
   41.56 +  Compute partial IDCT of single row.
   41.57 +  shift = left-shift amount
   41.58 +  r0 = source address
   41.59 +  r2 = row[2,0] <= 2 cycles
   41.60 +  r3 = row[3,1]
   41.61 +  ip = w42      <= 2 cycles
   41.62 +
   41.63 +  Output in registers r4--r11
   41.64 +*/
   41.65 +        .macro idct_row shift
   41.66 +        ldr    lr, w46               /* lr  = W4 | (W6 << 16) */
   41.67 +        mov    r1, #(1<<(\shift-1))
   41.68 +        smlad  r4, r2, ip, r1
   41.69 +        smlsd  r7, r2, ip, r1
   41.70 +        ldr    ip, w13               /* ip  = W1 | (W3 << 16) */
   41.71 +        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
   41.72 +        smlad  r5, r2, lr, r1
   41.73 +        smlsd  r6, r2, lr, r1
   41.74 +
   41.75 +        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
   41.76 +        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
   41.77 +        ldr    lr, [r0, #12]         /* lr  =  row[7,5] */
   41.78 +        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
   41.79 +        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
   41.80 +        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
   41.81 +        smlad  r8, lr, r10,r8        /* B0  +=      W5*row[5] + W7*row[7] */
   41.82 +        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
   41.83 +
   41.84 +        ldr    r3, w42n              /* r3 =  -W4 | (-W2 << 16) */
   41.85 +        smlad  r10,lr, r2, r10       /* B2 +=  W7*row[5] + W3*row[7] */
   41.86 +        ldr    r2, [r0, #4]          /* r2 =   row[6,4] */
   41.87 +        smlsdx r11,lr, ip, r11       /* B3 +=  W3*row[5] - W1*row[7] */
   41.88 +        ldr    ip, w46               /* ip =   W4 | (W6 << 16) */
   41.89 +        smlad  r9, lr, r1, r9        /* B1 -=  W1*row[5] + W5*row[7] */
   41.90 +
   41.91 +        smlad  r5, r2, r3, r5        /* A1 += -W4*row[4] - W2*row[6] */
   41.92 +        smlsd  r6, r2, r3, r6        /* A2 += -W4*row[4] + W2*row[6] */
   41.93 +        smlad  r4, r2, ip, r4        /* A0 +=  W4*row[4] + W6*row[6] */
   41.94 +        smlsd  r7, r2, ip, r7        /* A3 +=  W4*row[4] - W6*row[6] */
   41.95 +        .endm
   41.96 +
   41.97 +/*
   41.98 +  Compute partial IDCT of half row.
   41.99 +  shift = left-shift amount
  41.100 +  r2 = row[2,0]
  41.101 +  r3 = row[3,1]
  41.102 +  ip = w42
  41.103 +
  41.104 +  Output in registers r4--r11
  41.105 +*/
  41.106 +        .macro idct_row4 shift
  41.107 +        ldr    lr, w46               /* lr =  W4 | (W6 << 16) */
  41.108 +        ldr    r10,w57               /* r10 = W5 | (W7 << 16) */
  41.109 +        mov    r1, #(1<<(\shift-1))
  41.110 +        smlad  r4, r2, ip, r1
  41.111 +        smlsd  r7, r2, ip, r1
  41.112 +        ldr    ip, w13               /* ip =  W1 | (W3 << 16) */
  41.113 +        smlad  r5, r2, lr, r1
  41.114 +        smlsd  r6, r2, lr, r1
  41.115 +        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
  41.116 +        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
  41.117 +        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
  41.118 +        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
  41.119 +        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
  41.120 +        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
  41.121 +        .endm
  41.122 +
  41.123 +/*
  41.124 +  Compute final part of IDCT single row without shift.
  41.125 +  Input in registers r4--r11
  41.126 +  Output in registers ip, r4--r6, lr, r8--r10
  41.127 +*/
  41.128 +        .macro idct_finish
  41.129 +        add    ip, r4, r8            /* r1 = A0 + B0 */
  41.130 +        sub    lr, r4, r8            /* r2 = A0 - B0 */
  41.131 +        sub    r4, r5, r9            /* r2 = A1 + B1 */
  41.132 +        add    r8, r5, r9            /* r2 = A1 - B1 */
  41.133 +        add    r5, r6, r10           /* r1 = A2 + B2 */
  41.134 +        sub    r9, r6, r10           /* r1 = A2 - B2 */
  41.135 +        add    r6, r7, r11           /* r2 = A3 + B3 */
  41.136 +        sub    r10,r7, r11           /* r2 = A3 - B3 */
  41.137 +        .endm
  41.138 +
  41.139 +/*
  41.140 +  Compute final part of IDCT single row.
  41.141 +  shift = right-shift amount
  41.142 +  Input/output in registers r4--r11
  41.143 +*/
  41.144 +        .macro idct_finish_shift shift
  41.145 +        add    r3, r4, r8            /* r3 = A0 + B0 */
  41.146 +        sub    r2, r4, r8            /* r2 = A0 - B0 */
  41.147 +        mov    r4, r3, asr #\shift
  41.148 +        mov    r8, r2, asr #\shift
  41.149 +
  41.150 +        sub    r3, r5, r9            /* r3 = A1 + B1 */
  41.151 +        add    r2, r5, r9            /* r2 = A1 - B1 */
  41.152 +        mov    r5, r3, asr #\shift
  41.153 +        mov    r9, r2, asr #\shift
  41.154 +
  41.155 +        add    r3, r6, r10           /* r3 = A2 + B2 */
  41.156 +        sub    r2, r6, r10           /* r2 = A2 - B2 */
  41.157 +        mov    r6, r3, asr #\shift
  41.158 +        mov    r10,r2, asr #\shift
  41.159 +
  41.160 +        add    r3, r7, r11           /* r3 = A3 + B3 */
  41.161 +        sub    r2, r7, r11           /* r2 = A3 - B3 */
  41.162 +        mov    r7, r3, asr #\shift
  41.163 +        mov    r11,r2, asr #\shift
  41.164 +        .endm
  41.165 +
  41.166 +/*
  41.167 +  Compute final part of IDCT single row, saturating results at 8 bits.
  41.168 +  shift = right-shift amount
  41.169 +  Input/output in registers r4--r11
  41.170 +*/
  41.171 +        .macro idct_finish_shift_sat shift
  41.172 +        add    r3, r4, r8            /* r3 = A0 + B0 */
  41.173 +        sub    ip, r4, r8            /* ip = A0 - B0 */
  41.174 +        usat   r4, #8, r3, asr #\shift
  41.175 +        usat   r8, #8, ip, asr #\shift
  41.176 +
  41.177 +        sub    r3, r5, r9            /* r3 = A1 + B1 */
  41.178 +        add    ip, r5, r9            /* ip = A1 - B1 */
  41.179 +        usat   r5, #8, r3, asr #\shift
  41.180 +        usat   r9, #8, ip, asr #\shift
  41.181 +
  41.182 +        add    r3, r6, r10           /* r3 = A2 + B2 */
  41.183 +        sub    ip, r6, r10           /* ip = A2 - B2 */
  41.184 +        usat   r6, #8, r3, asr #\shift
  41.185 +        usat   r10,#8, ip, asr #\shift
  41.186 +
  41.187 +        add    r3, r7, r11           /* r3 = A3 + B3 */
  41.188 +        sub    ip, r7, r11           /* ip = A3 - B3 */
  41.189 +        usat   r7, #8, r3, asr #\shift
  41.190 +        usat   r11,#8, ip, asr #\shift
  41.191 +        .endm
  41.192 +
  41.193 +/*
  41.194 +  Compute IDCT of single row, storing as column.
  41.195 +  r0 = source
  41.196 +  r1 = dest
  41.197 +*/
  41.198 +function idct_row_armv6
  41.199 +        push   {lr}
  41.200 +
  41.201 +        ldr    lr, [r0, #12]         /* lr = row[7,5] */
  41.202 +        ldr    ip, [r0, #4]          /* ip = row[6,4] */
  41.203 +        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
  41.204 +        ldr    r2, [r0]              /* r2 = row[2,0] */
  41.205 +        orrs   lr, lr, ip
  41.206 +        cmpeq  lr, r3
  41.207 +        cmpeq  lr, r2, lsr #16
  41.208 +        beq    1f
  41.209 +        push   {r1}
  41.210 +        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
  41.211 +        cmp    lr, #0
  41.212 +        beq    2f
  41.213 +
  41.214 +        idct_row   ROW_SHIFT
  41.215 +        b      3f
  41.216 +
  41.217 +2:      idct_row4  ROW_SHIFT
  41.218 +
  41.219 +3:      pop    {r1}
  41.220 +        idct_finish_shift ROW_SHIFT
  41.221 +
  41.222 +        strh   r4, [r1]
  41.223 +        strh   r5, [r1, #(16*2)]
  41.224 +        strh   r6, [r1, #(16*4)]
  41.225 +        strh   r7, [r1, #(16*6)]
  41.226 +        strh   r11,[r1, #(16*1)]
  41.227 +        strh   r10,[r1, #(16*3)]
  41.228 +        strh   r9, [r1, #(16*5)]
  41.229 +        strh   r8, [r1, #(16*7)]
  41.230 +
  41.231 +        pop    {pc}
  41.232 +
  41.233 +1:      mov    r2, r2, lsl #3
  41.234 +        strh   r2, [r1]
  41.235 +        strh   r2, [r1, #(16*2)]
  41.236 +        strh   r2, [r1, #(16*4)]
  41.237 +        strh   r2, [r1, #(16*6)]
  41.238 +        strh   r2, [r1, #(16*1)]
  41.239 +        strh   r2, [r1, #(16*3)]
  41.240 +        strh   r2, [r1, #(16*5)]
  41.241 +        strh   r2, [r1, #(16*7)]
  41.242 +        pop    {pc}
  41.243 +endfunc
  41.244 +
  41.245 +/*
  41.246 +  Compute IDCT of single column, read as row.
  41.247 +  r0 = source
  41.248 +  r1 = dest
  41.249 +*/
  41.250 +function idct_col_armv6
  41.251 +        push   {r1, lr}
  41.252 +
  41.253 +        ldr    r2, [r0]              /* r2 = row[2,0] */
  41.254 +        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
  41.255 +        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
  41.256 +        idct_row COL_SHIFT
  41.257 +        pop    {r1}
  41.258 +        idct_finish_shift COL_SHIFT
  41.259 +
  41.260 +        strh   r4, [r1]
  41.261 +        strh   r5, [r1, #(16*1)]
  41.262 +        strh   r6, [r1, #(16*2)]
  41.263 +        strh   r7, [r1, #(16*3)]
  41.264 +        strh   r11,[r1, #(16*4)]
  41.265 +        strh   r10,[r1, #(16*5)]
  41.266 +        strh   r9, [r1, #(16*6)]
  41.267 +        strh   r8, [r1, #(16*7)]
  41.268 +
  41.269 +        pop    {pc}
  41.270 +endfunc
  41.271 +
  41.272 +/*
  41.273 +  Compute IDCT of single column, read as row, store saturated 8-bit.
  41.274 +  r0 = source
  41.275 +  r1 = dest
  41.276 +  r2 = line size
  41.277 +*/
  41.278 +function idct_col_put_armv6
  41.279 +        push   {r1, r2, lr}
  41.280 +
  41.281 +        ldr    r2, [r0]              /* r2 = row[2,0] */
  41.282 +        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
  41.283 +        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
  41.284 +        idct_row COL_SHIFT
  41.285 +        pop    {r1, r2}
  41.286 +        idct_finish_shift_sat COL_SHIFT
  41.287 +
  41.288 +        strb   r4, [r1], r2
  41.289 +        strb   r5, [r1], r2
  41.290 +        strb   r6, [r1], r2
  41.291 +        strb   r7, [r1], r2
  41.292 +        strb   r11,[r1], r2
  41.293 +        strb   r10,[r1], r2
  41.294 +        strb   r9, [r1], r2
  41.295 +        strb   r8, [r1], r2
  41.296 +
  41.297 +        sub    r1, r1, r2, lsl #3
  41.298 +
  41.299 +        pop    {pc}
  41.300 +endfunc
  41.301 +
  41.302 +/*
  41.303 +  Compute IDCT of single column, read as row, add/store saturated 8-bit.
  41.304 +  r0 = source
  41.305 +  r1 = dest
  41.306 +  r2 = line size
  41.307 +*/
  41.308 +function idct_col_add_armv6
  41.309 +        push   {r1, r2, lr}
  41.310 +
  41.311 +        ldr    r2, [r0]              /* r2 = row[2,0] */
  41.312 +        ldr    ip, w42               /* ip = W4 | (W2 << 16) */
  41.313 +        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
  41.314 +        idct_row COL_SHIFT
  41.315 +        pop    {r1, r2}
  41.316 +        idct_finish
  41.317 +
  41.318 +        ldrb   r3, [r1]
  41.319 +        ldrb   r7, [r1, r2]
  41.320 +        ldrb   r11,[r1, r2, lsl #2]
  41.321 +        add    ip, r3, ip, asr #COL_SHIFT
  41.322 +        usat   ip, #8, ip
  41.323 +        add    r4, r7, r4, asr #COL_SHIFT
  41.324 +        strb   ip, [r1], r2
  41.325 +        ldrb   ip, [r1, r2]
  41.326 +        usat   r4, #8, r4
  41.327 +        ldrb   r11,[r1, r2, lsl #2]
  41.328 +        add    r5, ip, r5, asr #COL_SHIFT
  41.329 +        usat   r5, #8, r5
  41.330 +        strb   r4, [r1], r2
  41.331 +        ldrb   r3, [r1, r2]
  41.332 +        ldrb   ip, [r1, r2, lsl #2]
  41.333 +        strb   r5, [r1], r2
  41.334 +        ldrb   r7, [r1, r2]
  41.335 +        ldrb   r4, [r1, r2, lsl #2]
  41.336 +        add    r6, r3, r6, asr #COL_SHIFT
  41.337 +        usat   r6, #8, r6
  41.338 +        add    r10,r7, r10,asr #COL_SHIFT
  41.339 +        usat   r10,#8, r10
  41.340 +        add    r9, r11,r9, asr #COL_SHIFT
  41.341 +        usat   r9, #8, r9
  41.342 +        add    r8, ip, r8, asr #COL_SHIFT
  41.343 +        usat   r8, #8, r8
  41.344 +        add    lr, r4, lr, asr #COL_SHIFT
  41.345 +        usat   lr, #8, lr
  41.346 +        strb   r6, [r1], r2
  41.347 +        strb   r10,[r1], r2
  41.348 +        strb   r9, [r1], r2
  41.349 +        strb   r8, [r1], r2
  41.350 +        strb   lr, [r1], r2
  41.351 +
  41.352 +        sub    r1, r1, r2, lsl #3
  41.353 +
  41.354 +        pop    {pc}
  41.355 +endfunc
  41.356 +
  41.357 +/*
  41.358 +  Compute 8 IDCT row transforms.
  41.359 +  func = IDCT row->col function
  41.360 +  width = width of columns in bytes
  41.361 +*/
  41.362 +        .macro idct_rows func width
  41.363 +        bl     \func
  41.364 +        add    r0, r0, #(16*2)
  41.365 +        add    r1, r1, #\width
  41.366 +        bl     \func
  41.367 +        add    r0, r0, #(16*2)
  41.368 +        add    r1, r1, #\width
  41.369 +        bl     \func
  41.370 +        add    r0, r0, #(16*2)
  41.371 +        add    r1, r1, #\width
  41.372 +        bl     \func
  41.373 +        sub    r0, r0, #(16*5)
  41.374 +        add    r1, r1, #\width
  41.375 +        bl     \func
  41.376 +        add    r0, r0, #(16*2)
  41.377 +        add    r1, r1, #\width
  41.378 +        bl     \func
  41.379 +        add    r0, r0, #(16*2)
  41.380 +        add    r1, r1, #\width
  41.381 +        bl     \func
  41.382 +        add    r0, r0, #(16*2)
  41.383 +        add    r1, r1, #\width
  41.384 +        bl     \func
  41.385 +
  41.386 +        sub    r0, r0, #(16*7)
  41.387 +        .endm
  41.388 +
  41.389 +/* void ff_simple_idct_armv6(DCTELEM *data); */
  41.390 +function ff_simple_idct_armv6, export=1
  41.391 +        push   {r4-r11, lr}
  41.392 +        sub    sp, sp, #128
  41.393 +
  41.394 +        mov    r1, sp
  41.395 +        idct_rows idct_row_armv6, 2
  41.396 +        mov    r1, r0
  41.397 +        mov    r0, sp
  41.398 +        idct_rows idct_col_armv6, 2
  41.399 +
  41.400 +        add    sp, sp, #128
  41.401 +        pop    {r4-r11, pc}
  41.402 +endfunc
  41.403 +
  41.404 +/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  41.405 +function ff_simple_idct_add_armv6, export=1
  41.406 +        push   {r0, r1, r4-r11, lr}
  41.407 +        sub    sp, sp, #128
  41.408 +
  41.409 +        mov    r0, r2
  41.410 +        mov    r1, sp
  41.411 +        idct_rows idct_row_armv6, 2
  41.412 +        mov    r0, sp
  41.413 +        ldr    r1, [sp, #128]
  41.414 +        ldr    r2, [sp, #(128+4)]
  41.415 +        idct_rows idct_col_add_armv6, 1
  41.416 +
  41.417 +        add    sp, sp, #(128+8)
  41.418 +        pop    {r4-r11, pc}
  41.419 +endfunc
  41.420 +
  41.421 +/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */
  41.422 +function ff_simple_idct_put_armv6, export=1
  41.423 +        push   {r0, r1, r4-r11, lr}
  41.424 +        sub    sp, sp, #128
  41.425 +
  41.426 +        mov    r0, r2
  41.427 +        mov    r1, sp
  41.428 +        idct_rows idct_row_armv6, 2
  41.429 +        mov    r0, sp
  41.430 +        ldr    r1, [sp, #128]
  41.431 +        ldr    r2, [sp, #(128+4)]
  41.432 +        idct_rows idct_col_put_armv6, 1
  41.433 +
  41.434 +        add    sp, sp, #(128+8)
  41.435 +        pop    {r4-r11, pc}
  41.436 +endfunc

    42.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    42.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/simple_idct_neon.S	Mon Aug 27 12:09:56 2012 +0200
    42.3 @@ -0,0 +1,373 @@
    42.4 +/*
    42.5 + * ARM NEON IDCT
    42.6 + *
    42.7 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
    42.8 + *
    42.9 + * Based on Simple IDCT
   42.10 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
   42.11 + *
   42.12 + * This file is part of FFmpeg.
   42.13 + *
   42.14 + * FFmpeg is free software; you can redistribute it and/or
   42.15 + * modify it under the terms of the GNU Lesser General Public
   42.16 + * License as published by the Free Software Foundation; either
   42.17 + * version 2.1 of the License, or (at your option) any later version.
   42.18 + *
   42.19 + * FFmpeg is distributed in the hope that it will be useful,
   42.20 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   42.21 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   42.22 + * Lesser General Public License for more details.
   42.23 + *
   42.24 + * You should have received a copy of the GNU Lesser General Public
   42.25 + * License along with FFmpeg; if not, write to the Free Software
   42.26 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   42.27 + */
   42.28 +
   42.29 +#include "asm.S"
   42.30 +
   42.31 +#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
   42.32 +#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
   42.33 +#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
   42.34 +#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
   42.35 +#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
   42.36 +#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
   42.37 +#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
   42.38 +#define W4c ((1<<(COL_SHIFT-1))/W4)
   42.39 +#define ROW_SHIFT 11
   42.40 +#define COL_SHIFT 20
   42.41 +
   42.42 +#define w1 d0[0]
   42.43 +#define w2 d0[1]
   42.44 +#define w3 d0[2]
   42.45 +#define w4 d0[3]
   42.46 +#define w5 d1[0]
   42.47 +#define w6 d1[1]
   42.48 +#define w7 d1[2]
   42.49 +#define w4c d1[3]
   42.50 +
   42.51 +        .macro idct_col4_top
   42.52 +        vmull.s16       q7,  d6,  w2    /* q9   = W2 * col[2] */
   42.53 +        vmull.s16       q8,  d6,  w6    /* q10  = W6 * col[2] */
   42.54 +        vmull.s16       q9,  d4,  w1    /* q9   = W1 * col[1] */
   42.55 +        vadd.i32        q11, q15, q7
   42.56 +        vmull.s16       q10, d4,  w3    /* q10  = W3 * col[1] */
   42.57 +        vadd.i32        q12, q15, q8
   42.58 +        vmull.s16       q5,  d4,  w5    /* q5   = W5 * col[1] */
   42.59 +        vsub.i32        q13, q15, q8
   42.60 +        vmull.s16       q6,  d4,  w7    /* q6   = W7 * col[1] */
   42.61 +        vsub.i32        q14, q15, q7
   42.62 +
   42.63 +        vmlal.s16       q9,  d8,  w3    /* q9  += W3 * col[3] */
   42.64 +        vmlsl.s16       q10, d8,  w7    /* q10 -= W7 * col[3] */
   42.65 +        vmlsl.s16       q5,  d8,  w1    /* q5  -= W1 * col[3] */
   42.66 +        vmlsl.s16       q6,  d8,  w5    /* q6  -= W5 * col[3] */
   42.67 +        .endm
   42.68 +
   42.69 +        .text
   42.70 +        .align 6
   42.71 +
   42.72 +function idct_row4_pld_neon
   42.73 +        pld             [r0]
   42.74 +        add             r3,  r0,  r1,  lsl #2
   42.75 +        pld             [r0, r1]
   42.76 +        pld             [r0, r1, lsl #1]
   42.77 +        pld             [r3, -r1]
   42.78 +        pld             [r3]
   42.79 +        pld             [r3, r1]
   42.80 +        add             r3,  r3,  r1,  lsl #1
   42.81 +        pld             [r3]
   42.82 +        pld             [r3, r1]
   42.83 +endfunc
   42.84 +
   42.85 +function idct_row4_neon
   42.86 +        vmov.i32        q15, #(1<<(ROW_SHIFT-1))
   42.87 +        vld1.64         {d2-d5},  [r2,:128]!
   42.88 +        vmlal.s16       q15, d2,  w4    /* q15  += W4 * col[0] */
   42.89 +        vld1.64         {d6,d7},  [r2,:128]!
   42.90 +        vorr            d10, d3,  d5
   42.91 +        vld1.64         {d8,d9},  [r2,:128]!
   42.92 +        add             r2,  r2,  #-64
   42.93 +
   42.94 +        vorr            d11, d7,  d9
   42.95 +        vorr            d10, d10, d11
   42.96 +        vmov            r3,  r4,  d10
   42.97 +
   42.98 +        idct_col4_top
   42.99 +
  42.100 +        orrs            r3,  r3,  r4
  42.101 +        beq             1f
  42.102 +
  42.103 +        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
  42.104 +        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
  42.105 +        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
  42.106 +        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
  42.107 +        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
  42.108 +        vadd.i32        q11, q11, q7
  42.109 +        vsub.i32        q12, q12, q7
  42.110 +        vsub.i32        q13, q13, q7
  42.111 +        vadd.i32        q14, q14, q7
  42.112 +        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
  42.113 +        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
  42.114 +        vmlal.s16       q9,  d9,  w7
  42.115 +        vmlsl.s16       q10, d9,  w5
  42.116 +        vmlal.s16       q5,  d9,  w3
  42.117 +        vmlsl.s16       q6,  d9,  w1
  42.118 +        vadd.i32        q11, q11, q7
  42.119 +        vsub.i32        q12, q12, q8
  42.120 +        vadd.i32        q13, q13, q8
  42.121 +        vsub.i32        q14, q14, q7
  42.122 +
  42.123 +1:      vadd.i32        q3,  q11, q9
  42.124 +        vadd.i32        q4,  q12, q10
  42.125 +        vshrn.i32       d2,  q3,  #ROW_SHIFT
  42.126 +        vshrn.i32       d4,  q4,  #ROW_SHIFT
  42.127 +        vadd.i32        q7,  q13, q5
  42.128 +        vadd.i32        q8,  q14, q6
  42.129 +        vtrn.16         d2,  d4
  42.130 +        vshrn.i32       d6,  q7,  #ROW_SHIFT
  42.131 +        vshrn.i32       d8,  q8,  #ROW_SHIFT
  42.132 +        vsub.i32        q14, q14, q6
  42.133 +        vsub.i32        q11, q11, q9
  42.134 +        vtrn.16         d6,  d8
  42.135 +        vsub.i32        q13, q13, q5
  42.136 +        vshrn.i32       d3,  q14, #ROW_SHIFT
  42.137 +        vtrn.32         d2,  d6
  42.138 +        vsub.i32        q12, q12, q10
  42.139 +        vtrn.32         d4,  d8
  42.140 +        vshrn.i32       d5,  q13, #ROW_SHIFT
  42.141 +        vshrn.i32       d7,  q12, #ROW_SHIFT
  42.142 +        vshrn.i32       d9,  q11, #ROW_SHIFT
  42.143 +
  42.144 +        vtrn.16         d3,  d5
  42.145 +        vtrn.16         d7,  d9
  42.146 +        vtrn.32         d3,  d7
  42.147 +        vtrn.32         d5,  d9
  42.148 +
  42.149 +        vst1.64         {d2-d5},  [r2,:128]!
  42.150 +        vst1.64         {d6-d9},  [r2,:128]!
  42.151 +
  42.152 +        bx              lr
  42.153 +endfunc
  42.154 +
  42.155 +function idct_col4_neon
  42.156 +        mov             ip,  #16
  42.157 +        vld1.64         {d2}, [r2,:64], ip /* d2 = col[0] */
  42.158 +        vdup.16         d30, w4c
  42.159 +        vld1.64         {d4}, [r2,:64], ip /* d3 = col[1] */
  42.160 +        vadd.i16        d30, d30, d2
  42.161 +        vld1.64         {d6}, [r2,:64], ip /* d4 = col[2] */
  42.162 +        vmull.s16       q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
  42.163 +        vld1.64         {d8}, [r2,:64], ip /* d5 = col[3] */
  42.164 +
  42.165 +        ldrd            r4,  [r2]
  42.166 +        ldrd            r6,  [r2, #16]
  42.167 +        orrs            r4,  r4,  r5
  42.168 +
  42.169 +        idct_col4_top
  42.170 +        addeq           r2,  r2,  #16
  42.171 +        beq             1f
  42.172 +
  42.173 +        vld1.64         {d3}, [r2,:64], ip /* d6 = col[4] */
  42.174 +        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
  42.175 +        vadd.i32        q11, q11, q7
  42.176 +        vsub.i32        q12, q12, q7
  42.177 +        vsub.i32        q13, q13, q7
  42.178 +        vadd.i32        q14, q14, q7
  42.179 +
  42.180 +1:      orrs            r6,  r6,  r7
  42.181 +        ldrd            r4,  [r2, #16]
  42.182 +        addeq           r2,  r2,  #16
  42.183 +        beq             2f
  42.184 +
  42.185 +        vld1.64         {d5}, [r2,:64], ip /* d7 = col[5] */
  42.186 +        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
  42.187 +        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
  42.188 +        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
  42.189 +        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
  42.190 +
  42.191 +2:      orrs            r4,  r4,  r5
  42.192 +        ldrd            r4,  [r2, #16]
  42.193 +        addeq           r2,  r2,  #16
  42.194 +        beq             3f
  42.195 +
  42.196 +        vld1.64         {d7}, [r2,:64], ip /* d8 = col[6] */
  42.197 +        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
  42.198 +        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
  42.199 +        vadd.i32        q11, q11, q7
  42.200 +        vsub.i32        q14, q14, q7
  42.201 +        vsub.i32        q12, q12, q8
  42.202 +        vadd.i32        q13, q13, q8
  42.203 +
  42.204 +3:      orrs            r4,  r4,  r5
  42.205 +        addeq           r2,  r2,  #16
  42.206 +        beq             4f
  42.207 +
  42.208 +        vld1.64         {d9}, [r2,:64], ip /* d9 = col[7] */
  42.209 +        vmlal.s16       q9,  d9,  w7
  42.210 +        vmlsl.s16       q10, d9,  w5
  42.211 +        vmlal.s16       q5,  d9,  w3
  42.212 +        vmlsl.s16       q6,  d9,  w1
  42.213 +
  42.214 +4:      vaddhn.i32      d2,  q11, q9
  42.215 +        vaddhn.i32      d3,  q12, q10
  42.216 +        vaddhn.i32      d4,  q13, q5
  42.217 +        vaddhn.i32      d5,  q14, q6
  42.218 +        vsubhn.i32      d9,  q11, q9
  42.219 +        vsubhn.i32      d8,  q12, q10
  42.220 +        vsubhn.i32      d7,  q13, q5
  42.221 +        vsubhn.i32      d6,  q14, q6
  42.222 +
  42.223 +        bx              lr
  42.224 +endfunc
  42.225 +
  42.226 +        .align 6
  42.227 +
  42.228 +function idct_col4_st8_neon
  42.229 +        vqshrun.s16     d2,  q1,  #COL_SHIFT-16
  42.230 +        vqshrun.s16     d3,  q2,  #COL_SHIFT-16
  42.231 +        vqshrun.s16     d4,  q3,  #COL_SHIFT-16
  42.232 +        vqshrun.s16     d5,  q4,  #COL_SHIFT-16
  42.233 +        vst1.32         {d2[0]}, [r0,:32], r1
  42.234 +        vst1.32         {d2[1]}, [r0,:32], r1
  42.235 +        vst1.32         {d3[0]}, [r0,:32], r1
  42.236 +        vst1.32         {d3[1]}, [r0,:32], r1
  42.237 +        vst1.32         {d4[0]}, [r0,:32], r1
  42.238 +        vst1.32         {d4[1]}, [r0,:32], r1
  42.239 +        vst1.32         {d5[0]}, [r0,:32], r1
  42.240 +        vst1.32         {d5[1]}, [r0,:32], r1
  42.241 +
  42.242 +        bx              lr
  42.243 +endfunc
  42.244 +
  42.245 +        .section .rodata
  42.246 +        .align 4
  42.247 +idct_coeff_neon:
  42.248 +        .short W1, W2, W3, W4, W5, W6, W7, W4c
  42.249 +        .previous
  42.250 +
  42.251 +        .macro idct_start data
  42.252 +        push            {r4-r7, lr}
  42.253 +        pld             [\data]
  42.254 +        pld             [\data, #64]
  42.255 +        vpush           {d8-d15}
  42.256 +        movrel          r3,  idct_coeff_neon
  42.257 +        vld1.64         {d0,d1}, [r3,:128]
  42.258 +        .endm
  42.259 +
  42.260 +        .macro idct_end
  42.261 +        vpop            {d8-d15}
  42.262 +        pop             {r4-r7, pc}
  42.263 +        .endm
  42.264 +
  42.265 +/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
  42.266 +function ff_simple_idct_put_neon, export=1
  42.267 +        idct_start      r2
  42.268 +
  42.269 +        bl              idct_row4_pld_neon
  42.270 +        bl              idct_row4_neon
  42.271 +        add             r2,  r2,  #-128
  42.272 +        bl              idct_col4_neon
  42.273 +        bl              idct_col4_st8_neon
  42.274 +        sub             r0,  r0,  r1, lsl #3
  42.275 +        add             r0,  r0,  #4
  42.276 +        add             r2,  r2,  #-120
  42.277 +        bl              idct_col4_neon
  42.278 +        bl              idct_col4_st8_neon
  42.279 +
  42.280 +        idct_end
  42.281 +endfunc
  42.282 +
  42.283 +        .align 6
  42.284 +
  42.285 +function idct_col4_add8_neon
  42.286 +        mov             ip,  r0
  42.287 +
  42.288 +        vld1.32         {d10[0]}, [r0,:32], r1
  42.289 +        vshr.s16        q1,  q1,  #COL_SHIFT-16
  42.290 +        vld1.32         {d10[1]}, [r0,:32], r1
  42.291 +        vshr.s16        q2,  q2,  #COL_SHIFT-16
  42.292 +        vld1.32         {d11[0]}, [r0,:32], r1
  42.293 +        vshr.s16        q3,  q3,  #COL_SHIFT-16
  42.294 +        vld1.32         {d11[1]}, [r0,:32], r1
  42.295 +        vshr.s16        q4,  q4,  #COL_SHIFT-16
  42.296 +        vld1.32         {d12[0]}, [r0,:32], r1
  42.297 +        vaddw.u8        q1,  q1,  d10
  42.298 +        vld1.32         {d12[1]}, [r0,:32], r1
  42.299 +        vaddw.u8        q2,  q2,  d11
  42.300 +        vld1.32         {d13[0]}, [r0,:32], r1
  42.301 +        vqmovun.s16     d2,  q1
  42.302 +        vld1.32         {d13[1]}, [r0,:32], r1
  42.303 +        vaddw.u8        q3,  q3,  d12
  42.304 +        vst1.32         {d2[0]},  [ip,:32], r1
  42.305 +        vqmovun.s16     d3,  q2
  42.306 +        vst1.32         {d2[1]},  [ip,:32], r1
  42.307 +        vaddw.u8        q4,  q4,  d13
  42.308 +        vst1.32         {d3[0]},  [ip,:32], r1
  42.309 +        vqmovun.s16     d4,  q3
  42.310 +        vst1.32         {d3[1]},  [ip,:32], r1
  42.311 +        vqmovun.s16     d5,  q4
  42.312 +        vst1.32         {d4[0]},  [ip,:32], r1
  42.313 +        vst1.32         {d4[1]},  [ip,:32], r1
  42.314 +        vst1.32         {d5[0]},  [ip,:32], r1
  42.315 +        vst1.32         {d5[1]},  [ip,:32], r1
  42.316 +
  42.317 +        bx              lr
  42.318 +endfunc
  42.319 +
  42.320 +/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
  42.321 +function ff_simple_idct_add_neon, export=1
  42.322 +        idct_start      r2
  42.323 +
  42.324 +        bl              idct_row4_pld_neon
  42.325 +        bl              idct_row4_neon
  42.326 +        add             r2,  r2,  #-128
  42.327 +        bl              idct_col4_neon
  42.328 +        bl              idct_col4_add8_neon
  42.329 +        sub             r0,  r0,  r1, lsl #3
  42.330 +        add             r0,  r0,  #4
  42.331 +        add             r2,  r2,  #-120
  42.332 +        bl              idct_col4_neon
  42.333 +        bl              idct_col4_add8_neon
  42.334 +
  42.335 +        idct_end
  42.336 +endfunc
  42.337 +
  42.338 +        .align 6
  42.339 +
  42.340 +function idct_col4_st16_neon
  42.341 +        mov             ip,  #16
  42.342 +
  42.343 +        vshr.s16        q1,  q1,  #COL_SHIFT-16
  42.344 +        vshr.s16        q2,  q2,  #COL_SHIFT-16
  42.345 +        vst1.64         {d2}, [r2,:64], ip
  42.346 +        vshr.s16        q3,  q3,  #COL_SHIFT-16
  42.347 +        vst1.64         {d3}, [r2,:64], ip
  42.348 +        vshr.s16        q4,  q4,  #COL_SHIFT-16
  42.349 +        vst1.64         {d4}, [r2,:64], ip
  42.350 +        vst1.64         {d5}, [r2,:64], ip
  42.351 +        vst1.64         {d6}, [r2,:64], ip
  42.352 +        vst1.64         {d7}, [r2,:64], ip
  42.353 +        vst1.64         {d8}, [r2,:64], ip
  42.354 +        vst1.64         {d9}, [r2,:64], ip
  42.355 +
  42.356 +        bx              lr
  42.357 +endfunc
  42.358 +
  42.359 +/* void ff_simple_idct_neon(DCTELEM *data); */
  42.360 +function ff_simple_idct_neon, export=1
  42.361 +        idct_start      r0
  42.362 +
  42.363 +        mov             r2,  r0
  42.364 +        bl              idct_row4_neon
  42.365 +        bl              idct_row4_neon
  42.366 +        add             r2,  r2,  #-128
  42.367 +        bl              idct_col4_neon
  42.368 +        add             r2,  r2,  #-128
  42.369 +        bl              idct_col4_st16_neon
  42.370 +        add             r2,  r2,  #-120
  42.371 +        bl              idct_col4_neon
  42.372 +        add             r2,  r2,  #-128
  42.373 +        bl              idct_col4_st16_neon
  42.374 +
  42.375 +        idct_end
  42.376 +endfunc

    43.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    43.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/synth_filter_neon.S	Mon Aug 27 12:09:56 2012 +0200
    43.3 @@ -0,0 +1,117 @@
    43.4 +/*
    43.5 + * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
    43.6 + *
    43.7 + * This file is part of FFmpeg.
    43.8 + *
    43.9 + * FFmpeg is free software; you can redistribute it and/or
   43.10 + * modify it under the terms of the GNU Lesser General Public
   43.11 + * License as published by the Free Software Foundation; either
   43.12 + * version 2.1 of the License, or (at your option) any later version.
   43.13 + *
   43.14 + * FFmpeg is distributed in the hope that it will be useful,
   43.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   43.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   43.17 + * Lesser General Public License for more details.
   43.18 + *
   43.19 + * You should have received a copy of the GNU Lesser General Public
   43.20 + * License along with FFmpeg; if not, write to the Free Software
   43.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   43.22 + */
   43.23 +
   43.24 +#include "asm.S"
   43.25 +
   43.26 +        preserve8
   43.27 +
   43.28 +function ff_synth_filter_float_neon, export=1
   43.29 +        push            {r3-r11,lr}
   43.30 +
   43.31 +        ldr             r4,  [r2]               @ synth_buf_offset
   43.32 +        add             r1,  r1,  r4,  lsl #2   @ synth_buf
   43.33 +        sub             r12, r4,  #32
   43.34 +        bfc             r12, #9,  #23
   43.35 +        bic             r4,  r4,  #63
   43.36 +        str             r12, [r2]
   43.37 +
   43.38 +        ldr             r2,  [sp, #12*4]        @ in
   43.39 +        mov             r9,  r1                 @ synth_buf
   43.40 +
   43.41 +VFP     vpush           {d0}
   43.42 +        bl              ff_imdct_half_neon
   43.43 +VFP     vpop            {d0}
   43.44 +        pop             {r3}
   43.45 +
   43.46 +        ldr             r5,  [sp, #9*4]         @ window
   43.47 +        ldr             r2,  [sp, #10*4]        @ out
   43.48 +NOVFP   vldr            d0,  [sp, #12*4]        @ scale, bias
   43.49 +        add             r8,  r9,  #12*4
   43.50 +
   43.51 +        mov             lr,  #64*4
   43.52 +        mov             r1,  #4
   43.53 +1:
   43.54 +        add             r10, r9,  #16*4         @ synth_buf
   43.55 +        add             r11, r8,  #16*4
   43.56 +        add             r0,  r5,  #16*4         @ window
   43.57 +        add             r6,  r5,  #32*4
   43.58 +        add             r7,  r5,  #48*4
   43.59 +
   43.60 +        vld1.32         {q10},    [r3,:128]     @ a
   43.61 +        add             r3,  r3,  #16*4
   43.62 +        vld1.32         {q1},     [r3,:128]     @ b
   43.63 +        vmov.f32        q2,  #0.0               @ c
   43.64 +        vmov.f32        q3,  #0.0               @ d
   43.65 +
   43.66 +        mov             r12, #512
   43.67 +2:
   43.68 +        vld1.32         {q9},     [r8, :128], lr
   43.69 +        vrev64.32       q9,  q9
   43.70 +        vld1.32         {q8},     [r5, :128], lr
   43.71 +        vmls.f32        d20, d16, d19
   43.72 +        vld1.32         {q11},    [r0, :128], lr
   43.73 +        vmls.f32        d21, d17, d18
   43.74 +        vld1.32         {q12},    [r9, :128], lr
   43.75 +        vmla.f32        d2,  d22, d24
   43.76 +        vld1.32         {q8},     [r6, :128], lr
   43.77 +        vmla.f32        d3,  d23, d25
   43.78 +        vld1.32         {q9},     [r10,:128], lr
   43.79 +        vmla.f32        d4,  d16, d18
   43.80 +        vld1.32         {q12},    [r11,:128], lr
   43.81 +        vmla.f32        d5,  d17, d19
   43.82 +        vrev64.32       q12, q12
   43.83 +        vld1.32         {q11},    [r7, :128], lr
   43.84 +        vmla.f32        d6,  d22, d25
   43.85 +        vmla.f32        d7,  d23, d24
   43.86 +        subs            r12, r12, #64
   43.87 +        beq             3f
   43.88 +        cmp             r12, r4
   43.89 +        bne             2b
   43.90 +        sub             r8,  r8,  #512*4
   43.91 +        sub             r9,  r9,  #512*4
   43.92 +        sub             r10, r10, #512*4
   43.93 +        sub             r11, r11, #512*4
   43.94 +        b               2b
   43.95 +3:
   43.96 +        vdup.32         q8,  d0[1]
   43.97 +        vdup.32         q9,  d0[1]
   43.98 +        vmla.f32        q8,  q10, d0[0]
   43.99 +        vmla.f32        q9,  q1,  d0[0]
  43.100 +        vst1.32         {q3},     [r3,:128]
  43.101 +        sub             r3,  r3,  #16*4
  43.102 +        vst1.32         {q2},     [r3,:128]
  43.103 +        vst1.32         {q8},     [r2,:128]
  43.104 +        add             r2,  r2,  #16*4
  43.105 +        vst1.32         {q9},     [r2,:128]
  43.106 +
  43.107 +        subs            r1,  r1,  #1
  43.108 +        popeq           {r4-r11,pc}
  43.109 +
  43.110 +        cmp             r4,  #0
  43.111 +        subeq           r8,  r8,  #512*4
  43.112 +        subeq           r9,  r9,  #512*4
  43.113 +        sub             r5,  r5,  #512*4
  43.114 +        sub             r2,  r2,  #12*4         @ out
  43.115 +        add             r3,  r3,  #4*4          @ synth_buf2
  43.116 +        add             r5,  r5,  #4*4          @ window
  43.117 +        add             r9,  r9,  #4*4          @ synth_buf
  43.118 +        sub             r8,  r8,  #4*4          @ synth_buf
  43.119 +        b               1b
  43.120 +endfunc

    44.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    44.2 +++ b/ffmpeg_smp/h264dec/libavcodec/arm/vp3dsp_neon.S	Mon Aug 27 12:09:56 2012 +0200
    44.3 @@ -0,0 +1,420 @@
    44.4 +/*
    44.5 + * Copyright (c) 2009 David Conrad
    44.6 + *
    44.7 + * This file is part of FFmpeg.
    44.8 + *
    44.9 + * FFmpeg is free software; you can redistribute it and/or
   44.10 + * modify it under the terms of the GNU Lesser General Public
   44.11 + * License as published by the Free Software Foundation; either
   44.12 + * version 2.1 of the License, or (at your option) any later version.
   44.13 + *
   44.14 + * FFmpeg is distributed in the hope that it will be useful,
   44.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   44.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   44.17 + * Lesser General Public License for more details.
   44.18 + *
   44.19 + * You should have received a copy of the GNU Lesser General Public
   44.20 + * License along with FFmpeg; if not, write to the Free Software
   44.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   44.22 + */
   44.23 +
   44.24 +#include "asm.S"
   44.25 +
   44.26 +.section .rodata
   44.27 +.align 4
   44.28 +
   44.29 +vp3_idct_constants:
   44.30 +.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
   44.31 +
   44.32 +#define xC1S7 d0[0]
   44.33 +#define xC2S6 d0[1]
   44.34 +#define xC3S5 d0[2]
   44.35 +#define xC4S4 d0[3]
   44.36 +#define xC5S3 d1[0]
   44.37 +#define xC6S2 d1[1]
   44.38 +#define xC7S1 d1[2]
   44.39 +
   44.40 +.text
   44.41 +
   44.42 +.macro vp3_loop_filter
   44.43 +    vsubl.u8        q3,  d18, d17
   44.44 +    vsubl.u8        q2,  d16, d19
   44.45 +    vadd.i16        q1,  q3,  q3
   44.46 +    vadd.i16        q2,  q2,  q3
   44.47 +    vadd.i16        q0,  q1,  q2
   44.48 +    vrshr.s16       q0,  q0,  #3
   44.49 +    vmovl.u8        q9,  d18
   44.50 +    vdup.u16        q15, r2
   44.51 +
   44.52 +    vabs.s16        q1,  q0
   44.53 +    vshr.s16        q0,  q0,  #15
   44.54 +    vqsub.u16       q2,  q15, q1
   44.55 +    vqsub.u16       q3,  q2,  q1
   44.56 +    vsub.i16        q1,  q2,  q3
   44.57 +    veor            q1,  q1,  q0
   44.58 +    vsub.i16        q0,  q1,  q0
   44.59 +
   44.60 +    vaddw.u8        q2,  q0,  d17
   44.61 +    vsub.i16        q3,  q9,  q0
   44.62 +    vqmovun.s16     d0,  q2
   44.63 +    vqmovun.s16     d1,  q3
   44.64 +.endm
   44.65 +
   44.66 +function ff_vp3_v_loop_filter_neon, export=1
   44.67 +    sub             ip,  r0,  r1
   44.68 +    sub             r0,  r0,  r1,  lsl #1
   44.69 +    vld1.64         {d16}, [r0,:64], r1
   44.70 +    vld1.64         {d17}, [r0,:64], r1
   44.71 +    vld1.64         {d18}, [r0,:64], r1
   44.72 +    vld1.64         {d19}, [r0,:64], r1
   44.73 +    ldrb            r2,    [r2, #129*4]
   44.74 +
   44.75 +    vp3_loop_filter
   44.76 +
   44.77 +    vst1.64         {d0},  [ip,:64], r1
   44.78 +    vst1.64         {d1},  [ip,:64], r1
   44.79 +    bx              lr
   44.80 +endfunc
   44.81 +
   44.82 +function ff_vp3_h_loop_filter_neon, export=1
   44.83 +    sub             ip,  r0,  #1
   44.84 +    sub             r0,  r0,  #2
   44.85 +    vld1.32         {d16[]},  [r0], r1
   44.86 +    vld1.32         {d17[]},  [r0], r1
   44.87 +    vld1.32         {d18[]},  [r0], r1
   44.88 +    vld1.32         {d19[]},  [r0], r1
   44.89 +    vld1.32         {d16[1]}, [r0], r1
   44.90 +    vld1.32         {d17[1]}, [r0], r1
   44.91 +    vld1.32         {d18[1]}, [r0], r1
   44.92 +    vld1.32         {d19[1]}, [r0], r1
   44.93 +    ldrb            r2,  [r2, #129*4]
   44.94 +
   44.95 +    vtrn.8          d16, d17
   44.96 +    vtrn.8          d18, d19
   44.97 +    vtrn.16         d16, d18
   44.98 +    vtrn.16         d17, d19
   44.99 +
  44.100 +    vp3_loop_filter
  44.101 +
  44.102 +    vtrn.8          d0,  d1
  44.103 +
  44.104 +    vst1.16         {d0[0]}, [ip], r1
  44.105 +    vst1.16         {d1[0]}, [ip], r1
  44.106 +    vst1.16         {d0[1]}, [ip], r1
  44.107 +    vst1.16         {d1[1]}, [ip], r1
  44.108 +    vst1.16         {d0[2]}, [ip], r1
  44.109 +    vst1.16         {d1[2]}, [ip], r1
  44.110 +    vst1.16         {d0[3]}, [ip], r1
  44.111 +    vst1.16         {d1[3]}, [ip], r1
  44.112 +    bx              lr
  44.113 +endfunc
  44.114 +
  44.115 +
  44.116 +function vp3_idct_start_neon
  44.117 +    vpush           {d8-d15}
  44.118 +    movrel          r3,  vp3_idct_constants
  44.119 +    vld1.64         {d0-d1},   [r3,:128]
  44.120 +    vld1.64         {d16-d19}, [r2,:128]!
  44.121 +    vld1.64         {d20-d23}, [r2,:128]!
  44.122 +    vld1.64         {d24-d27}, [r2,:128]!
  44.123 +    vadd.s16        q1,  q8,  q12
  44.124 +    vsub.s16        q8,  q8,  q12
  44.125 +    vld1.64         {d28-d31}, [r2,:128]!
  44.126 +endfunc
  44.127 +
  44.128 +function vp3_idct_core_neon
  44.129 +    vmull.s16       q2,  d18, xC1S7     // (ip[1] * C1) << 16
  44.130 +    vmull.s16       q3,  d19, xC1S7
  44.131 +    vmull.s16       q4,  d2,  xC4S4     // ((ip[0] + ip[4]) * C4) << 16
  44.132 +    vmull.s16       q5,  d3,  xC4S4
  44.133 +    vmull.s16       q6,  d16, xC4S4     // ((ip[0] - ip[4]) * C4) << 16
  44.134 +    vmull.s16       q7,  d17, xC4S4
  44.135 +    vshrn.s32       d4,  q2,  #16
  44.136 +    vshrn.s32       d5,  q3,  #16
  44.137 +    vshrn.s32       d6,  q4,  #16
  44.138 +    vshrn.s32       d7,  q5,  #16
  44.139 +    vshrn.s32       d8,  q6,  #16
  44.140 +    vshrn.s32       d9,  q7,  #16
  44.141 +    vadd.s16        q12, q1,  q3        // E = (ip[0] + ip[4]) * C4
  44.142 +    vadd.s16        q8,  q8,  q4        // F = (ip[0] - ip[4]) * C4
  44.143 +    vadd.s16        q1,  q2,  q9        // ip[1] * C1
  44.144 +
  44.145 +    vmull.s16       q2,  d30, xC1S7     // (ip[7] * C1) << 16
  44.146 +    vmull.s16       q3,  d31, xC1S7
  44.147 +    vmull.s16       q4,  d30, xC7S1     // (ip[7] * C7) << 16
  44.148 +    vmull.s16       q5,  d31, xC7S1
  44.149 +    vmull.s16       q6,  d18, xC7S1     // (ip[1] * C7) << 16
  44.150 +    vmull.s16       q7,  d19, xC7S1
  44.151 +    vshrn.s32       d4,  q2,  #16
  44.152 +    vshrn.s32       d5,  q3,  #16
  44.153 +    vshrn.s32       d6,  q4,  #16       // ip[7] * C7
  44.154 +    vshrn.s32       d7,  q5,  #16
  44.155 +    vshrn.s32       d8,  q6,  #16       // ip[1] * C7
  44.156 +    vshrn.s32       d9,  q7,  #16
  44.157 +    vadd.s16        q2,  q2,  q15       // ip[7] * C1
  44.158 +    vadd.s16        q9,  q1,  q3        // A = ip[1] * C1 + ip[7] * C7
  44.159 +    vsub.s16        q15, q4,  q2        // B = ip[1] * C7 - ip[7] * C1
  44.160 +
  44.161 +    vmull.s16       q2,  d22, xC5S3     // (ip[3] * C5) << 16
  44.162 +    vmull.s16       q3,  d23, xC5S3
  44.163 +    vmull.s16       q4,  d22, xC3S5     // (ip[3] * C3) << 16
  44.164 +    vmull.s16       q5,  d23, xC3S5
  44.165 +    vmull.s16       q6,  d26, xC5S3     // (ip[5] * C5) << 16
  44.166 +    vmull.s16       q7,  d27, xC5S3
  44.167 +    vshrn.s32       d4,  q2,  #16
  44.168 +    vshrn.s32       d5,  q3,  #16
  44.169 +    vshrn.s32       d6,  q4,  #16
  44.170 +    vshrn.s32       d7,  q5,  #16
  44.171 +    vshrn.s32       d8,  q6,  #16
  44.172 +    vshrn.s32       d9,  q7,  #16
  44.173 +    vadd.s16        q3,  q3,  q11       // ip[3] * C3
  44.174 +    vadd.s16        q4,  q4,  q13       // ip[5] * C5
  44.175 +    vadd.s16        q1,  q2,  q11       // ip[3] * C5
  44.176 +    vadd.s16        q11, q3,  q4        // C = ip[3] * C3 + ip[5] * C5
  44.177 +
  44.178 +    vmull.s16       q2,  d26, xC3S5     // (ip[5] * C3) << 16
  44.179 +    vmull.s16       q3,  d27, xC3S5
  44.180 +    vmull.s16       q4,  d20, xC2S6     // (ip[2] * C2) << 16
  44.181 +    vmull.s16       q5,  d21, xC2S6
  44.182 +    vmull.s16       q6,  d28, xC6S2     // (ip[6] * C6) << 16
  44.183 +    vmull.s16       q7,  d29, xC6S2
  44.184 +    vshrn.s32       d4,  q2,  #16
  44.185 +    vshrn.s32       d5,  q3,  #16
  44.186 +    vshrn.s32       d6,  q4,  #16
  44.187 +    vshrn.s32       d7,  q5,  #16
  44.188 +    vshrn.s32       d8,  q6,  #16       // ip[6] * C6
  44.189 +    vshrn.s32       d9,  q7,  #16
  44.190 +    vadd.s16        q2,  q2,  q13       // ip[5] * C3
  44.191 +    vadd.s16        q3,  q3,  q10       // ip[2] * C2
  44.192 +    vsub.s16        q13, q2,  q1        // D = ip[5] * C3 - ip[3] * C5
  44.193 +    vsub.s16        q1,  q9,  q11       // (A - C)
  44.194 +    vadd.s16        q11, q9,  q11       // Cd = A + C
  44.195 +    vsub.s16        q9,  q15, q13       // (B - D)
  44.196 +    vadd.s16        q13, q15, q13       // Dd = B + D
  44.197 +    vadd.s16        q15, q3,  q4        // G = ip[2] * C2 + ip[6] * C6
  44.198 +
  44.199 +    vmull.s16       q2,  d2,  xC4S4     // ((A - C) * C4) << 16
  44.200 +    vmull.s16       q3,  d3,  xC4S4
  44.201 +    vmull.s16       q4,  d28, xC2S6     // (ip[6] * C2) << 16
  44.202 +    vmull.s16       q5,  d29, xC2S6
  44.203 +    vmull.s16       q6,  d20, xC6S2     // (ip[2] * C6) << 16
  44.204 +    vmull.s16       q7,  d21, xC6S2
  44.205 +    vshrn.s32       d4,  q2,  #16
  44.206 +    vshrn.s32       d5,  q3,  #16
  44.207 +    vshrn.s32       d6,  q4,  #16
  44.208 +    vshrn.s32       d7,  q5,  #16
  44.209 +    vshrn.s32       d8,  q6,  #16       // ip[2] * C6
  44.210 +    vmull.s16       q5,  d18, xC4S4     // ((B - D) * C4) << 16
  44.211 +    vmull.s16       q6,  d19, xC4S4
  44.212 +    vshrn.s32       d9,  q7,  #16
  44.213 +    vadd.s16        q3,  q3,  q14       // ip[6] * C2
  44.214 +    vadd.s16        q10, q1,  q2        // Ad = (A - C) * C4
  44.215 +    vsub.s16        q14, q4,  q3        // H = ip[2] * C6 - ip[6] * C2
  44.216 +    bx              lr
  44.217 +endfunc
  44.218 +
  44.219 +.macro VP3_IDCT_END type
  44.220 +function vp3_idct_end_\type\()_neon
  44.221 +.ifc \type, col
  44.222 +    vdup.16         q0,  r3
  44.223 +    vadd.s16        q12, q12, q0
  44.224 +    vadd.s16        q8,  q8,  q0
  44.225 +.endif
  44.226 +
  44.227 +    vshrn.s32       d2,  q5,  #16
  44.228 +    vshrn.s32       d3,  q6,  #16
  44.229 +    vadd.s16        q2,  q12, q15       // Gd  = E + G
  44.230 +    vadd.s16        q9,  q1,  q9        // (B - D) * C4
  44.231 +    vsub.s16        q12, q12, q15       // Ed  = E - G
  44.232 +    vsub.s16        q3,  q8,  q10       // Fd  = F - Ad
  44.233 +    vadd.s16        q10, q8,  q10       // Add = F + Ad
  44.234 +    vadd.s16        q4,  q9,  q14       // Hd  = Bd + H
  44.235 +    vsub.s16        q14, q9,  q14       // Bdd = Bd - H
  44.236 +    vadd.s16        q8,  q2,  q11       // [0] = Gd + Cd
  44.237 +    vsub.s16        q15, q2,  q11       // [7] = Gd - Cd
  44.238 +    vadd.s16        q9,  q10, q4        // [1] = Add + Hd
  44.239 +    vsub.s16        q10, q10, q4        // [2] = Add - Hd
  44.240 +    vadd.s16        q11, q12, q13       // [3] = Ed + Dd
  44.241 +    vsub.s16        q12, q12, q13       // [4] = Ed - Dd
  44.242 +.ifc \type, row
  44.243 +    vtrn.16         q8,  q9
  44.244 +.endif
  44.245 +    vadd.s16        q13, q3,  q14       // [5] = Fd + Bdd
  44.246 +    vsub.s16        q14, q3,  q14       // [6] = Fd - Bdd
  44.247 +
  44.248 +.ifc \type, row
  44.249 +    // 8x8 transpose
  44.250 +    vtrn.16         q10, q11
  44.251 +    vtrn.16         q12, q13
  44.252 +    vtrn.16         q14, q15
  44.253 +    vtrn.32         q8,  q10
  44.254 +    vtrn.32         q9,  q11
  44.255 +    vtrn.32         q12, q14
  44.256 +    vtrn.32         q13, q15
  44.257 +    vswp            d17, d24
  44.258 +    vswp            d19, d26
  44.259 +    vadd.s16        q1,  q8,  q12
  44.260 +    vswp            d21, d28
  44.261 +    vsub.s16        q8,  q8,  q12
  44.262 +    vswp            d23, d30
  44.263 +.endif
  44.264 +    bx              lr
  44.265 +endfunc
  44.266 +.endm
  44.267 +
  44.268 +VP3_IDCT_END row
  44.269 +VP3_IDCT_END col
  44.270 +
  44.271 +function ff_vp3_idct_neon, export=1
  44.272 +    mov             ip,  lr
  44.273 +    mov             r2,  r0
  44.274 +    bl              vp3_idct_start_neon
  44.275 +    bl              vp3_idct_end_row_neon
  44.276 +    mov             r3,  #8
  44.277 +    bl              vp3_idct_core_neon
  44.278 +    bl              vp3_idct_end_col_neon
  44.279 +    mov             lr,  ip
  44.280 +    vpop            {d8-d15}
  44.281 +
  44.282 +    vshr.s16        q8,  q8,  #4
  44.283 +    vshr.s16        q9,  q9,  #4
  44.284 +    vshr.s16        q10, q10, #4
  44.285 +    vshr.s16        q11, q11, #4
  44.286 +    vshr.s16        q12, q12, #4
  44.287 +    vst1.64         {d16-d19}, [r0,:128]!
  44.288 +    vshr.s16        q13, q13, #4
  44.289 +    vshr.s16        q14, q14, #4
  44.290 +    vst1.64         {d20-d23}, [r0,:128]!
  44.291 +    vshr.s16        q15, q15, #4
  44.292 +    vst1.64         {d24-d27}, [r0,:128]!
  44.293 +    vst1.64         {d28-d31}, [r0,:128]!
  44.294 +    bx              lr
  44.295 +endfunc
  44.296 +
  44.297 +function ff_vp3_idct_put_neon, export=1
  44.298 +    mov             ip,  lr
  44.299 +    bl              vp3_idct_start_neon
  44.300 +    bl              vp3_idct_end_row_neon
  44.301 +    mov             r3,  #8
  44.302 +    add             r3,  r3,  #2048         // convert signed pixel to unsigned
  44.303 +    bl              vp3_idct_core_neon
  44.304 +    bl              vp3_idct_end_col_neon
  44.305 +    mov             lr,  ip
  44.306 +    vpop            {d8-d15}
  44.307 +
  44.308 +    vqshrun.s16     d0,  q8,  #4
  44.309 +    vqshrun.s16     d1,  q9,  #4
  44.310 +    vqshrun.s16     d2,  q10, #4
  44.311 +    vqshrun.s16     d3,  q11, #4
  44.312 +    vst1.64         {d0}, [r0,:64], r1
  44.313 +    vqshrun.s16     d4,  q12, #4
  44.314 +    vst1.64         {d1}, [r0,:64], r1
  44.315 +    vqshrun.s16     d5,  q13, #4
  44.316 +    vst1.64         {d2}, [r0,:64], r1
  44.317 +    vqshrun.s16     d6,  q14, #4
  44.318 +    vst1.64         {d3}, [r0,:64], r1
  44.319 +    vqshrun.s16     d7,  q15, #4
  44.320 +    vst1.64         {d4}, [r0,:64], r1
  44.321 +    vst1.64         {d5}, [r0,:64], r1
  44.322 +    vst1.64         {d6}, [r0,:64], r1
  44.323 +    vst1.64         {d7}, [r0,:64], r1
  44.324 +    bx              lr
  44.325 +endfunc
  44.326 +
  44.327 +function ff_vp3_idct_add_neon, export=1
  44.328 +    mov             ip,  lr
  44.329 +    bl              vp3_idct_start_neon
  44.330 +    bl              vp3_idct_end_row_neon
  44.331 +    mov             r3,  #8
  44.332 +    bl              vp3_idct_core_neon
  44.333 +    bl              vp3_idct_end_col_neon
  44.334 +    mov             lr,  ip
  44.335 +    vpop            {d8-d15}
  44.336 +    mov             r2,  r0
  44.337 +
  44.338 +    vld1.64         {d0}, [r0,:64], r1
  44.339 +    vshr.s16        q8,  q8,  #4
  44.340 +    vld1.64         {d1}, [r0,:64], r1
  44.341 +    vshr.s16        q9,  q9,  #4
  44.342 +    vld1.64         {d2}, [r0,:64], r1
  44.343 +    vaddw.u8        q8,  q8,  d0
  44.344 +    vld1.64         {d3}, [r0,:64], r1
  44.345 +    vaddw.u8        q9,  q9,  d1
  44.346 +    vld1.64         {d4}, [r0,:64], r1
  44.347 +    vshr.s16        q10, q10, #4
  44.348 +    vld1.64         {d5}, [r0,:64], r1
  44.349 +    vshr.s16        q11, q11, #4
  44.350 +    vld1.64         {d6}, [r0,:64], r1
  44.351 +    vqmovun.s16     d0,  q8
  44.352 +    vld1.64         {d7}, [r0,:64], r1
  44.353 +    vqmovun.s16     d1,  q9
  44.354 +    vaddw.u8        q10, q10, d2
  44.355 +    vaddw.u8        q11, q11, d3
  44.356 +    vshr.s16        q12, q12, #4
  44.357 +    vshr.s16        q13, q13, #4
  44.358 +    vqmovun.s16     d2,  q10
  44.359 +    vqmovun.s16     d3,  q11
  44.360 +    vaddw.u8        q12, q12, d4
  44.361 +    vaddw.u8        q13, q13, d5
  44.362 +    vshr.s16        q14, q14, #4
  44.363 +    vshr.s16        q15, q15, #4
  44.364 +    vst1.64         {d0}, [r2,:64], r1
  44.365 +    vqmovun.s16     d4,  q12
  44.366 +    vst1.64         {d1}, [r2,:64], r1
  44.367 +    vqmovun.s16     d5,  q13
  44.368 +    vst1.64         {d2}, [r2,:64], r1
  44.369 +    vaddw.u8        q14, q14, d6
  44.370 +    vst1.64         {d3}, [r2,:64], r1
  44.371 +    vaddw.u8        q15, q15, d7
  44.372 +    vst1.64         {d4}, [r2,:64], r1
  44.373 +    vqmovun.s16     d6,  q14
  44.374 +    vst1.64         {d5}, [r2,:64], r1
  44.375 +    vqmovun.s16     d7,  q15
  44.376 +    vst1.64         {d6}, [r2,:64], r1
  44.377 +    vst1.64         {d7}, [r2,:64], r1
  44.378 +    bx              lr
  44.379 +endfunc
  44.380 +
  44.381 +function ff_vp3_idct_dc_add_neon, export=1
  44.382 +    ldrsh           r2,  [r2]
  44.383 +    movw            r3,  #46341
  44.384 +    mul             r2,  r3,  r2
  44.385 +    smulwt          r2,  r3,  r2
  44.386 +    mov             r3,  r0
  44.387 +    vdup.16         q15, r2
  44.388 +    vrshr.s16       q15, q15, #4
  44.389 +
  44.390 +    vld1.8          {d0}, [r0,:64], r1
  44.391 +    vld1.8          {d1}, [r0,:64], r1
  44.392 +    vld1.8          {d2}, [r0,:64], r1
  44.393 +    vaddw.u8        q8,  q15, d0
  44.394 +    vld1.8          {d3}, [r0,:64], r1
  44.395 +    vaddw.u8        q9,  q15, d1
  44.396 +    vld1.8          {d4}, [r0,:64], r1
  44.397 +    vaddw.u8        q10, q15, d2
  44.398 +    vld1.8          {d5}, [r0,:64], r1
  44.399 +    vaddw.u8        q11, q15, d3
  44.400 +    vld1.8          {d6}, [r0,:64], r1
  44.401 +    vaddw.u8        q12, q15, d4
  44.402 +    vld1.8          {d7}, [r0,:64], r1
  44.403 +    vaddw.u8        q13, q15, d5
  44.404 +    vqmovun.s16     d0,  q8
  44.405 +    vaddw.u8        q14, q15, d6
  44.406 +    vqmovun.s16     d1,  q9
  44.407 +    vaddw.u8        q15, q15, d7
  44.408 +    vqmovun.s16     d2,  q10
  44.409 +    vst1.8          {d0}, [r3,:64], r1
  44.410 +    vqmovun.s16     d3,  q11
  44.411 +    vst1.8          {d1}, [r3,:64], r1
  44.412 +    vqmovun.s16     d4,  q12
  44.413 +    vst1.8          {d2}, [r3,:64], r1
  44.414 +    vqmovun.s16     d5,  q13
  44.415 +    vst1.8          {d3}, [r3,:64], r1
  44.416 +    vqmovun.s16     d6,  q14
  44.417 +    vst1.8          {d4}, [r3,:64], r1
  44.418 +    vqmovun.s16     d7,  q15
  44.419 +    vst1.8          {d5}, [r3,:64], r1
  44.420 +    vst1.8          {d6}, [r3,:64], r1
  44.421 +    vst1.8          {d7}, [r3,:64], r1
  44.422 +    bx              lr
  44.423 +endfunc

    45.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    45.2 +++ b/ffmpeg_smp/h264dec/libavcodec/avcodec.h	Mon Aug 27 12:09:56 2012 +0200
    45.3 @@ -0,0 +1,407 @@
    45.4 +#ifndef AVCODEC_AVCODEC_H
    45.5 +#define AVCODEC_AVCODEC_H
    45.6 +
    45.7 +#include <errno.h>
    45.8 +#include <stdint.h>
    45.9 +#include "config.h"
   45.10 +
   45.11 +#include "libavutil/mem.h"
   45.12 +
   45.13 +#define MAX_SPS_COUNT 32
   45.14 +#define MAX_PPS_COUNT 256
   45.15 +
   45.16 +
   45.17 +#ifndef CABAC
   45.18 +#define CABAC h->pps.cabac
   45.19 +#endif
   45.20 +
   45.21 +#define EXTENDED_SAR          255
   45.22 +
   45.23 +#define MB_TYPE_REF0       MB_TYPE_ACPRED //dirty but it fits in 16 bit
   45.24 +#define MB_TYPE_8x8DCT     0x01000000
   45.25 +#define IS_REF0(a)         ((a) & MB_TYPE_REF0)
   45.26 +#define IS_8x8DCT(a)       ((a) & MB_TYPE_8x8DCT)
   45.27 +
   45.28 +#define LIST_NOT_USED -1
   45.29 +#define PART_NOT_AVAILABLE -2
   45.30 +
   45.31 +/* dct code */
   45.32 +typedef short DCTELEM;
   45.33 +
   45.34 +/**
   45.35 +* Required number of additionally allocated bytes at the end of the input bitstream for decoding.
   45.36 +* This is mainly needed because some optimized bitstream readers read
   45.37 +* 32 or 64 bit at once and could read over the end.<br>
   45.38 +* Note: If the first 23 bits of the additional bytes are not 0, then damaged
   45.39 +* MPEG bitstreams could cause overread and segfault.
   45.40 +*/
   45.41 +#define FF_INPUT_BUFFER_PADDING_SIZE 8
   45.42 +
   45.43 +enum AVColorPrimaries{
   45.44 +    AVCOL_PRI_BT709      =1, ///< also ITU-R BT1361 / IEC 61966-2-4 / SMPTE RP177 Annex B
   45.45 +    AVCOL_PRI_UNSPECIFIED=2,
   45.46 +    AVCOL_PRI_BT470M     =4,
   45.47 +    AVCOL_PRI_BT470BG    =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM
   45.48 +    AVCOL_PRI_SMPTE170M  =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC
   45.49 +    AVCOL_PRI_SMPTE240M  =7, ///< functionally identical to above
   45.50 +    AVCOL_PRI_FILM       =8,
   45.51 +    AVCOL_PRI_NB           , ///< Not part of ABI
   45.52 +};
   45.53 +
   45.54 +enum AVColorTransferCharacteristic{
   45.55 +    AVCOL_TRC_BT709      =1, ///< also ITU-R BT1361
   45.56 +    AVCOL_TRC_UNSPECIFIED=2,
   45.57 +    AVCOL_TRC_GAMMA22    =4, ///< also ITU-R BT470M / ITU-R BT1700 625 PAL & SECAM
   45.58 +    AVCOL_TRC_GAMMA28    =5, ///< also ITU-R BT470BG
   45.59 +    AVCOL_TRC_NB           , ///< Not part of ABI
   45.60 +};
   45.61 +
   45.62 +enum AVColorSpace{
   45.63 +    AVCOL_SPC_RGB        =0,
   45.64 +    AVCOL_SPC_BT709      =1, ///< also ITU-R BT1361 / IEC 61966-2-4 xvYCC709 / SMPTE RP177 Annex B
   45.65 +    AVCOL_SPC_UNSPECIFIED=2,
   45.66 +    AVCOL_SPC_FCC        =4,
   45.67 +    AVCOL_SPC_BT470BG    =5, ///< also ITU-R BT601-6 625 / ITU-R BT1358 625 / ITU-R BT1700 625 PAL & SECAM / IEC 61966-2-4 xvYCC601
   45.68 +    AVCOL_SPC_SMPTE170M  =6, ///< also ITU-R BT601-6 525 / ITU-R BT1358 525 / ITU-R BT1700 NTSC / functionally identical to above
   45.69 +    AVCOL_SPC_SMPTE240M  =7,
   45.70 +    AVCOL_SPC_NB           , ///< Not part of ABI
   45.71 +};
   45.72 +
   45.73 +enum AVColorRange{
   45.74 +    AVCOL_RANGE_UNSPECIFIED=0,
   45.75 +    AVCOL_RANGE_MPEG       =1, ///< the normal 219*2^(n-8) "MPEG" YUV ranges
   45.76 +    AVCOL_RANGE_JPEG       =2, ///< the normal     2^n-1   "JPEG" YUV ranges
   45.77 +    AVCOL_RANGE_NB           , ///< Not part of ABI
   45.78 +};
   45.79 +
   45.80 +#define MAX_MMCO_COUNT 66
   45.81 +/**
   45.82 +* Memory management control operation opcode.
   45.83 +*/
   45.84 +typedef enum MMCOOpcode{
   45.85 +    MMCO_END=0,
   45.86 +    MMCO_SHORT2UNUSED,
   45.87 +    MMCO_LONG2UNUSED,
   45.88 +    MMCO_SHORT2LONG,
   45.89 +    MMCO_SET_MAX_LONG,
   45.90 +    MMCO_RESET,
   45.91 +    MMCO_LONG,
   45.92 +} MMCOOpcode;
   45.93 +
   45.94 +/* NAL unit types */
   45.95 +enum {
   45.96 +    NAL_SLICE=1,
   45.97 +    NAL_DPA,
   45.98 +    NAL_DPB,
   45.99 +    NAL_DPC,
  45.100 +    NAL_IDR_SLICE,
  45.101 +    NAL_SEI,
  45.102 +    NAL_SPS,
  45.103 +    NAL_PPS,
  45.104 +    NAL_AUD,
  45.105 +    NAL_END_SEQUENCE,
  45.106 +    NAL_END_STREAM,
  45.107 +    NAL_FILLER_DATA,
  45.108 +    NAL_SPS_EXT,
  45.109 +    NAL_AUXILIARY_SLICE=19
  45.110 +};
  45.111 +
  45.112 +/**
  45.113 +* SEI message types
  45.114 +*/
  45.115 +typedef enum {
  45.116 +    SEI_BUFFERING_PERIOD             =  0, ///< buffering period (H.264, D.1.1)
  45.117 +    SEI_TYPE_PIC_TIMING              =  1, ///< picture timing
  45.118 +    SEI_TYPE_USER_DATA_UNREGISTERED  =  5, ///< unregistered user data
  45.119 +    SEI_TYPE_RECOVERY_POINT          =  6  ///< recovery point (frame # to decoder sync)
  45.120 +} SEI_Type;
  45.121 +
  45.122 +/**
  45.123 +* pic_struct in picture timing SEI message
  45.124 +*/
  45.125 +typedef enum {
  45.126 +    SEI_PIC_STRUCT_FRAME             = 0, ///<  0: %frame
  45.127 +    SEI_PIC_STRUCT_TOP_FIELD         = 1, ///<  1: top field
  45.128 +    SEI_PIC_STRUCT_BOTTOM_FIELD      = 2, ///<  2: bottom field
  45.129 +    SEI_PIC_STRUCT_TOP_BOTTOM        = 3, ///<  3: top field, bottom field, in that order
  45.130 +    SEI_PIC_STRUCT_BOTTOM_TOP        = 4, ///<  4: bottom field, top field, in that order
  45.131 +    SEI_PIC_STRUCT_TOP_BOTTOM_TOP    = 5, ///<  5: top field, bottom field, top field repeated, in that order
  45.132 +    SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM = 6, ///<  6: bottom field, top field, bottom field repeated, in that order
  45.133 +    SEI_PIC_STRUCT_FRAME_DOUBLING    = 7, ///<  7: %frame doubling
  45.134 +    SEI_PIC_STRUCT_FRAME_TRIPLING    = 8  ///<  8: %frame tripling
  45.135 +} SEI_PicStructType;
  45.136 +
  45.137 +#define FF_MAX_B_FRAMES 16
  45.138 +
  45.139 +
  45.140 +//The following defines may change, don't expect compatibility if you use them.
  45.141 +#define MB_TYPE_INTRA4x4   0x0001
  45.142 +#define MB_TYPE_INTRA16x16 0x0002 //FIXME H.264-specific
  45.143 +#define MB_TYPE_INTRA_PCM  0x0004 //FIXME H.264-specific
  45.144 +#define MB_TYPE_16x16      0x0008
  45.145 +#define MB_TYPE_16x8       0x0010
  45.146 +#define MB_TYPE_8x16       0x0020
  45.147 +#define MB_TYPE_8x8        0x0040
  45.148 +#define MB_TYPE_INTERLACED 0x0080
  45.149 +#define MB_TYPE_DIRECT2    0x0100 //FIXME
  45.150 +#define MB_TYPE_ACPRED     0x0200
  45.151 +#define MB_TYPE_GMC        0x0400
  45.152 +#define MB_TYPE_SKIP       0x0800
  45.153 +#define MB_TYPE_P0L0       0x1000
  45.154 +#define MB_TYPE_P1L0       0x2000
  45.155 +#define MB_TYPE_P0L1       0x4000
  45.156 +#define MB_TYPE_P1L1       0x8000
  45.157 +#define MB_TYPE_L0         (MB_TYPE_P0L0 | MB_TYPE_P1L0)
  45.158 +#define MB_TYPE_L1         (MB_TYPE_P0L1 | MB_TYPE_P1L1)
  45.159 +#define MB_TYPE_L0L1       (MB_TYPE_L0   | MB_TYPE_L1)
  45.160 +#define MB_TYPE_QUANT      0x00010000
  45.161 +#define MB_TYPE_CBP        0x00020000
  45.162 +//Note bits 24-31 are reserved for codec specific use (h264 ref0, mpeg1 0mv, ...)
  45.163 +
  45.164 +#define FF_BUFFER_TYPE_INTERNAL 1
  45.165 +#define FF_BUFFER_TYPE_USER     2 ///< direct rendering buffers (image is (de)allocated by user)
  45.166 +#define FF_BUFFER_TYPE_SHARED   4 ///< Buffer from somewhere else; don't deallocate image (data/base), all other tables are not shared.
  45.167 +#define FF_BUFFER_TYPE_COPY     8 ///< Just a (modified) copy of some other buffer, don't deallocate anything.
  45.168 +
  45.169 +
  45.170 +#define FF_I_TYPE  1 ///< Intra
  45.171 +#define FF_P_TYPE  2 ///< Predicted
  45.172 +#define FF_B_TYPE  3 ///< Bi-dir predicted
  45.173 +#define FF_S_TYPE  4 ///< S(GMC)-VOP MPEG4
  45.174 +#define FF_SI_TYPE 5 ///< Switching Intra
  45.175 +#define FF_SP_TYPE 6 ///< Switching Predicted
  45.176 +#define FF_BI_TYPE 7
  45.177 +
  45.178 +#define MB_TYPE_INTRA MB_TYPE_INTRA4x4 //default mb_type if there is just one type
  45.179 +#define IS_INTRA4x4(a)   ((a)&MB_TYPE_INTRA4x4)
  45.180 +#define IS_INTRA16x16(a) ((a)&MB_TYPE_INTRA16x16)
  45.181 +#define IS_PCM(a)        ((a)&MB_TYPE_INTRA_PCM)
  45.182 +#define IS_INTRA(a)      ((a)&7)
  45.183 +#define IS_INTER(a)      ((a)&(MB_TYPE_16x16|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8))
  45.184 +#define IS_SKIP(a)       ((a)&MB_TYPE_SKIP)
  45.185 +#define IS_INTRA_PCM(a)  ((a)&MB_TYPE_INTRA_PCM)
  45.186 +#define IS_INTERLACED(a) ((a)&MB_TYPE_INTERLACED)
  45.187 +#define IS_DIRECT(a)     ((a)&MB_TYPE_DIRECT2)
  45.188 +#define IS_GMC(a)        ((a)&MB_TYPE_GMC)
  45.189 +#define IS_16X16(a)      ((a)&MB_TYPE_16x16)
  45.190 +#define IS_16X8(a)       ((a)&MB_TYPE_16x8)
  45.191 +#define IS_8X16(a)       ((a)&MB_TYPE_8x16)
  45.192 +#define IS_8X8(a)        ((a)&MB_TYPE_8x8)
  45.193 +#define IS_SUB_8X8(a)    ((a)&MB_TYPE_16x16) //note reused
  45.194 +#define IS_SUB_8X4(a)    ((a)&MB_TYPE_16x8)  //note reused
  45.195 +#define IS_SUB_4X8(a)    ((a)&MB_TYPE_8x16)  //note reused
  45.196 +#define IS_SUB_4X4(a)    ((a)&MB_TYPE_8x8)   //note reused
  45.197 +#define IS_ACPRED(a)     ((a)&MB_TYPE_ACPRED)
  45.198 +#define IS_QUANT(a)      ((a)&MB_TYPE_QUANT)
  45.199 +#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list))))
  45.200 +#define USES_LIST(a, list) ((a) & ((MB_TYPE_P0L0|MB_TYPE_P1L0)<<(2*(list)))) ///< does this mb use listX, note does not work if subMBs
  45.201 +#define HAS_CBP(a)        ((a)&MB_TYPE_CBP)
  45.202 +
  45.203 +
  45.204 +#define FF_MM_FORCE    0x80000000 /* Force usage of selected flags (OR) */
  45.205 +    /* lower 16 bits - CPU features */
  45.206 +#define FF_MM_MMX      0x0001 ///< standard MMX
  45.207 +#define FF_MM_3DNOW    0x0004 ///< AMD 3DNOW
  45.208 +#define FF_MM_MMX2     0x0002 ///< SSE integer functions or AMD MMX ext
  45.209 +#define FF_MM_SSE      0x0008 ///< SSE functions
  45.210 +#define FF_MM_SSE2     0x0010 ///< PIV SSE2 functions
  45.211 +#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
  45.212 +#define FF_MM_SSE3     0x0040 ///< Prescott SSE3 functions
  45.213 +#define FF_MM_SSSE3    0x0080 ///< Conroe SSSE3 functions
  45.214 +#define FF_MM_SSE4     0x0100 ///< Penryn SSE4.1 functions
  45.215 +#define FF_MM_SSE42    0x0200 ///< Nehalem SSE4.2 functions
  45.216 +#define FF_MM_IWMMXT   0x0100 ///< XScale IWMMXT
  45.217 +#define FF_MM_ALTIVEC  0x0001 ///< standard AltiVec
  45.218 +
  45.219 +
  45.220 +/**
  45.221 +* Sequence parameter set
  45.222 +*/
  45.223 +typedef struct SPS{
  45.224 +
  45.225 +    int profile_idc;
  45.226 +    int level_idc;
  45.227 +    int chroma_format_idc;
  45.228 +    int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  45.229 +    int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  45.230 +    int poc_type;                      ///< pic_order_cnt_type
  45.231 +    int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  45.232 +    int delta_pic_order_always_zero_flag;
  45.233 +    int offset_for_non_ref_pic;
  45.234 +    int offset_for_top_to_bottom_field;
  45.235 +    int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  45.236 +    int ref_frame_count;               ///< num_ref_frames
  45.237 +    int gaps_in_frame_num_allowed_flag;
  45.238 +    int mb_width;                      ///< pic_width_in_mbs_minus1 + 1
  45.239 +    int mb_height;                     ///< pic_height_in_map_units_minus1 + 1
  45.240 +    int frame_mbs_only_flag;
  45.241 +    int mb_aff;                        ///<mb_adaptive_frame_field_flag
  45.242 +    int direct_8x8_inference_flag;
  45.243 +    int crop;                   ///< frame_cropping_flag
  45.244 +    unsigned int crop_left;            ///< frame_cropping_rect_left_offset
  45.245 +    unsigned int crop_right;           ///< frame_cropping_rect_right_offset
  45.246 +    unsigned int crop_top;             ///< frame_cropping_rect_top_offset
  45.247 +    unsigned int crop_bottom;          ///< frame_cropping_rect_bottom_offset
  45.248 +    int vui_parameters_present_flag;
  45.249 +    int num,den;
  45.250 +
  45.251 +    int video_signal_type_present_flag;
  45.252 +    int full_range;
  45.253 +    int colour_description_present_flag;
  45.254 +    enum AVColorPrimaries color_primaries;
  45.255 +    enum AVColorTransferCharacteristic color_trc;
  45.256 +    enum AVColorSpace colorspace;
  45.257 +    int timing_info_present_flag;
  45.258 +    uint32_t num_units_in_tick;
  45.259 +    uint32_t time_scale;
  45.260 +    int fixed_frame_rate_flag;
  45.261 +    short offset_for_ref_frame[256]; //FIXME dyn aloc?
  45.262 +    int bitstream_restriction_flag;
  45.263 +    int num_reorder_frames;
  45.264 +    int scaling_matrix_present;
  45.265 +    uint8_t scaling_matrix4[6][16];
  45.266 +    uint8_t scaling_matrix8[2][64];
  45.267 +    int nal_hrd_parameters_present_flag;
  45.268 +    int vcl_hrd_parameters_present_flag;
  45.269 +    int pic_struct_present_flag;
  45.270 +    int time_offset_length;
  45.271 +    int cpb_cnt;                       ///< See H.264 E.1.2
  45.272 +    int initial_cpb_removal_delay_length; ///< initial_cpb_removal_delay_length_minus1 +1
  45.273 +    int cpb_removal_delay_length;      ///< cpb_removal_delay_length_minus1 + 1
  45.274 +    int dpb_output_delay_length;       ///< dpb_output_delay_length_minus1 + 1
  45.275 +    int bit_depth_luma;                ///< bit_depth_luma_minus8 + 8
  45.276 +    int bit_depth_chroma;              ///< bit_depth_chroma_minus8 + 8
  45.277 +    int residual_color_transform_flag; ///< residual_colour_transform_flag
  45.278 +}SPS;
  45.279 +
  45.280 +/**
  45.281 +* Picture parameter set
  45.282 +*/
  45.283 +typedef struct PPS{
  45.284 +    unsigned int sps_id;
  45.285 +    int cabac;                  ///< entropy_coding_mode_flag
  45.286 +    int pic_order_present;      ///< pic_order_present_flag
  45.287 +    int slice_group_count;      ///< num_slice_groups_minus1 + 1
  45.288 +    int mb_slice_group_map_type;
  45.289 +    unsigned int ref_count[2];  ///< num_ref_idx_l0/1_active_minus1 + 1
  45.290 +    int weighted_pred;          ///< weighted_pred_flag
  45.291 +    int weighted_bipred_idc;
  45.292 +    int init_qp;                ///< pic_init_qp_minus26 + 26
  45.293 +    int init_qs;                ///< pic_init_qs_minus26 + 26
  45.294 +    int chroma_qp_index_offset[2];
  45.295 +    int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
  45.296 +    int constrained_intra_pred; ///< constrained_intra_pred_flag
  45.297 +    int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
  45.298 +    int transform_8x8_mode;     ///< transform_8x8_mode_flag
  45.299 +    uint8_t scaling_matrix4[6][16];
  45.300 +    uint8_t scaling_matrix8[2][64];
  45.301 +    uint8_t chroma_qp_table[2][64];  ///< pre-scaled (with chroma_qp_index_offset) version of qp_table
  45.302 +    int chroma_qp_diff;
  45.303 +}PPS;
  45.304 +
  45.305 +typedef struct TopBorder{
  45.306 +    uint8_t unfiltered_y[16];
  45.307 +    uint8_t unfiltered_cb[8];
  45.308 +    uint8_t unfiltered_cr[8];
  45.309 +
  45.310 +    uint8_t top_borders_y[16*4];
  45.311 +    uint8_t top_borders_cb[8*2];
  45.312 +    uint8_t top_borders_cr[8*2];
  45.313 +}TopBorder;
  45.314 +
  45.315 +typedef struct LeftBorder{
  45.316 +    uint8_t unfiltered_y[17];
  45.317 +    uint8_t unfiltered_cb[9];
  45.318 +    uint8_t unfiltered_cr[9];
  45.319 +}LeftBorder;
  45.320 +
  45.321 +typedef struct H264Mb {
  45.322 +    //variables copied in after cabac decoding
  45.323 +    int16_t mb_x, mb_y;
  45.324 +    int32_t mb_type;
  45.325 +
  45.326 +    uint16_t cbp;                                               // coded block pattern, idct, deblock
  45.327 +    int8_t qscale_mb_xy;                                        // qp, deblock
  45.328 +    int8_t qscale_left_mb_xy; //not required
  45.329 +    int8_t qscale_top_mb_xy;
  45.330 +
  45.331 +    DECLARE_ALIGNED(8, uint16_t, sub_mb_type[4]);
  45.332 +    DECLARE_ALIGNED(8, uint8_t, non_zero_count[24]);            //idct deblock
  45.333 +    DECLARE_ALIGNED(16, int16_t, mb[16*24]);                    //coeffs, idct
  45.334 +
  45.335 +    union{
  45.336 +        struct {
  45.337 +        DECLARE_ALIGNED(8, int8_t, ref_index[2][4]);            //mc, deblock
  45.338 +        DECLARE_ALIGNED(16, int16_t, mvd[2][16][2]);            //mc, deblock
  45.339 +        };
  45.340 +        struct {
  45.341 +        DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode[16]);     //intra, deblock
  45.342 +        int8_t chroma_pred_mode;                                //intra
  45.343 +        int8_t intra16x16_pred_mode;                            //intra, deblock
  45.344 +        };
  45.345 +    };
  45.346 +
  45.347 +#if OMPSS
  45.348 +    DECLARE_ALIGNED(8, uint8_t, top_border[16+ 2*8]);
  45.349 +    DECLARE_ALIGNED(8, uint8_t, top_border_next[8]);
  45.350 +    DECLARE_ALIGNED(8, uint8_t, left_border[17+2*9]);
  45.351 +    int8_t intra4x4_pred_mode_left[4];
  45.352 +#endif
  45.353 +
  45.354 +} H264Mb;
  45.355 +
  45.356 +typedef struct RawFrame {
  45.357 +    uint8_t *data;
  45.358 +    int size;
  45.359 +    unsigned int data_size;
  45.360 +    int64_t pos;                            ///< byte position in stream, -1 if unknown
  45.361 +    int state;
  45.362 +} RawFrame;
  45.363 +
  45.364 +typedef struct PictureInfo{
  45.365 +    int ref_poc[2][16];      ///< h264 POCs of the frames used as reference
  45.366 +    int ref_count[2];        ///< number of entries in ref_poc
  45.367 +    int poc;                    ///< h264 frame POC
  45.368 +    int frame_num;              ///< h264 frame_num (raw frame_num from slice header)
  45.369 +    int pic_id;
  45.370 +    int long_ref;
  45.371 +    int cpn;                    ///coded picture number
  45.372 +    int slice_type_nos;
  45.373 +//     int key_frame;
  45.374 +//     int mmco_reset;             ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET.
  45.375 +
  45.376 +    int reference;  //Set to 4 for delayed, non-reference frames. 1-3 for reference. FIXME
  45.377 +
  45.378 +}PictureInfo;
  45.379 +
  45.380 +typedef struct DecodedPicture{
  45.381 +    int16_t (*motion_val[2])[2];
  45.382 +    int16_t (*motion_val_base[2])[2];
  45.383 +
  45.384 +    /**
  45.385 +    * motion reference frame index
  45.386 +    * the order in which these are stored can depend on the codec.
  45.387 +    * - encoding: Set by user.
  45.388 +    * - decoding: Set by libavcodec.
  45.389 +    */
  45.390 +    int8_t *ref_index[2];
  45.391 +    uint32_t *mb_type;          //mb_type_base + mb_width + 2
  45.392 +    uint32_t *mb_type_base;
  45.393 +
  45.394 +    int8_t *intra4x4_pred_mode;
  45.395 +    int8_t *non_zero_count;
  45.396 +
  45.397 +    uint8_t *data[3]; //point to first pixel in the frame
  45.398 +    int linesize[3];
  45.399 +    uint8_t *base[3]; //base of picture planes
  45.400 +
  45.401 +    int cpn;                /// coded picture number
  45.402 +    int poc;                    ///< h264 frame POC
  45.403 +    int reference;  // 0 -> free, 1 -> needs to be displayed, 2 -> needed for reference, 3 -> 1 && 2
  45.404 +    int key_frame;
  45.405 +    int mmco_reset;             ///< h264 MMCO_RESET set this 1. Reordering code must not mix pictures before and after MMCO_RESET.
  45.406 +
  45.407 +} DecodedPicture;
  45.408 +
  45.409 +
  45.410 +#endif /* AVCODEC_AVCODEC_H */

    46.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    46.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cabac.c	Mon Aug 27 12:09:56 2012 +0200
    46.3 @@ -0,0 +1,242 @@
    46.4 +/*
    46.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
    46.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    46.7 + *
    46.8 + * This file is part of FFmpeg.
    46.9 + *
   46.10 + * FFmpeg is free software; you can redistribute it and/or
   46.11 + * modify it under the terms of the GNU Lesser General Public
   46.12 + * License as published by the Free Software Foundation; either
   46.13 + * version 2.1 of the License, or (at your option) any later version.
   46.14 + *
   46.15 + * FFmpeg is distributed in the hope that it will be useful,
   46.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   46.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   46.18 + * Lesser General Public License for more details.
   46.19 + *
   46.20 + * You should have received a copy of the GNU Lesser General Public
   46.21 + * License along with FFmpeg; if not, write to the Free Software
   46.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   46.23 + */
   46.24 +
   46.25 +/**
   46.26 + * @file
   46.27 + * Context Adaptive Binary Arithmetic Coder.
   46.28 + */
   46.29 +
   46.30 +#include <string.h>
   46.31 +
   46.32 +#include "libavutil/common.h"
   46.33 +//#include "get_bits.h"
   46.34 +#include "cabac.h"
   46.35 +
   46.36 +static const uint8_t lps_range[64][4]= {
   46.37 +{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
   46.38 +{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
   46.39 +{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
   46.40 +{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
   46.41 +{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
   46.42 +{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
   46.43 +{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
   46.44 +{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
   46.45 +{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
   46.46 +{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
   46.47 +{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
   46.48 +{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
   46.49 +{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
   46.50 +{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
   46.51 +{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
   46.52 +{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
   46.53 +};
   46.54 +
   46.55 +uint8_t ff_h264_mlps_state[4*64];
   46.56 +uint8_t ff_h264_lps_range[4*2*64];
   46.57 +uint8_t ff_h264_lps_state[2*64];
   46.58 +uint8_t ff_h264_mps_state[2*64];
   46.59 +
   46.60 +static const uint8_t mps_state[64]= {
   46.61 +  1, 2, 3, 4, 5, 6, 7, 8,
   46.62 +  9,10,11,12,13,14,15,16,
   46.63 + 17,18,19,20,21,22,23,24,
   46.64 + 25,26,27,28,29,30,31,32,
   46.65 + 33,34,35,36,37,38,39,40,
   46.66 + 41,42,43,44,45,46,47,48,
   46.67 + 49,50,51,52,53,54,55,56,
   46.68 + 57,58,59,60,61,62,62,63,
   46.69 +};
   46.70 +
   46.71 +static const uint8_t lps_state[64]= {
   46.72 +  0, 0, 1, 2, 2, 4, 4, 5,
   46.73 +  6, 7, 8, 9, 9,11,11,12,
   46.74 + 13,13,15,15,16,16,18,18,
   46.75 + 19,19,21,21,22,22,23,24,
   46.76 + 24,25,26,26,27,27,28,29,
   46.77 + 29,30,30,30,31,32,32,33,
   46.78 + 33,33,34,34,35,35,35,36,
   46.79 + 36,36,37,37,37,38,38,63,
   46.80 +};
   46.81 +
   46.82 +const uint8_t ff_h264_norm_shift[512]= {
   46.83 + 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
   46.84 + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
   46.85 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   46.86 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   46.87 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   46.88 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   46.89 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   46.90 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   46.91 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   46.92 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   46.93 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   46.94 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   46.95 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   46.96 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   46.97 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   46.98 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   46.99 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  46.100 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  46.101 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  46.102 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  46.103 +};
  46.104 +
  46.105 +/**
  46.106 + *
  46.107 + * @param buf_size size of buf in bits
  46.108 + */
  46.109 +void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size){
  46.110 +    c->bytestream_start=
  46.111 +    c->bytestream= buf;
  46.112 +    c->bytestream_end= buf + buf_size;
  46.113 +
  46.114 +#if CABAC_BITS == 16
  46.115 +    c->low =  (*c->bytestream++)<<18;
  46.116 +    c->low+=  (*c->bytestream++)<<10;
  46.117 +#else
  46.118 +    c->low =  (*c->bytestream++)<<10;
  46.119 +#endif
  46.120 +    c->low+= ((*c->bytestream++)<<2) + 2;
  46.121 +    c->range= 0x1FE;
  46.122 +}
  46.123 +
  46.124 +void ff_init_cabac_states(){
  46.125 +    int i, j;
  46.126 +
  46.127 +    for(i=0; i<64; i++){
  46.128 +        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
  46.129 +            ff_h264_lps_range[j*2*64+2*i+0]=
  46.130 +            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
  46.131 +        }
  46.132 +
  46.133 +        ff_h264_mlps_state[128+2*i+0]=
  46.134 +        ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0;
  46.135 +        ff_h264_mlps_state[128+2*i+1]=
  46.136 +        ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1;
  46.137 +
  46.138 +        if( i ){
  46.139 +#ifdef BRANCHLESS_CABAC_DECODER
  46.140 +            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
  46.141 +            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
  46.142 +        }else{
  46.143 +            ff_h264_mlps_state[128-2*i-1]= 1;
  46.144 +            ff_h264_mlps_state[128-2*i-2]= 0;
  46.145 +#else
  46.146 +            ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0;
  46.147 +            ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1;
  46.148 +        }else{
  46.149 +            ff_h264_lps_state[2*i+0]= 1;
  46.150 +            ff_h264_lps_state[2*i+1]= 0;
  46.151 +#endif
  46.152 +        }
  46.153 +    }
  46.154 +}
  46.155 +
  46.156 +#ifdef TEST
  46.157 +#define SIZE 10240
  46.158 +#define START_TIMER
  46.159 +#define STOP_TIMER(...)
  46.160 +#define av_log(...)
  46.161 +// #include "libavutil/lfg.h"
  46.162 +#include "avcodec.h"
  46.163 +#include "cabac.h"
  46.164 +
  46.165 +int main(void){
  46.166 +    CABACContext c;
  46.167 +    uint8_t b[9*SIZE];
  46.168 +    uint8_t r[9*SIZE];
  46.169 +    int i;
  46.170 +    uint8_t state[10]= {0};
  46.171 +//    AVLFG prng;
  46.172 +
  46.173 +// //     av_lfg_init(&prng, 1);
  46.174 +//     ff_init_cabac_encoder(&c, b, SIZE);
  46.175 +//     ff_init_cabac_states();
  46.176 +//
  46.177 +//     for(i=0; i<SIZE; i++){
  46.178 +//         r[i] = i%7; //av_lfg_get(&prng) % 7;
  46.179 +//     }
  46.180 +//
  46.181 +//     for(i=0; i<SIZE; i++){
  46.182 +// START_TIMER
  46.183 +//         put_cabac_bypass(&c, r[i]&1);
  46.184 +// STOP_TIMER("put_cabac_bypass")
  46.185 +//     }
  46.186 +//
  46.187 +//     for(i=0; i<SIZE; i++){
  46.188 +// START_TIMER
  46.189 +//         put_cabac(&c, state, r[i]&1);
  46.190 +// STOP_TIMER("put_cabac")
  46.191 +//     }
  46.192 +//
  46.193 +//     for(i=0; i<SIZE; i++){
  46.194 +// START_TIMER
  46.195 +//         put_cabac_u(&c, state, r[i], 6, 3, i&1);
  46.196 +// STOP_TIMER("put_cabac_u")
  46.197 +//     }
  46.198 +//
  46.199 +//     for(i=0; i<SIZE; i++){
  46.200 +// START_TIMER
  46.201 +//         put_cabac_ueg(&c, state, r[i], 3, 0, 1, 2);
  46.202 +// STOP_TIMER("put_cabac_ueg")
  46.203 +//     }
  46.204 +//
  46.205 +//     put_cabac_terminate(&c, 1);
  46.206 +
  46.207 +    ff_init_cabac_decoder(&c, b, SIZE);
  46.208 +
  46.209 +    memset(state, 0, sizeof(state));
  46.210 +
  46.211 +    for(i=0; i<SIZE; i++){
  46.212 +START_TIMER
  46.213 +        if( (r[i]&1) != get_cabac_bypass(&c) )
  46.214 +            av_log(NULL, AV_LOG_ERROR, "CABAC bypass failure at %d\n", i);
  46.215 +STOP_TIMER("get_cabac_bypass")
  46.216 +    }
  46.217 +
  46.218 +    for(i=0; i<SIZE; i++){
  46.219 +START_TIMER
  46.220 +        if( (r[i]&1) != get_cabac(&c, state) )
  46.221 +            av_log(NULL, AV_LOG_ERROR, "CABAC failure at %d\n", i);
  46.222 +STOP_TIMER("get_cabac")
  46.223 +    }
  46.224 +#if 0
  46.225 +    for(i=0; i<SIZE; i++){
  46.226 +START_TIMER
  46.227 +        if( r[i] != get_cabac_u(&c, state, (i&1) ? 6 : 7, 3, i&1) )
  46.228 +            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
  46.229 +STOP_TIMER("get_cabac_u")
  46.230 +    }
  46.231 +
  46.232 +    for(i=0; i<SIZE; i++){
  46.233 +START_TIMER
  46.234 +        if( r[i] != get_cabac_ueg(&c, state, 3, 0, 1, 2))
  46.235 +            av_log(NULL, AV_LOG_ERROR, "CABAC unary (truncated) binarization failure at %d\n", i);
  46.236 +STOP_TIMER("get_cabac_ueg")
  46.237 +    }
  46.238 +#endif
  46.239 +    if(!get_cabac_terminate(&c))
  46.240 +        av_log(NULL, AV_LOG_ERROR, "where's the Terminator?\n");
  46.241 +
  46.242 +    return 0;
  46.243 +}
  46.244 +
  46.245 +#endif /* TEST */

    47.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    47.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cabac.h	Mon Aug 27 12:09:56 2012 +0200
    47.3 @@ -0,0 +1,206 @@
    47.4 +/*
    47.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
    47.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    47.7 + *
    47.8 + * This file is part of FFmpeg.
    47.9 + *
   47.10 + * FFmpeg is free software; you can redistribute it and/or
   47.11 + * modify it under the terms of the GNU Lesser General Public
   47.12 + * License as published by the Free Software Foundation; either
   47.13 + * version 2.1 of the License, or (at your option) any later version.
   47.14 + *
   47.15 + * FFmpeg is distributed in the hope that it will be useful,
   47.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   47.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   47.18 + * Lesser General Public License for more details.
   47.19 + *
   47.20 + * You should have received a copy of the GNU Lesser General Public
   47.21 + * License along with FFmpeg; if not, write to the Free Software
   47.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   47.23 + */
   47.24 +
   47.25 +/**
   47.26 + * @file
   47.27 + * Context Adaptive Binary Arithmetic Coder.
   47.28 + */
   47.29 +
   47.30 +#ifndef AVCODEC_CABAC_H
   47.31 +#define AVCODEC_CABAC_H
   47.32 +
   47.33 +//#undef NDEBUG
   47.34 +#include <assert.h>
   47.35 +#include "libavutil/x86_cpu.h"
   47.36 +#include "libavutil/attributes.h"
   47.37 +
   47.38 +#define CABAC_BITS 16
   47.39 +#define CABAC_MASK ((1<<CABAC_BITS)-1)
   47.40 +#define BRANCHLESS_CABAC_DECODER 1
   47.41 +
   47.42 +typedef struct CABACContext{
   47.43 +    int low;
   47.44 +    int range;
   47.45 +    int outstanding_count;
   47.46 +#ifdef STRICT_LIMITS
   47.47 +    int symCount;
   47.48 +#endif
   47.49 +    const uint8_t *bytestream_start;
   47.50 +    const uint8_t *bytestream;
   47.51 +    const uint8_t *bytestream_end;
   47.52 +    uint8_t  cabac_state[460];
   47.53 +}CABACContext;
   47.54 +
   47.55 +extern uint8_t ff_h264_mlps_state[4*64];
   47.56 +extern uint8_t ff_h264_lps_range[4*2*64];  ///< rangeTabLPS
   47.57 +extern uint8_t ff_h264_mps_state[2*64];     ///< transIdxMPS
   47.58 +extern uint8_t ff_h264_lps_state[2*64];     ///< transIdxLPS
   47.59 +extern const uint8_t ff_h264_norm_shift[512];
   47.60 +
   47.61 +void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
   47.62 +void ff_init_cabac_states(void);
   47.63 +
   47.64 +static void refill(CABACContext *c){
   47.65 +#if CABAC_BITS == 16
   47.66 +        c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
   47.67 +#else
   47.68 +        c->low+= c->bytestream[0]<<1;
   47.69 +#endif
   47.70 +    c->low -= CABAC_MASK;
   47.71 +    c->bytestream+= CABAC_BITS/8;
   47.72 +}
   47.73 +
   47.74 +static void refill2(CABACContext *c){
   47.75 +    int i, x;
   47.76 +
   47.77 +    x= c->low ^ (c->low-1);
   47.78 +    i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
   47.79 +
   47.80 +    x= -CABAC_MASK;
   47.81 +
   47.82 +#if CABAC_BITS == 16
   47.83 +        x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
   47.84 +#else
   47.85 +        x+= c->bytestream[0]<<1;
   47.86 +#endif
   47.87 +
   47.88 +    c->low += x<<i;
   47.89 +    c->bytestream+= CABAC_BITS/8;
   47.90 +}
   47.91 +
   47.92 +static inline void renorm_cabac_decoder(CABACContext *c){
   47.93 +    while(c->range < 0x100){
   47.94 +        c->range+= c->range;
   47.95 +        c->low+= c->low;
   47.96 +        if(!(c->low & CABAC_MASK))
   47.97 +            refill(c);
   47.98 +    }
   47.99 +}
  47.100 +
  47.101 +static inline void renorm_cabac_decoder_once(CABACContext *c){
  47.102 +
  47.103 +    int shift= (uint32_t)(c->range - 0x100)>>31;
  47.104 +    c->range<<= shift;
  47.105 +    c->low  <<= shift;
  47.106 +
  47.107 +    if(!(c->low & CABAC_MASK))
  47.108 +        refill(c);
  47.109 +}
  47.110 +
  47.111 +static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
  47.112 +
  47.113 +    int s = *state;
  47.114 +    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
  47.115 +    int bit, lps_mask av_unused;
  47.116 +
  47.117 +    c->range -= RangeLPS;
  47.118 +#ifndef BRANCHLESS_CABAC_DECODER
  47.119 +    if(c->low < (c->range<<(CABAC_BITS+1))){
  47.120 +        bit= s&1;
  47.121 +        *state= ff_h264_mps_state[s];
  47.122 +        renorm_cabac_decoder_once(c);
  47.123 +    }else{
  47.124 +        bit= ff_h264_norm_shift[RangeLPS];
  47.125 +        c->low -= (c->range<<(CABAC_BITS+1));
  47.126 +        *state= ff_h264_lps_state[s];
  47.127 +        c->range = RangeLPS<<bit;
  47.128 +        c->low <<= bit;
  47.129 +        bit= (s&1)^1;
  47.130 +
  47.131 +        if(!(c->low & CABAC_MASK)){
  47.132 +            refill2(c);
  47.133 +        }
  47.134 +    }
  47.135 +#else /* BRANCHLESS_CABAC_DECODER */
  47.136 +    lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31;
  47.137 +
  47.138 +    c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask;
  47.139 +    c->range += (RangeLPS - c->range) & lps_mask;
  47.140 +
  47.141 +    s^=lps_mask;
  47.142 +    *state= (ff_h264_mlps_state+128)[s];
  47.143 +    bit= s&1;
  47.144 +
  47.145 +    lps_mask= ff_h264_norm_shift[c->range];
  47.146 +    c->range<<= lps_mask;
  47.147 +    c->low  <<= lps_mask;
  47.148 +    if(!(c->low & CABAC_MASK))
  47.149 +        refill2(c);
  47.150 +#endif /* BRANCHLESS_CABAC_DECODER */
  47.151 +
  47.152 +    return bit;
  47.153 +}
  47.154 +
  47.155 +static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){
  47.156 +    return get_cabac_inline(c, state);
  47.157 +}
  47.158 +
  47.159 +static int av_unused get_cabac(CABACContext *c, uint8_t * const state){
  47.160 +    return get_cabac_inline(c, state);
  47.161 +}
  47.162 +
  47.163 +static int av_unused get_cabac_bypass(CABACContext *c){
  47.164 +
  47.165 +    int range;
  47.166 +    c->low += c->low;
  47.167 +
  47.168 +    if(!(c->low & CABAC_MASK))
  47.169 +        refill(c);
  47.170 +
  47.171 +    range= c->range<<(CABAC_BITS+1);
  47.172 +    if(c->low < range){
  47.173 +        return 0;
  47.174 +    }else{
  47.175 +        c->low -= range;
  47.176 +        return 1;
  47.177 +    }
  47.178 +}
  47.179 +
  47.180 +static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
  47.181 +    int range, mask;
  47.182 +    c->low += c->low;
  47.183 +
  47.184 +    if(!(c->low & CABAC_MASK))
  47.185 +        refill(c);
  47.186 +
  47.187 +    range= c->range<<(CABAC_BITS+1);
  47.188 +    c->low -= range;
  47.189 +    mask= c->low >> 31;
  47.190 +    range &= mask;
  47.191 +    c->low += range;
  47.192 +    return (val^mask)-mask;
  47.193 +}
  47.194 +
  47.195 +/**
  47.196 + *
  47.197 + * @return the number of bytes read or 0 if no end
  47.198 + */
  47.199 +static int av_unused get_cabac_terminate(CABACContext *c){
  47.200 +    c->range -= 2;
  47.201 +    if(c->low < c->range<<(CABAC_BITS+1)){
  47.202 +        renorm_cabac_decoder_once(c);
  47.203 +        return 0;
  47.204 +    }else{
  47.205 +        return c->bytestream - c->bytestream_start;
  47.206 +    }
  47.207 +}
  47.208 +
  47.209 +#endif /* AVCODEC_CABAC_H */

    48.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    48.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.c	Mon Aug 27 12:09:56 2012 +0200
    48.3 @@ -0,0 +1,140 @@
    48.4 +/*
    48.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
    48.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    48.7 + *
    48.8 + * This file is part of FFmpeg.
    48.9 + *
   48.10 + * FFmpeg is free software; you can redistribute it and/or
   48.11 + * modify it under the terms of the GNU Lesser General Public
   48.12 + * License as published by the Free Software Foundation; either
   48.13 + * version 2.1 of the License, or (at your option) any later version.
   48.14 + *
   48.15 + * FFmpeg is distributed in the hope that it will be useful,
   48.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   48.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   48.18 + * Lesser General Public License for more details.
   48.19 + *
   48.20 + * You should have received a copy of the GNU Lesser General Public
   48.21 + * License along with FFmpeg; if not, write to the Free Software
   48.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   48.23 + */
   48.24 +
   48.25 +/**
   48.26 + * @file
   48.27 + * Context Adaptive Binary Arithmetic Coder.
   48.28 + */
   48.29 +
   48.30 +#include <string.h>
   48.31 +
   48.32 +#include "libavutil/common.h"
   48.33 +//#include "get_bits.h"
   48.34 +#include "cabac_spu.h"
   48.35 +#define av_log(...)
   48.36 +
   48.37 +int bytecount =0;
   48.38 +static const uint8_t lps_range[64][4]= {
   48.39 +{128,176,208,240}, {128,167,197,227}, {128,158,187,216}, {123,150,178,205},
   48.40 +{116,142,169,195}, {111,135,160,185}, {105,128,152,175}, {100,122,144,166},
   48.41 +{ 95,116,137,158}, { 90,110,130,150}, { 85,104,123,142}, { 81, 99,117,135},
   48.42 +{ 77, 94,111,128}, { 73, 89,105,122}, { 69, 85,100,116}, { 66, 80, 95,110},
   48.43 +{ 62, 76, 90,104}, { 59, 72, 86, 99}, { 56, 69, 81, 94}, { 53, 65, 77, 89},
   48.44 +{ 51, 62, 73, 85}, { 48, 59, 69, 80}, { 46, 56, 66, 76}, { 43, 53, 63, 72},
   48.45 +{ 41, 50, 59, 69}, { 39, 48, 56, 65}, { 37, 45, 54, 62}, { 35, 43, 51, 59},
   48.46 +{ 33, 41, 48, 56}, { 32, 39, 46, 53}, { 30, 37, 43, 50}, { 29, 35, 41, 48},
   48.47 +{ 27, 33, 39, 45}, { 26, 31, 37, 43}, { 24, 30, 35, 41}, { 23, 28, 33, 39},
   48.48 +{ 22, 27, 32, 37}, { 21, 26, 30, 35}, { 20, 24, 29, 33}, { 19, 23, 27, 31},
   48.49 +{ 18, 22, 26, 30}, { 17, 21, 25, 28}, { 16, 20, 23, 27}, { 15, 19, 22, 25},
   48.50 +{ 14, 18, 21, 24}, { 14, 17, 20, 23}, { 13, 16, 19, 22}, { 12, 15, 18, 21},
   48.51 +{ 12, 14, 17, 20}, { 11, 14, 16, 19}, { 11, 13, 15, 18}, { 10, 12, 15, 17},
   48.52 +{ 10, 12, 14, 16}, {  9, 11, 13, 15}, {  9, 11, 12, 14}, {  8, 10, 12, 14},
   48.53 +{  8,  9, 11, 13}, {  7,  9, 11, 12}, {  7,  9, 10, 12}, {  7,  8, 10, 11},
   48.54 +{  6,  8,  9, 11}, {  6,  7,  9, 10}, {  6,  7,  8,  9}, {  2,  2,  2,  2},
   48.55 +};
   48.56 +
   48.57 +uint8_t ff_h264_mlps_state[4*64];
   48.58 +uint8_t ff_h264_lps_range[4*2*64];
   48.59 +uint8_t ff_h264_lps_state[2*64];
   48.60 +uint8_t ff_h264_mps_state[2*64];
   48.61 +
   48.62 +static const uint8_t mps_state[64]= {
   48.63 +  1, 2, 3, 4, 5, 6, 7, 8,
   48.64 +  9,10,11,12,13,14,15,16,
   48.65 + 17,18,19,20,21,22,23,24,
   48.66 + 25,26,27,28,29,30,31,32,
   48.67 + 33,34,35,36,37,38,39,40,
   48.68 + 41,42,43,44,45,46,47,48,
   48.69 + 49,50,51,52,53,54,55,56,
   48.70 + 57,58,59,60,61,62,62,63,
   48.71 +};
   48.72 +
   48.73 +static const uint8_t lps_state[64]= {
   48.74 +  0, 0, 1, 2, 2, 4, 4, 5,
   48.75 +  6, 7, 8, 9, 9,11,11,12,
   48.76 + 13,13,15,15,16,16,18,18,
   48.77 + 19,19,21,21,22,22,23,24,
   48.78 + 24,25,26,26,27,27,28,29,
   48.79 + 29,30,30,30,31,32,32,33,
   48.80 + 33,33,34,34,35,35,35,36,
   48.81 + 36,36,37,37,37,38,38,63,
   48.82 +};
   48.83 +
   48.84 +const uint8_t ff_h264_norm_shift[512]= {
   48.85 + 9,8,7,7,6,6,6,6,5,5,5,5,5,5,5,5,
   48.86 + 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
   48.87 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   48.88 + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   48.89 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   48.90 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   48.91 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   48.92 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   48.93 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   48.94 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   48.95 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   48.96 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   48.97 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   48.98 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   48.99 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  48.100 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  48.101 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  48.102 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  48.103 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  48.104 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  48.105 +};
  48.106 +
  48.107 +/**
  48.108 + *
  48.109 + * @param buf_size size of buf in bits
  48.110 + */
  48.111 +
  48.112 +void ff_init_cabac_states(){
  48.113 +    int i, j;
  48.114 +
  48.115 +    for(i=0; i<64; i++){
  48.116 +        for(j=0; j<4; j++){ //FIXME check if this is worth the 1 shift we save
  48.117 +            ff_h264_lps_range[j*2*64+2*i+0]=
  48.118 +            ff_h264_lps_range[j*2*64+2*i+1]= lps_range[i][j];
  48.119 +        }
  48.120 +
  48.121 +        ff_h264_mlps_state[128+2*i+0]=
  48.122 +        ff_h264_mps_state[2*i+0]= 2*mps_state[i]+0;
  48.123 +        ff_h264_mlps_state[128+2*i+1]=
  48.124 +        ff_h264_mps_state[2*i+1]= 2*mps_state[i]+1;
  48.125 +
  48.126 +        if( i ){
  48.127 +#ifdef BRANCHLESS_CABAC_DECODER
  48.128 +            ff_h264_mlps_state[128-2*i-1]= 2*lps_state[i]+0;
  48.129 +            ff_h264_mlps_state[128-2*i-2]= 2*lps_state[i]+1;
  48.130 +        }else{
  48.131 +            ff_h264_mlps_state[128-2*i-1]= 1;
  48.132 +            ff_h264_mlps_state[128-2*i-2]= 0;
  48.133 +#else
  48.134 +            ff_h264_lps_state[2*i+0]= 2*lps_state[i]+0;
  48.135 +            ff_h264_lps_state[2*i+1]= 2*lps_state[i]+1;
  48.136 +        }else{
  48.137 +            ff_h264_lps_state[2*i+0]= 1;
  48.138 +            ff_h264_lps_state[2*i+1]= 0;
  48.139 +#endif
  48.140 +        }
  48.141 +    }
  48.142 +}
  48.143 +

    49.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    49.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/cabac_spu.h	Mon Aug 27 12:09:56 2012 +0200
    49.3 @@ -0,0 +1,233 @@
    49.4 +/*
    49.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
    49.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    49.7 + *
    49.8 + * This file is part of FFmpeg.
    49.9 + *
   49.10 + * FFmpeg is free software; you can redistribute it and/or
   49.11 + * modify it under the terms of the GNU Lesser General Public
   49.12 + * License as published by the Free Software Foundation; either
   49.13 + * version 2.1 of the License, or (at your option) any later version.
   49.14 + *
   49.15 + * FFmpeg is distributed in the hope that it will be useful,
   49.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   49.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   49.18 + * Lesser General Public License for more details.
   49.19 + *
   49.20 + * You should have received a copy of the GNU Lesser General Public
   49.21 + * License along with FFmpeg; if not, write to the Free Software
   49.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   49.23 + */
   49.24 +
   49.25 +/**
   49.26 + * @file
   49.27 + * Context Adaptive Binary Arithmetic Coder.
   49.28 + */
   49.29 +
   49.30 +#ifndef AVCODEC_CABAC_H
   49.31 +#define AVCODEC_CABAC_H
   49.32 +
   49.33 +//#undef NDEBUG
   49.34 +#include <assert.h>
   49.35 +#include "h264_dma.h"
   49.36 +#include "libavutil/x86_cpu.h"
   49.37 +#include "libavutil/attributes.h"
   49.38 +
   49.39 +#define CABAC_BITS 16
   49.40 +#define CABAC_MASK ((1<<CABAC_BITS)-1)
   49.41 +#define BRANCHLESS_CABAC_DECODER 1
   49.42 +
   49.43 +typedef struct CABACContext{
   49.44 +    int low;
   49.45 +    int range;
   49.46 +    int outstanding_count;
   49.47 +#ifdef STRICT_LIMITS
   49.48 +    int symCount;
   49.49 +#endif
   49.50 +	const uint8_t *bytestream_ea_start;
   49.51 +    const uint8_t *bytestream_ea;
   49.52 +	const uint8_t *bytestream_ea_end;
   49.53 +	int slot;
   49.54 +	int bufsize;
   49.55 +
   49.56 +	uint8_t *bytestream_start;
   49.57 +    uint8_t *bytestream;
   49.58 +    uint8_t *bytestream_end;
   49.59 +    uint8_t  cabac_state[460];
   49.60 +}CABACContext;
   49.61 +
   49.62 +extern uint8_t ff_h264_mlps_state[4*64];
   49.63 +extern uint8_t ff_h264_lps_range[4*2*64];  ///< rangeTabLPS
   49.64 +extern uint8_t ff_h264_mps_state[2*64];     ///< transIdxMPS
   49.65 +extern uint8_t ff_h264_lps_state[2*64];     ///< transIdxLPS
   49.66 +extern const uint8_t ff_h264_norm_shift[512];
   49.67 +
   49.68 +void ff_init_cabac_states(void);
   49.69 +
   49.70 +extern DECLARE_ALIGNED(128,uint8_t, bytestream_ls[4096]);
   49.71 +extern int bytecount;
   49.72 +static inline void dma_cabac(CABACContext *c){
   49.73 +	bytecount++;
   49.74 +	if (c->bytestream == c->bytestream_end){
   49.75 +		if (c->bufsize>0){
   49.76 +			int size = (c->bufsize > sizeof(bytestream_ls)) ?  sizeof(bytestream_ls) : c->bufsize;
   49.77 +			int align = size &0xF;
   49.78 +			int dma_size = size + (align? 16-align : 0);
   49.79 +
   49.80 +			spu_dma_get(bytestream_ls, (unsigned) c->bytestream_ea, dma_size, ED_raw);
   49.81 +			wait_dma_id(ED_raw);
   49.82 +			c->bytestream = bytestream_ls;
   49.83 +			c->bytestream_end = &bytestream_ls[size];
   49.84 +			c->bytestream_ea += dma_size;
   49.85 +			c->bufsize -= size;
   49.86 +		} 
   49.87 +		bytecount =0;
   49.88 +	}else if((unsigned)c->bytestream > (unsigned)c->bytestream_end +2){		
   49.89 +		//fprintf(stderr, "Read beyond end of frame %d\n", c->bufsize);
   49.90 +		bytecount =0;
   49.91 +	}
   49.92 +}
   49.93 +
   49.94 +static void refill(CABACContext *c){
   49.95 +	dma_cabac(c); 
   49.96 +
   49.97 +	c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
   49.98 +
   49.99 +    c->low -= CABAC_MASK;
  49.100 +    c->bytestream+= CABAC_BITS/8;
  49.101 +}
  49.102 +
  49.103 +static void refill2(CABACContext *c){
  49.104 +    int i, x;
  49.105 +
  49.106 +	dma_cabac(c);
  49.107 +
  49.108 +    x= c->low ^ (c->low-1);
  49.109 +    i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)];
  49.110 +
  49.111 +    x= -CABAC_MASK;
  49.112 +
  49.113 +	x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1);
  49.114 +
  49.115 +    c->low += x<<i;
  49.116 +    c->bytestream+= CABAC_BITS/8;
  49.117 +}
  49.118 +
  49.119 +static inline void renorm_cabac_decoder(CABACContext *c){
  49.120 +    while(c->range < 0x100){
  49.121 +        c->range+= c->range;
  49.122 +        c->low+= c->low;
  49.123 +        if(!(c->low & CABAC_MASK))
  49.124 +            refill(c);
  49.125 +    }
  49.126 +}
  49.127 +
  49.128 +static inline void renorm_cabac_decoder_once(CABACContext *c){
  49.129 +
  49.130 +    int shift= (uint32_t)(c->range - 0x100)>>31;
  49.131 +    c->range<<= shift;
  49.132 +    c->low  <<= shift;
  49.133 +
  49.134 +    if(!(c->low & CABAC_MASK))
  49.135 +        refill(c);
  49.136 +}
  49.137 +
  49.138 +static av_always_inline int get_cabac_inline(CABACContext *c, uint8_t * const state){
  49.139 +
  49.140 +    int s = *state;
  49.141 +    int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s];
  49.142 +    int bit, lps_mask av_unused;
  49.143 +
  49.144 +    c->range -= RangeLPS;
  49.145 +#ifndef BRANCHLESS_CABAC_DECODER
  49.146 +    if(c->low < (c->range<<(CABAC_BITS+1))){
  49.147 +        bit= s&1;
  49.148 +        *state= ff_h264_mps_state[s];
  49.149 +        renorm_cabac_decoder_once(c);
  49.150 +    }else{
  49.151 +        bit= ff_h264_norm_shift[RangeLPS];
  49.152 +        c->low -= (c->range<<(CABAC_BITS+1));
  49.153 +        *state= ff_h264_lps_state[s];
  49.154 +        c->range = RangeLPS<<bit;
  49.155 +        c->low <<= bit;
  49.156 +        bit= (s&1)^1;
  49.157 +
  49.158 +        if(!(c->low & CABAC_MASK)){
  49.159 +            refill2(c);
  49.160 +        }
  49.161 +    }
  49.162 +#else /* BRANCHLESS_CABAC_DECODER */
  49.163 +    lps_mask= ((c->range<<(CABAC_BITS+1)) - c->low)>>31;
  49.164 +
  49.165 +    c->low -= (c->range<<(CABAC_BITS+1)) & lps_mask;
  49.166 +    c->range += (RangeLPS - c->range) & lps_mask;
  49.167 +
  49.168 +    s^=lps_mask;
  49.169 +    *state= (ff_h264_mlps_state+128)[s];
  49.170 +    bit= s&1;
  49.171 +
  49.172 +    lps_mask= ff_h264_norm_shift[c->range];
  49.173 +    c->range<<= lps_mask;
  49.174 +    c->low  <<= lps_mask;
  49.175 +    if(!(c->low & CABAC_MASK))
  49.176 +        refill2(c);
  49.177 +#endif /* BRANCHLESS_CABAC_DECODER */
  49.178 +
  49.179 +    return bit;
  49.180 +}
  49.181 +
  49.182 +static int av_noinline av_unused get_cabac_noinline(CABACContext *c, uint8_t * const state){
  49.183 +    return get_cabac_inline(c, state);
  49.184 +}
  49.185 +
  49.186 +static int av_unused get_cabac(CABACContext *c, uint8_t * const state){
  49.187 +    return get_cabac_inline(c, state);
  49.188 +}
  49.189 +
  49.190 +static int av_unused get_cabac_bypass(CABACContext *c){
  49.191 +
  49.192 +    int range;
  49.193 +    c->low += c->low;
  49.194 +
  49.195 +    if(!(c->low & CABAC_MASK))
  49.196 +        refill(c);
  49.197 +
  49.198 +    range= c->range<<(CABAC_BITS+1);
  49.199 +    if(c->low < range){
  49.200 +        return 0;
  49.201 +    }else{
  49.202 +        c->low -= range;
  49.203 +        return 1;
  49.204 +    }
  49.205 +}
  49.206 +
  49.207 +static av_always_inline int get_cabac_bypass_sign(CABACContext *c, int val){
  49.208 +    int range, mask;
  49.209 +    c->low += c->low;
  49.210 +
  49.211 +    if(!(c->low & CABAC_MASK))
  49.212 +        refill(c);
  49.213 +
  49.214 +    range= c->range<<(CABAC_BITS+1);
  49.215 +    c->low -= range;
  49.216 +    mask= c->low >> 31;
  49.217 +    range &= mask;
  49.218 +    c->low += range;
  49.219 +    return (val^mask)-mask;
  49.220 +}
  49.221 +
  49.222 +/**
  49.223 + *
  49.224 + * @return the number of bytes read or 0 if no end
  49.225 + */
  49.226 +static int av_unused get_cabac_terminate(CABACContext *c){
  49.227 +    c->range -= 2;
  49.228 +    if(c->low < c->range<<(CABAC_BITS+1)){
  49.229 +        renorm_cabac_decoder_once(c);
  49.230 +        return 0;
  49.231 +    }else{
  49.232 +        return c->bytestream - c->bytestream_start;
  49.233 +    }
  49.234 +}
  49.235 +
  49.236 +#endif /* AVCODEC_CABAC_H */

    50.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    50.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.c	Mon Aug 27 12:09:56 2012 +0200
    50.3 @@ -0,0 +1,1147 @@
    50.4 +/*
    50.5 + * Copyright (c) 2009 TUDelft 
    50.6 + * 
    50.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 
    50.8 + */
    50.9 +
   50.10 +/**
   50.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   50.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
   50.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   50.14 + * 
   50.15 + * SIMD SPU kernels 
   50.16 + * H.264/AVC motion compensation
   50.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   50.18 + * @author Albert Paradis <apar7632@hotmail.com>
   50.19 + */ 
   50.20 +
   50.21 +
   50.22 +#include "dsputil_spu.h"
   50.23 +#include "h264_idct_spu.h"
   50.24 +#include "h264_deblock_spu.h"
   50.25 +#include "types_spu.h"
   50.26 +#include "libavutil/intreadwrite.h"
   50.27 +
   50.28 +#include <stdio.h>
   50.29 +#include <spu_intrinsics.h>
   50.30 +#include <spu_mfcio.h>
   50.31 +#include <assert.h>
   50.32 +
   50.33 +//Luma interpolation
   50.34 +#define PUT_OP_U8_SPU(d, s, dst) (void) dst; d = s
   50.35 +#define AVG_OP_U8_SPU(d, s, dst) d = spu_avg(dst, s)
   50.36 +
   50.37 +#define OP_U8_SPU                          PUT_OP_U8_SPU
   50.38 +#define PREFIX_h264_qpel16_h_lowpass_spu   put_h264_qpel16_h_lowpass_spu
   50.39 +#define PREFIX_h264_qpel16_v_lowpass_spu   put_h264_qpel16_v_lowpass_spu
   50.40 +#define PREFIX_h264_qpel16_hv_lowpass_spu  put_h264_qpel16_hv_lowpass_spu
   50.41 +#define PREFIX_h264_qpel8_h_lowpass_spu    put_h264_qpel8_h_lowpass_spu
   50.42 +#define PREFIX_h264_qpel8_v_lowpass_spu    put_h264_qpel8_v_lowpass_spu
   50.43 +#define PREFIX_h264_qpel8_hv_lowpass_spu   put_h264_qpel8_hv_lowpass_spu
   50.44 +#define PREFIX_h264_qpel4_h_lowpass_spu    put_h264_qpel4_h_lowpass_spu
   50.45 +#define PREFIX_h264_qpel4_v_lowpass_spu    put_h264_qpel4_v_lowpass_spu
   50.46 +#define PREFIX_h264_qpel4_hv_lowpass_spu   put_h264_qpel4_hv_lowpass_spu
   50.47 +#include "h264_luma_template_spu.c"
   50.48 +#undef OP_U8_SPU                          
   50.49 +#undef PREFIX_h264_qpel16_h_lowpass_spu
   50.50 +#undef PREFIX_h264_qpel16_v_lowpass_spu
   50.51 +#undef PREFIX_h264_qpel16_hv_lowpass_spu
   50.52 +#undef PREFIX_h264_qpel8_h_lowpass_spu
   50.53 +#undef PREFIX_h264_qpel8_v_lowpass_spu
   50.54 +#undef PREFIX_h264_qpel8_hv_lowpass_spu
   50.55 +#undef PREFIX_h264_qpel4_h_lowpass_spu
   50.56 +#undef PREFIX_h264_qpel4_v_lowpass_spu
   50.57 +#undef PREFIX_h264_qpel4_hv_lowpass_spu
   50.58 +
   50.59 +#define OP_U8_SPU                          AVG_OP_U8_SPU
   50.60 +#define PREFIX_h264_qpel16_h_lowpass_spu   avg_h264_qpel16_h_lowpass_spu
   50.61 +#define PREFIX_h264_qpel16_v_lowpass_spu   avg_h264_qpel16_v_lowpass_spu
   50.62 +#define PREFIX_h264_qpel16_hv_lowpass_spu  avg_h264_qpel16_hv_lowpass_spu
   50.63 +#define PREFIX_h264_qpel8_h_lowpass_spu    avg_h264_qpel8_h_lowpass_spu
   50.64 +#define PREFIX_h264_qpel8_v_lowpass_spu    avg_h264_qpel8_v_lowpass_spu
   50.65 +#define PREFIX_h264_qpel8_hv_lowpass_spu   avg_h264_qpel8_hv_lowpass_spu
   50.66 +#define PREFIX_h264_qpel4_h_lowpass_spu    avg_h264_qpel4_h_lowpass_spu
   50.67 +#define PREFIX_h264_qpel4_v_lowpass_spu    avg_h264_qpel4_v_lowpass_spu
   50.68 +#define PREFIX_h264_qpel4_hv_lowpass_spu   avg_h264_qpel4_hv_lowpass_spu
   50.69 +#include "h264_luma_template_spu.c"
   50.70 +#undef OP_U8_SPU                          
   50.71 +#undef PREFIX_h264_qpel16_h_lowpass_spu
   50.72 +#undef PREFIX_h264_qpel16_v_lowpass_spu
   50.73 +#undef PREFIX_h264_qpel16_hv_lowpass_spu
   50.74 +#undef PREFIX_h264_qpel8_h_lowpass_spu
   50.75 +#undef PREFIX_h264_qpel8_v_lowpass_spu
   50.76 +#undef PREFIX_h264_qpel8_hv_lowpass_spu
   50.77 +#undef PREFIX_h264_qpel4_h_lowpass_spu
   50.78 +#undef PREFIX_h264_qpel4_v_lowpass_spu
   50.79 +#undef PREFIX_h264_qpel4_hv_lowpass_spu
   50.80 +
   50.81 +#define H264_MC(OPNAME, SIZE, CODETYPE) \
   50.82 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   50.83 +    OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, dst_stride, STRIDE_Y, h);\
   50.84 +}\
   50.85 +\
   50.86 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){ \
   50.87 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
   50.88 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
   50.89 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
   50.90 +}\
   50.91 +\
   50.92 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   50.93 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
   50.94 +}\
   50.95 +\
   50.96 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
   50.97 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
   50.98 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, 16, h);\
   50.99 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, dst_stride, STRIDE_Y, h);\
  50.100 +}\
  50.101 +\
  50.102 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.103 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
  50.104 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
  50.105 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, dst_stride, STRIDE_Y, h);\
  50.106 +}\
  50.107 +\
  50.108 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.109 +    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, dst_stride, h);\
  50.110 +}\
  50.111 +\
  50.112 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.113 +    DECLARE_ALIGNED_16(uint8_t, half[16*16]);\
  50.114 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, 16, h);\
  50.115 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+STRIDE_Y, half, dst_stride, STRIDE_Y, h);\
  50.116 +}\
  50.117 +\
  50.118 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.119 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
  50.120 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
  50.121 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
  50.122 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
  50.123 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
  50.124 +}\
  50.125 +\
  50.126 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.127 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
  50.128 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
  50.129 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
  50.130 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
  50.131 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
  50.132 +}\
  50.133 +\
  50.134 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.135 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
  50.136 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
  50.137 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
  50.138 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
  50.139 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
  50.140 +}\
  50.141 +\
  50.142 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.143 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
  50.144 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
  50.145 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
  50.146 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
  50.147 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, dst_stride, 16, h);\
  50.148 +}\
  50.149 +\
  50.150 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.151 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
  50.152 +    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, dst_stride, 16, h);\
  50.153 +}\
  50.154 +\
  50.155 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.156 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
  50.157 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
  50.158 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
  50.159 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, 16, h);\
  50.160 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
  50.161 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
  50.162 +}\
  50.163 +\
  50.164 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.165 +    DECLARE_ALIGNED_16(uint8_t, halfH[16*16]);\
  50.166 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
  50.167 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
  50.168 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + STRIDE_Y, 16, h);\
  50.169 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
  50.170 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, dst_stride, 16, h);\
  50.171 +}\
  50.172 +\
  50.173 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.174 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
  50.175 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
  50.176 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
  50.177 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, 16, h);\
  50.178 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
  50.179 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
  50.180 +}\
  50.181 +\
  50.182 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int dst_stride, int h){\
  50.183 +    DECLARE_ALIGNED_16(uint8_t, halfV[16*16]);\
  50.184 +    DECLARE_ALIGNED_16(uint8_t, halfHV[16*16]);\
  50.185 +    DECLARE_ALIGNED_16(int16_t, tmp[16*(16+8)]);\
  50.186 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, 16, h);\
  50.187 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, 16, 16, h);\
  50.188 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, dst_stride, 16, h);\
  50.189 +}\
  50.190 +
  50.191 +
  50.192 +/**************************/
  50.193 +/* put pixels functions   */
  50.194 +/*************************/
  50.195 +
  50.196 +static void put_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
  50.197 +                                    const uint8_t * src2, int dst_stride,
  50.198 +                                    int src_stride1, int h)
  50.199 +{
  50.200 +  int i;
  50.201 +
  50.202 +  const int perm_src1 = (unsigned int) src1 & 15;
  50.203 +
  50.204 +  for (i=0; i<h; i++){
  50.205 +      //unaligned load of src1
  50.206 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
  50.207 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
  50.208 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
  50.209 +
  50.210 +      //aligned load of src2
  50.211 +      const vuint8_t srcb = *(vuint8_t *)(src2);
  50.212 +
  50.213 +      //average and rounding
  50.214 +      const vuint8_t avgc = spu_avg(srca,srcb);
  50.215 +
  50.216 +      // 16x16 dest luma blocks are always aligned
  50.217 +      *(vuint8_t *)dst=avgc;
  50.218 +
  50.219 +      src1 +=src_stride1;
  50.220 +      src2 +=16;
  50.221 +      dst  +=dst_stride;
  50.222 +  }
  50.223 +}
  50.224 +
  50.225 +static void avg_pixels16_l2_spu( uint8_t * dst, const uint8_t * src1,
  50.226 +                                    const uint8_t * src2, int dst_stride,
  50.227 +                                    int src_stride1, int h)
  50.228 +{
  50.229 +  int i;
  50.230 +
  50.231 +  const int perm_src1 = (unsigned int) src1 & 15;
  50.232 +
  50.233 +  for (i=0; i<h; i++){
  50.234 +      //unaligned load of src1
  50.235 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
  50.236 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
  50.237 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
  50.238 +
  50.239 +      //aligned load of src2
  50.240 +      const vuint8_t srcb = *(vuint8_t *)(src2);
  50.241 +
  50.242 +      //average and rounding
  50.243 +      const vuint8_t avgc = spu_avg(spu_avg(srca,srcb), *(vuint8_t *)dst);
  50.244 +
  50.245 +      // 16x16 dest luma blocks are always aligned
  50.246 +      *(vuint8_t *)dst=avgc;
  50.247 +
  50.248 +      src1 +=src_stride1;
  50.249 +      src2 +=16;
  50.250 +      dst  +=dst_stride;
  50.251 +  }
  50.252 +}
  50.253 +
  50.254 +// next one assumes that ((line_size % 16) == 0)
  50.255 +void put_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
  50.256 +{
  50.257 +    register vector unsigned char pixelsv1, pixelsv2;
  50.258 +    register vector unsigned char pixelsv1B, pixelsv2B;
  50.259 +    register vector unsigned char pixelsv1C, pixelsv2C;
  50.260 +    register vector unsigned char pixelsv1D, pixelsv2D;
  50.261 +
  50.262 +    const int perm = (unsigned int) src & 15;
  50.263 +    int i;
  50.264 +	register int line_size   = src_stride;
  50.265 +    register int line_size_2 = line_size << 1;
  50.266 +    register int line_size_3 = line_size + line_size_2;
  50.267 +    register int line_size_4 = line_size << 2;
  50.268 +
  50.269 +    register int dst_stride_2 = dst_stride << 1;
  50.270 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
  50.271 +    register int dst_stride_4 = dst_stride << 2;
  50.272 +
  50.273 +    for(i=0; i<h; i+=4) {
  50.274 +      pixelsv1 = *(vuint8_t *)(src);
  50.275 +      pixelsv2 = *(vuint8_t *)(src+16);
  50.276 +      pixelsv1B = *(vuint8_t *)(src + line_size);
  50.277 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
  50.278 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
  50.279 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
  50.280 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
  50.281 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
  50.282 +
  50.283 +      *(vuint8_t *) dst                 = spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16));
  50.284 +      *(vuint8_t *)(dst +   dst_stride) = spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16));
  50.285 +      *(vuint8_t *)(dst + dst_stride_2) = spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16));
  50.286 +      *(vuint8_t *)(dst + dst_stride_3) = spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16));
  50.287 +
  50.288 +      src+= line_size_4;
  50.289 +      dst+= dst_stride_4;
  50.290 +    }
  50.291 +}
  50.292 +
  50.293 +// next one assumes that ((line_size % 16) == 0)
  50.294 +void avg_pixels16_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
  50.295 +{
  50.296 +    register vector unsigned char pixelsv1, pixelsv2;
  50.297 +    register vector unsigned char pixelsv1B, pixelsv2B;
  50.298 +    register vector unsigned char pixelsv1C, pixelsv2C;
  50.299 +    register vector unsigned char pixelsv1D, pixelsv2D;
  50.300 +
  50.301 +    const int perm = (unsigned int) src & 15;
  50.302 +    int i;
  50.303 +	register int line_size   = src_stride;
  50.304 +    register int line_size_2 = line_size << 1;
  50.305 +    register int line_size_3 = line_size + line_size_2;
  50.306 +    register int line_size_4 = line_size << 2;
  50.307 +
  50.308 +    register int dst_stride_2 = dst_stride << 1;
  50.309 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
  50.310 +    register int dst_stride_4 = dst_stride << 2;
  50.311 +
  50.312 +
  50.313 +    for(i=0; i<h; i+=4) {
  50.314 +      pixelsv1 = *(vuint8_t *)(src);
  50.315 +      pixelsv2 = *(vuint8_t *)(src+16);
  50.316 +      pixelsv1B = *(vuint8_t *)(src + line_size);
  50.317 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
  50.318 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
  50.319 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
  50.320 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
  50.321 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
  50.322 +
  50.323 +      *(vuint8_t *)dst = spu_avg(spu_or(spu_slqwbyte(pixelsv1, perm), spu_rlmaskqwbyte(pixelsv2, perm-16)), *(vuint8_t *)dst);
  50.324 +      *(vuint8_t *)(dst + dst_stride) = spu_avg(spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), *(vuint8_t *)(dst+dst_stride));
  50.325 +      *(vuint8_t *)(dst + dst_stride_2) = spu_avg(spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), *(vuint8_t *)(dst+dst_stride_2));
  50.326 +      *(vuint8_t *)(dst + dst_stride_3) = spu_avg(spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), *(vuint8_t *)(dst+dst_stride_3));
  50.327 +
  50.328 +      src+= line_size_4;
  50.329 +      dst+= dst_stride_4;
  50.330 +    }
  50.331 +}
  50.332 +
  50.333 +void put_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
  50.334 +				   int dst_stride, int src_stride1, int h)
  50.335 +{
  50.336 +  int i;
  50.337 +
  50.338 +  const int perm_src1 = (unsigned int) src1 & 15;
  50.339 +  const int shift_dst = (unsigned int) dst & 15;
  50.340 +
  50.341 +  // 8x dest luma blocks are aligned or desaligned by 8
  50.342 +  vuint8_t dstmask;
  50.343 +  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.344 +  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
  50.345 +
  50.346 +  if(shift_dst==0){
  50.347 +    dstmask = dst8mask1;
  50.348 +  }
  50.349 +  else{
  50.350 +    dstmask = dst8mask2;
  50.351 +  }
  50.352 +
  50.353 +  for (i=0; i<h; i++){
  50.354 +      //unaligned load of src1
  50.355 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
  50.356 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
  50.357 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
  50.358 +
  50.359 +      //aligned load of src2
  50.360 +      const vuint8_t srcb = *(vuint8_t *)(src2);
  50.361 +
  50.362 +      //average and rounding
  50.363 +      const vuint8_t avgc = spu_avg(srca,srcb);
  50.364 +
  50.365 +      const vuint8_t dst1 = *(vuint8_t *)dst;
  50.366 +
  50.367 +      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
  50.368 +
  50.369 +      *(vuint8_t *)dst=davgc;
  50.370 +
  50.371 +      src1 +=src_stride1;
  50.372 +      src2 +=16;
  50.373 +      dst  +=dst_stride;
  50.374 +  }
  50.375 +}
  50.376 +
  50.377 +void avg_pixels8_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
  50.378 +				   int dst_stride, int src_stride1, int h)
  50.379 +{
  50.380 +  int i;
  50.381 +
  50.382 +  const int perm_src1 = (unsigned int) src1 & 15;
  50.383 +  const int shift_dst = (unsigned int) dst & 15;
  50.384 +
  50.385 +  // 8x dest luma blocks are aligned or desaligned by 8
  50.386 +  vuint8_t dstmask;
  50.387 +  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.388 +  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
  50.389 +
  50.390 +  if(shift_dst==0){
  50.391 +    dstmask = dst8mask1;
  50.392 +  }
  50.393 +  else{
  50.394 +    dstmask = dst8mask2;
  50.395 +  }
  50.396 +
  50.397 +  for (i=0; i<h; i++){
  50.398 +      //unaligned load of src1
  50.399 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
  50.400 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
  50.401 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
  50.402 +
  50.403 +      //aligned load of src2
  50.404 +      const vuint8_t srcb = *(vuint8_t *)(src2);
  50.405 +
  50.406 +      //average and rounding
  50.407 +      const vuint8_t avgc = spu_avg(srca,srcb);
  50.408 +
  50.409 +      const vuint8_t dst1 = *(vuint8_t *)dst;
  50.410 +
  50.411 +      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
  50.412 +
  50.413 +      const vuint8_t davgc = spu_avg(dst1,davgc1);
  50.414 +
  50.415 +      *(vuint8_t *)dst=davgc;
  50.416 +
  50.417 +      src1 +=src_stride1;
  50.418 +      src2 +=16;
  50.419 +      dst  +=dst_stride;
  50.420 +  }
  50.421 +}
  50.422 +
  50.423 +// next one assumes that ((line_size % 16) == 0)
  50.424 +void put_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
  50.425 +{
  50.426 +    register vector unsigned char pixelsv1A, pixelsv2A;
  50.427 +    register vector unsigned char pixelsv1B, pixelsv2B;
  50.428 +    register vector unsigned char pixelsv1C, pixelsv2C;
  50.429 +    register vector unsigned char pixelsv1D, pixelsv2D;
  50.430 +
  50.431 +    const int perm = (unsigned int) src & 15;
  50.432 +    const int shift_dst = (unsigned int) dst & 15;
  50.433 +
  50.434 +    // 8x dest luma blocks are aligned or desaligned by 8
  50.435 +    vuint8_t dstmask;
  50.436 +    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.437 +    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
  50.438 +
  50.439 +    if(shift_dst==0){
  50.440 +      dstmask = dst8mask1;
  50.441 +    }
  50.442 +    else{
  50.443 +      dstmask = dst8mask2;
  50.444 +    }
  50.445 +
  50.446 +    int i;
  50.447 +	register int line_size   = src_stride;
  50.448 +    register int line_size_2 = line_size << 1;
  50.449 +    register int line_size_3 = line_size + line_size_2;
  50.450 +    register int line_size_4 = line_size << 2;
  50.451 +
  50.452 +    register int dst_stride_2 = dst_stride << 1;
  50.453 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
  50.454 +    register int dst_stride_4 = dst_stride << 2;
  50.455 +
  50.456 +    for(i=0; i<h; i+=4) {
  50.457 +      pixelsv1A = *(vuint8_t *)(src);
  50.458 +      pixelsv2A = *(vuint8_t *)(src+16);
  50.459 +      pixelsv1B = *(vuint8_t *)(src + line_size);
  50.460 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
  50.461 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
  50.462 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
  50.463 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
  50.464 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
  50.465 +
  50.466 +      const vuint8_t block1 = *(vuint8_t *)dst;
  50.467 +      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
  50.468 +      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
  50.469 +      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
  50.470 +      const vuint8_t block3 = *(vuint8_t *)(dst+2*dst_stride);
  50.471 +      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
  50.472 +      const vuint8_t block4 = *(vuint8_t *)(dst+3*dst_stride);
  50.473 +      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
  50.474 +
  50.475 +      *(vuint8_t *) dst = put1;
  50.476 +      *(vuint8_t *)(dst + dst_stride) = put2;
  50.477 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
  50.478 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
  50.479 +
  50.480 +      src += line_size_4;
  50.481 +      dst += dst_stride_4;
  50.482 +    }
  50.483 +}
  50.484 +
  50.485 +// next one assumes that ((line_size % 16) == 0)
  50.486 +void avg_pixels8_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
  50.487 +{
  50.488 +    register vector unsigned char pixelsv1A, pixelsv2A;
  50.489 +    register vector unsigned char pixelsv1B, pixelsv2B;
  50.490 +    register vector unsigned char pixelsv1C, pixelsv2C;
  50.491 +    register vector unsigned char pixelsv1D, pixelsv2D;
  50.492 +
  50.493 +    const int perm = (unsigned int) src & 15;
  50.494 +    const int shift_dst = (unsigned int) dst & 15;
  50.495 +
  50.496 +    // 8x dest luma blocks are aligned or desaligned by 8
  50.497 +    vuint8_t dstmask;
  50.498 +    const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.499 +    const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
  50.500 +
  50.501 +    if(shift_dst==0){
  50.502 +      dstmask = dst8mask1;
  50.503 +    }
  50.504 +    else{
  50.505 +      dstmask = dst8mask2;
  50.506 +    }
  50.507 +
  50.508 +    int i;
  50.509 +	register int line_size   = src_stride;
  50.510 +    register int line_size_2 = line_size << 1;
  50.511 +    register int line_size_3 = line_size + line_size_2;
  50.512 +    register int line_size_4 = line_size << 2;
  50.513 +
  50.514 +	register int dst_stride_2 = dst_stride << 1;
  50.515 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
  50.516 +    register int dst_stride_4 = dst_stride << 2;
  50.517 +
  50.518 +    for(i=0; i<h; i+=4) {
  50.519 +      pixelsv1A = *(vuint8_t *)(src);
  50.520 +      pixelsv2A = *(vuint8_t *)(src+16);
  50.521 +      pixelsv1B = *(vuint8_t *)(src + line_size);
  50.522 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
  50.523 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
  50.524 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
  50.525 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
  50.526 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
  50.527 +
  50.528 +      const vuint8_t block1 = *(vuint8_t *) dst;
  50.529 +      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
  50.530 +      const vuint8_t put1 = spu_avg(block1,put1a);
  50.531 +
  50.532 +      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
  50.533 +      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
  50.534 +      const vuint8_t put2 = spu_avg(block2,put2a);
  50.535 +
  50.536 +      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
  50.537 +      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
  50.538 +      const vuint8_t put3 = spu_avg(block3,put3a);
  50.539 +
  50.540 +      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
  50.541 +      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
  50.542 +      const vuint8_t put4 = spu_avg(block4,put4a);
  50.543 +
  50.544 +      *(vuint8_t *) dst = put1;
  50.545 +      *(vuint8_t *)(dst + dst_stride) = put2;
  50.546 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
  50.547 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
  50.548 +
  50.549 +      src+= line_size_4;
  50.550 +      dst+= dst_stride_4;
  50.551 +    }
  50.552 +}
  50.553 +
  50.554 +void put_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
  50.555 +				   int dst_stride, int src_stride1, int h)
  50.556 +{
  50.557 +  int i;
  50.558 +
  50.559 +  const int perm_src1 = (unsigned int) src1 & 15;
  50.560 +  const int shift_dst = (unsigned int) dst & 15;
  50.561 +
  50.562 +  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
  50.563 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  50.564 +  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.565 +  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.566 +  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
  50.567 +  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
  50.568 +
  50.569 +  switch(shift_dst){
  50.570 +    case 0:  dstmask = dstmask0;
  50.571 +             break;
  50.572 +    case 4:  dstmask = dstmask4;
  50.573 +             break;
  50.574 +    case 8:  dstmask = dstmask8;
  50.575 +             break;
  50.576 +    case 12: dstmask = dstmask12;
  50.577 +             break;
  50.578 +  }
  50.579 +
  50.580 +  for (i=0; i<h; i++){
  50.581 +      //unaligned load of src1
  50.582 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
  50.583 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
  50.584 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
  50.585 +
  50.586 +      //aligned load of src2
  50.587 +      const vuint8_t srcb = *(vuint8_t *)(src2);
  50.588 +
  50.589 +      //average and rounding
  50.590 +      const vuint8_t avgc = spu_avg(srca,srcb);
  50.591 +
  50.592 +      const vuint8_t dst1 = *(vuint8_t *)dst;
  50.593 +
  50.594 +      const vuint8_t davgc = spu_shuffle(dst1, avgc, dstmask);
  50.595 +
  50.596 +      *(vuint8_t *)dst=davgc;
  50.597 +
  50.598 +      src1 +=src_stride1;
  50.599 +      src2 +=16;
  50.600 +      dst  +=dst_stride;
  50.601 +  }
  50.602 +}
  50.603 +
  50.604 +void avg_pixels4_l2_spu(uint8_t * dst, const uint8_t * src1, const uint8_t * src2,
  50.605 +				   int dst_stride, int src_stride1, int h)
  50.606 +{
  50.607 +  int i;
  50.608 +
  50.609 +  const int perm_src1 = (unsigned int) src1 & 15;
  50.610 +  const int shift_dst = (unsigned int) dst & 15;
  50.611 +
  50.612 +  // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
  50.613 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  50.614 +  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.615 +  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.616 +  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
  50.617 +  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
  50.618 +
  50.619 +  switch(shift_dst){
  50.620 +    case 0:  dstmask = dstmask0;
  50.621 +             break;
  50.622 +    case 4:  dstmask = dstmask4;
  50.623 +             break;
  50.624 +    case 8:  dstmask = dstmask8;
  50.625 +             break;
  50.626 +    case 12: dstmask = dstmask12;
  50.627 +             break;
  50.628 +  }
  50.629 +
  50.630 +  for (i=0; i<h; i++){
  50.631 +      //unaligned load of src1
  50.632 +      const vuint8_t srctmpa1 = *(vuint8_t *)(src1);
  50.633 +      const vuint8_t srctmpa2 = *(vuint8_t *)(src1+16);
  50.634 +      const vuint8_t srca= spu_or(spu_slqwbyte(srctmpa1, perm_src1), spu_rlmaskqwbyte(srctmpa2, perm_src1-16));
  50.635 +
  50.636 +      //aligned load of src2
  50.637 +      const vuint8_t srcb = *(vuint8_t *)(src2);
  50.638 +
  50.639 +      //average and rounding
  50.640 +      const vuint8_t avgc = spu_avg(srca,srcb);
  50.641 +
  50.642 +      const vuint8_t dst1 = *(vuint8_t *)dst;
  50.643 +
  50.644 +      const vuint8_t davgc1 = spu_shuffle(dst1, avgc, dstmask);
  50.645 +
  50.646 +      const vuint8_t davgc = spu_avg(dst1,davgc1);
  50.647 +
  50.648 +      *(vuint8_t *)dst=davgc;
  50.649 +
  50.650 +      src1 +=src_stride1;
  50.651 +      src2 +=16;
  50.652 +      dst  +=dst_stride;
  50.653 +  }
  50.654 +}
  50.655 +
  50.656 +// next one assumes that ((line_size % 16) == 0)
  50.657 +void put_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
  50.658 +{
  50.659 +    register vector unsigned char pixelsv1A, pixelsv2A;
  50.660 +    register vector unsigned char pixelsv1B, pixelsv2B;
  50.661 +    register vector unsigned char pixelsv1C, pixelsv2C;
  50.662 +    register vector unsigned char pixelsv1D, pixelsv2D;
  50.663 +
  50.664 +    const int perm = (unsigned int) src & 15;
  50.665 +    const int shift_dst = (unsigned int) dst & 15;
  50.666 +
  50.667 +    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
  50.668 +    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  50.669 +    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.670 +    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.671 +    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
  50.672 +    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
  50.673 +
  50.674 +    switch(shift_dst){
  50.675 +      case 0:  dstmask = dstmask0;
  50.676 +               break;
  50.677 +      case 4:  dstmask = dstmask4;
  50.678 +               break;
  50.679 +      case 8:  dstmask = dstmask8;
  50.680 +               break;
  50.681 +      case 12: dstmask = dstmask12;
  50.682 +               break;
  50.683 +    }
  50.684 +
  50.685 +    int i;
  50.686 +	register int line_size   = src_stride;
  50.687 +    register int line_size_2 = line_size << 1;
  50.688 +    register int line_size_3 = line_size + line_size_2;
  50.689 +    register int line_size_4 = line_size << 2;
  50.690 +
  50.691 +	register int dst_stride_2 = dst_stride << 1;
  50.692 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
  50.693 +    register int dst_stride_4 = dst_stride << 2;
  50.694 +
  50.695 +    for(i=0; i<h; i+=4) {
  50.696 +	  pixelsv1A = *(vuint8_t *)(src);
  50.697 +      pixelsv2A = *(vuint8_t *)(src+16);
  50.698 +      pixelsv1B = *(vuint8_t *)(src + line_size);
  50.699 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
  50.700 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
  50.701 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
  50.702 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
  50.703 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
  50.704 +
  50.705 +      const vuint8_t block1 = *(vuint8_t *)dst;
  50.706 +      const vuint8_t put1 = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
  50.707 +      const vuint8_t block2 = *(vuint8_t *)(dst+dst_stride);
  50.708 +      const vuint8_t put2 = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
  50.709 +      const vuint8_t block3 = *(vuint8_t *)(dst+dst_stride_2);
  50.710 +      const vuint8_t put3 = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
  50.711 +      const vuint8_t block4 = *(vuint8_t *)(dst+dst_stride_3);
  50.712 +      const vuint8_t put4 = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
  50.713 +
  50.714 +      *(vuint8_t *) dst = put1;
  50.715 +      *(vuint8_t *)(dst + dst_stride) = put2;
  50.716 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
  50.717 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
  50.718 +
  50.719 +      src += line_size_4;
  50.720 +      dst += dst_stride_4;
  50.721 +    }
  50.722 +}
  50.723 +
  50.724 +// next one assumes that ((line_size % 16) == 0)
  50.725 +void avg_pixels4_spu(uint8_t *dst, const uint8_t *src, int dst_stride, int src_stride, int h)
  50.726 +{
  50.727 +    register vector unsigned char pixelsv1A, pixelsv2A;
  50.728 +    register vector unsigned char pixelsv1B, pixelsv2B;
  50.729 +    register vector unsigned char pixelsv1C, pixelsv2C;
  50.730 +    register vector unsigned char pixelsv1D, pixelsv2D;
  50.731 +
  50.732 +    const int perm = (unsigned int) src & 15;
  50.733 +    const int shift_dst = (unsigned int) dst & 15;
  50.734 +
  50.735 +    // 4x dest luma blocks are desaligned by 0, 4, 8, or 12
  50.736 +    vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  50.737 +    const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.738 +    const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  50.739 +    const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
  50.740 +    const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
  50.741 +
  50.742 +    switch(shift_dst){
  50.743 +      case 0:  dstmask = dstmask0;
  50.744 +               break;
  50.745 +      case 4:  dstmask = dstmask4;
  50.746 +               break;
  50.747 +      case 8:  dstmask = dstmask8;
  50.748 +               break;
  50.749 +      case 12: dstmask = dstmask12;
  50.750 +               break;
  50.751 +    }
  50.752 +
  50.753 +    int i;
  50.754 +	register int line_size   = src_stride;
  50.755 +    register int line_size_2 = line_size << 1;
  50.756 +    register int line_size_3 = line_size + line_size_2;
  50.757 +    register int line_size_4 = line_size << 2;
  50.758 +
  50.759 +	register int dst_stride_2 = dst_stride << 1;
  50.760 +    register int dst_stride_3 = dst_stride_2 + dst_stride;
  50.761 +    register int dst_stride_4 = dst_stride << 2;
  50.762 +
  50.763 +    for(i=0; i<h; i+=4) {
  50.764 +	  pixelsv1A = *(vuint8_t *)(src);
  50.765 +      pixelsv2A = *(vuint8_t *)(src+16);
  50.766 +      pixelsv1B = *(vuint8_t *)(src + line_size);
  50.767 +      pixelsv2B = *(vuint8_t *)(src+16 + line_size);
  50.768 +      pixelsv1C = *(vuint8_t *)(src + line_size_2);
  50.769 +      pixelsv2C = *(vuint8_t *)(src+16 + line_size_2);
  50.770 +      pixelsv1D = *(vuint8_t *)(src + line_size_3);
  50.771 +      pixelsv2D = *(vuint8_t *)(src+16 + line_size_3);
  50.772 +
  50.773 +      const vuint8_t block1 = *(vuint8_t *) dst;
  50.774 +      const vuint8_t put1a = spu_shuffle(block1, spu_or(spu_slqwbyte(pixelsv1A, perm), spu_rlmaskqwbyte(pixelsv2A, perm-16)), dstmask);
  50.775 +      const vuint8_t put1 = spu_avg(block1,put1a);
  50.776 +
  50.777 +      const vuint8_t block2 = *(vuint8_t *)(dst + dst_stride);
  50.778 +      const vuint8_t put2a = spu_shuffle(block2, spu_or(spu_slqwbyte(pixelsv1B, perm), spu_rlmaskqwbyte(pixelsv2B, perm-16)), dstmask);
  50.779 +      const vuint8_t put2 = spu_avg(block2,put2a);
  50.780 +
  50.781 +      const vuint8_t block3 = *(vuint8_t *)(dst + dst_stride_2);
  50.782 +      const vuint8_t put3a = spu_shuffle(block3, spu_or(spu_slqwbyte(pixelsv1C, perm), spu_rlmaskqwbyte(pixelsv2C, perm-16)), dstmask);
  50.783 +      const vuint8_t put3 = spu_avg(block3,put3a);
  50.784 +
  50.785 +      const vuint8_t block4 = *(vuint8_t *)(dst + dst_stride_3);
  50.786 +      const vuint8_t put4a = spu_shuffle(block4, spu_or(spu_slqwbyte(pixelsv1D, perm), spu_rlmaskqwbyte(pixelsv2D, perm-16)), dstmask);
  50.787 +      const vuint8_t put4 = spu_avg(block4,put4a);
  50.788 +
  50.789 +      *(vuint8_t *) dst = put1;
  50.790 +      *(vuint8_t *)(dst + dst_stride) = put2;
  50.791 +      *(vuint8_t *)(dst + dst_stride_2) = put3;
  50.792 +      *(vuint8_t *)(dst + dst_stride_3) = put4;
  50.793 +
  50.794 +      src+= line_size_4;
  50.795 +      dst+= dst_stride_4;
  50.796 +    }
  50.797 +}
  50.798 +
  50.799 +/* Here we create all the interpolation modes H.264 motion compensation stage for luma */
  50.800 +  H264_MC(put_, 16, spu)
  50.801 +  H264_MC(put_, 8, spu)
  50.802 +  H264_MC(put_, 4, spu)
  50.803 +
  50.804 +  H264_MC(avg_, 16, spu)
  50.805 +  H264_MC(avg_, 8, spu)
  50.806 +  H264_MC(avg_, 4, spu)
  50.807 +
  50.808 +
  50.809 +//Chroma interpolation:
  50.810 +
  50.811 +#define OP_U8_SPU                          PUT_OP_U8_SPU
  50.812 +#define PREFIX_h264_chroma_mc8_spu         put_h264_chroma_mc8_spu
  50.813 +#define PREFIX_h264_chroma_mc4_spu         put_h264_chroma_mc4_spu
  50.814 +#define PREFIX_h264_chroma_mc2_spu         put_h264_chroma_mc2_spu
  50.815 +#include "h264_chroma_template_spu.c"
  50.816 +#undef OP_U8_SPU
  50.817 +#undef PREFIX_h264_chroma_mc8_spu
  50.818 +#undef PREFIX_h264_chroma_mc4_spu
  50.819 +#undef PREFIX_h264_chroma_mc2_spu
  50.820 +
  50.821 +#define OP_U8_SPU                          AVG_OP_U8_SPU
  50.822 +#define PREFIX_h264_chroma_mc8_spu         avg_h264_chroma_mc8_spu
  50.823 +#define PREFIX_h264_chroma_mc4_spu         avg_h264_chroma_mc4_spu
  50.824 +#define PREFIX_h264_chroma_mc2_spu         avg_h264_chroma_mc2_spu
  50.825 +#include "h264_chroma_template_spu.c"
  50.826 +#undef OP_U8_SPU
  50.827 +#undef PREFIX_h264_chroma_mc8_spu
  50.828 +#undef PREFIX_h264_chroma_mc4_spu
  50.829 +#undef PREFIX_h264_chroma_mc2_spu
  50.830 +
  50.831 +// Weight and Biweight functions
  50.832 +
  50.833 +#define op_scale1(x)  dst[x] = av_clip_uint8( (dst[x]*weight + offset) >> log2_denom )
  50.834 +#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
  50.835 +#define H264_WEIGHT(W,H) \
  50.836 +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
  50.837 +    int y; \
  50.838 +    offset <<= log2_denom; \
  50.839 +    if(log2_denom) offset += 1<<(log2_denom-1); \
  50.840 +    for(y=0; y<H; y++, dst += stride){ \
  50.841 +        op_scale1(0); \
  50.842 +        op_scale1(1); \
  50.843 +        if(W==2) continue; \
  50.844 +        op_scale1(2); \
  50.845 +        op_scale1(3); \
  50.846 +        if(W==4) continue; \
  50.847 +        op_scale1(4); \
  50.848 +        op_scale1(5); \
  50.849 +        op_scale1(6); \
  50.850 +        op_scale1(7); \
  50.851 +        if(W==8) continue; \
  50.852 +        op_scale1(8); \
  50.853 +        op_scale1(9); \
  50.854 +        op_scale1(10); \
  50.855 +        op_scale1(11); \
  50.856 +        op_scale1(12); \
  50.857 +        op_scale1(13); \
  50.858 +        op_scale1(14); \
  50.859 +        op_scale1(15); \
  50.860 +    } \
  50.861 +} \
  50.862 +static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd, int weights, int offset){ \
  50.863 +    int y; \
  50.864 +    offset = ((offset + 1) | 1) << log2_denom; \
  50.865 +    for(y=0; y<H; y++, dst += dst_stride, src += src_stride){ \
  50.866 +        op_scale2(0); \
  50.867 +        op_scale2(1); \
  50.868 +        if(W==2) continue; \
  50.869 +        op_scale2(2); \
  50.870 +        op_scale2(3); \
  50.871 +        if(W==4) continue; \
  50.872 +        op_scale2(4); \
  50.873 +        op_scale2(5); \
  50.874 +        op_scale2(6); \
  50.875 +        op_scale2(7); \
  50.876 +        if(W==8) continue; \
  50.877 +        op_scale2(8); \
  50.878 +        op_scale2(9); \
  50.879 +        op_scale2(10); \
  50.880 +        op_scale2(11); \
  50.881 +        op_scale2(12); \
  50.882 +        op_scale2(13); \
  50.883 +        op_scale2(14); \
  50.884 +        op_scale2(15); \
  50.885 +    } \
  50.886 +}
  50.887 +
  50.888 +H264_WEIGHT(16,16)
  50.889 +H264_WEIGHT(16,8)
  50.890 +H264_WEIGHT(8,16)
  50.891 +H264_WEIGHT(8,8)
  50.892 +H264_WEIGHT(8,4)
  50.893 +H264_WEIGHT(4,8)
  50.894 +H264_WEIGHT(4,4)
  50.895 +H264_WEIGHT(4,2)
  50.896 +H264_WEIGHT(2,4)
  50.897 +H264_WEIGHT(2,2)
  50.898 +
  50.899 +#undef op_scale1
  50.900 +#undef op_scale2
  50.901 +#undef H264_WEIGHT
  50.902 +
  50.903 +/////////////////////////////////////////////////////////////////////////////////////////
  50.904 +
  50.905 +static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
  50.906 +{
  50.907 +    int i, d;
  50.908 +    for( i = 0; i < 4; i++ ) {
  50.909 +        if( tc0[i] < 0 ) {
  50.910 +            pix += 4*ystride;
  50.911 +            continue;
  50.912 +        }
  50.913 +        for( d = 0; d < 4; d++ ) {
  50.914 +            const int p0 = pix[-1*xstride];
  50.915 +            const int p1 = pix[-2*xstride];
  50.916 +            const int p2 = pix[-3*xstride];
  50.917 +            const int q0 = pix[0];
  50.918 +            const int q1 = pix[1*xstride];
  50.919 +            const int q2 = pix[2*xstride];
  50.920 +
  50.921 +            if( FFABS( p0 - q0 ) < alpha &&
  50.922 +                FFABS( p1 - p0 ) < beta &&
  50.923 +                FFABS( q1 - q0 ) < beta ) {
  50.924 +
  50.925 +                int tc = tc0[i];
  50.926 +                int i_delta;
  50.927 +
  50.928 +                if( FFABS( p2 - p0 ) < beta ) {
  50.929 +                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
  50.930 +                    tc++;
  50.931 +                }
  50.932 +                if( FFABS( q2 - q0 ) < beta ) {
  50.933 +                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
  50.934 +                    tc++;
  50.935 +                }
  50.936 +
  50.937 +                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
  50.938 +                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
  50.939 +                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
  50.940 +            }
  50.941 +            pix += ystride;
  50.942 +        }
  50.943 +    }
  50.944 +}
  50.945 +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  50.946 +{
  50.947 +    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
  50.948 +}
  50.949 +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  50.950 +{
  50.951 +    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
  50.952 +}
  50.953 +
  50.954 +static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
  50.955 +{
  50.956 +    int d;
  50.957 +    for( d = 0; d < 16; d++ ) {
  50.958 +        const int p2 = pix[-3*xstride];
  50.959 +        const int p1 = pix[-2*xstride];
  50.960 +        const int p0 = pix[-1*xstride];
  50.961 +
  50.962 +        const int q0 = pix[ 0*xstride];
  50.963 +        const int q1 = pix[ 1*xstride];
  50.964 +        const int q2 = pix[ 2*xstride];
  50.965 +
  50.966 +        if( FFABS( p0 - q0 ) < alpha &&
  50.967 +            FFABS( p1 - p0 ) < beta &&
  50.968 +            FFABS( q1 - q0 ) < beta ) {
  50.969 +
  50.970 +            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
  50.971 +                if( FFABS( p2 - p0 ) < beta)
  50.972 +                {
  50.973 +                    const int p3 = pix[-4*xstride];
  50.974 +                    /* p0', p1', p2' */
  50.975 +                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
  50.976 +                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
  50.977 +                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
  50.978 +                } else {
  50.979 +                    /* p0' */
  50.980 +                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
  50.981 +                }
  50.982 +                if( FFABS( q2 - q0 ) < beta)
  50.983 +                {
  50.984 +                    const int q3 = pix[3*xstride];
  50.985 +                    /* q0', q1', q2' */
  50.986 +                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
  50.987 +                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
  50.988 +                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
  50.989 +                } else {
  50.990 +                    /* q0' */
  50.991 +                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
  50.992 +                }
  50.993 +            }else{
  50.994 +                /* p0', q0' */
  50.995 +                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
  50.996 +                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
  50.997 +            }
  50.998 +        }
  50.999 +        pix += ystride;
 50.1000 +    }
 50.1001 +}
 50.1002 +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
 50.1003 +{
 50.1004 +    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
 50.1005 +}
 50.1006 +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
 50.1007 +{
 50.1008 +    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
 50.1009 +}
 50.1010 +
 50.1011 +static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
 50.1012 +{
 50.1013 +    int i, d;
 50.1014 +    for( i = 0; i < 4; i++ ) {
 50.1015 +        const int tc = tc0[i];
 50.1016 +        if( tc <= 0 ) {
 50.1017 +            pix += 2*ystride;
 50.1018 +            continue;
 50.1019 +        }
 50.1020 +        for( d = 0; d < 2; d++ ) {
 50.1021 +            const int p0 = pix[-1*xstride];
 50.1022 +            const int p1 = pix[-2*xstride];
 50.1023 +            const int q0 = pix[0];
 50.1024 +            const int q1 = pix[1*xstride];
 50.1025 +
 50.1026 +            if( FFABS( p0 - q0 ) < alpha &&
 50.1027 +                FFABS( p1 - p0 ) < beta &&
 50.1028 +                FFABS( q1 - q0 ) < beta ) {
 50.1029 +
 50.1030 +                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
 50.1031 +
 50.1032 +                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
 50.1033 +                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
 50.1034 +            }
 50.1035 +            pix += ystride;
 50.1036 +        }
 50.1037 +    }
 50.1038 +}
 50.1039 +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 50.1040 +{
 50.1041 +    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
 50.1042 +}
 50.1043 +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 50.1044 +{
 50.1045 +    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
 50.1046 +}
 50.1047 +
 50.1048 +static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
 50.1049 +{
 50.1050 +    int d;
 50.1051 +    for( d = 0; d < 8; d++ ) {
 50.1052 +        const int p0 = pix[-1*xstride];
 50.1053 +        const int p1 = pix[-2*xstride];
 50.1054 +        const int q0 = pix[0];
 50.1055 +        const int q1 = pix[1*xstride];
 50.1056 +
 50.1057 +        if( FFABS( p0 - q0 ) < alpha &&
 50.1058 +            FFABS( p1 - p0 ) < beta &&
 50.1059 +            FFABS( q1 - q0 ) < beta ) {
 50.1060 +
 50.1061 +            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
 50.1062 +            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
 50.1063 +        }
 50.1064 +        pix += ystride;
 50.1065 +    }
 50.1066 +}
 50.1067 +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
 50.1068 +{
 50.1069 +    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
 50.1070 +}
 50.1071 +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
 50.1072 +{
 50.1073 +    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
 50.1074 +}
 50.1075 +
 50.1076 +
 50.1077 +void dsputil_h264_init_cell(DSPContext_spu* c) {
 50.1078 +
 50.1079 +	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
 50.1080 +    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
 50.1081 +    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
 50.1082 +    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
 50.1083 +    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
 50.1084 +    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
 50.1085 +    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
 50.1086 +    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
 50.1087 +
 50.1088 +    c->h264_idct_add[0] = h264_idct8_add_spu;
 50.1089 +    c->h264_idct_add[1] = h264_idct4_add_spu;
 50.1090 +
 50.1091 +
 50.1092 +    c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_spu;
 50.1093 +    c->put_h264_chroma_pixels_tab[1] = put_h264_chroma_mc4_spu;
 50.1094 +    c->put_h264_chroma_pixels_tab[2] = put_h264_chroma_mc2_spu;
 50.1095 +    c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_spu;
 50.1096 +    c->avg_h264_chroma_pixels_tab[1] = avg_h264_chroma_mc4_spu;
 50.1097 +    c->avg_h264_chroma_pixels_tab[2] = avg_h264_chroma_mc2_spu;
 50.1098 +
 50.1099 +    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
 50.1100 +    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
 50.1101 +    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
 50.1102 +    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
 50.1103 +    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
 50.1104 +    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
 50.1105 +    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
 50.1106 +    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
 50.1107 +    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
 50.1108 +    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
 50.1109 +    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
 50.1110 +    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
 50.1111 +    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
 50.1112 +    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
 50.1113 +    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
 50.1114 +    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
 50.1115 +    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
 50.1116 +    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
 50.1117 +    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
 50.1118 +    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
 50.1119 +
 50.1120 +
 50.1121 +#define dspfunc(PFX, IDX, NUM) \
 50.1122 +    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_spu; \
 50.1123 +    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_spu; \
 50.1124 +    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_spu; \
 50.1125 +    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_spu; \
 50.1126 +    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_spu; \
 50.1127 +    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_spu; \
 50.1128 +    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_spu; \
 50.1129 +    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_spu; \
 50.1130 +    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_spu; \
 50.1131 +    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_spu; \
 50.1132 +    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_spu; \
 50.1133 +    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_spu; \
 50.1134 +    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_spu; \
 50.1135 +    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_spu; \
 50.1136 +    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_spu; \
 50.1137 +    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_spu
 50.1138 +
 50.1139 +    dspfunc(put_h264_qpel, 0, 16);
 50.1140 +    dspfunc(put_h264_qpel, 1, 8);
 50.1141 +    dspfunc(put_h264_qpel, 2, 4);
 50.1142 +
 50.1143 +    dspfunc(avg_h264_qpel, 0, 16);
 50.1144 +    dspfunc(avg_h264_qpel, 1, 8);
 50.1145 +    dspfunc(avg_h264_qpel, 2, 4);
 50.1146 +
 50.1147 +#undef dspfunc
 50.1148 +
 50.1149 +
 50.1150 +}

    51.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    51.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/dsputil_spu.h	Mon Aug 27 12:09:56 2012 +0200
    51.3 @@ -0,0 +1,34 @@
    51.4 +#ifndef DSPUTIL_CELL_H
    51.5 +#define DSPUTIL_CELL_H
    51.6 +
    51.7 +#include "types_spu.h"
    51.8 +
    51.9 +typedef struct DSPContext_spu {
   51.10 +	
   51.11 +	void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
   51.12 +    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
   51.13 +    /* v/h_loop_filter_luma_intra: align 16 */
   51.14 +    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
   51.15 +    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
   51.16 +    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
   51.17 +    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
   51.18 +    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
   51.19 +    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
   51.20 +	
   51.21 +	qpel_mc_func put_h264_qpel_pixels_tab[3][16];
   51.22 +	qpel_mc_func avg_h264_qpel_pixels_tab[3][16];
   51.23 +
   51.24 +	h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
   51.25 +	h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
   51.26 +
   51.27 +	h264_idct_func h264_idct_add[2];
   51.28 +
   51.29 +	h264_weight_func weight_h264_pixels_tab[10];
   51.30 +	h264_biweight_func biweight_h264_pixels_tab[10];
   51.31 +
   51.32 +} DSPContext_spu;
   51.33 +
   51.34 +
   51.35 +void dsputil_h264_init_cell(DSPContext_spu* c);
   51.36 + 
   51.37 +#endif

    52.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    52.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.c	Mon Aug 27 12:09:56 2012 +0200
    52.3 @@ -0,0 +1,2633 @@
    52.4 +/*
    52.5 + * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
    52.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    52.7 + *
    52.8 + * This file is part of FFmpeg.
    52.9 + *
   52.10 + * FFmpeg is free software; you can redistribute it and/or
   52.11 + * modify it under the terms of the GNU Lesser General Public
   52.12 + * License as published by the Free Software Foundation; either
   52.13 + * version 2.1 of the License, or (at your option) any later version.
   52.14 + *
   52.15 + * FFmpeg is distributed in the hope that it will be useful,
   52.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   52.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   52.18 + * Lesser General Public License for more details.
   52.19 + *
   52.20 + * You should have received a copy of the GNU Lesser General Public
   52.21 + * License along with FFmpeg; if not, write to the Free Software
   52.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   52.23 + */
   52.24 +
   52.25 +/**
   52.26 + * @file
   52.27 + * H.264 / AVC / MPEG4 part10 cabac decoding.
   52.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   52.29 + */
   52.30 +#define CELL_SPE
   52.31 +#include <limits.h>
   52.32 +#include <stdlib.h>
   52.33 +#include "libavutil/intreadwrite.h"
   52.34 +#include "libavutil/mem.h"
   52.35 +#include "libavcodec/avcodec.h"
   52.36 +#include "h264_deblock_spu.h"
   52.37 +#include "h264_pred_spu.h"
   52.38 +#include "h264_direct_spu.h"
   52.39 +#include "h264_tables.h"
   52.40 +#include "mathops_spu.h"
   52.41 +//#include "libavcodec/h264_data.h"
   52.42 +#include "cabac_spu.h"
   52.43 +#include "rectangle_spu.h"
   52.44 +#include "libavutil/log.h"
   52.45 +
   52.46 +//#undef NDEBUG
   52.47 +#include <assert.h>
   52.48 +#define INT_BIT (sizeof(int) * 8)
   52.49 +/* Cabac pre state table */
   52.50 +typedef struct IMbInfo{
   52.51 +    uint16_t type;
   52.52 +    uint8_t pred_mode;
   52.53 +    uint8_t cbp;
   52.54 +} IMbInfo;
   52.55 +
   52.56 +extern int bytecount;
   52.57 +
   52.58 +static const IMbInfo i_mb_type_info[26]={
   52.59 +{MB_TYPE_INTRA4x4  , -1, -1},
   52.60 +{MB_TYPE_INTRA16x16,  2,  0},
   52.61 +{MB_TYPE_INTRA16x16,  1,  0},
   52.62 +{MB_TYPE_INTRA16x16,  0,  0},
   52.63 +{MB_TYPE_INTRA16x16,  3,  0},
   52.64 +{MB_TYPE_INTRA16x16,  2,  16},
   52.65 +{MB_TYPE_INTRA16x16,  1,  16},
   52.66 +{MB_TYPE_INTRA16x16,  0,  16},
   52.67 +{MB_TYPE_INTRA16x16,  3,  16},
   52.68 +{MB_TYPE_INTRA16x16,  2,  32},
   52.69 +{MB_TYPE_INTRA16x16,  1,  32},
   52.70 +{MB_TYPE_INTRA16x16,  0,  32},
   52.71 +{MB_TYPE_INTRA16x16,  3,  32},
   52.72 +{MB_TYPE_INTRA16x16,  2,  15+0},
   52.73 +{MB_TYPE_INTRA16x16,  1,  15+0},
   52.74 +{MB_TYPE_INTRA16x16,  0,  15+0},
   52.75 +{MB_TYPE_INTRA16x16,  3,  15+0},
   52.76 +{MB_TYPE_INTRA16x16,  2,  15+16},
   52.77 +{MB_TYPE_INTRA16x16,  1,  15+16},
   52.78 +{MB_TYPE_INTRA16x16,  0,  15+16},
   52.79 +{MB_TYPE_INTRA16x16,  3,  15+16},
   52.80 +{MB_TYPE_INTRA16x16,  2,  15+32},
   52.81 +{MB_TYPE_INTRA16x16,  1,  15+32},
   52.82 +{MB_TYPE_INTRA16x16,  0,  15+32},
   52.83 +{MB_TYPE_INTRA16x16,  3,  15+32},
   52.84 +{MB_TYPE_INTRA_PCM , -1, -1},
   52.85 +};
   52.86 +
   52.87 +typedef struct PMbInfo{
   52.88 +    uint16_t type;
   52.89 +    uint8_t partition_count;
   52.90 +} PMbInfo;
   52.91 +
   52.92 +static const PMbInfo p_mb_type_info[5]={
   52.93 +{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
   52.94 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
   52.95 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
   52.96 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0, 4},
   52.97 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4},
   52.98 +};
   52.99 +
  52.100 +static const PMbInfo p_sub_mb_type_info[4]={
  52.101 +{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
  52.102 +{MB_TYPE_16x8 |MB_TYPE_P0L0             , 2},
  52.103 +{MB_TYPE_8x16 |MB_TYPE_P0L0             , 2},
  52.104 +{MB_TYPE_8x8  |MB_TYPE_P0L0             , 4},
  52.105 +};
  52.106 +
  52.107 +static const PMbInfo b_mb_type_info[23]={
  52.108 +{MB_TYPE_DIRECT2|MB_TYPE_L0L1                                      , 1, },
  52.109 +{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
  52.110 +{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
  52.111 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
  52.112 +{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  52.113 +{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  52.114 +{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  52.115 +{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  52.116 +{MB_TYPE_16x8 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
  52.117 +{MB_TYPE_8x16 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
  52.118 +{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  52.119 +{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  52.120 +{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.121 +{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.122 +{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.123 +{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.124 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  52.125 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  52.126 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  52.127 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  52.128 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.129 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.130 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
  52.131 +};
  52.132 +
  52.133 +static const PMbInfo b_sub_mb_type_info[13]={
  52.134 +{MB_TYPE_DIRECT2                                                   , 1, },
  52.135 +{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
  52.136 +{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
  52.137 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
  52.138 +{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  52.139 +{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  52.140 +{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  52.141 +{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  52.142 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.143 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  52.144 +{MB_TYPE_8x8  |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 4, },
  52.145 +{MB_TYPE_8x8               |MB_TYPE_P0L1             |MB_TYPE_P1L1, 4, },
  52.146 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
  52.147 +};
  52.148 +
  52.149 +static const int8_t cabac_context_init_I[460][2] =
  52.150 +{
  52.151 +    /* 0 - 10 */
  52.152 +    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
  52.153 +    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
  52.154 +    { -6,  53 }, { -1, 54 },  {  7,  51 },
  52.155 +
  52.156 +    /* 11 - 23 unsused for I */
  52.157 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.158 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.159 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.160 +    { 0, 0 },
  52.161 +
  52.162 +    /* 24- 39 */
  52.163 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.164 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.165 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.166 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.167 +
  52.168 +    /* 40 - 53 */
  52.169 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.170 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.171 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.172 +    { 0, 0 },    { 0, 0 },
  52.173 +
  52.174 +    /* 54 - 59 */
  52.175 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
  52.176 +    { 0, 0 },    { 0, 0 },
  52.177 +
  52.178 +    /* 60 - 69 */
  52.179 +    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
  52.180 +    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
  52.181 +    { 13, 41 },  { 3, 62 },
  52.182 +
  52.183 +    /* 70 -> 87 */
  52.184 +    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
  52.185 +    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
  52.186 +    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
  52.187 +    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
  52.188 +    { -12, 115 },{ -16, 122 },
  52.189 +
  52.190 +    /* 88 -> 104 */
  52.191 +    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
  52.192 +    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
  52.193 +    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
  52.194 +    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
  52.195 +    { -22, 125 },
  52.196 +
  52.197 +    /* 105 -> 135 */
  52.198 +    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
  52.199 +    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
  52.200 +    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
  52.201 +    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
  52.202 +    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
  52.203 +    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
  52.204 +    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
  52.205 +    { 14, 62 },  { -13, 108 },{ -15, 100 },
  52.206 +
  52.207 +    /* 136 -> 165 */
  52.208 +    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
  52.209 +    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
  52.210 +    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
  52.211 +    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
  52.212 +    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
  52.213 +    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
  52.214 +    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
  52.215 +    { 0, 62 },   { 12, 72 },
  52.216 +
  52.217 +    /* 166 -> 196 */
  52.218 +    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
  52.219 +    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
  52.220 +    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
  52.221 +    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
  52.222 +    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
  52.223 +    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
  52.224 +    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
  52.225 +    { 0, 89 },   { 26, -19 }, { 22, -17 },
  52.226 +
  52.227 +    /* 197 -> 226 */
  52.228 +    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
  52.229 +    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
  52.230 +    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
  52.231 +    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
  52.232 +    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
  52.233 +    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
  52.234 +    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
  52.235 +    { 12, 68 },  { 2, 97 },
  52.236 +
  52.237 +    /* 227 -> 251 */
  52.238 +    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
  52.239 +    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
  52.240 +    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
  52.241 +    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
  52.242 +    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
  52.243 +    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
  52.244 +    { -4, 65 },
  52.245 +
  52.246 +    /* 252 -> 275 */
  52.247 +    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
  52.248 +    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
  52.249 +    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
  52.250 +    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
  52.251 +    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
  52.252 +    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
  52.253 +
  52.254 +    /* 276 a bit special (not used, bypass is used instead) */
  52.255 +    { 0, 0 },
  52.256 +
  52.257 +    /* 277 -> 307 */
  52.258 +    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
  52.259 +    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
  52.260 +    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
  52.261 +    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
  52.262 +    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
  52.263 +    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
  52.264 +    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
  52.265 +    { 9, 64 },   { -12, 104 },{ -11, 97 },
  52.266 +
  52.267 +    /* 308 -> 337 */
  52.268 +    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
  52.269 +    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
  52.270 +    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
  52.271 +    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
  52.272 +    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
  52.273 +    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
  52.274 +    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
  52.275 +    { 5, 64 },   { 12, 70 },
  52.276 +
  52.277 +    /* 338 -> 368 */
  52.278 +    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
  52.279 +    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
  52.280 +    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
  52.281 +    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
  52.282 +    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
  52.283 +    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
  52.284 +    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
  52.285 +    { -12, 109 },{ 36, -35 }, { 36, -34 },
  52.286 +
  52.287 +    /* 369 -> 398 */
  52.288 +    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
  52.289 +    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
  52.290 +    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
  52.291 +    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
  52.292 +    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
  52.293 +    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
  52.294 +    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
  52.295 +    { 29, 39 },  { 19, 66 },
  52.296 +
  52.297 +    /* 399 -> 435 */
  52.298 +    {  31,  21 }, {  31,  31 }, {  25,  50 },
  52.299 +    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
  52.300 +    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
  52.301 +    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
  52.302 +    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
  52.303 +    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
  52.304 +    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
  52.305 +    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
  52.306 +    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
  52.307 +    {   0,  68 }, {  -9,  92 },
  52.308 +
  52.309 +    /* 436 -> 459 */
  52.310 +    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
  52.311 +    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
  52.312 +    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
  52.313 +    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
  52.314 +    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
  52.315 +    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
  52.316 +};
  52.317 +
  52.318 +static const int8_t cabac_context_init_PB[3][460][2] =
  52.319 +{
  52.320 +    /* i_cabac_init_idc == 0 */
  52.321 +    {
  52.322 +        /* 0 - 10 */
  52.323 +        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
  52.324 +        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
  52.325 +        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
  52.326 +
  52.327 +        /* 11 - 23 */
  52.328 +        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
  52.329 +        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
  52.330 +        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
  52.331 +        {  17,  50 },
  52.332 +
  52.333 +        /* 24 - 39 */
  52.334 +        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
  52.335 +        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
  52.336 +        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
  52.337 +        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
  52.338 +
  52.339 +        /* 40 - 53 */
  52.340 +        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
  52.341 +        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
  52.342 +        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
  52.343 +        {  -3,  81 }, {   0,  88 },
  52.344 +
  52.345 +        /* 54 - 59 */
  52.346 +        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
  52.347 +        {  -7,  72 }, {   1,  58 },
  52.348 +
  52.349 +        /* 60 - 69 */
  52.350 +        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
  52.351 +        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
  52.352 +        {  13,  41 }, {   3,  62 },
  52.353 +
  52.354 +        /* 70 - 87 */
  52.355 +        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
  52.356 +        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
  52.357 +        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
  52.358 +        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
  52.359 +        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
  52.360 +        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
  52.361 +        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
  52.362 +        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
  52.363 +        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
  52.364 +
  52.365 +        /* 105 -> 165 */
  52.366 +        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
  52.367 +        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
  52.368 +        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
  52.369 +        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
  52.370 +        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
  52.371 +        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
  52.372 +        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
  52.373 +        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
  52.374 +        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
  52.375 +        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
  52.376 +        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
  52.377 +        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
  52.378 +        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
  52.379 +        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
  52.380 +        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
  52.381 +        {   9,  69 },
  52.382 +
  52.383 +        /* 166 - 226 */
  52.384 +        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
  52.385 +        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
  52.386 +        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
  52.387 +        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
  52.388 +        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
  52.389 +        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
  52.390 +        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
  52.391 +        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
  52.392 +        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
  52.393 +        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
  52.394 +        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
  52.395 +        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
  52.396 +        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
  52.397 +        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
  52.398 +        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
  52.399 +        {  -9, 108 },
  52.400 +
  52.401 +        /* 227 - 275 */
  52.402 +        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
  52.403 +        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
  52.404 +        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
  52.405 +        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
  52.406 +        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
  52.407 +        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
  52.408 +        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
  52.409 +        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
  52.410 +        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
  52.411 +        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
  52.412 +        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
  52.413 +        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
  52.414 +        {  -8,  85 },
  52.415 +
  52.416 +        /* 276 a bit special (not used, bypass is used instead) */
  52.417 +        { 0, 0 },
  52.418 +
  52.419 +        /* 277 - 337 */
  52.420 +        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
  52.421 +        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
  52.422 +        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
  52.423 +        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
  52.424 +        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
  52.425 +        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
  52.426 +        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
  52.427 +        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
  52.428 +        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
  52.429 +        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
  52.430 +        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
  52.431 +        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
  52.432 +        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
  52.433 +        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
  52.434 +        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
  52.435 +        {  26,  43 },
  52.436 +
  52.437 +        /* 338 - 398 */
  52.438 +        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
  52.439 +        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
  52.440 +        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
  52.441 +        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
  52.442 +        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
  52.443 +        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
  52.444 +        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
  52.445 +        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
  52.446 +        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
  52.447 +        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
  52.448 +        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
  52.449 +        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
  52.450 +        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
  52.451 +        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
  52.452 +        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
  52.453 +        {  11,  86 },
  52.454 +
  52.455 +        /* 399 - 435 */
  52.456 +        {  12,  40 }, {  11,  51 }, {  14,  59 },
  52.457 +        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
  52.458 +        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
  52.459 +        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
  52.460 +        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
  52.461 +        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
  52.462 +        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
  52.463 +        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
  52.464 +        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
  52.465 +        {  -8,  66 }, {  -8,  76 },
  52.466 +
  52.467 +        /* 436 - 459 */
  52.468 +        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
  52.469 +        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
  52.470 +        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
  52.471 +        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
  52.472 +        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
  52.473 +        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
  52.474 +    },
  52.475 +
  52.476 +    /* i_cabac_init_idc == 1 */
  52.477 +    {
  52.478 +        /* 0 - 10 */
  52.479 +        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
  52.480 +        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
  52.481 +        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
  52.482 +
  52.483 +        /* 11 - 23 */
  52.484 +        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
  52.485 +        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
  52.486 +        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
  52.487 +        {  10,  54 },
  52.488 +
  52.489 +        /* 24 - 39 */
  52.490 +        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
  52.491 +        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
  52.492 +        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
  52.493 +        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
  52.494 +
  52.495 +        /* 40 - 53 */
  52.496 +        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
  52.497 +        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
  52.498 +        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
  52.499 +        {  -7,  86 },{  -5,  95 },
  52.500 +
  52.501 +        /* 54 - 59 */
  52.502 +        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
  52.503 +        {  -5,  72 },{   0,  61 },
  52.504 +
  52.505 +        /* 60 - 69 */
  52.506 +        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
  52.507 +        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
  52.508 +        { 13, 41 },  { 3, 62 },
  52.509 +
  52.510 +        /* 70 - 104 */
  52.511 +        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
  52.512 +        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
  52.513 +        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
  52.514 +        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
  52.515 +        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
  52.516 +        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
  52.517 +        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
  52.518 +        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
  52.519 +        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
  52.520 +
  52.521 +        /* 105 -> 165 */
  52.522 +        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
  52.523 +        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
  52.524 +        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
  52.525 +        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
  52.526 +        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
  52.527 +        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
  52.528 +        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
  52.529 +        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
  52.530 +        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
  52.531 +        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
  52.532 +        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
  52.533 +        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
  52.534 +        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
  52.535 +        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
  52.536 +        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
  52.537 +        {   0,  89 },
  52.538 +
  52.539 +        /* 166 - 226 */
  52.540 +        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
  52.541 +        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
  52.542 +        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
  52.543 +        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
  52.544 +        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
  52.545 +        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
  52.546 +        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
  52.547 +        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
  52.548 +        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
  52.549 +        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
  52.550 +        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
  52.551 +        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
  52.552 +        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
  52.553 +        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
  52.554 +        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
  52.555 +        { -10, 116 },
  52.556 +
  52.557 +        /* 227 - 275 */
  52.558 +        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
  52.559 +        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
  52.560 +        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
  52.561 +        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
  52.562 +        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
  52.563 +        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
  52.564 +        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
  52.565 +        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
  52.566 +        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
  52.567 +        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
  52.568 +        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
  52.569 +        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
  52.570 +        {  -4,  78 },
  52.571 +
  52.572 +        /* 276 a bit special (not used, bypass is used instead) */
  52.573 +        { 0, 0 },
  52.574 +
  52.575 +        /* 277 - 337 */
  52.576 +        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
  52.577 +        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
  52.578 +        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
  52.579 +        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
  52.580 +        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
  52.581 +        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
  52.582 +        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
  52.583 +        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
  52.584 +        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
  52.585 +        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
  52.586 +        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
  52.587 +        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
  52.588 +        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
  52.589 +        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
  52.590 +        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
  52.591 +        {  18,  50 },
  52.592 +
  52.593 +        /* 338 - 398 */
  52.594 +        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
  52.595 +        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
  52.596 +        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
  52.597 +        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
  52.598 +        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
  52.599 +        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
  52.600 +        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
  52.601 +        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
  52.602 +        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
  52.603 +        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
  52.604 +        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
  52.605 +        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
  52.606 +        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
  52.607 +        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
  52.608 +        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
  52.609 +        {  11,  83 },
  52.610 +
  52.611 +        /* 399 - 435 */
  52.612 +        {  25,  32 }, {  21,  49 }, {  21,  54 },
  52.613 +        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
  52.614 +        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
  52.615 +        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
  52.616 +        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
  52.617 +        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
  52.618 +        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
  52.619 +        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
  52.620 +        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
  52.621 +        {  -4,  67 }, {  -7,  82 },
  52.622 +
  52.623 +        /* 436 - 459 */
  52.624 +        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
  52.625 +        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
  52.626 +        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
  52.627 +        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
  52.628 +        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
  52.629 +        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
  52.630 +    },
  52.631 +
  52.632 +    /* i_cabac_init_idc == 2 */
  52.633 +    {
  52.634 +        /* 0 - 10 */
  52.635 +        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
  52.636 +        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
  52.637 +        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
  52.638 +
  52.639 +        /* 11 - 23 */
  52.640 +        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
  52.641 +        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
  52.642 +        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
  52.643 +        {  14,  57 },
  52.644 +
  52.645 +        /* 24 - 39 */
  52.646 +        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
  52.647 +        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
  52.648 +        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
  52.649 +        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
  52.650 +
  52.651 +        /* 40 - 53 */
  52.652 +        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
  52.653 +        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
  52.654 +        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
  52.655 +        {  -3,  90 },{  -1,  101 },
  52.656 +
  52.657 +        /* 54 - 59 */
  52.658 +        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
  52.659 +        {  -7,  50 },{   1,  60 },
  52.660 +
  52.661 +        /* 60 - 69 */
  52.662 +        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
  52.663 +        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
  52.664 +        { 13, 41 },  { 3, 62 },
  52.665 +
  52.666 +        /* 70 - 104 */
  52.667 +        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
  52.668 +        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
  52.669 +        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
  52.670 +        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
  52.671 +        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
  52.672 +        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
  52.673 +        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
  52.674 +        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
  52.675 +        {   3,  68 }, {  -8,  71 }, { -13,  98 },
  52.676 +
  52.677 +        /* 105 -> 165 */
  52.678 +        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
  52.679 +        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
  52.680 +        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
  52.681 +        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
  52.682 +        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
  52.683 +        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
  52.684 +        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
  52.685 +        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
  52.686 +        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
  52.687 +        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
  52.688 +        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
  52.689 +        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
  52.690 +        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
  52.691 +        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
  52.692 +        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
  52.693 +        { -22, 127 },
  52.694 +
  52.695 +        /* 166 - 226 */
  52.696 +        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
  52.697 +        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
  52.698 +        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
  52.699 +        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
  52.700 +        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
  52.701 +        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
  52.702 +        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
  52.703 +        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
  52.704 +        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
  52.705 +        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
  52.706 +        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
  52.707 +        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
  52.708 +        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
  52.709 +        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
  52.710 +        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
  52.711 +        { -24, 127 },
  52.712 +
  52.713 +        /* 227 - 275 */
  52.714 +        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
  52.715 +        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
  52.716 +        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
  52.717 +        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
  52.718 +        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
  52.719 +        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
  52.720 +        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
  52.721 +        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
  52.722 +        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
  52.723 +        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
  52.724 +        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
  52.725 +        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
  52.726 +        { -10,  87 },
  52.727 +
  52.728 +        /* 276 a bit special (not used, bypass is used instead) */
  52.729 +        { 0, 0 },
  52.730 +
  52.731 +        /* 277 - 337 */
  52.732 +        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
  52.733 +        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
  52.734 +        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
  52.735 +        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
  52.736 +        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
  52.737 +        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
  52.738 +        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
  52.739 +        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
  52.740 +        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
  52.741 +        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
  52.742 +        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
  52.743 +        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
  52.744 +        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
  52.745 +        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
  52.746 +        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
  52.747 +        {  25,  42 },
  52.748 +
  52.749 +        /* 338 - 398 */
  52.750 +        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
  52.751 +        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
  52.752 +        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
  52.753 +        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
  52.754 +        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
  52.755 +        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
  52.756 +        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
  52.757 +        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
  52.758 +        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
  52.759 +        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
  52.760 +        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
  52.761 +        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
  52.762 +        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
  52.763 +        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
  52.764 +        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
  52.765 +        {  25,  61 },
  52.766 +
  52.767 +        /* 399 - 435 */
  52.768 +        {  21,  33 }, {  19,  50 }, {  17,  61 },
  52.769 +        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
  52.770 +        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
  52.771 +        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
  52.772 +        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
  52.773 +        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
  52.774 +        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
  52.775 +        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
  52.776 +        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
  52.777 +        {  -6,  68 }, { -10,  79 },
  52.778 +
  52.779 +        /* 436 - 459 */
  52.780 +        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
  52.781 +        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
  52.782 +        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
  52.783 +        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
  52.784 +        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
  52.785 +        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
  52.786 +    }
  52.787 +};
  52.788 +
  52.789 +static const uint8_t left_block_options[4][16]={
  52.790 +    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
  52.791 +    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
  52.792 +    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
  52.793 +    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
  52.794 +};
  52.795 +
  52.796 +void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c) {
  52.797 +    int i;
  52.798 +    const int8_t (*tab)[2];
  52.799 +
  52.800 +    if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I;
  52.801 +    else                                 tab = cabac_context_init_PB[s->cabac_init_idc];
  52.802 +
  52.803 +    /* calculate pre-state */
  52.804 +    for( i= 0; i < 460; i++ ) {
  52.805 +        int pre = 2*(((tab[i][0] * s->qscale) >>4 ) + tab[i][1]) - 127;
  52.806 +
  52.807 +        pre^= pre>>31;
  52.808 +        if(pre > 124)
  52.809 +            pre= 124 + (pre&1);
  52.810 +
  52.811 +        c->cabac_state[i] =  pre;
  52.812 +    }
  52.813 +}
  52.814 +
  52.815 +static void fill_decode_neighbors(H264Cabac_spu *hc, EDSlice_spu *s){
  52.816 +    H264Mb *m = s->m;
  52.817 +	const int mb_x = m->mb_x;
  52.818 +	const int mb_y = m->mb_y;
  52.819 +
  52.820 +    m->top_type     = hc->mb_type_top[mb_x];
  52.821 +    m->left_type    = hc->mb_type[mb_x-1] ;
  52.822 +
  52.823 +}
  52.824 +
  52.825 +static void fill_decode_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
  52.826 +    H264Mb *m = s->m;
  52.827 +    int topleft_xy, top_xy, topright_xy, left_xy;
  52.828 +    int topleft_type, top_type, topright_type, left_type;
  52.829 +    const uint8_t * left_block= left_block_options[0];
  52.830 +	const int mb_x = m->mb_x;
  52.831 +	const int mb_y = m->mb_y;
  52.832 +	const int b_stride = hc->b_stride;
  52.833 +    int i;
  52.834 +
  52.835 +    topleft_type = hc->mb_type_top[mb_x-1] ;
  52.836 +    top_type     = m->top_type      ;
  52.837 +	topright_type= hc->mb_type_top[mb_x+1] ;
  52.838 +    left_type    = m->left_type     ;
  52.839 +	
  52.840 +	if (s->slice_type_nos == FF_B_TYPE){
  52.841 +		get_list = get_list_buf;
  52.842 +		for(int i=0; i<2; i++){
  52.843 +			get_dma_list(hc->list1_motion_val[i], s->list1.motion_val[i][4*mb_x + 4*mb_y*b_stride], 16, 4, b_stride*2*sizeof(int16_t), ED_get_mv, 0);
  52.844 +		}
  52.845 +		if (hc->blocking) wait_dma_id(ED_get_mv);
  52.846 +	}
  52.847 +	
  52.848 +    if(!IS_SKIP(mb_type)){
  52.849 +        if(IS_INTRA(mb_type)){
  52.850 +            int type_mask= s->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
  52.851 +            m->topleft_samples_available=
  52.852 +            m->top_samples_available=
  52.853 +            m->left_samples_available= 0xFFFF;
  52.854 +            m->topright_samples_available= 0xEEEA;
  52.855 +
  52.856 +            if(!(top_type & type_mask)){
  52.857 +                m->topleft_samples_available= 0xB3FF;
  52.858 +                m->top_samples_available= 0x33FF;
  52.859 +                m->topright_samples_available= 0x26EA;
  52.860 +            }
  52.861 +            if(!(left_type & type_mask)){
  52.862 +                m->topleft_samples_available&= 0xDF5F;
  52.863 +                m->left_samples_available&= 0x5F5F;
  52.864 +            }
  52.865 +
  52.866 +            if(!(topleft_type & type_mask))
  52.867 +                m->topleft_samples_available&= 0x7FFF;
  52.868 +
  52.869 +            if(!(topright_type & type_mask))
  52.870 +                m->topright_samples_available&= 0xFBFF;
  52.871 +
  52.872 +            if(IS_INTRA4x4(mb_type)){
  52.873 +                if(IS_INTRA4x4(top_type)){
  52.874 +                    AV_COPY32(m->intra4x4_pred_mode_cache+4+8*0, &hc->intra4x4_pred_mode_top[8*mb_x]);
  52.875 +                }else{
  52.876 +                    m->intra4x4_pred_mode_cache[4+8*0]=
  52.877 +                    m->intra4x4_pred_mode_cache[5+8*0]=
  52.878 +                    m->intra4x4_pred_mode_cache[6+8*0]=
  52.879 +                    m->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask);
  52.880 +                }
  52.881 +                for(i=0; i<2; i++){
  52.882 +                    if(IS_INTRA4x4(left_type)){
  52.883 +                        int8_t *mode= &hc->intra4x4_pred_mode[8*(mb_x-1)];
  52.884 +                        m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= mode[6-left_block[0+2*i]];
  52.885 +                        m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= mode[6-left_block[1+2*i]];
  52.886 +                    }else{
  52.887 +                        m->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
  52.888 +                        m->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= 2 - 3*!(left_type & type_mask);
  52.889 +                    }
  52.890 +                }
  52.891 +            }
  52.892 +        }
  52.893 +        if(top_type){
  52.894 +			AV_COPY32(&m->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]);
  52.895 +            m->non_zero_count_cache[1+8*0]= hc->non_zero_count_top[mb_x][1+1*8];
  52.896 +			m->non_zero_count_cache[2+8*0]= hc->non_zero_count_top[mb_x][2+1*8];
  52.897 +			m->non_zero_count_cache[1+8*3]= hc->non_zero_count_top[mb_x][1+2*8];
  52.898 +			m->non_zero_count_cache[2+8*3]= hc->non_zero_count_top[mb_x][2+2*8];
  52.899 +        }else {
  52.900 +            m->non_zero_count_cache[1+8*0]=
  52.901 +            m->non_zero_count_cache[2+8*0]=
  52.902 +            m->non_zero_count_cache[1+8*3]=
  52.903 +            m->non_zero_count_cache[2+8*3]=
  52.904 +            AV_WN32A(&m->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040);
  52.905 +        }
  52.906 +
  52.907 +        for (i=0; i<2; i++) {
  52.908 +            if(left_type){
  52.909 +                m->non_zero_count_cache[3+8*1 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+0+2*i]];
  52.910 +				m->non_zero_count_cache[3+8*2 + 2*8*i]= hc->non_zero_count[mb_x-1][left_block[8+1+2*i]];
  52.911 +				m->non_zero_count_cache[0+8*1 +   8*i]= hc->non_zero_count[mb_x-1][left_block[8+4+2*i]];
  52.912 +				m->non_zero_count_cache[0+8*4 +   8*i]= hc->non_zero_count[mb_x-1][left_block[8+5+2*i]];
  52.913 +            }else{
  52.914 +                    m->non_zero_count_cache[3+8*1 + 2*8*i]=
  52.915 +                    m->non_zero_count_cache[3+8*2 + 2*8*i]=
  52.916 +                    m->non_zero_count_cache[0+8*1 +   8*i]=
  52.917 +                    m->non_zero_count_cache[0+8*4 +   8*i]= !IS_INTRA(mb_type) ? 0 : 64;
  52.918 +            }
  52.919 +        }
  52.920 +
  52.921 +
  52.922 +		// top_cbp
  52.923 +		if(top_type) {
  52.924 +			hc->top_cbp = hc->cbp_top[mb_x];
  52.925 +		} else {
  52.926 +			hc->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
  52.927 +		}
  52.928 +		// left_cbp
  52.929 +		if (left_type) {
  52.930 +			hc->left_cbp = (hc->cbp[mb_x-1] & 0x1f0)
  52.931 +			|  ((hc->cbp[mb_x-1]>>(left_block[0]&(~1)))&2)
  52.932 +			| (((hc->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2);
  52.933 +		} else {
  52.934 +			hc->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
  52.935 +		}
  52.936 +    }
  52.937 +
  52.938 +    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
  52.939 +        int list;
  52.940 +
  52.941 +        m->ref_cache[0][scan8[5 ]+1] = m->ref_cache[0][scan8[7 ]+1] = m->ref_cache[0][scan8[13]+1] =
  52.942 +        m->ref_cache[1][scan8[5 ]+1] = m->ref_cache[1][scan8[7 ]+1] = m->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
  52.943 +
  52.944 +        for(list=0; list<s->list_count; list++){
  52.945 +            if(!USES_LIST(mb_type, list)){
  52.946 +                continue;
  52.947 +            }
  52.948 +            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
  52.949 +
  52.950 +            if(USES_LIST(top_type, list)){
  52.951 +                const int b_xy= 4*mb_x + 3*hc->b_stride;
  52.952 +                AV_COPY128(m->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]);
  52.953 +                    m->ref_cache[list][scan8[0] + 0 - 1*8]=
  52.954 +                    m->ref_cache[list][scan8[0] + 1 - 1*8]= hc->ref_index_top[list][4*mb_x + 2];
  52.955 +                    m->ref_cache[list][scan8[0] + 2 - 1*8]=
  52.956 +					m->ref_cache[list][scan8[0] + 3 - 1*8]= hc->ref_index_top[list][4*mb_x + 3];
  52.957 +            }else{
  52.958 +                AV_ZERO128(m->mv_cache[list][scan8[0] + 0 - 1*8]);
  52.959 +                AV_WN32A(&m->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
  52.960 +            }
  52.961 +
  52.962 +            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
  52.963 +                for(i=0; i<2; i++){
  52.964 +                    int cache_idx = scan8[0] - 1 + i*2*8;
  52.965 +                    if(USES_LIST(left_type, list)){
  52.966 +                        const int b_xy= 4*(mb_x-1) + 3;
  52.967 +                        const int b8_x= 4*(mb_x-1) + 1;
  52.968 +                        AV_COPY32(m->mv_cache[list][cache_idx  ], hc->motion_val[list][b_xy + hc->b_stride*left_block[0+i*2]]);
  52.969 +                        AV_COPY32(m->mv_cache[list][cache_idx+8], hc->motion_val[list][b_xy + hc->b_stride*left_block[1+i*2]]);
  52.970 +                        m->ref_cache[list][cache_idx  ]= hc->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
  52.971 +                        m->ref_cache[list][cache_idx+8]= hc->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
  52.972 +                    }else{
  52.973 +                        AV_ZERO32(m->mv_cache [list][cache_idx  ]);
  52.974 +                        AV_ZERO32(m->mv_cache [list][cache_idx+8]);
  52.975 +                        m->ref_cache[list][cache_idx  ]=
  52.976 +                        m->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
  52.977 +                    }
  52.978 +                }
  52.979 +            }else{
  52.980 +                if(USES_LIST(left_type, list)){
  52.981 +					const int b_x = 4*(mb_x-1) + 3;
  52.982 +                    const int b8_x= 4*(mb_x-1) + 1;
  52.983 +                    AV_COPY32(m->mv_cache[list][scan8[0] - 1], hc->motion_val[list][b_x + hc->b_stride*left_block[0]]);
  52.984 +                    m->ref_cache[list][scan8[0] - 1]= hc->ref_index[list][b8_x + (left_block[0]&~1)];
  52.985 +                }else{
  52.986 +                    AV_ZERO32(m->mv_cache [list][scan8[0] - 1]);
  52.987 +                    m->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
  52.988 +                }
  52.989 +            }
  52.990 +
  52.991 +            if(USES_LIST(topright_type, list)){
  52.992 +                const int b_xy= 4*(mb_x+1) + 3*hc->b_stride;
  52.993 +                AV_COPY32(m->mv_cache[list][scan8[0] + 4 - 1*8], hc->motion_val_top[list][b_xy]);
  52.994 +                m->ref_cache[list][scan8[0] + 4 - 1*8]= hc->ref_index_top[list][4*(mb_x+1) + 2];
  52.995 +            }else{
  52.996 +                AV_ZERO32(m->mv_cache [list][scan8[0] + 4 - 1*8]);
  52.997 +                m->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
  52.998 +            }
  52.999 +            if(m->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
 52.1000 +                int topleft_partition= -1;
 52.1001 +                if(USES_LIST(topleft_type, list)){
 52.1002 +                    const int b_xy = 4*(mb_x-1) + 3 + hc->b_stride + (topleft_partition & 2*hc->b_stride);
 52.1003 +                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
 52.1004 +                    AV_COPY32(m->mv_cache[list][scan8[0] - 1 - 1*8], hc->motion_val_top[list][b_xy]);
 52.1005 +                    m->ref_cache[list][scan8[0] - 1 - 1*8]= hc->ref_index_top[list][b8_x];
 52.1006 +                }else{
 52.1007 +                    AV_ZERO32(m->mv_cache[list][scan8[0] - 1 - 1*8]);
 52.1008 +                    m->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 52.1009 +                }
 52.1010 +            }
 52.1011 +
 52.1012 +            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
 52.1013 +                continue;
 52.1014 +
 52.1015 +            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
 52.1016 +                m->ref_cache[list][scan8[4 ]] =
 52.1017 +                m->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 52.1018 +                AV_ZERO32(m->mv_cache [list][scan8[4 ]]);
 52.1019 +                AV_ZERO32(m->mv_cache [list][scan8[12]]);
 52.1020 +
 52.1021 +
 52.1022 +				/* XXX beurk, Load mvd */
 52.1023 +				if(USES_LIST(top_type, list)){
 52.1024 +// 					const int b_xy= hc->mb2br_top_xy;
 52.1025 +					AV_COPY64(hc->mvd_cache[list][scan8[0] + 0 - 1*8], hc->mvd_top[list][8*mb_x + 0]);
 52.1026 +				}else{
 52.1027 +					AV_ZERO64(hc->mvd_cache[list][scan8[0] + 0 - 1*8]);
 52.1028 +				}
 52.1029 +				if(USES_LIST(left_type, list)){
 52.1030 +// 					const int b_xy= hc->mb2br_left_xy + 6;
 52.1031 +					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 0*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[0]]);
 52.1032 +					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 1*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[1]]);
 52.1033 +				}else{
 52.1034 +					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 0*8]);
 52.1035 +					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 1*8]);
 52.1036 +				}
 52.1037 +				if(USES_LIST(left_type, list)){
 52.1038 +// 					const int b_xy= hc->mb2br_left_xy + 6;
 52.1039 +					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 2*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[2]]);
 52.1040 +					AV_COPY16(hc->mvd_cache[list][scan8[0] - 1 + 3*8], hc->mvd[list][8*(mb_x-1) + 6 - left_block[3]]);
 52.1041 +				}else{
 52.1042 +					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 2*8]);
 52.1043 +					AV_ZERO16(hc->mvd_cache [list][scan8[0] - 1 + 3*8]);
 52.1044 +				}
 52.1045 +				AV_ZERO16(hc->mvd_cache [list][scan8[4 ]]);
 52.1046 +				AV_ZERO16(hc->mvd_cache [list][scan8[12]]);
 52.1047 +				if(s->slice_type_nos == FF_B_TYPE){
 52.1048 +					fill_rectangle(&hc->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
 52.1049 +
 52.1050 +					if(IS_DIRECT(top_type)){
 52.1051 +						AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1));
 52.1052 +					}else if(IS_8X8(top_type)){
 52.1053 +						int b8_x = 4*mb_x;
 52.1054 +						hc->direct_cache[scan8[0] + 0 - 1*8]= hc->direct_top[b8_x + 2];
 52.1055 +						hc->direct_cache[scan8[0] + 2 - 1*8]= hc->direct_top[b8_x + 3];
 52.1056 +					}else{
 52.1057 +						AV_WN32A(&hc->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1));
 52.1058 +					}
 52.1059 +
 52.1060 +					if(IS_DIRECT(left_type))
 52.1061 +						hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1;
 52.1062 +					else if(IS_8X8(left_type))
 52.1063 +						hc->direct_cache[scan8[0] - 1 + 0*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)];
 52.1064 +					else
 52.1065 +						hc->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1;
 52.1066 +
 52.1067 +					if(IS_DIRECT(left_type))
 52.1068 +						hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1;
 52.1069 +					else if(IS_8X8(left_type))
 52.1070 +						hc->direct_cache[scan8[0] - 1 + 2*8]= hc->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)];
 52.1071 +					else
 52.1072 +						hc->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1;
 52.1073 +				}
 52.1074 +            }
 52.1075 +        }
 52.1076 +    }
 52.1077 +    hc->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type);
 52.1078 +
 52.1079 +	if (s->slice_type_nos == FF_B_TYPE){
 52.1080 +		wait_dma_id(ED_get_mv);
 52.1081 +	}
 52.1082 +}
 52.1083 +
 52.1084 +static int check_mv(H264Cabac_spu *hc, EDSlice_spu *s, long b_idx, long bn_idx, int mvy_limit){
 52.1085 +	int v;
 52.1086 +
 52.1087 +	v= hc->ref_cache[0][b_idx] != hc->ref_cache[0][bn_idx];
 52.1088 +	if(!v && hc->ref_cache[0][b_idx]!=-1)
 52.1089 +		// absolute value >= 7 | ...
 52.1090 +		v= ((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) |
 52.1091 +		((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit);
 52.1092 +
 52.1093 +	if(s->list_count==2){
 52.1094 +		if(!v)
 52.1095 +			v = (hc->ref_cache[1][b_idx] != hc->ref_cache[1][bn_idx]) |
 52.1096 +			((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) |
 52.1097 +			((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit);
 52.1098 +
 52.1099 +		if(v){
 52.1100 +			if((hc->ref_cache[0][b_idx] != hc->ref_cache[1][bn_idx]) |
 52.1101 +				(hc->ref_cache[1][b_idx] != hc->ref_cache[0][bn_idx]))
 52.1102 +				return 1;
 52.1103 +			return
 52.1104 +			((unsigned) (hc->mv_cache[0][b_idx][0] - hc->mv_cache[1][bn_idx][0] + 3) >= 7U) |
 52.1105 +			((FFABS( hc->mv_cache[0][b_idx][1] - hc->mv_cache[1][bn_idx][1] )) >= mvy_limit) |
 52.1106 +			((unsigned) (hc->mv_cache[1][b_idx][0] - hc->mv_cache[0][bn_idx][0] + 3) >= 7U) |
 52.1107 +			((FFABS( hc->mv_cache[1][b_idx][1] - hc->mv_cache[0][bn_idx][1] )) >= mvy_limit);
 52.1108 +		}
 52.1109 +	}
 52.1110 +
 52.1111 +	return v;
 52.1112 +}
 52.1113 +
 52.1114 +static void calc_bS_values(H264Cabac_spu *hc, EDSlice_spu *s, int mvy_limit, int dir) {
 52.1115 +	H264Mb *m = s->m;
 52.1116 +	int mb_type = m->mb_type;
 52.1117 +	int edge;
 52.1118 +	const int mbm_type = dir == 0 ? m->left_type : m->top_type;
 52.1119 +
 52.1120 +	// how often to recheck mv-based bS when iterating between edges
 52.1121 +	static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1},
 52.1122 +	{0,3,1,1,3,3,3,3}};
 52.1123 +	const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7];
 52.1124 +	const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4;
 52.1125 +	// how often to recheck mv-based bS when iterating along each edge
 52.1126 +	const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
 52.1127 +
 52.1128 +	m->edges[dir]= edges;
 52.1129 +
 52.1130 +	if(mbm_type){
 52.1131 +		int16_t* bS=m->bS[dir][0];
 52.1132 +		if( IS_INTRA(mb_type|mbm_type)) {
 52.1133 +			AV_WN64A(bS, 0x0004000400040004ULL);
 52.1134 +		} else {
 52.1135 +			int i;
 52.1136 +			int mv_done;
 52.1137 +			if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
 52.1138 +				int b_idx= 8 + 4;
 52.1139 +				int bn_idx= b_idx - (dir ? 8:1);
 52.1140 +
 52.1141 +				bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, 8 + 4, bn_idx, mvy_limit);
 52.1142 +				mv_done = 1;
 52.1143 +			}
 52.1144 +			else
 52.1145 +				mv_done = 0;
 52.1146 +
 52.1147 +			for( i = 0; i < 4; i++ ) {
 52.1148 +				int x = dir == 0 ? 0 : i;
 52.1149 +				int y = dir == 0 ? i    : 0;
 52.1150 +				int b_idx= 8 + 4 + x + 8*y;
 52.1151 +				int bn_idx= b_idx - (dir ? 8:1);
 52.1152 +
 52.1153 +				if( hc->non_zero_count_cache[b_idx] |
 52.1154 +					hc->non_zero_count_cache[bn_idx] ) {
 52.1155 +					bS[i] = 2;
 52.1156 +				}
 52.1157 +				else if(!mv_done)
 52.1158 +				{
 52.1159 +					bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
 52.1160 +				}
 52.1161 +			}
 52.1162 +		}
 52.1163 +	}
 52.1164 +
 52.1165 +	/* Calculate bS */
 52.1166 +	for( edge = 1; edge < edges; edge++ ) {
 52.1167 +		int16_t* bS=m->bS[dir][edge];
 52.1168 +
 52.1169 +		if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
 52.1170 +			continue;
 52.1171 +
 52.1172 +		if( IS_INTRA(mb_type)) {
 52.1173 +			AV_WN64A(bS, 0x0003000300030003ULL);
 52.1174 +		} else {
 52.1175 +			int i;
 52.1176 +			int mv_done;
 52.1177 +
 52.1178 +			if( edge & mask_edge ) {
 52.1179 +				AV_ZERO64(bS);
 52.1180 +				mv_done = 1;
 52.1181 +			}
 52.1182 +			else if( mask_par0 ) {
 52.1183 +				int b_idx= 8 + 4 + edge * (dir ? 8:1);
 52.1184 +				int bn_idx= b_idx - (dir ? 8:1);
 52.1185 +
 52.1186 +				bS[0] = bS[1] = bS[2] = bS[3] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
 52.1187 +				mv_done = 1;
 52.1188 +			}
 52.1189 +			else
 52.1190 +				mv_done = 0;
 52.1191 +
 52.1192 +			for( i = 0; i < 4; i++ ) {
 52.1193 +				int x = dir == 0 ? edge : i;
 52.1194 +				int y = dir == 0 ? i    : edge;
 52.1195 +				int b_idx= 8 + 4 + x + 8*y;
 52.1196 +				int bn_idx= b_idx - (dir ? 8:1);
 52.1197 +
 52.1198 +				if( hc->non_zero_count_cache[b_idx] |
 52.1199 +					hc->non_zero_count_cache[bn_idx] ) {
 52.1200 +					bS[i] = 2;
 52.1201 +				}
 52.1202 +				else if(!mv_done)
 52.1203 +				{
 52.1204 +					bS[i] = check_mv(hc, s, b_idx, bn_idx, mvy_limit);
 52.1205 +				}
 52.1206 +			}
 52.1207 +
 52.1208 +			if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
 52.1209 +				continue;
 52.1210 +		}
 52.1211 +
 52.1212 +	}
 52.1213 +}
 52.1214 +
 52.1215 +/**
 52.1216 +*
 52.1217 +* @return zero if the loop filter can be skiped
 52.1218 +*/
 52.1219 +static int fill_filter_caches(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
 52.1220 +    H264Mb *m = s->m;
 52.1221 +	const int mb_x = m->mb_x;
 52.1222 +    const int mb_y = m->mb_y;
 52.1223 +    int top_type, left_type;
 52.1224 +    int qp, top_qp, left_qp;
 52.1225 +    int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
 52.1226 +
 52.1227 +    m->dequant4_coeff_y  = hc->dequant4_coeff[0][s->qscale][0];
 52.1228 +    m->dequant4_coeff_cb = hc->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][s->chroma_qp[0]][0];
 52.1229 +    m->dequant4_coeff_cr = hc->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][s->chroma_qp[1]][0];
 52.1230 +
 52.1231 +    m->qscale_mb_xy = qp = hc->qscale[mb_x];
 52.1232 +    m->qscale_left_mb_xy = left_qp = hc->qscale[mb_x-1];
 52.1233 +    m->qscale_top_mb_xy = top_qp = hc->qscale_top[mb_x];
 52.1234 +
 52.1235 +    //for sufficiently low qp, filtering wouldn't do anything
 52.1236 +    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
 52.1237 +	if(qp <= qp_thresh
 52.1238 +		&& (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh)
 52.1239 +		&& ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){
 52.1240 +		m->deblock_mb = 0;
 52.1241 +		return 0;
 52.1242 +	}
 52.1243 +    
 52.1244 +
 52.1245 +    m->deblock_mb = 1;
 52.1246 +
 52.1247 +	top_type     = hc->mb_type_top[mb_x] ;
 52.1248 +	left_type    = hc->mb_type[mb_x -1];
 52.1249 +
 52.1250 +    m->top_type     = top_type ;
 52.1251 +    m->left_type    = left_type;
 52.1252 +
 52.1253 +    if(IS_INTRA(mb_type)){
 52.1254 +        calc_bS_values(hc, s, 4, 0);
 52.1255 +        calc_bS_values(hc, s, 4, 1);
 52.1256 +        return 1;
 52.1257 +    }
 52.1258 +
 52.1259 +    AV_COPY64(&hc->non_zero_count_cache[0+8*1], &hc->non_zero_count[mb_x][ 0]);
 52.1260 +    AV_COPY64(&hc->non_zero_count_cache[0+8*2], &hc->non_zero_count[mb_x][ 8]);
 52.1261 +    AV_COPY32(&hc->non_zero_count_cache[0+8*5], &hc->non_zero_count[mb_x][16]);
 52.1262 +    AV_COPY32(&hc->non_zero_count_cache[4+8*3], &hc->non_zero_count[mb_x][20]);
 52.1263 +    AV_COPY64(&hc->non_zero_count_cache[0+8*4], &hc->non_zero_count[mb_x][24]);
 52.1264 +
 52.1265 +    m->cbp= hc->cbp[mb_x];
 52.1266 +
 52.1267 +    {
 52.1268 +        int list;
 52.1269 +        for(list=0; list<s->list_count; list++){
 52.1270 +            int8_t *ref;
 52.1271 +            int y, b_stride;
 52.1272 +            int16_t (*mv_dst)[2];
 52.1273 +            int16_t (*mv_src)[2];
 52.1274 +
 52.1275 +            if(!USES_LIST(mb_type, list)){
 52.1276 +                fill_rectangle( hc->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
 52.1277 +                AV_WN32A(&hc->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
 52.1278 +                AV_WN32A(&hc->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
 52.1279 +                AV_WN32A(&hc->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
 52.1280 +                AV_WN32A(&hc->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
 52.1281 +                continue;
 52.1282 +            }
 52.1283 +
 52.1284 +            ref = &hc->ref_index[list][4*mb_x];
 52.1285 +            {
 52.1286 +                int (*ref2frm)[64] =(void *) (s->ref2frm[0] +  2);
 52.1287 +                AV_WN32A(&hc->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
 52.1288 +                AV_WN32A(&hc->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
 52.1289 +                ref += 2;
 52.1290 +                AV_WN32A(&hc->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
 52.1291 +                AV_WN32A(&hc->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
 52.1292 +            }
 52.1293 +            b_stride = hc->b_stride;
 52.1294 +            mv_dst   = &hc->mv_cache[list][scan8[0]];
 52.1295 +            mv_src   = &hc->motion_val[list][4*mb_x];
 52.1296 +            for(y=0; y<4; y++){
 52.1297 +                AV_COPY128(mv_dst + 8*y, mv_src + y*b_stride);
 52.1298 +            }
 52.1299 +
 52.1300 +        }
 52.1301 +    }
 52.1302 +
 52.1303 +    /*
 52.1304 +    0 . T T. T T T T
 52.1305 +    1 L . .L . . . .
 52.1306 +    2 L . .L . . . .
 52.1307 +    3 . T TL . . . .
 52.1308 +    4 L . .L . . . .
 52.1309 +    5 L . .. . . . .
 52.1310 +    */
 52.1311 +    //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 52.1312 +    if(top_type){
 52.1313 +        AV_COPY32(&hc->non_zero_count_cache[4+8*0], &hc->non_zero_count_top[mb_x][4+3*8]);
 52.1314 +    }
 52.1315 +
 52.1316 +    if(left_type){
 52.1317 +        hc->non_zero_count_cache[3+8*1]= hc->non_zero_count[mb_x-1][7+0*8];
 52.1318 +		hc->non_zero_count_cache[3+8*2]= hc->non_zero_count[mb_x-1][7+1*8];
 52.1319 +		hc->non_zero_count_cache[3+8*3]= hc->non_zero_count[mb_x-1][7+2*8];
 52.1320 +		hc->non_zero_count_cache[3+8*4]= hc->non_zero_count[mb_x-1][7+3*8];
 52.1321 +    }
 52.1322 +
 52.1323 +    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 52.1324 +        int list;
 52.1325 +        for(list=0; list<s->list_count; list++){
 52.1326 +            if(USES_LIST(top_type, list)){
 52.1327 +                const int b_xy= 4*mb_x + 3*hc->b_stride;
 52.1328 +                const int b8_x= 4*mb_x + 2;
 52.1329 +                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
 52.1330 +                AV_COPY128(hc->mv_cache[list][scan8[0] + 0 - 1*8], hc->motion_val_top[list][b_xy + 0]);
 52.1331 +                hc->ref_cache[list][scan8[0] + 0 - 1*8]=
 52.1332 +                hc->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 0]];
 52.1333 +                hc->ref_cache[list][scan8[0] + 2 - 1*8]=
 52.1334 +                hc->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][hc->ref_index_top[list][b8_x + 1]];
 52.1335 +            }else{
 52.1336 +                AV_ZERO128(hc->mv_cache[list][scan8[0] + 0 - 1*8]);
 52.1337 +                AV_WN32A(&hc->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
 52.1338 +            }
 52.1339 +
 52.1340 +            if(USES_LIST(left_type, list)){
 52.1341 +				const int b_x = 4*(mb_x-1) + 3;
 52.1342 +                const int b8_x= 4*(mb_x-1) + 1;
 52.1343 +                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
 52.1344 +                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 0 ], hc->motion_val[list][b_x + hc->b_stride*0]);
 52.1345 +                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 + 8 ], hc->motion_val[list][b_x + hc->b_stride*1]);
 52.1346 +                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +16 ], hc->motion_val[list][b_x + hc->b_stride*2]);
 52.1347 +                AV_COPY32(hc->mv_cache[list][scan8[0] - 1 +24 ], hc->motion_val[list][b_x + hc->b_stride*3]);
 52.1348 +                hc->ref_cache[list][scan8[0] - 1 + 0 ]=
 52.1349 +                hc->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*0]];
 52.1350 +                hc->ref_cache[list][scan8[0] - 1 +16 ]=
 52.1351 +                hc->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][hc->ref_index[list][b8_x + 2*1]];
 52.1352 +            }else{
 52.1353 +                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 0 ]);
 52.1354 +                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 + 8 ]);
 52.1355 +                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +16 ]);
 52.1356 +                AV_ZERO32(hc->mv_cache [list][scan8[0] - 1 +24 ]);
 52.1357 +                hc->ref_cache[list][scan8[0] - 1 + 0  ]=
 52.1358 +                hc->ref_cache[list][scan8[0] - 1 + 8  ]=
 52.1359 +                hc->ref_cache[list][scan8[0] - 1 + 16 ]=
 52.1360 +                hc->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
 52.1361 +            }
 52.1362 +        }
 52.1363 +    }
 52.1364 +    calc_bS_values(hc, s, 4, 0);
 52.1365 +    calc_bS_values(hc, s, 4, 1);
 52.1366 +    return 1;
 52.1367 +}
 52.1368 +
 52.1369 +
 52.1370 +/**
 52.1371 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 52.1372 +*/
 52.1373 +static int check_intra4x4_pred_mode(EDSlice_spu *s){
 52.1374 +    H264Mb *m = s->m;
 52.1375 +    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 52.1376 +    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 52.1377 +    int i;
 52.1378 +
 52.1379 +    if(!(m->top_samples_available&0x8000)){
 52.1380 +        for(i=0; i<4; i++){
 52.1381 +            int status= top[ m->intra4x4_pred_mode_cache[scan8[0] + i] ];
 52.1382 +            if(status<0){
 52.1383 +                fprintf(stderr, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
 52.1384 +                return -1;
 52.1385 +            } else if(status){
 52.1386 +                m->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 52.1387 +            }
 52.1388 +        }
 52.1389 +    }
 52.1390 +
 52.1391 +    if((m->left_samples_available&0x8888)!=0x8888){
 52.1392 +        static const int mask[4]={0x8000,0x2000,0x80,0x20};
 52.1393 +        for(i=0; i<4; i++){
 52.1394 +            if(!(m->left_samples_available&mask[i])){
 52.1395 +                int status= left[ m->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 52.1396 +                if(status<0){
 52.1397 +                    fprintf(stderr, "left block unavailable for requested intra4x4 mode %d at %d %d, %x\n", status, m->mb_x, m->mb_y, m->left_samples_available);
 52.1398 +                    return -1;
 52.1399 +                } else if(status){
 52.1400 +                    m->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 52.1401 +                }
 52.1402 +            }
 52.1403 +        }
 52.1404 +    }
 52.1405 +    return 0;
 52.1406 +}
 52.1407 +
 52.1408 +/**
 52.1409 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 52.1410 +*/
 52.1411 +static int check_intra_pred_mode(EDSlice_spu *s, int mode){
 52.1412 +    H264Mb *m = s->m;
 52.1413 +    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 52.1414 +    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 52.1415 +
 52.1416 +    if(mode > 6) {
 52.1417 +        fprintf(stderr, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y);
 52.1418 +        return -1;
 52.1419 +    }
 52.1420 +
 52.1421 +    if(!(m->top_samples_available&0x8000)){
 52.1422 +        mode= top[ mode ];
 52.1423 +        if(mode<0){
 52.1424 +            fprintf(stderr, "top block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y);
 52.1425 +            return -1;
 52.1426 +        }
 52.1427 +    }
 52.1428 +
 52.1429 +    if((m->left_samples_available&0x8080) != 0x8080){
 52.1430 +        mode= left[ mode ];
 52.1431 +        if(m->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 52.1432 +            mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(m->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 52.1433 +        }
 52.1434 +        if(mode<0){
 52.1435 +            fprintf(stderr, "left block unavailable for requested intra mode %d at %d %d\n", mode, m->mb_x, m->mb_y);
 52.1436 +            return -1;
 52.1437 +        }
 52.1438 +    }
 52.1439 +    return mode;
 52.1440 +}
 52.1441 +
 52.1442 +/**
 52.1443 + * gets the predicted intra4x4 prediction mode.
 52.1444 + */
 52.1445 +static inline int pred_intra_mode(EDSlice_spu *s, int n){
 52.1446 +    H264Mb *m = s->m;
 52.1447 +    const int index8= scan8[n];
 52.1448 +    const int left= m->intra4x4_pred_mode_cache[index8 - 1];
 52.1449 +    const int top = m->intra4x4_pred_mode_cache[index8 - 8];
 52.1450 +    const int min= FFMIN(left, top);
 52.1451 +
 52.1452 +    if(min<0) return DC_PRED;
 52.1453 +    else      return min;
 52.1454 +}
 52.1455 +
 52.1456 +static void write_back_intra_pred_mode(H264Cabac_spu *hc, EDSlice_spu *s){
 52.1457 +    H264Mb *m = s->m;
 52.1458 +	const int mb_x = m->mb_x;
 52.1459 +    int8_t *mode= &hc->intra4x4_pred_mode[8*mb_x];
 52.1460 +
 52.1461 +    AV_COPY32(mode, m->intra4x4_pred_mode_cache + 4 + 8*4);
 52.1462 +    mode[4]= m->intra4x4_pred_mode_cache[7+8*3];
 52.1463 +    mode[5]= m->intra4x4_pred_mode_cache[7+8*2];
 52.1464 +    mode[6]= m->intra4x4_pred_mode_cache[7+8*1];
 52.1465 +}
 52.1466 +
 52.1467 +static inline void write_back_non_zero_count(H264Cabac_spu *hc, EDSlice_spu *s){
 52.1468 +    H264Mb *m = s->m;
 52.1469 +    const int mb_x= m->mb_x;
 52.1470 +
 52.1471 +    AV_COPY64(&hc->non_zero_count[mb_x][ 0], &m->non_zero_count_cache[0+8*1]);
 52.1472 +    AV_COPY64(&hc->non_zero_count[mb_x][ 8], &m->non_zero_count_cache[0+8*2]);
 52.1473 +    AV_COPY32(&hc->non_zero_count[mb_x][16], &m->non_zero_count_cache[0+8*5]);
 52.1474 +    AV_COPY32(&hc->non_zero_count[mb_x][20], &m->non_zero_count_cache[4+8*3]);
 52.1475 +    AV_COPY64(&hc->non_zero_count[mb_x][24], &m->non_zero_count_cache[0+8*4]);
 52.1476 +}
 52.1477 +
 52.1478 +static inline void write_back_motion(H264Cabac_spu *hc, EDSlice_spu *s, int mb_type){
 52.1479 +    H264Mb *m = s->m;
 52.1480 +	const int mb_x = m->mb_x;
 52.1481 +    int b_stride = hc->b_stride;
 52.1482 +    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
 52.1483 +    const int b8_x= 4*m->mb_x;
 52.1484 +    int list;
 52.1485 +
 52.1486 +    if(!USES_LIST(mb_type, 0))
 52.1487 +        fill_rectangle(&hc->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
 52.1488 +
 52.1489 +    for(list=0; list<s->list_count; list++){
 52.1490 +        int y;
 52.1491 +        int16_t (*mv_dst)[2];
 52.1492 +        int16_t (*mv_src)[2];
 52.1493 +
 52.1494 +        if(!USES_LIST(mb_type, list))
 52.1495 +            continue;
 52.1496 +
 52.1497 +        mv_dst   = &hc->motion_val[list][b_x];
 52.1498 +        mv_src   = &m->mv_cache[list][scan8[0]];
 52.1499 +        for(y=0; y<4; y++){
 52.1500 +            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
 52.1501 +        }
 52.1502 +        {
 52.1503 +            uint8_t (*mvd_dst)[2] = (void *) hc->mvd[list][8*mb_x];
 52.1504 +            uint8_t (*mvd_src)[2] = &hc->mvd_cache[list][scan8[0]];
 52.1505 +            if(IS_SKIP(mb_type))
 52.1506 +                AV_ZERO128(mvd_dst);
 52.1507 +            else{
 52.1508 +				AV_COPY64(mvd_dst, mvd_src + 8*3);
 52.1509 +                AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
 52.1510 +                AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
 52.1511 +                AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
 52.1512 +            }
 52.1513 +        }
 52.1514 +
 52.1515 +        {
 52.1516 +            int8_t *ref_index = &hc->ref_index[list][b8_x];
 52.1517 +            ref_index[0+0*2]= m->ref_cache[list][scan8[0]];
 52.1518 +            ref_index[1+0*2]= m->ref_cache[list][scan8[4]];
 52.1519 +            ref_index[0+1*2]= m->ref_cache[list][scan8[8]];
 52.1520 +            ref_index[1+1*2]= m->ref_cache[list][scan8[12]];
 52.1521 +        }
 52.1522 +    }
 52.1523 +
 52.1524 +    if(s->slice_type_nos == FF_B_TYPE){
 52.1525 +        if(IS_8X8(mb_type)){
 52.1526 +            uint8_t *direct = &hc->direct[4*mb_x];
 52.1527 +            direct[1] = m->sub_mb_type[1]>>1;
 52.1528 +            direct[2] = m->sub_mb_type[2]>>1;
 52.1529 +            direct[3] = m->sub_mb_type[3]>>1;
 52.1530 +        }
 52.1531 +    }
 52.1532 +}
 52.1533 +
 52.1534 +static inline int get_dct8x8_allowed(EDSlice_spu *s){
 52.1535 +    H264Mb *m = s->m;
 52.1536 +    if(s->direct_8x8_inference_flag)
 52.1537 +        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
 52.1538 +    else
 52.1539 +        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
 52.1540 +}
 52.1541 +
 52.1542 +static inline int fetch_diagonal_mv(EDSlice_spu *s, const int16_t **C, int i, int list, int part_width){
 52.1543 +    H264Mb *m = s->m;
 52.1544 +    const int topright_ref= m->ref_cache[list][ i - 8 + part_width ];
 52.1545 +
 52.1546 +    if(topright_ref != PART_NOT_AVAILABLE){
 52.1547 +        *C= m->mv_cache[list][ i - 8 + part_width ];
 52.1548 +        return topright_ref;
 52.1549 +    }else{
 52.1550 +        *C= m->mv_cache[list][ i - 8 - 1 ];
 52.1551 +        return m->ref_cache[list][ i - 8 - 1 ];
 52.1552 +    }
 52.1553 +}
 52.1554 +
 52.1555 +/**
 52.1556 + * gets the predicted MV.
 52.1557 + * @param n the block index
 52.1558 + * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 52.1559 + * @param mx the x component of the predicted motion vector
 52.1560 + * @param my the y component of the predicted motion vector
 52.1561 + */
 52.1562 +static inline void pred_motion(EDSlice_spu *s, int n, int part_width, int list, int ref, int * const mx, int * const my){
 52.1563 +    H264Mb *m = s->m;
 52.1564 +    const int index8= scan8[n];
 52.1565 +    const int top_ref=      m->ref_cache[list][ index8 - 8 ];
 52.1566 +    const int left_ref=     m->ref_cache[list][ index8 - 1 ];
 52.1567 +    const int16_t * const A= m->mv_cache[list][ index8 - 1 ];
 52.1568 +    const int16_t * const B= m->mv_cache[list][ index8 - 8 ];
 52.1569 +    const int16_t * C;
 52.1570 +    int diagonal_ref, match_count;
 52.1571 +
 52.1572 +    assert(part_width==1 || part_width==2 || part_width==4);
 52.1573 +
 52.1574 +/* mv_cache
 52.1575 +  B . . A T T T T
 52.1576 +  U . . L . . , .
 52.1577 +  U . . L . . . .
 52.1578 +  U . . L . . , .
 52.1579 +  . . . L . . . .
 52.1580 +*/
 52.1581 +
 52.1582 +    diagonal_ref= fetch_diagonal_mv(s, &C, index8, list, part_width);
 52.1583 +    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 52.1584 +
 52.1585 +    if(match_count > 1){ //most common
 52.1586 +        *mx= mid_pred(A[0], B[0], C[0]);
 52.1587 +        *my= mid_pred(A[1], B[1], C[1]);
 52.1588 +    }else if(match_count==1){
 52.1589 +        if(left_ref==ref){
 52.1590 +            *mx= A[0];
 52.1591 +            *my= A[1];
 52.1592 +        }else if(top_ref==ref){
 52.1593 +            *mx= B[0];
 52.1594 +            *my= B[1];
 52.1595 +        }else{
 52.1596 +            *mx= C[0];
 52.1597 +            *my= C[1];
 52.1598 +        }
 52.1599 +    }else{
 52.1600 +        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 52.1601 +            *mx= A[0];
 52.1602 +            *my= A[1];
 52.1603 +        }else{
 52.1604 +            *mx= mid_pred(A[0], B[0], C[0]);
 52.1605 +            *my= mid_pred(A[1], B[1], C[1]);
 52.1606 +        }
 52.1607 +    }
 52.1608 +
 52.1609 +}
 52.1610 +
 52.1611 +/**
 52.1612 + * gets the directionally predicted 16x8 MV.
 52.1613 + * @param n the block index
 52.1614 + * @param mx the x component of the predicted motion vector
 52.1615 + * @param my the y component of the predicted motion vector
 52.1616 + */
 52.1617 +static inline void pred_16x8_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){
 52.1618 +    H264Mb *m = s->m;
 52.1619 +    if(n==0){
 52.1620 +        const int top_ref=      m->ref_cache[list][ scan8[0] - 8 ];
 52.1621 +        const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ];
 52.1622 +
 52.1623 +        if(top_ref == ref){
 52.1624 +            *mx= B[0];
 52.1625 +            *my= B[1];
 52.1626 +            return;
 52.1627 +        }
 52.1628 +    }else{
 52.1629 +        const int left_ref=     m->ref_cache[list][ scan8[8] - 1 ];
 52.1630 +        const int16_t * const A= m->mv_cache[list][ scan8[8] - 1 ];
 52.1631 +
 52.1632 +        if(left_ref == ref){
 52.1633 +            *mx= A[0];
 52.1634 +            *my= A[1];
 52.1635 +            return;
 52.1636 +        }
 52.1637 +    }
 52.1638 +
 52.1639 +    //RARE
 52.1640 +    pred_motion(s, n, 4, list, ref, mx, my);
 52.1641 +}
 52.1642 +
 52.1643 +/**
 52.1644 + * gets the directionally predicted 8x16 MV.
 52.1645 + * @param n the block index
 52.1646 + * @param mx the x component of the predicted motion vector
 52.1647 + * @param my the y component of the predicted motion vector
 52.1648 + */
 52.1649 +static inline void pred_8x16_motion(EDSlice_spu *s, int n, int list, int ref, int * const mx, int * const my){
 52.1650 +    H264Mb *m = s->m;
 52.1651 +    if(n==0){
 52.1652 +        const int left_ref=      m->ref_cache[list][ scan8[0] - 1 ];
 52.1653 +        const int16_t * const A=  m->mv_cache[list][ scan8[0] - 1 ];
 52.1654 +
 52.1655 +        if(left_ref == ref){
 52.1656 +            *mx= A[0];
 52.1657 +            *my= A[1];
 52.1658 +            return;
 52.1659 +        }
 52.1660 +    }else{
 52.1661 +        const int16_t * C;
 52.1662 +        int diagonal_ref;
 52.1663 +
 52.1664 +        diagonal_ref= fetch_diagonal_mv(s, &C, scan8[4], list, 2);
 52.1665 +        if(diagonal_ref == ref){
 52.1666 +            *mx= C[0];
 52.1667 +            *my= C[1];
 52.1668 +            return;
 52.1669 +        }
 52.1670 +    }
 52.1671 +
 52.1672 +    //RARE
 52.1673 +    pred_motion(s, n, 2, list, ref, mx, my);
 52.1674 +}
 52.1675 +
 52.1676 +static inline void pred_pskip_motion(EDSlice_spu *s, int * const mx, int * const my){
 52.1677 +    H264Mb *m = s->m;
 52.1678 +    const int top_ref = m->ref_cache[0][ scan8[0] - 8 ];
 52.1679 +    const int left_ref= m->ref_cache[0][ scan8[0] - 1 ];
 52.1680 +
 52.1681 +    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 52.1682 +       || !( top_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 8 ]))
 52.1683 +       || !(left_ref | AV_RN32A(m->mv_cache[0][ scan8[0] - 1 ]))){
 52.1684 +
 52.1685 +        *mx = *my = 0;
 52.1686 +        return;
 52.1687 +    }
 52.1688 +
 52.1689 +    pred_motion(s, 0, 4, 0, 0, mx, my);
 52.1690 +
 52.1691 +    return;
 52.1692 +}
 52.1693 +
 52.1694 +/**
 52.1695 + * decodes a P_SKIP or B_SKIP macroblock
 52.1696 + */
 52.1697 +static void decode_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s){
 52.1698 +    H264Mb *m = s->m;
 52.1699 +	const int mb_x = m->mb_x;    
 52.1700 +    int mb_type=0;
 52.1701 +
 52.1702 +    memset(hc->non_zero_count[mb_x], 0, 32);
 52.1703 +    memset(m->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
 52.1704 +
 52.1705 +    if( s->slice_type_nos == FF_B_TYPE )
 52.1706 +    {
 52.1707 +        // just for fill_caches. pred_direct_motion will set the real mb_type
 52.1708 +        mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
 52.1709 +		fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ...
 52.1710 +
 52.1711 +        ff_h264_pred_direct_motion(hc, s, &mb_type);
 52.1712 +        mb_type|= MB_TYPE_SKIP;
 52.1713 +    }
 52.1714 +    else
 52.1715 +    {
 52.1716 +        int mx, my;
 52.1717 +        mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
 52.1718 +
 52.1719 +        fill_decode_caches(hc, s, mb_type); //FIXME check what is needed and what not ...
 52.1720 +        pred_pskip_motion(s, &mx, &my);
 52.1721 +        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
 52.1722 +        fill_rectangle(  m->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
 52.1723 +    }
 52.1724 +
 52.1725 +    write_back_motion(hc, s, mb_type);
 52.1726 +	hc->mb_type[mb_x]= mb_type;
 52.1727 +    m->mb_type = mb_type;
 52.1728 +    hc->qscale[mb_x]= s->qscale;
 52.1729 +    fill_filter_caches(hc, s, mb_type);
 52.1730 +}
 52.1731 +
 52.1732 +static int decode_cabac_intra_mb_type(EDSlice_spu *s, CABACContext *c, int ctx_base, int intra_slice) {
 52.1733 +    H264Mb *m =s->m;
 52.1734 +    uint8_t *state= &c->cabac_state[ctx_base];
 52.1735 +    int mb_type;
 52.1736 +
 52.1737 +    if(intra_slice){
 52.1738 +        int ctx=0;
 52.1739 +        if( m->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
 52.1740 +            ctx++;
 52.1741 +        if( m->top_type     & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
 52.1742 +            ctx++;
 52.1743 +        if( get_cabac_noinline( c, &state[ctx] ) == 0 )
 52.1744 +            return 0;   /* I4x4 */
 52.1745 +        state += 2;
 52.1746 +    }else{
 52.1747 +        if( get_cabac_noinline( c, state ) == 0 )
 52.1748 +            return 0;   /* I4x4 */
 52.1749 +    }
 52.1750 +
 52.1751 +    if( get_cabac_terminate( c ) )
 52.1752 +        return 25;  /* PCM */
 52.1753 +
 52.1754 +    mb_type = 1; /* I16x16 */
 52.1755 +    mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */
 52.1756 +    if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */
 52.1757 +        mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] );
 52.1758 +    mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] );
 52.1759 +    mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] );
 52.1760 +    return mb_type;
 52.1761 +}
 52.1762 +
 52.1763 +static int decode_cabac_mb_skip(H264Cabac_spu *hc, EDSlice_spu *s, H264Mb *m, CABACContext *c) {
 52.1764 +    int ctx = 0;
 52.1765 +    const int mb_x = m->mb_x;
 52.1766 +
 52.1767 +	if( m->mb_x>0 && !IS_SKIP( hc->mb_type[mb_x-1] ))
 52.1768 +        ctx++;
 52.1769 +	if( m->mb_y>0 && !IS_SKIP( hc->mb_type_top[mb_x] ))
 52.1770 +        ctx++;
 52.1771 +
 52.1772 +    if( s->slice_type_nos == FF_B_TYPE )
 52.1773 +        ctx += 13;
 52.1774 +    return get_cabac_noinline(c, &c->cabac_state[11+ctx] );
 52.1775 +}
 52.1776 +
 52.1777 +static int decode_cabac_mb_intra4x4_pred_mode( CABACContext *c, int pred_mode ) {
 52.1778 +    int mode = 0;
 52.1779 +
 52.1780 +    if( get_cabac(c, &c->cabac_state[68] ) )
 52.1781 +        return pred_mode;
 52.1782 +
 52.1783 +    mode += 1 * get_cabac(c, &c->cabac_state[69] );
 52.1784 +    mode += 2 * get_cabac(c, &c->cabac_state[69] );
 52.1785 +    mode += 4 * get_cabac(c, &c->cabac_state[69] );
 52.1786 +
 52.1787 +    return mode + ( mode >= pred_mode );
 52.1788 +}
 52.1789 +
 52.1790 +static int decode_cabac_mb_chroma_pre_mode(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) {
 52.1791 +    H264Mb *m = s->m;
 52.1792 +	const int mb_x = m->mb_x;
 52.1793 +
 52.1794 +    int ctx = 0;
 52.1795 +
 52.1796 +    /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */
 52.1797 +    if( m->left_type && hc->chroma_pred_mode[mb_x-1] != 0 )
 52.1798 +        ctx++;
 52.1799 +
 52.1800 +    if( m->top_type     && hc->chroma_pred_mode_top[mb_x] != 0 )
 52.1801 +        ctx++;
 52.1802 +
 52.1803 +    if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 )
 52.1804 +        return 0;
 52.1805 +
 52.1806 +    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
 52.1807 +        return 1;
 52.1808 +    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
 52.1809 +        return 2;
 52.1810 +    else
 52.1811 +        return 3;
 52.1812 +}
 52.1813 +
 52.1814 +static int decode_cabac_mb_cbp_luma(H264Cabac_spu *hc, CABACContext *c) {
 52.1815 +    int cbp_b, cbp_a, ctx, cbp = 0;
 52.1816 +
 52.1817 +    cbp_a = hc->left_cbp;
 52.1818 +    cbp_b = hc->top_cbp;
 52.1819 +
 52.1820 +    ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
 52.1821 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]);
 52.1822 +    ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
 52.1823 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1;
 52.1824 +    ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
 52.1825 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2;
 52.1826 +    ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
 52.1827 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3;
 52.1828 +    return cbp;
 52.1829 +}
 52.1830 +static int decode_cabac_mb_cbp_chroma(H264Cabac_spu *hc, CABACContext *c) {
 52.1831 +    int ctx;
 52.1832 +    int cbp_a, cbp_b;
 52.1833 +
 52.1834 +    cbp_a = (hc->left_cbp>>4)&0x03;
 52.1835 +    cbp_b = (hc-> top_cbp>>4)&0x03;
 52.1836 +
 52.1837 +    ctx = 0;
 52.1838 +    if( cbp_a > 0 ) ctx++;
 52.1839 +    if( cbp_b > 0 ) ctx += 2;
 52.1840 +    if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 )
 52.1841 +        return 0;
 52.1842 +
 52.1843 +    ctx = 4;
 52.1844 +    if( cbp_a == 2 ) ctx++;
 52.1845 +    if( cbp_b == 2 ) ctx += 2;
 52.1846 +    return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] );
 52.1847 +}
 52.1848 +
 52.1849 +static int decode_cabac_p_mb_sub_type( CABACContext *c) {
 52.1850 +    if( get_cabac(c, &c->cabac_state[21] ) )
 52.1851 +        return 0;   /* 8x8 */
 52.1852 +    if( !get_cabac(c, &c->cabac_state[22] ) )
 52.1853 +        return 1;   /* 8x4 */
 52.1854 +    if( get_cabac(c, &c->cabac_state[23] ) )
 52.1855 +        return 2;   /* 4x8 */
 52.1856 +    return 3;       /* 4x4 */
 52.1857 +}
 52.1858 +static int decode_cabac_b_mb_sub_type(CABACContext *c) {
 52.1859 +    int type;
 52.1860 +    if( !get_cabac(c, &c->cabac_state[36] ) )
 52.1861 +        return 0;   /* B_Direct_8x8 */
 52.1862 +    if( !get_cabac(c, &c->cabac_state[37] ) )
 52.1863 +        return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
 52.1864 +    type = 3;
 52.1865 +    if( get_cabac(c, &c->cabac_state[38] ) ) {
 52.1866 +        if( get_cabac(c, &c->cabac_state[39] ) )
 52.1867 +            return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
 52.1868 +        type += 4;
 52.1869 +    }
 52.1870 +    type += 2*get_cabac(c, &c->cabac_state[39] );
 52.1871 +    type +=   get_cabac(c, &c->cabac_state[39] );
 52.1872 +    return type;
 52.1873 +}
 52.1874 +
 52.1875 +static int decode_cabac_mb_ref(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, int list, int n ) {
 52.1876 +    H264Mb *m = s->m;
 52.1877 +    int refa = m->ref_cache[list][scan8[n] - 1];
 52.1878 +    int refb = m->ref_cache[list][scan8[n] - 8];
 52.1879 +    int ref  = 0;
 52.1880 +    int ctx  = 0;
 52.1881 +
 52.1882 +    if( s->slice_type_nos == FF_B_TYPE) {
 52.1883 +        if( refa > 0 && !(hc->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) )
 52.1884 +            ctx++;
 52.1885 +        if( refb > 0 && !(hc->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) )
 52.1886 +            ctx += 2;
 52.1887 +    } else {
 52.1888 +        if( refa > 0 )
 52.1889 +            ctx++;
 52.1890 +        if( refb > 0 )
 52.1891 +            ctx += 2;
 52.1892 +    }
 52.1893 +
 52.1894 +    while( get_cabac(c, &c->cabac_state[54+ctx] ) ) {
 52.1895 +        ref++;
 52.1896 +        ctx = (ctx>>2)+4;
 52.1897 +        if(ref >= 32 /*h->ref_list[list]*/){
 52.1898 +			fprintf(stderr, "refcount %d\n", ref);
 52.1899 +            return -1;
 52.1900 +        }
 52.1901 +    }
 52.1902 +    return ref;
 52.1903 +}
 52.1904 +
 52.1905 +static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) {
 52.1906 +    int mvd;
 52.1907 +
 52.1908 +    if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
 52.1909 +//    if(!get_cabac(&h->cabac, &c->cabac_state[ctxbase+(amvd>2)+(amvd>32)])){
 52.1910 +        *mvda= 0;
 52.1911 +        return 0;
 52.1912 +    }
 52.1913 +
 52.1914 +    mvd= 1;
 52.1915 +    ctxbase+= 3;
 52.1916 +    while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) {
 52.1917 +        if( mvd < 4 )
 52.1918 +            ctxbase++;
 52.1919 +        mvd++;
 52.1920 +    }
 52.1921 +
 52.1922 +    if( mvd >= 9 ) {
 52.1923 +        int k = 3;
 52.1924 +        while( get_cabac_bypass(c ) ) {
 52.1925 +            mvd += 1 << k;
 52.1926 +            k++;
 52.1927 +            if(k>24){
 52.1928 +                fprintf(stderr, "overflow in decode_cabac_mb_mvd\n");
 52.1929 +                return INT_MIN;
 52.1930 +            }
 52.1931 +        }
 52.1932 +        while( k-- ) {
 52.1933 +            mvd += get_cabac_bypass(c )<<k;
 52.1934 +        }
 52.1935 +        *mvda=mvd < 70 ? mvd : 70;
 52.1936 +    }else
 52.1937 +        *mvda=mvd;
 52.1938 +    return get_cabac_bypass_sign(c, -mvd );
 52.1939 +}
 52.1940 +
 52.1941 +#define DECODE_CABAC_MB_MVD( hc, c, list,  n )\
 52.1942 +{\
 52.1943 +    int amvd0 = hc->mvd_cache[list][scan8[n] - 1][0] +\
 52.1944 +                hc->mvd_cache[list][scan8[n] - 8][0];\
 52.1945 +    int amvd1 = hc->mvd_cache[list][scan8[n] - 1][1] +\
 52.1946 +                hc->mvd_cache[list][scan8[n] - 8][1];\
 52.1947 +\
 52.1948 +    mx += decode_cabac_mb_mvd( c, 40, amvd0, &mpx );\
 52.1949 +    my += decode_cabac_mb_mvd( c, 47, amvd1, &mpy );\
 52.1950 +}
 52.1951 +
 52.1952 +static av_always_inline int get_cabac_cbf_ctx(H264Cabac_spu *hc, EDSlice_spu *s, int cat, int idx, int is_dc ) {
 52.1953 +    H264Mb *m = s->m;
 52.1954 +    int nza, nzb;
 52.1955 +    int ctx = 0;
 52.1956 +
 52.1957 +    if( is_dc ) {
 52.1958 +        if( cat == 0 ) {
 52.1959 +            nza = hc->left_cbp&0x100;
 52.1960 +            nzb = hc-> top_cbp&0x100;
 52.1961 +        } else {
 52.1962 +            nza = (hc->left_cbp>>(6+idx))&0x01;
 52.1963 +            nzb = (hc-> top_cbp>>(6+idx))&0x01;
 52.1964 +        }
 52.1965 +    } else {
 52.1966 +        assert(cat == 1 || cat == 2 || cat == 4);
 52.1967 +        nza = m->non_zero_count_cache[scan8[idx] - 1];
 52.1968 +        nzb = m->non_zero_count_cache[scan8[idx] - 8];
 52.1969 +    }
 52.1970 +
 52.1971 +    if( nza > 0 )
 52.1972 +        ctx++;
 52.1973 +
 52.1974 +    if( nzb > 0 )
 52.1975 +        ctx += 2;
 52.1976 +
 52.1977 +    return ctx + 4 * cat;
 52.1978 +}
 52.1979 +
 52.1980 + uint8_t last_coeff_flag_offset_8x8[63] = {
 52.1981 +    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 52.1982 +    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 52.1983 +    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 52.1984 +    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 52.1985 +};
 52.1986 +
 52.1987 +static const int significant_coeff_flag_offset[2][6] = {
 52.1988 +    { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
 52.1989 +    { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
 52.1990 +};
 52.1991 +static const int last_coeff_flag_offset[2][6] = {
 52.1992 +    { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
 52.1993 +    { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
 52.1994 +};
 52.1995 +static const int coeff_abs_level_m1_offset[6] = {
 52.1996 +    227+0, 227+10, 227+20, 227+30, 227+39, 426
 52.1997 +};
 52.1998 +static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
 52.1999 +    { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
 52.2000 +    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
 52.2001 +    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
 52.2002 +    12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
 52.2003 +    { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
 52.2004 +    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
 52.2005 +    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
 52.2006 +    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
 52.2007 +};
 52.2008 +/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
 52.2009 +* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
 52.2010 +* map node ctx => cabac ctx for level=1 */
 52.2011 +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
 52.2012 +/* map node ctx => cabac ctx for level>1 */
 52.2013 +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
 52.2014 +static const uint8_t coeff_abs_level_transition[2][8] = {
 52.2015 +    /* update node ctx after decoding a level=1 */
 52.2016 +    { 1, 2, 3, 3, 4, 5, 6, 7 },
 52.2017 +    /* update node ctx after decoding a level>1 */
 52.2018 +    { 4, 4, 4, 4, 5, 6, 7, 7 }
 52.2019 +};
 52.2020 +
 52.2021 +static av_always_inline void decode_cabac_residual_internal(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
 52.2022 +    H264Mb *m = s->m;
 52.2023 +	const int mb_x = m->mb_x;
 52.2024 +    int index[64];
 52.2025 +
 52.2026 +    int av_unused last;
 52.2027 +    int coeff_count = 0;
 52.2028 +    int node_ctx = 0;
 52.2029 +
 52.2030 +    uint8_t *significant_coeff_ctx_base;
 52.2031 +    uint8_t *last_coeff_ctx_base;
 52.2032 +    uint8_t *abs_level_m1_ctx_base;
 52.2033 +
 52.2034 +    /* read coded block flag */
 52.2035 +    if( is_dc || cat != 5 ) {
 52.2036 +        if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( hc, s, cat, n, is_dc ) ] ) == 0 ) {
 52.2037 +            if( !is_dc )
 52.2038 +                m->non_zero_count_cache[scan8[n]] = 0;
 52.2039 +            return;
 52.2040 +        }
 52.2041 +    }
 52.2042 +
 52.2043 +    significant_coeff_ctx_base = c->cabac_state
 52.2044 +        + significant_coeff_flag_offset[0][cat];
 52.2045 +    last_coeff_ctx_base = c->cabac_state
 52.2046 +        + last_coeff_flag_offset[0][cat];
 52.2047 +    abs_level_m1_ctx_base = c->cabac_state
 52.2048 +        + coeff_abs_level_m1_offset[cat];
 52.2049 +
 52.2050 +    if( !is_dc && cat == 5 ) {
 52.2051 +#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
 52.2052 +        for(last= 0; last < coefs; last++) { \
 52.2053 +            uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
 52.2054 +            if( get_cabac( c, sig_ctx )) { \
 52.2055 +                uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
 52.2056 +                index[coeff_count++] = last; \
 52.2057 +                if( get_cabac( c, last_ctx ) ) { \
 52.2058 +                    last= max_coeff; \
 52.2059 +                    break; \
 52.2060 +                } \
 52.2061 +            } \
 52.2062 +        }\
 52.2063 +        if( last == max_coeff -1 ) {\
 52.2064 +            index[coeff_count++] = last;\
 52.2065 +        }\
 52.2066 +		
 52.2067 +        const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0];
 52.2068 +        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
 52.2069 +    } else {
 52.2070 +        DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
 52.2071 +    }
 52.2072 +    assert(coeff_count > 0);
 52.2073 +
 52.2074 +    if( is_dc ) {
 52.2075 +        if( cat == 0 )
 52.2076 +            hc->cbp[mb_x] |= 0x100;
 52.2077 +        else
 52.2078 +            hc->cbp[mb_x] |= 0x40 << n;
 52.2079 +    } else {
 52.2080 +        if( cat == 5 )
 52.2081 +            fill_rectangle(&m->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
 52.2082 +        else {
 52.2083 +            assert( cat == 1 || cat == 2 || cat == 4 );
 52.2084 +            m->non_zero_count_cache[scan8[n]] = coeff_count;
 52.2085 +        }
 52.2086 +    }
 52.2087 +
 52.2088 +    do {
 52.2089 +        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
 52.2090 +        int j= scantable[index[--coeff_count]];
 52.2091 +
 52.2092 +        if( get_cabac( c, ctx ) == 0 ) {
 52.2093 +            node_ctx = coeff_abs_level_transition[0][node_ctx];
 52.2094 +            if( is_dc ) {
 52.2095 +                block[j] = get_cabac_bypass_sign( c, -1);
 52.2096 +            }else{
 52.2097 +                block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6;
 52.2098 +            }
 52.2099 +        } else {
 52.2100 +            int coeff_abs = 2;
 52.2101 +            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
 52.2102 +            node_ctx = coeff_abs_level_transition[1][node_ctx];
 52.2103 +
 52.2104 +            while( coeff_abs < 15 && get_cabac( c, ctx ) ) {
 52.2105 +                coeff_abs++;
 52.2106 +            }
 52.2107 +
 52.2108 +            if( coeff_abs >= 15 ) {
 52.2109 +                int j = 0;
 52.2110 +                while( get_cabac_bypass( c ) ) {
 52.2111 +                    j++;
 52.2112 +                }
 52.2113 +
 52.2114 +                coeff_abs=1;
 52.2115 +                while( j-- ) {
 52.2116 +                    coeff_abs += coeff_abs + get_cabac_bypass( c );
 52.2117 +                }
 52.2118 +                coeff_abs+= 14;
 52.2119 +            }
 52.2120 +
 52.2121 +            if( is_dc ) {
 52.2122 +                block[j] = get_cabac_bypass_sign( c, -coeff_abs );
 52.2123 +            }else{
 52.2124 +                block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6;
 52.2125 +            }
 52.2126 +        }
 52.2127 +    } while( coeff_count );
 52.2128 +
 52.2129 +}
 52.2130 +
 52.2131 +static void decode_cabac_residual_dc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
 52.2132 +    decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, NULL, max_coeff, 1);
 52.2133 +}
 52.2134 +
 52.2135 +static void decode_cabac_residual_nondc( H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
 52.2136 +    decode_cabac_residual_internal( hc, s, c, block, cat, n, scantable, qmul, max_coeff, 0);
 52.2137 +}
 52.2138 +
 52.2139 +/**
 52.2140 + * decodes a macroblock
 52.2141 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
 52.2142 + */
 52.2143 +int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c) {
 52.2144 +    H264Mb *m = s->m;
 52.2145 +	int mb_x = m->mb_x;
 52.2146 +    int mb_type, partition_count, cbp = 0;
 52.2147 +    int dct8x8_allowed= s->pps.transform_8x8_mode;
 52.2148 +
 52.2149 +    fill_decode_neighbors(hc, s);
 52.2150 +	memset(m->mb, 0 , sizeof(m->mb));
 52.2151 +
 52.2152 +    if( s->slice_type_nos != FF_I_TYPE ) {
 52.2153 +        int skip;
 52.2154 +        /* a skipped mb needs the aff flag from the following mb */
 52.2155 +        skip = decode_cabac_mb_skip( hc, s, m, c);
 52.2156 +		
 52.2157 +        /* read skip flags */
 52.2158 +        if( skip ) {
 52.2159 +            decode_mb_skip(hc, s);
 52.2160 +            hc->cbp[mb_x] = m->cbp = 0;
 52.2161 +            hc->chroma_pred_mode[mb_x] = 0;
 52.2162 +            s->last_qscale_diff = 0;
 52.2163 +            return 0;
 52.2164 +        }
 52.2165 +    }
 52.2166 +
 52.2167 +    if( s->slice_type_nos == FF_B_TYPE ) {
 52.2168 +        int ctx = 0;
 52.2169 +
 52.2170 +        if( !IS_DIRECT( m->left_type-1 ) )
 52.2171 +            ctx++;
 52.2172 +        if( !IS_DIRECT( m->top_type-1 ) )
 52.2173 +            ctx++;
 52.2174 +
 52.2175 +        if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){
 52.2176 +            mb_type= 0; /* B_Direct_16x16 */
 52.2177 +        }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) {
 52.2178 +            mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */
 52.2179 +        }else{
 52.2180 +            int bits;
 52.2181 +            bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3;
 52.2182 +            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2;
 52.2183 +            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1;
 52.2184 +            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] );
 52.2185 +            if( bits < 8 ){
 52.2186 +                mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
 52.2187 +            }else if( bits == 13 ){
 52.2188 +                mb_type= decode_cabac_intra_mb_type(s, c, 32, 0);
 52.2189 +                goto decode_intra_mb;
 52.2190 +            }else if( bits == 14 ){
 52.2191 +                mb_type= 11; /* B_L1_L0_8x16 */
 52.2192 +            }else if( bits == 15 ){
 52.2193 +                mb_type= 22; /* B_8x8 */
 52.2194 +            }else{
 52.2195 +                bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] );
 52.2196 +                mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
 52.2197 +            }
 52.2198 +        }
 52.2199 +            partition_count= b_mb_type_info[mb_type].partition_count;
 52.2200 +            mb_type=         b_mb_type_info[mb_type].type;
 52.2201 +    } else if( s->slice_type_nos == FF_P_TYPE ) {
 52.2202 +        if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) {
 52.2203 +            /* P-type */
 52.2204 +            if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) {
 52.2205 +                /* P_L0_D16x16, P_8x8 */
 52.2206 +                mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] );
 52.2207 +            } else {
 52.2208 +                /* P_L0_D8x16, P_L0_D16x8 */
 52.2209 +                mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] );
 52.2210 +            }
 52.2211 +            partition_count= p_mb_type_info[mb_type].partition_count;
 52.2212 +            mb_type=         p_mb_type_info[mb_type].type;
 52.2213 +        } else {
 52.2214 +            mb_type= decode_cabac_intra_mb_type(s, c, 17, 0);
 52.2215 +            goto decode_intra_mb;
 52.2216 +        }
 52.2217 +    } else {
 52.2218 +        mb_type= decode_cabac_intra_mb_type(s ,c, 3, 1);
 52.2219 +        if(s->slice_type == FF_SI_TYPE && mb_type)
 52.2220 +            mb_type--;
 52.2221 +        assert(s->slice_type_nos == FF_I_TYPE);
 52.2222 +decode_intra_mb:
 52.2223 +        partition_count = 0;
 52.2224 +        cbp= i_mb_type_info[mb_type].cbp;
 52.2225 +        m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
 52.2226 +        mb_type= i_mb_type_info[mb_type].type;
 52.2227 +    }
 52.2228 +	
 52.2229 +    if(IS_INTRA_PCM(mb_type)) {
 52.2230 +        uint8_t *ptr;
 52.2231 +        // We assume these blocks are very rare so we do not optimize it.
 52.2232 +        // FIXME The two following lines get the bitstream position in the cabac
 52.2233 +        // decode, I think it should be done by a function in cabac.h (or cabac.c).
 52.2234 +        ptr=c->bytestream;
 52.2235 +        if(c->low&0x1) ptr--;
 52.2236 +        if(CABAC_BITS==16){
 52.2237 +            if(c->low&0x1FF) ptr--;
 52.2238 +        }
 52.2239 +		if ((unsigned) (ptr + 384) >= (unsigned) c->bytestream_end){
 52.2240 +			fprintf(stderr, "Intra PCM mb crossed bytestream buffer\n Known issue.");
 52.2241 +		}		
 52.2242 +		
 52.2243 +        // The pixels are stored in the same order as levels in h->mb array.
 52.2244 +        memcpy(m->mb, ptr, 256); ptr+=256;        
 52.2245 +		memcpy(m->mb+128, ptr, 128); ptr+=128;
 52.2246 +        
 52.2247 +		c->bytestream = ptr;
 52.2248 +		#if CABAC_BITS == 16
 52.2249 +		c->low =  (*c->bytestream++)<<18;
 52.2250 +		c->low+=  (*c->bytestream++)<<10;
 52.2251 +		#else
 52.2252 +		c->low =  (*c->bytestream++)<<10;
 52.2253 +		#endif
 52.2254 +		c->low+= ((*c->bytestream++)<<2) + 2;
 52.2255 +		c->range= 0x1FE;
 52.2256 +
 52.2257 +        // All blocks are present
 52.2258 +        hc->cbp[mb_x] = 0x1ef;
 52.2259 +        hc->chroma_pred_mode[mb_x] = 0;
 52.2260 +        // In deblocking, the quantizer is 0
 52.2261 +        hc->qscale[mb_x]= 0;
 52.2262 +        // All coeffs are present
 52.2263 +        memset(hc->non_zero_count[mb_x], 16, 32);
 52.2264 +		hc->mb_type[mb_x]= m->mb_type = mb_type;
 52.2265 +        s->last_qscale_diff = 0;
 52.2266 +        fill_filter_caches(hc, s, mb_type);
 52.2267 +        return 0;
 52.2268 +    }
 52.2269 +    fill_decode_caches(hc, s, mb_type);
 52.2270 +
 52.2271 +    if( IS_INTRA( mb_type ) ) {
 52.2272 +        int i, pred_mode;
 52.2273 +        if( IS_INTRA4x4( mb_type ) ) {
 52.2274 +            if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] ) ) {
 52.2275 +                mb_type |= MB_TYPE_8x8DCT;
 52.2276 +                for( i = 0; i < 16; i+=4 ) {
 52.2277 +                    int pred = pred_intra_mode( s, i );
 52.2278 +                    int mode = decode_cabac_mb_intra4x4_pred_mode(c, pred );
 52.2279 +                    fill_rectangle( &m->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
 52.2280 +                }
 52.2281 +            } else {
 52.2282 +                for( i = 0; i < 16; i++ ) {
 52.2283 +                    int pred = pred_intra_mode( s, i );
 52.2284 +                    m->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode(c, pred );
 52.2285 +                }
 52.2286 +            }
 52.2287 +            write_back_intra_pred_mode(hc, s);
 52.2288 +            if( check_intra4x4_pred_mode(s) < 0 ) return -1;
 52.2289 +        } else {
 52.2290 +            m->intra16x16_pred_mode= check_intra_pred_mode(s, m->intra16x16_pred_mode );
 52.2291 +            if( m->intra16x16_pred_mode < 0 ) return -1;
 52.2292 +        }
 52.2293 +
 52.2294 +		hc->chroma_pred_mode[mb_x] =
 52.2295 +		pred_mode                        = decode_cabac_mb_chroma_pre_mode( hc, s, c );
 52.2296 +
 52.2297 +		pred_mode= check_intra_pred_mode( s, pred_mode );
 52.2298 +		if( pred_mode < 0 ) return -1;
 52.2299 +		m->chroma_pred_mode= pred_mode;
 52.2300 +	
 52.2301 +    } else if( partition_count == 4 ) {
 52.2302 +        int i, j, sub_partition_count[4], list, ref[2][4];
 52.2303 +
 52.2304 +        if( s->slice_type_nos == FF_B_TYPE ) {
 52.2305 +            for( i = 0; i < 4; i++ ) {
 52.2306 +                m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c );
 52.2307 +                sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
 52.2308 +                m->sub_mb_type[i]=      b_sub_mb_type_info[ m->sub_mb_type[i] ].type;
 52.2309 +            }
 52.2310 +            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
 52.2311 +                          m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
 52.2312 +                ff_h264_pred_direct_motion(hc, s, &mb_type);
 52.2313 +                m->ref_cache[0][scan8[4]] =
 52.2314 +                m->ref_cache[1][scan8[4]] =
 52.2315 +                m->ref_cache[0][scan8[12]] =
 52.2316 +                m->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
 52.2317 +                    for( i = 0; i < 4; i++ )
 52.2318 +                        fill_rectangle( &hc->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 );
 52.2319 +            }
 52.2320 +        } else {
 52.2321 +            for( i = 0; i < 4; i++ ) {
 52.2322 +                m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c );
 52.2323 +                sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
 52.2324 +                m->sub_mb_type[i]=      p_sub_mb_type_info[ m->sub_mb_type[i] ].type;
 52.2325 +            }
 52.2326 +        }
 52.2327 +
 52.2328 +        for( list = 0; list < s->list_count; list++ ) {
 52.2329 +            for( i = 0; i < 4; i++ ) {
 52.2330 +                if(IS_DIRECT(m->sub_mb_type[i])) continue;
 52.2331 +                if(IS_DIR(m->sub_mb_type[i], 0, list)){
 52.2332 +                    if( s->ref_count[list] > 1 ){
 52.2333 +                        ref[list][i] = decode_cabac_mb_ref(hc, s, c, list, 4*i );
 52.2334 +                        if(ref[list][i] >= s->ref_count[list]){
 52.2335 +                            fprintf(stderr, "Reference %d >= %d\n", ref[list][i], s->ref_count[list]);
 52.2336 +                            return -1;
 52.2337 +                        }
 52.2338 +                    }else
 52.2339 +                        ref[list][i] = 0;
 52.2340 +                } else {
 52.2341 +                    ref[list][i] = -1;
 52.2342 +                }
 52.2343 +                                                    m->ref_cache[list][ scan8[4*i]+1 ]=
 52.2344 +                m->ref_cache[list][ scan8[4*i]+8 ]=m->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
 52.2345 +            }
 52.2346 +        }
 52.2347 +
 52.2348 +        if(dct8x8_allowed)
 52.2349 +            dct8x8_allowed = get_dct8x8_allowed(s);
 52.2350 +
 52.2351 +        for(list=0; list<s->list_count; list++){
 52.2352 +            for(i=0; i<4; i++){
 52.2353 +                m->ref_cache[list][ scan8[4*i]   ]=m->ref_cache[list][ scan8[4*i]+1 ];
 52.2354 +                if(IS_DIRECT(m->sub_mb_type[i])){
 52.2355 +                    fill_rectangle(hc->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
 52.2356 +                    continue;
 52.2357 +                }
 52.2358 +
 52.2359 +                if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){
 52.2360 +                    const int sub_mb_type= m->sub_mb_type[i];
 52.2361 +                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
 52.2362 +                    for(j=0; j<sub_partition_count[i]; j++){
 52.2363 +                        int mpx, mpy;
 52.2364 +                        int mx, my;
 52.2365 +                        const int index= 4*i + block_width*j;
 52.2366 +                        int16_t (* mv_cache)[2]= &m->mv_cache[list][ scan8[index]];
 52.2367 +                        uint8_t (* mvd_cache)[2]= &hc->mvd_cache[list][ scan8[index]];
 52.2368 +                        pred_motion(s, index, block_width, list, m->ref_cache[list][ scan8[index] ], &mx, &my);
 52.2369 +                        DECODE_CABAC_MB_MVD( hc, c, list, index)
 52.2370 +
 52.2371 +                        if(IS_SUB_8X8(sub_mb_type)){
 52.2372 +                            mv_cache[ 1 ][0]=
 52.2373 +                            mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
 52.2374 +                            mv_cache[ 1 ][1]=
 52.2375 +                            mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
 52.2376 +
 52.2377 +                            mvd_cache[ 1 ][0]=
 52.2378 +                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx;
 52.2379 +                            mvd_cache[ 1 ][1]=
 52.2380 +                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy;
 52.2381 +                        }else if(IS_SUB_8X4(sub_mb_type)){
 52.2382 +                            mv_cache[ 1 ][0]= mx;
 52.2383 +                            mv_cache[ 1 ][1]= my;
 52.2384 +
 52.2385 +                            mvd_cache[ 1 ][0]=  mpx;
 52.2386 +                            mvd_cache[ 1 ][1]= mpy;
 52.2387 +                        }else if(IS_SUB_4X8(sub_mb_type)){
 52.2388 +                            mv_cache[ 8 ][0]= mx;
 52.2389 +                            mv_cache[ 8 ][1]= my;
 52.2390 +
 52.2391 +                            mvd_cache[ 8 ][0]= mpx;
 52.2392 +                            mvd_cache[ 8 ][1]= mpy;
 52.2393 +                        }
 52.2394 +                        mv_cache[ 0 ][0]= mx;
 52.2395 +                        mv_cache[ 0 ][1]= my;
 52.2396 +
 52.2397 +                        mvd_cache[ 0 ][0]= mpx;
 52.2398 +                        mvd_cache[ 0 ][1]= mpy;
 52.2399 +                    }
 52.2400 +                }else{
 52.2401 +                    fill_rectangle(m->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4);
 52.2402 +                    fill_rectangle(hc->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2);
 52.2403 +                }
 52.2404 +            }
 52.2405 +        }
 52.2406 +    } else if( IS_DIRECT(mb_type) ) {
 52.2407 +		ff_h264_pred_direct_motion(hc, s, &mb_type);
 52.2408 +        fill_rectangle(hc->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
 52.2409 +        fill_rectangle(hc->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
 52.2410 +        dct8x8_allowed &= s->direct_8x8_inference_flag;
 52.2411 +    } else {
 52.2412 +        int list, i;
 52.2413 +        if(IS_16X16(mb_type)){
 52.2414 +            for(list=0; list<s->list_count; list++){
 52.2415 +                if(IS_DIR(mb_type, 0, list)){
 52.2416 +                    int ref;
 52.2417 +                    if(s->ref_count[list] > 1){
 52.2418 +                        ref= decode_cabac_mb_ref(hc, s, c, list, 0);
 52.2419 +                        if(ref >= s->ref_count[list]){
 52.2420 +                            fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
 52.2421 +                            return -1;
 52.2422 +                        }
 52.2423 +                    }else
 52.2424 +                        ref=0;
 52.2425 +                        fill_rectangle(&m->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
 52.2426 +                }
 52.2427 +            }
 52.2428 +            for(list=0; list<s->list_count; list++){
 52.2429 +                if(IS_DIR(mb_type, 0, list)){
 52.2430 +                    int mx,my,mpx,mpy;
 52.2431 +                    pred_motion(s, 0, 4, list, m->ref_cache[list][ scan8[0] ], &mx, &my);
 52.2432 +                    DECODE_CABAC_MB_MVD( hc, c, list, 0)
 52.2433 +
 52.2434 +                    fill_rectangle(hc->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
 52.2435 +                    fill_rectangle(m->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
 52.2436 +                }
 52.2437 +
 52.2438 +            }
 52.2439 +        }
 52.2440 +        else if(IS_16X8(mb_type)){
 52.2441 +            for(list=0; list<s->list_count; list++){
 52.2442 +                    for(i=0; i<2; i++){
 52.2443 +                        if(IS_DIR(mb_type, i, list)){
 52.2444 +                            int ref;
 52.2445 +                            if(s->ref_count[list] > 1){
 52.2446 +                                ref= decode_cabac_mb_ref(hc, s, c, list, 8*i );
 52.2447 +                                if(ref >= s->ref_count[list]){
 52.2448 +                                    fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
 52.2449 +                                    return -1;
 52.2450 +                                }
 52.2451 +                            }else
 52.2452 +                                ref=0;
 52.2453 +                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
 52.2454 +                        }else
 52.2455 +                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
 52.2456 +                    }
 52.2457 +            }
 52.2458 +            for(list=0; list<s->list_count; list++){
 52.2459 +                for(i=0; i<2; i++){
 52.2460 +                    if(IS_DIR(mb_type, i, list)){
 52.2461 +                        int mx,my,mpx,mpy;
 52.2462 +                        pred_16x8_motion(s, 8*i, list, m->ref_cache[list][scan8[0] + 16*i], &mx, &my);
 52.2463 +                        DECODE_CABAC_MB_MVD( hc, c, list, 8*i)
 52.2464 +
 52.2465 +                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
 52.2466 +                        fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
 52.2467 +                    }else{
 52.2468 +                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
 52.2469 +                        fill_rectangle(m->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
 52.2470 +                    }
 52.2471 +                }
 52.2472 +            }
 52.2473 +        }else{
 52.2474 +            assert(IS_8X16(mb_type));
 52.2475 +            for(list=0; list<s->list_count; list++){
 52.2476 +                    for(i=0; i<2; i++){
 52.2477 +                        if(IS_DIR(mb_type, i, list)){ //FIXME optimize
 52.2478 +                            int ref;
 52.2479 +                            if(s->ref_count[list] > 1){
 52.2480 +                                ref= decode_cabac_mb_ref(hc, s, c, list, 4*i );
 52.2481 +                                if(ref >= s->ref_count[list]){
 52.2482 +                                    fprintf(stderr, "Reference %d >= %d\n", ref, s->ref_count[list]);
 52.2483 +                                    return -1;
 52.2484 +                                }
 52.2485 +                            }else
 52.2486 +                                ref=0;
 52.2487 +                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
 52.2488 +                        }else
 52.2489 +                            fill_rectangle(&m->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
 52.2490 +                    }
 52.2491 +            }
 52.2492 +            for(list=0; list<s->list_count; list++){
 52.2493 +                for(i=0; i<2; i++){
 52.2494 +                    if(IS_DIR(mb_type, i, list)){
 52.2495 +                        int mx,my,mpx,mpy;
 52.2496 +                        pred_8x16_motion( s, i*4, list, m->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
 52.2497 +                        DECODE_CABAC_MB_MVD( hc, c, list, 4*i)
 52.2498 +
 52.2499 +                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
 52.2500 +                        fill_rectangle(m->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
 52.2501 +                    }else{
 52.2502 +                        fill_rectangle(hc->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
 52.2503 +                        fill_rectangle(m-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
 52.2504 +                    }
 52.2505 +                }
 52.2506 +            }
 52.2507 +        }
 52.2508 +    }
 52.2509 +	
 52.2510 +	if( IS_INTER( mb_type ) ) {
 52.2511 +			hc->chroma_pred_mode[mb_x] = 0;
 52.2512 +			write_back_motion( hc, s, mb_type );
 52.2513 +	}
 52.2514 +
 52.2515 +    if( !IS_INTRA16x16( mb_type ) ) {
 52.2516 +        cbp  = decode_cabac_mb_cbp_luma( hc, c);
 52.2517 +		cbp |= decode_cabac_mb_cbp_chroma( hc, c ) << 4;
 52.2518 +    }
 52.2519 +	
 52.2520 +    hc->cbp[mb_x] = m->cbp = cbp;
 52.2521 +    if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
 52.2522 +        mb_type |= MB_TYPE_8x8DCT * get_cabac_noinline(c, &c->cabac_state[399 + hc->neighbor_transform_size] );
 52.2523 +    }
 52.2524 +
 52.2525 +    if( cbp || IS_INTRA16x16( mb_type ) ) {
 52.2526 +        const uint8_t *scan, *scan8x8, *dc_scan;
 52.2527 +        const uint32_t *qmul;
 52.2528 +
 52.2529 +        if (s->transform_bypass && s->qscale){
 52.2530 +            scan8x8= ff_zigzag_direct;
 52.2531 +            scan= zigzag_scan;
 52.2532 +        }else{
 52.2533 +            scan8x8= hc->zigzag_scan8x8;
 52.2534 +            scan= hc->zigzag_scan;
 52.2535 +        }
 52.2536 +        dc_scan= luma_dc_zigzag_scan;
 52.2537 +
 52.2538 +        // decode_cabac_mb_dqp
 52.2539 +        if(get_cabac_noinline(c, &c->cabac_state[60 + (s->last_qscale_diff != 0)])){
 52.2540 +            int val = 1;
 52.2541 +            int ctx= 2;
 52.2542 +
 52.2543 +            while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) {
 52.2544 +                ctx= 3;
 52.2545 +                val++;
 52.2546 +                if(val > 102){ //prevent infinite loop
 52.2547 +                    fprintf(stderr, "cabac decode of qscale diff failed at %d %d (%d)\n", m->mb_x, m->mb_y, val);
 52.2548 +                    return -1;
 52.2549 +                }
 52.2550 +            }
 52.2551 +
 52.2552 +            if( val&0x01 )
 52.2553 +                val=   (val + 1)>>1 ;
 52.2554 +            else
 52.2555 +                val= -((val + 1)>>1);
 52.2556 +            s->last_qscale_diff = val;
 52.2557 +            s->qscale += val;
 52.2558 +            if(((unsigned)s->qscale) > 51){
 52.2559 +                if(s->qscale<0) s->qscale+= 52;
 52.2560 +                else            s->qscale-= 52;
 52.2561 +            }
 52.2562 +            s->chroma_qp[0] = s->pps.chroma_qp_table[0][s->qscale];
 52.2563 +            s->chroma_qp[1] = s->pps.chroma_qp_table[1][s->qscale];
 52.2564 +        }else
 52.2565 +            s->last_qscale_diff=0;
 52.2566 +
 52.2567 +        if( IS_INTRA16x16( mb_type ) ) {
 52.2568 +            int i;            
 52.2569 +            decode_cabac_residual_dc( hc, s, c, m->mb, 0, 0, dc_scan, 16);
 52.2570 +
 52.2571 +            if( cbp&15 ) {
 52.2572 +                qmul = hc->dequant4_coeff[0][s->qscale];
 52.2573 +                for( i = 0; i < 16; i++ ) {                    
 52.2574 +                    decode_cabac_residual_nondc( hc, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15);
 52.2575 +                }
 52.2576 +            } else {
 52.2577 +                fill_rectangle(&m->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
 52.2578 +            }
 52.2579 +        } else {
 52.2580 +            int i8x8, i4x4;
 52.2581 +            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
 52.2582 +                if( cbp & (1<<i8x8) ) {
 52.2583 +                    if( IS_8x8DCT(mb_type) ) {
 52.2584 +                        decode_cabac_residual_nondc(hc, s, c, m->mb + 64*i8x8, 5, 4*i8x8,
 52.2585 +                            scan8x8, hc->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
 52.2586 +                    } else {
 52.2587 +                        qmul = hc->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
 52.2588 +                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
 52.2589 +                            const int index = 4*i8x8 + i4x4;                            
 52.2590 +//START_TIMER
 52.2591 +                            decode_cabac_residual_nondc(hc, s, c, m->mb + 16*index, 2, index, scan, qmul, 16);
 52.2592 +//STOP_TIMER("decode_residual")
 52.2593 +                        }
 52.2594 +                    }
 52.2595 +                } else {
 52.2596 +                    uint8_t * const nnz= &m->non_zero_count_cache[ scan8[4*i8x8] ];
 52.2597 +                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
 52.2598 +                }
 52.2599 +            }
 52.2600 +        }
 52.2601 +
 52.2602 +        if( cbp&0x30 ){
 52.2603 +            int i;
 52.2604 +            for( i = 0; i < 2; i++ ) {                
 52.2605 +                decode_cabac_residual_dc(hc, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4);
 52.2606 +            }
 52.2607 +        }
 52.2608 +
 52.2609 +        if( cbp&0x20 ) {
 52.2610 +            int i, j;
 52.2611 +            for( i = 0; i < 2; i++ ) {
 52.2612 +                qmul = hc->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][s->chroma_qp[i]];
 52.2613 +                for( j = 0; j < 4; j++ ) {
 52.2614 +                    const int index = 16 + 4 * i + j;                    
 52.2615 +                    decode_cabac_residual_nondc( hc, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15);
 52.2616 +                }
 52.2617 +            }
 52.2618 +        } else {
 52.2619 +            uint8_t * const nnz= &m->non_zero_count_cache[0];
 52.2620 +            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
 52.2621 +            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
 52.2622 +        }
 52.2623 +    } else {
 52.2624 +        uint8_t * const nnz= &m->non_zero_count_cache[0];
 52.2625 +        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
 52.2626 +        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
 52.2627 +        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
 52.2628 +        s->last_qscale_diff = 0;
 52.2629 +    }
 52.2630 +	hc->mb_type[mb_x]= m->mb_type = mb_type;
 52.2631 +    hc->qscale[mb_x]= s->qscale;	
 52.2632 +    write_back_non_zero_count(hc, s);
 52.2633 +    fill_filter_caches(hc, s, mb_type);
 52.2634 +
 52.2635 +    return 0;
 52.2636 +}

    53.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    53.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_cabac_spu.h	Mon Aug 27 12:09:56 2012 +0200
    53.3 @@ -0,0 +1,17 @@
    53.4 +#ifndef H264_CABAC_H
    53.5 +#define H264_CABAC_H
    53.6 +
    53.7 +#define CELL_SPE
    53.8 +#include "libavcodec/avcodec.h"
    53.9 +#include "h264_types_spu.h"
   53.10 +#include "cabac_spu.h"
   53.11 +
   53.12 +
   53.13 +/**
   53.14 + * decodes a CABAC coded macroblock
   53.15 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
   53.16 + */
   53.17 +int ff_h264_decode_mb_cabac(H264Cabac_spu *hc, EDSlice_spu *s, CABACContext *c);
   53.18 +void ff_h264_init_cabac_states(EDSlice_spu *s, CABACContext *c);
   53.19 +
   53.20 +#endif

    54.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    54.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_chroma_template_spu.c	Mon Aug 27 12:09:56 2012 +0200
    54.3 @@ -0,0 +1,355 @@
    54.4 +static void PREFIX_h264_chroma_mc8_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
    54.5 +
    54.6 +  register int i;
    54.7 +
    54.8 +  const int16_t i32ss= 32;
    54.9 +  const int16_t imax = 255;
   54.10 +  const int16_t iABCD1 = ((8 - x) * (8 - y));
   54.11 +  const int16_t iABCD2 = ((x) * (8 - y));
   54.12 +  const int16_t iABCD3 = ((8 - x) * (y));
   54.13 +  const int16_t iABCD4 = ((x) * (y));
   54.14 +
   54.15 +  const vsint16_t vA = spu_splats(iABCD1);
   54.16 +  const vsint16_t vB = spu_splats(iABCD2);
   54.17 +  const vsint16_t vC = spu_splats(iABCD3);
   54.18 +  const vsint16_t vD = spu_splats(iABCD4);
   54.19 +  const vsint32_t vzero = spu_splats(0);
   54.20 +  const vsint16_t v32ss = spu_splats(i32ss);
   54.21 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
   54.22 +  vuint16_t sat;
   54.23 +
   54.24 +  const int shift_src =(unsigned int) src & 15;
   54.25 +  const int shift_dst =(unsigned int) dst & 15;
   54.26 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
   54.27 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
   54.28 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
   54.29 +  const vuint8_t dstmask0= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
   54.30 +  const vuint8_t dstmask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
   54.31 +  vuint8_t dstmask;
   54.32 +
   54.33 +  if(shift_dst==0)
   54.34 +    dstmask=dstmask0;
   54.35 +  else
   54.36 +    dstmask=dstmask8;
   54.37 +
   54.38 +  vuint8_t vsrc0uc1;
   54.39 +  vuint8_t vsrc0uc2;
   54.40 +  vuint8_t vsrc0uc;
   54.41 +  vuint8_t vsrc1uc;
   54.42 +  vsrc0uc1 = *(vuint8_t *)(src);
   54.43 +  vsrc0uc2 = *(vuint8_t *)(src+16);
   54.44 +  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
   54.45 +  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
   54.46 +
   54.47 +  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
   54.48 +  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
   54.49 +
   54.50 +  for (i = 0 ; i < h ; i++) {
   54.51 +        
   54.52 +    vuint8_t vsrc2uc1;
   54.53 +    vuint8_t vsrc2uc2;
   54.54 +    vuint8_t vsrc2uc;
   54.55 +    vuint8_t vsrc3uc;
   54.56 +    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
   54.57 +    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
   54.58 +    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
   54.59 +    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
   54.60 +        
   54.61 +    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
   54.62 +    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
   54.63 +        
   54.64 +    vsint16_t psum;
   54.65 +        
   54.66 +    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
   54.67 +    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
   54.68 +    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
   54.69 +
   54.70 +    psum1 = spu_mule(vsrc1ssH, vB);
   54.71 +    psum2 = spu_mulo(vsrc1ssH, vB);
   54.72 +    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
   54.73 +    psum = spu_add(psum3, psum);
   54.74 +
   54.75 +    psum1 = spu_mule(vsrc2ssH, vC);
   54.76 +    psum2 = spu_mulo(vsrc2ssH, vC);
   54.77 +    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
   54.78 +    psum = spu_add(psum3, psum);
   54.79 +
   54.80 +    psum1 = spu_mule(vsrc3ssH, vD);
   54.81 +    psum2 = spu_mulo(vsrc3ssH, vD);
   54.82 +    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
   54.83 +    psum = spu_add(psum3, psum);
   54.84 +
   54.85 +    psum = spu_add(v32ss, psum);
   54.86 +    psum = spu_rlmask(psum, -6);
   54.87 +
   54.88 +    //Saturation from 0 to 255
   54.89 +    sat = spu_cmpgt(psum,(vsint16_t)vzero);
   54.90 +    psum = spu_and(psum,(vsint16_t)sat);
   54.91 +    sat = spu_cmpgt(psum,vmax);
   54.92 +    psum = spu_sel(psum,vmax,sat);
   54.93 +
   54.94 +    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
   54.95 +
   54.96 +    const vuint8_t dst1 = *(vuint8_t *)dst;
   54.97 +
   54.98 +    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
   54.99 +    vuint8_t fsum;
  54.100 +    OP_U8_SPU(fsum, dsum, dst1);
  54.101 +
  54.102 +    *(vuint8_t *)dst=fsum;
  54.103 +
  54.104 +    vsrc0ssH = vsrc2ssH;
  54.105 +    vsrc1ssH = vsrc3ssH;
  54.106 +        
  54.107 +    dst += dst_stride;
  54.108 +    //src += src_stride;
  54.109 +	src += STRIDE_C;
  54.110 +  }
  54.111 +}
  54.112 +
  54.113 +static void PREFIX_h264_chroma_mc4_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
  54.114 +
  54.115 +  register int i;
  54.116 +
  54.117 +  const int16_t i32ss= 32;
  54.118 +  const int16_t imax = 255;
  54.119 +  const int16_t iABCD1 = ((8 - x) * (8 - y));
  54.120 +  const int16_t iABCD2 = ((x) * (8 - y));
  54.121 +  const int16_t iABCD3 = ((8 - x) * (y));
  54.122 +  const int16_t iABCD4 = ((x) * (y));
  54.123 +
  54.124 +  const vsint16_t vA = spu_splats(iABCD1);
  54.125 +  const vsint16_t vB = spu_splats(iABCD2);
  54.126 +  const vsint16_t vC = spu_splats(iABCD3);
  54.127 +  const vsint16_t vD = spu_splats(iABCD4);
  54.128 +  const vsint32_t vzero = spu_splats(0);
  54.129 +  const vsint16_t v32ss = spu_splats(i32ss);
  54.130 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  54.131 +  vuint16_t sat;
  54.132 +    
  54.133 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
  54.134 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
  54.135 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
  54.136 +
  54.137 +  const int shift_src = (unsigned int) src & 15;
  54.138 +  const int shift_dst = (unsigned int) dst & 15;
  54.139 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  54.140 +  const vuint8_t dstmask0=  {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  54.141 +  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  54.142 +  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
  54.143 +  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
  54.144 +
  54.145 +  switch(shift_dst){
  54.146 +    case 0:  dstmask = dstmask0;
  54.147 +             break;
  54.148 +    case 4:  dstmask = dstmask4;
  54.149 +             break;
  54.150 +    case 8:  dstmask = dstmask8;
  54.151 +             break;
  54.152 +    case 12: dstmask = dstmask12;
  54.153 +             break;
  54.154 +  }
  54.155 +
  54.156 +  vuint8_t vsrc0uc1;
  54.157 +  vuint8_t vsrc0uc2;
  54.158 +  vuint8_t vsrc0uc;
  54.159 +  vuint8_t vsrc1uc;
  54.160 +  vsrc0uc1 = *(vuint8_t *)(src);
  54.161 +  vsrc0uc2 = *(vuint8_t *)(src+16);
  54.162 +  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
  54.163 +  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
  54.164 +    
  54.165 +  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
  54.166 +  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
  54.167 +
  54.168 +  for (i = 0 ; i < h ; i++) {
  54.169 +
  54.170 +    vuint8_t vsrc2uc1;
  54.171 +    vuint8_t vsrc2uc2;
  54.172 +    vuint8_t vsrc2uc;
  54.173 +    vuint8_t vsrc3uc;
  54.174 +    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
  54.175 +    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
  54.176 +    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
  54.177 +    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
  54.178 +        
  54.179 +    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
  54.180 +    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
  54.181 +        
  54.182 +    vsint16_t psum;
  54.183 +        
  54.184 +    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
  54.185 +    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
  54.186 +    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.187 +
  54.188 +    psum1 = spu_mule(vsrc1ssH, vB);
  54.189 +    psum2 = spu_mulo(vsrc1ssH, vB);
  54.190 +    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.191 +    psum = spu_add(psum3, psum);
  54.192 +
  54.193 +    psum1 = spu_mule(vsrc2ssH, vC);
  54.194 +    psum2 = spu_mulo(vsrc2ssH, vC);
  54.195 +    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.196 +    psum = spu_add(psum3, psum);
  54.197 +
  54.198 +    psum1 = spu_mule(vsrc3ssH, vD);
  54.199 +    psum2 = spu_mulo(vsrc3ssH, vD);
  54.200 +    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.201 +    psum = spu_add(psum3, psum);
  54.202 +
  54.203 +    psum = spu_add(v32ss, psum);
  54.204 +    psum = spu_rlmask(psum, -6);
  54.205 +
  54.206 +    //Saturation from 0 to 255
  54.207 +    sat = spu_cmpgt(psum,(vsint16_t)vzero);
  54.208 +    psum = spu_and(psum,(vsint16_t)sat);
  54.209 +    sat = spu_cmpgt(psum,vmax);
  54.210 +    psum = spu_sel(psum,vmax,sat);
  54.211 +
  54.212 +    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
  54.213 +
  54.214 +    const vuint8_t dst1 = *(vuint8_t *)dst;
  54.215 +
  54.216 +    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
  54.217 +    vuint8_t fsum;
  54.218 +    OP_U8_SPU(fsum, dsum, dst1);
  54.219 +
  54.220 +    *(vuint8_t *)dst=fsum;
  54.221 +
  54.222 +    vsrc0ssH = vsrc2ssH;
  54.223 +    vsrc1ssH = vsrc3ssH;
  54.224 +        
  54.225 +    dst += dst_stride;
  54.226 +    src += STRIDE_C;
  54.227 +  }
  54.228 +}
  54.229 +
  54.230 +static void PREFIX_h264_chroma_mc2_spu(uint8_t * dst, uint8_t * src, int dst_stride, int h, int x, int y) {
  54.231 +
  54.232 +  register int i;
  54.233 +
  54.234 +  const int16_t i32ss= 32;
  54.235 +  const int16_t imax = 255;
  54.236 +  const int16_t iABCD1 = ((8 - x) * (8 - y));
  54.237 +  const int16_t iABCD2 = ((x) * (8 - y));
  54.238 +  const int16_t iABCD3 = ((8 - x) * (y));
  54.239 +  const int16_t iABCD4 = ((x) * (y));
  54.240 +
  54.241 +  const vsint16_t vA = spu_splats(iABCD1);
  54.242 +  const vsint16_t vB = spu_splats(iABCD2);
  54.243 +  const vsint16_t vC = spu_splats(iABCD3);
  54.244 +  const vsint16_t vD = spu_splats(iABCD4);
  54.245 +  const vsint32_t vzero = spu_splats(0);
  54.246 +  const vsint16_t v32ss = spu_splats(i32ss);
  54.247 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  54.248 +  vuint16_t sat;
  54.249 +    
  54.250 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
  54.251 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
  54.252 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
  54.253 +
  54.254 +  const int shift_src = (unsigned int) src & 15;
  54.255 +  const int shift_dst = (unsigned int) dst & 15;
  54.256 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
  54.257 +  const vuint8_t dstmask0=  {0x10,0x11,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  54.258 +  const vuint8_t dstmask2=  {0x00,0x01,0x10,0x11,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  54.259 +  const vuint8_t dstmask4=  {0x00,0x01,0x02,0x03,0x10,0x11,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  54.260 +  const vuint8_t dstmask6=  {0x00,0x01,0x02,0x03,0x04,0x05,0x10,0x11,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  54.261 +  const vuint8_t dstmask8=  {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  54.262 +  const vuint8_t dstmask10= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x10,0x11,0x0C,0x0D,0x0E,0x0F};
  54.263 +  const vuint8_t dstmask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x0E,0x0F};
  54.264 +  const vuint8_t dstmask14= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x10,0x11};
  54.265 +  
  54.266 +  switch(shift_dst){
  54.267 +    case 0:  dstmask = dstmask0;
  54.268 +             break;
  54.269 +    case 2:  dstmask = dstmask2;
  54.270 +             break;
  54.271 +    case 4:  dstmask = dstmask4;
  54.272 +             break;
  54.273 +    case 6:  dstmask = dstmask6;
  54.274 +             break;
  54.275 +    case 8:  dstmask = dstmask8;
  54.276 +             break;
  54.277 +    case 10: dstmask = dstmask10;
  54.278 +             break;
  54.279 +    case 12: dstmask = dstmask12;
  54.280 +             break;
  54.281 +    case 14: dstmask = dstmask14;
  54.282 +             break;
  54.283 +  }
  54.284 +
  54.285 +  vuint8_t vsrc0uc1;
  54.286 +  vuint8_t vsrc0uc2;
  54.287 +  vuint8_t vsrc0uc;
  54.288 +  vuint8_t vsrc1uc;
  54.289 +  vsrc0uc1 = *(vuint8_t *)(src);
  54.290 +  vsrc0uc2 = *(vuint8_t *)(src+16);
  54.291 +  vsrc0uc = spu_or(spu_slqwbyte(vsrc0uc1, shift_src), spu_rlmaskqwbyte(vsrc0uc2, shift_src-16));
  54.292 +  vsrc1uc = spu_slqwbyte(vsrc0uc, 1);
  54.293 +    
  54.294 +  vsint16_t vsrc0ssH = (vsint16_t)spu_shuffle(vsrc0uc, vsrc0uc, mergeh);
  54.295 +  vsint16_t vsrc1ssH = (vsint16_t)spu_shuffle(vsrc1uc, vsrc1uc, mergeh);
  54.296 +
  54.297 +  for (i = 0 ; i < h ; i++) {
  54.298 +
  54.299 +    vuint8_t vsrc2uc1;
  54.300 +    vuint8_t vsrc2uc2;
  54.301 +    vuint8_t vsrc2uc;
  54.302 +    vuint8_t vsrc3uc;
  54.303 +    vsrc2uc1 = *(vuint8_t *)(src+STRIDE_C);
  54.304 +    vsrc2uc2 = *(vuint8_t *)(src+STRIDE_C+16);
  54.305 +    vsrc2uc = spu_or(spu_slqwbyte(vsrc2uc1, shift_src), spu_rlmaskqwbyte(vsrc2uc2, shift_src-16));
  54.306 +    vsrc3uc = spu_slqwbyte(vsrc2uc, 1);
  54.307 +        
  54.308 +    vsint16_t vsrc2ssH = (vsint16_t)spu_shuffle(vsrc2uc, vsrc2uc, mergeh);
  54.309 +    vsint16_t vsrc3ssH = (vsint16_t)spu_shuffle(vsrc3uc, vsrc3uc, mergeh);
  54.310 +        
  54.311 +    vsint16_t psum;
  54.312 +        
  54.313 +    vsint32_t psum1 = spu_mule(vsrc0ssH, vA);
  54.314 +    vsint32_t psum2 = spu_mulo(vsrc0ssH, vA);
  54.315 +    psum = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.316 +
  54.317 +    psum1 = spu_mule(vsrc1ssH, vB);
  54.318 +    psum2 = spu_mulo(vsrc1ssH, vB);
  54.319 +    vsint16_t psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.320 +    psum = spu_add(psum3, psum);
  54.321 +
  54.322 +    psum1 = spu_mule(vsrc2ssH, vC);
  54.323 +    psum2 = spu_mulo(vsrc2ssH, vC);
  54.324 +    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.325 +    psum = spu_add(psum3, psum);
  54.326 +
  54.327 +    psum1 = spu_mule(vsrc3ssH, vD);
  54.328 +    psum2 = spu_mulo(vsrc3ssH, vD);
  54.329 +    psum3 = (vsint16_t)spu_shuffle((vsint16_t)psum1, (vsint16_t)psum2, mez);
  54.330 +    psum = spu_add(psum3, psum);
  54.331 +
  54.332 +    psum = spu_add(v32ss, psum);
  54.333 +    psum = spu_rlmask(psum, -6);
  54.334 +
  54.335 +    //Saturation from 0 to 255
  54.336 +    sat = spu_cmpgt(psum,(vsint16_t)vzero);
  54.337 +    psum = spu_and(psum,(vsint16_t)sat);
  54.338 +    sat = spu_cmpgt(psum,vmax);
  54.339 +    psum = spu_sel(psum,vmax,sat);
  54.340 +
  54.341 +    const vuint8_t ppsum = (vuint8_t)spu_shuffle(psum, (vsint16_t)vzero, packsu);
  54.342 +
  54.343 +    const vuint8_t dst1 = *(vuint8_t *)dst;
  54.344 +
  54.345 +    const vuint8_t dsum = spu_shuffle(dst1, ppsum, dstmask);
  54.346 +    vuint8_t fsum;
  54.347 +    OP_U8_SPU(fsum, dsum, dst1);
  54.348 +
  54.349 +    *(vuint8_t *)dst=fsum;
  54.350 +
  54.351 +    vsrc0ssH = vsrc2ssH;
  54.352 +    vsrc1ssH = vsrc3ssH;
  54.353 +        
  54.354 +    dst += dst_stride;
  54.355 +    src += STRIDE_C;
  54.356 +  }
  54.357 +}
  54.358 +

    55.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    55.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.c	Mon Aug 27 12:09:56 2012 +0200
    55.3 @@ -0,0 +1,266 @@
    55.4 +/*
    55.5 + * Copyright (c) 2009 TUDelft 
    55.6 + * 
    55.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 
    55.8 + */
    55.9 +
   55.10 +/**
   55.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   55.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
   55.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   55.14 + * 
   55.15 + * SIMD kernels 
   55.16 + * H.264/AVC motion compensation
   55.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   55.18 + * @author Albert Paradis <apar7632@hotmail.com>
   55.19 + */ 
   55.20 +
   55.21 +#include "h264_deblock_spu.h"
   55.22 +#include "h264_decode_mb_spu.h"
   55.23 +
   55.24 +extern int print_debug;
   55.25 +
   55.26 +static void filter_mb_edgev( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
   55.27 +	H264slice *s= h->s;
   55.28 +    const int index_a = qp + s->slice_alpha_c0_offset;
   55.29 +    const int alpha = alpha_table[index_a];
   55.30 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   55.31 +    if (alpha ==0 || beta == 0) return;
   55.32 +
   55.33 +    if( bS[0] < 4 ) {
   55.34 +        int8_t tc[4];
   55.35 +        tc[0] = tc0_table[index_a][bS[0]];
   55.36 +        tc[1] = tc0_table[index_a][bS[1]];
   55.37 +        tc[2] = tc0_table[index_a][bS[2]];
   55.38 +        tc[3] = tc0_table[index_a][bS[3]];
   55.39 +		
   55.40 +        h->dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
   55.41 +    } else {
   55.42 +        h->dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
   55.43 +    }
   55.44 +}
   55.45 +
   55.46 +static void filter_mb_edgecv( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
   55.47 +	H264slice *s= h->s;
   55.48 +    const int index_a = qp + s->slice_alpha_c0_offset;
   55.49 +    const int alpha = alpha_table[index_a];
   55.50 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   55.51 +	if (alpha ==0 || beta == 0) return;
   55.52 +	
   55.53 +    if( bS[0] < 4 ) {
   55.54 +        int8_t tc[4];
   55.55 +		
   55.56 +        tc[0] = tc0_table[index_a][bS[0]]+1;
   55.57 +        tc[1] = tc0_table[index_a][bS[1]]+1;
   55.58 +        tc[2] = tc0_table[index_a][bS[2]]+1;
   55.59 +        tc[3] = tc0_table[index_a][bS[3]]+1;
   55.60 +		
   55.61 +		h->dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
   55.62 +    } else {
   55.63 +        h->dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
   55.64 +    }
   55.65 +}
   55.66 +
   55.67 +static void filter_mb_edgeh( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
   55.68 +	H264slice *s= h->s;
   55.69 +    const int index_a = qp + s->slice_alpha_c0_offset;
   55.70 +    const int alpha = alpha_table[index_a];
   55.71 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   55.72 +    if (alpha ==0 || beta == 0) return;
   55.73 +
   55.74 +    if( bS[0] < 4 ) {
   55.75 +        int8_t tc[4];
   55.76 +		
   55.77 +        tc[0] = tc0_table[index_a][bS[0]];
   55.78 +        tc[1] = tc0_table[index_a][bS[1]];
   55.79 +        tc[2] = tc0_table[index_a][bS[2]];
   55.80 +        tc[3] = tc0_table[index_a][bS[3]];
   55.81 +		
   55.82 +        h->dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
   55.83 +    } else {
   55.84 +        h->dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
   55.85 +    }
   55.86 +}
   55.87 +
   55.88 +static void filter_mb_edgech( H264Context_spu *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
   55.89 +	H264slice *s= h->s;
   55.90 +    const int index_a = qp + s->slice_alpha_c0_offset;
   55.91 +    const int alpha = alpha_table[index_a];
   55.92 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   55.93 +    if (alpha ==0 || beta == 0) return;
   55.94 +
   55.95 +    if( bS[0] < 4 ) {
   55.96 +        int8_t tc[4];
   55.97 +		
   55.98 +		tc[0] = tc0_table[index_a][bS[0]]+1;
   55.99 +        tc[1] = tc0_table[index_a][bS[1]]+1;
  55.100 +        tc[2] = tc0_table[index_a][bS[2]]+1;
  55.101 +        tc[3] = tc0_table[index_a][bS[3]]+1;
  55.102 +		
  55.103 +        h->dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
  55.104 +    } else {
  55.105 +        h->dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
  55.106 +    }
  55.107 +}
  55.108 +
  55.109 +static void filter_mb_dir(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int dir) {
  55.110 +    H264Mb *mb = h->mb;
  55.111 +	H264slice *s = h->s;
  55.112 +	const int qp_xy= mb->qscale_mb_xy;
  55.113 +    const int qp_dir = dir == 0 ? mb->qscale_left_mb_xy : mb->qscale_top_mb_xy;
  55.114 +	const int mbm_type = dir == 0 ? mb->left_type : mb->top_type;
  55.115 +	const int mb_type = mb->mb_type;
  55.116 +	int edge;
  55.117 +	const int edges = mb->edges[dir];
  55.118 +    //int (*ref2frm)[64] = s->ref2frm;
  55.119 +
  55.120 +//     int start;//= h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
  55.121 +// 
  55.122 +//     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
  55.123 +//                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
  55.124 +//     // how often to recheck mv-based bS when iterating between edges
  55.125 +//     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
  55.126 +//                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
  55.127 +//     // how often to recheck mv-based bS when iterating along each edge
  55.128 +//     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
  55.129 +
  55.130 +// 	if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0))
  55.131 +// 		start =1;
  55.132 +// 	else
  55.133 +// 		start =0;
  55.134 +// 
  55.135 +//     /* Calculate bS */
  55.136 +//     for( edge = start; edge < edges; edge++ ) {
  55.137 +// 		const int mbn_type = edge > 0 ? mb_type : mbm_type;
  55.138 +// 		const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm;
  55.139 +//         int (*ref2frmn)[64] = ref2frm;//edge > 0 ? ref2frm : ref2frmm;
  55.140 +//         int16_t bS[4];
  55.141 +//         int qp;
  55.142 +// 
  55.143 +//         if( (edge&1) && IS_8x8DCT(mb_type) )
  55.144 +//             continue;
  55.145 +// 
  55.146 +//         if( IS_INTRA(mb_type) ||
  55.147 +//             IS_INTRA(mbn_type) ) {
  55.148 +//             int value;
  55.149 +// 
  55.150 +//             if (edge == 0) {
  55.151 +//                 value = 4;
  55.152 +//             } else {
  55.153 +//                 value = 3;
  55.154 +//             }
  55.155 +//             bS[0] = bS[1] = bS[2] = bS[3] = value;
  55.156 +//         } else {
  55.157 +//             int i, l;
  55.158 +//             int mv_done;
  55.159 +// 
  55.160 +//             if( edge & mask_edge ) {
  55.161 +// 
  55.162 +//                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
  55.163 +//                 mv_done = 1;
  55.164 +//             }
  55.165 +//             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
  55.166 +//                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
  55.167 +//                 int bn_idx= b_idx - (dir ? 8:1);
  55.168 +//                 int v = 0;
  55.169 +// 
  55.170 +// 				for( l = 0; !v && l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) {
  55.171 +//                     v |= ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] ||
  55.172 +//                          FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
  55.173 +//                          FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit;
  55.174 +//                 }
  55.175 +//                 bS[0] = bS[1] = bS[2] = bS[3] = v;
  55.176 +// 
  55.177 +//                 mv_done = 1;
  55.178 +//             }
  55.179 +//             else
  55.180 +//                 mv_done = 0;
  55.181 +// 
  55.182 +// 			for( i = 0; i < 4; i++ ) {
  55.183 +//                 int x = dir == 0 ? edge : i;
  55.184 +//                 int y = dir == 0 ? i    : edge;
  55.185 +//                 int b_idx= 8 + 4 + x + 8*y;
  55.186 +//                 int bn_idx= b_idx - (dir ? 8:1);
  55.187 +// 
  55.188 +//                 if( mb->non_zero_count_cache[b_idx] |
  55.189 +//                     mb->non_zero_count_cache[bn_idx] ) {
  55.190 +//                     bS[i] = 2;
  55.191 +//                 }
  55.192 +//                 else if(!mv_done)
  55.193 +//                 {
  55.194 +//                     bS[i] = 0;
  55.195 +//                     for( l = 0; l < 1 + (s->slice_type_nos == FF_B_TYPE); l++ ) {
  55.196 +//                         if( ref2frm[l][mb->ref_cache[l][b_idx]] != ref2frmn[l][mb->ref_cache[l][bn_idx]] ||
  55.197 +//                             FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
  55.198 +//                             FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
  55.199 +//                             bS[i] = 1;
  55.200 +//                             break;
  55.201 +//                         }
  55.202 +//                     }
  55.203 +//                 }
  55.204 +//             }
  55.205 +// 
  55.206 +//             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
  55.207 +//                 continue;
  55.208 +//         }
  55.209 +// 		qp = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1;
  55.210 +
  55.211 +    if(mbm_type){
  55.212 +        int16_t* bS=mb->bS[dir][0];
  55.213 +        /* Filter edge */
  55.214 +        // Do not use s->qscale as luma quantizer because it has not the same
  55.215 +        // value in IPCM macroblocks.
  55.216 +        if(bS[0]+bS[1]+bS[2]+bS[3]){
  55.217 +            int qp = ( qp_xy + qp_dir + 1 ) >> 1;
  55.218 +            if( dir == 0 ) {
  55.219 +                filter_mb_edgev(h, &img_y[0], linesize, bS, qp);
  55.220 +                {
  55.221 +                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
  55.222 +                    filter_mb_edgecv(h, &img_cb[0], uvlinesize, bS, qp);
  55.223 +                    filter_mb_edgecv(h, &img_cr[0], uvlinesize, bS, qp);
  55.224 +                }
  55.225 +            } else {
  55.226 +                filter_mb_edgeh(h, &img_y[0], linesize, bS, qp);
  55.227 +                {
  55.228 +                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
  55.229 +                    filter_mb_edgech(h, &img_cb[0], uvlinesize, bS, qp);
  55.230 +                    filter_mb_edgech(h, &img_cr[0], uvlinesize, bS, qp);
  55.231 +                }
  55.232 +            }
  55.233 +        }
  55.234 +    }
  55.235 +
  55.236 +    for( edge = 1; edge < edges; edge++ ) {
  55.237 +        int16_t* bS=mb->bS[dir][edge];
  55.238 +        int qp = qp_xy;
  55.239 +
  55.240 +        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
  55.241 +            continue;
  55.242 +
  55.243 +        /* Filter edge */
  55.244 +        // Do not use s->qscale as luma quantizer because it has not the same
  55.245 +        // value in IPCM macroblocks.
  55.246 +
  55.247 +        if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
  55.248 +            continue;
  55.249 +
  55.250 +		if( dir == 0 ) {
  55.251 +            filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
  55.252 +            if( (edge&1) == 0 ) {
  55.253 +                filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) );
  55.254 +                filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) );
  55.255 +            }
  55.256 +        } else {
  55.257 +            filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
  55.258 +            if( (edge&1) == 0 ) {
  55.259 +                filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 0, qp_xy ) );
  55.260 +                filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp( s, 1, qp_xy ) );
  55.261 +            }
  55.262 +        }
  55.263 +    }
  55.264 +}
  55.265 +
  55.266 +void filter_mb( H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
  55.267 +    filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 0);
  55.268 +    filter_mb_dir(h, img_y, img_cb, img_cr, linesize, uvlinesize, 1);
  55.269 +}

    56.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    56.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_deblock_spu.h	Mon Aug 27 12:09:56 2012 +0200
    56.3 @@ -0,0 +1,80 @@
    56.4 +#ifndef H264_FILTER_SPU_H
    56.5 +#define H264_FILTER_SPU_H
    56.6 +
    56.7 +#include "types_spu.h"
    56.8 +#include "h264_decode_mb_spu.h"
    56.9 +
   56.10 +#define FFABS(a)           ((a) >= 0 ? (a) : (-(a)))
   56.11 +
   56.12 +void filter_mb(H264Context_spu *h, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
   56.13 +
   56.14 +/* Deblocking filter (p153) */
   56.15 +static const uint8_t alpha_table[52*3] = {
   56.16 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.17 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.18 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.19 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.20 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.21 +     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
   56.22 +     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
   56.23 +    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
   56.24 +    80, 90,101,113,127,144,162,182,203,226,
   56.25 +   255,255,
   56.26 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   56.27 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   56.28 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   56.29 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   56.30 +};
   56.31 +
   56.32 +static const uint8_t beta_table[52*3] = {
   56.33 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.34 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.35 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.36 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.37 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   56.38 +     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
   56.39 +     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
   56.40 +     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
   56.41 +    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
   56.42 +    18, 18,
   56.43 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   56.44 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   56.45 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   56.46 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   56.47 +};
   56.48 +
   56.49 +static const uint8_t tc0_table[52*3][4] = {
   56.50 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.51 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.52 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.53 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.54 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.55 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.56 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.57 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.58 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.59 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.60 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   56.61 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
   56.62 +    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
   56.63 +    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
   56.64 +    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
   56.65 +    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
   56.66 +    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
   56.67 +    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
   56.68 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.69 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.70 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.71 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.72 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.73 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.74 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.75 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.76 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   56.77 +};
   56.78 +
   56.79 +static inline int get_chroma_qp(H264slice *s, int t, int qscale){
   56.80 +    return s->chroma_qp_table[t][qscale];
   56.81 +}
   56.82 +
   56.83 +#endif 

    57.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    57.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.c	Mon Aug 27 12:09:56 2012 +0200
    57.3 @@ -0,0 +1,725 @@
    57.4 +/*
    57.5 + * Copyright (c) 2009 TUDelft 
    57.6 + * 
    57.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding.
    57.8 + */
    57.9 +
   57.10 +/**
   57.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   57.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
   57.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   57.14 + * 
   57.15 + * SIMD kernels 
   57.16 + * H.264/AVC motion compensation
   57.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   57.18 + * @author Albert Paradis <apar7632@hotmail.com>
   57.19 + */
   57.20 +
   57.21 +#include <stdio.h>
   57.22 +#include <string.h>
   57.23 +#include <spu_intrinsics.h>
   57.24 +//#include "dsputil_cell.h"
   57.25 +#include "types_spu.h"
   57.26 +#include "h264_tables.h"
   57.27 +#include "h264_dma.h"
   57.28 +#include "h264_mc_spu.h"
   57.29 +#include "h264_intra_spu.h"
   57.30 +#include "h264_decode_mb_spu.h"
   57.31 +#include "h264_deblock_spu.h"
   57.32 +
   57.33 +//border buffers
   57.34 +DECLARE_ALIGNED_16(TopBorder, top_ls[240]);
   57.35 +LeftBorder left_ls;
   57.36 +
   57.37 +//mb line buffer - statically allocated for up to 1920 width video
   57.38 +DECLARE_ALIGNED_16(uint8_t, dest_y_ls[2*16*20]);
   57.39 +DECLARE_ALIGNED_16(uint8_t, dest_cb_ls[2*8*10]);
   57.40 +DECLARE_ALIGNED_16(uint8_t, dest_cr_ls[2*8*10]);
   57.41 +
   57.42 +//dma transfer buffer
   57.43 +DECLARE_ALIGNED_16(uint8_t, dma_y_ls [64*(32+20)]); //EDGE_WIDTH = 32
   57.44 +DECLARE_ALIGNED_16(uint8_t, dma_cb_ls[32*(16+10)]);
   57.45 +DECLARE_ALIGNED_16(uint8_t, dma_cr_ls[32*(16+10)]);
   57.46 +
   57.47 +DECLARE_ALIGNED_16(uint8_t, extra_edge_y [32*(32+20)]); //EDGE_WIDTH = 32
   57.48 +DECLARE_ALIGNED_16(uint8_t, extra_edge_cr[16*(16+10)]);
   57.49 +DECLARE_ALIGNED_16(uint8_t, extra_edge_cb[16*(16+10)]);
   57.50 +
   57.51 +
   57.52 +// For intra mode
   57.53 +/// for now do the extra copy before dma, but it's better to skip this and do the dma right away
   57.54 +static void backup_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
   57.55 +	H264Mb* mb= h->mb;
   57.56 +	
   57.57 +    int i;
   57.58 +	uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y;
   57.59 +	uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb;
   57.60 +	uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr;
   57.61 +	
   57.62 +	uint8_t* left_border_y = left_ls.unfiltered_y;
   57.63 +	uint8_t* left_border_cb = left_ls.unfiltered_cb;
   57.64 +	uint8_t* left_border_cr = left_ls.unfiltered_cr;
   57.65 +		
   57.66 +    src_y  -=   linesize;
   57.67 +    src_cb -= uvlinesize;
   57.68 +    src_cr -= uvlinesize;
   57.69 +
   57.70 +    // There are two lines saved, the line above the top macroblock of a pair,
   57.71 +    // and the line above the bottom macroblock
   57.72 +    left_border_y[0] = top_border_y[15];
   57.73 +    for(i=1; i<17; i++){
   57.74 +        left_border_y[i] = src_y[15+i*  linesize];
   57.75 +    }
   57.76 +
   57.77 +   *(qword*)(top_border_y)= *(qword*)(src_y +  16*linesize);
   57.78 +
   57.79 +    left_border_cb[0] = top_border_cb[7];
   57.80 +    left_border_cr[0] = top_border_cr[7];
   57.81 +    for(i=1; i<9; i++){
   57.82 +        left_border_cb[i] = src_cb[7+i*uvlinesize];
   57.83 +        left_border_cr[i] = src_cr[7+i*uvlinesize];
   57.84 +    }
   57.85 +    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
   57.86 +    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
   57.87 +}
   57.88 +
   57.89 +static void xchg_mb_border(H264Context_spu *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
   57.90 +	H264Mb* mb= h->mb;
   57.91 +	H264slice* s = h->s;
   57.92 +	
   57.93 +	int temp8, i;
   57.94 +	uint64_t temp64;
   57.95 +	int deblock_left;
   57.96 +	int deblock_top;
   57.97 +	
   57.98 +	uint8_t* top_border_y = top_ls[mb->mb_x].unfiltered_y;	
   57.99 +	uint8_t* top_border_cb = top_ls[mb->mb_x].unfiltered_cb;
  57.100 +	uint8_t* top_border_cr = top_ls[mb->mb_x].unfiltered_cr;
  57.101 +	uint8_t* top_border_y_next = top_ls[mb->mb_x +1].unfiltered_y;
  57.102 +	
  57.103 +	uint8_t* left_border_y = left_ls.unfiltered_y;
  57.104 +	uint8_t* left_border_cb = left_ls.unfiltered_cb;
  57.105 +	uint8_t* left_border_cr = left_ls.unfiltered_cr;
  57.106 +	
  57.107 +	deblock_left = (mb->mb_x > 0);
  57.108 +	deblock_top =  (mb->mb_y > 0);
  57.109 +	
  57.110 +	src_y  -= (  linesize + 1);
  57.111 +	src_cb -= (uvlinesize + 1);
  57.112 +	src_cr -= (uvlinesize + 1);
  57.113 +	
  57.114 +	#define XCHG(a,b,t,xchg)\
  57.115 +	t= a;\
  57.116 +	if(xchg)\
  57.117 +		a= b;\
  57.118 +	b= t;
  57.119 +	
  57.120 +	if(deblock_left){
  57.121 +		for(i = !deblock_top; i<16; i++){
  57.122 +			XCHG(left_border_y[i], src_y [i*  linesize], temp8, xchg);
  57.123 +		}
  57.124 +		XCHG(left_border_y[i], src_y [i*  linesize], temp8, 1);
  57.125 +		
  57.126 +		for(i = !deblock_top; i<8; i++){
  57.127 +			XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg);
  57.128 +			XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg);
  57.129 +		}
  57.130 +		XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1);
  57.131 +		XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1);
  57.132 +	}
  57.133 +	
  57.134 +	if(deblock_top){
  57.135 +		XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg);
  57.136 +		XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1);
  57.137 +		if(mb->mb_x+1 < s->mb_width){
  57.138 +			XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1);
  57.139 +		}
  57.140 +		XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1);
  57.141 +		XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1);
  57.142 +	}
  57.143 +}
  57.144 +
  57.145 +void copy_top_borders(int mb_x, uint8_t *dst_y, uint8_t *dst_cb, uint8_t *dst_cr, int stride_y, int stride_c){			
  57.146 +	qword *qsrc_y = (qword *) (top_ls[mb_x].top_borders_y);
  57.147 +	dst_y-= 4*stride_y;
  57.148 +	
  57.149 +	*((qword *) (dst_y + 0*stride_y)) = *qsrc_y++;
  57.150 +	*((qword *) (dst_y + 1*stride_y)) = *qsrc_y++;
  57.151 +	*((qword *) (dst_y + 2*stride_y)) = *qsrc_y++;
  57.152 +	*((qword *) (dst_y + 3*stride_y)) = *qsrc_y++;
  57.153 +
  57.154 +	dst_cb-=2*stride_c;	
  57.155 +	uint64_t *dsrc_cb = (uint64_t *) (top_ls[mb_x].top_borders_cb);
  57.156 +	*((uint64_t *) (dst_cb + 0*stride_c)) = *dsrc_cb++; 
  57.157 +	*((uint64_t *) (dst_cb + 1*stride_c)) = *dsrc_cb++;
  57.158 +
  57.159 +	dst_cr-=2*stride_c;	
  57.160 +	uint64_t *dsrc_cr = (uint64_t *) (top_ls[mb_x].top_borders_cr);
  57.161 +	*((uint64_t *) (dst_cr + 0*stride_c)) = *dsrc_cr++;
  57.162 +	*((uint64_t *) (dst_cr + 1*stride_c)) = *dsrc_cr++;
  57.163 +}
  57.164 +
  57.165 +static void send_top_borders(H264Context_spu *h, int mb_x, uint8_t* dest_y, uint8_t* dest_cb, uint8_t* dest_cr, int stride_y, int stride_c){
  57.166 +	H264spe *spe= &h->spe;
  57.167 +	//fill borders (unfiltered borders already filled in backup_mb_border)
  57.168 +	dest_y+= 12*stride_y;
  57.169 +	qword *qtop_y = (qword *) top_ls[mb_x].top_borders_y;	
  57.170 +	for(int i=0; i<4; i++){
  57.171 +		qword *qdest_y = (qword *) dest_y;
  57.172 +		*qtop_y++ = *qdest_y;		
  57.173 +		dest_y+=stride_y;
  57.174 +	}
  57.175 +	dest_cb+= 6*stride_c;
  57.176 +	dest_cr+= 6*stride_c;
  57.177 +	uint64_t *dtop_cb = (uint64_t *) top_ls[mb_x].top_borders_cb;
  57.178 +	uint64_t *dtop_cr = (uint64_t *) top_ls[mb_x].top_borders_cr;
  57.179 +	for(int i=0; i<2; i++){
  57.180 +		uint64_t *ddest_cb = (uint64_t *) dest_cb;
  57.181 +		uint64_t *ddest_cr = (uint64_t *) dest_cr;
  57.182 +		
  57.183 +		*dtop_cb++  = *ddest_cb;
  57.184 +		*dtop_cr++  = *ddest_cr;
  57.185 +		
  57.186 +		dest_cb+=stride_c;
  57.187 +		dest_cr+=stride_c;
  57.188 +	}
  57.189 +	uint8_t* top_border_tgt = spe->tgt_spe + (unsigned) &top_ls[mb_x];
  57.190 +	spu_dma_put(&top_ls[mb_x], (unsigned) top_border_tgt, sizeof(TopBorder), MBD_put);
  57.191 +}
  57.192 +
  57.193 +static void extend_edges_left(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c){
  57.194 +	for (int i=0; i<lines; i++){
  57.195 +		memset(dma_y, dma_y[32], 32);
  57.196 +		dma_y+=64;
  57.197 +	}
  57.198 +
  57.199 +	for (int i=0; i<lines_c; i++){
  57.200 +		memset(dma_cb, dma_cb[16], 16);
  57.201 +		memset(dma_cr, dma_cr[16], 16);
  57.202 +		dma_cb+=32; dma_cr+=32;
  57.203 +	}
  57.204 +}
  57.205 +
  57.206 +static void extend_edges_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr , int lines, int lines_c, int slots){
  57.207 +		
  57.208 +	for (int i=0; i<lines; i++){
  57.209 +		memset(dma_y, dma_y[-1], slots*16);
  57.210 +		dma_y+=64;
  57.211 +	}
  57.212 +	
  57.213 +	for (int i=0; i<lines_c; i++){
  57.214 +		memset(dma_cb, dma_cb[-1], slots*8);
  57.215 +		memset(dma_cr, dma_cr[-1], slots*8);
  57.216 +		dma_cb+=32; dma_cr+=32;
  57.217 +	}
  57.218 +}
  57.219 +
  57.220 +static void extend_edges_top(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr ){
  57.221 +	qword *qborder_y = (qword *) dma_y;
  57.222 +	for (int i=1; i<=32; i++){
  57.223 +		qword *qdma_y = (qword *) (dma_y - i*64);
  57.224 +		*qdma_y = *qborder_y;
  57.225 +	}
  57.226 +
  57.227 +	uint64_t *dborder_cb = (uint64_t *) dma_cb;
  57.228 +	uint64_t *dborder_cr = (uint64_t *) dma_cr;
  57.229 +	for (int i=1; i<=16; i++){
  57.230 +		uint64_t *ddma_cb = (uint64_t *) (dma_cb - i*32);
  57.231 +		uint64_t *ddma_cr = (uint64_t *) (dma_cr - i*32);
  57.232 +		*ddma_cb = *dborder_cb;
  57.233 +		*ddma_cr = *dborder_cr;
  57.234 +	}
  57.235 +}
  57.236 +
  57.237 +static void extend_edges_bottom(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr){
  57.238 +	qword *qborder_y = (qword *) dma_y;
  57.239 +	for (int i=1; i<=32; i++){
  57.240 +		qword *qdma_y = (qword *) (dma_y + i*64);
  57.241 +		*qdma_y = *qborder_y;
  57.242 +	}
  57.243 +	
  57.244 +	uint64_t *dborder_cb = (uint64_t *) dma_cb;
  57.245 +	uint64_t *dborder_cr = (uint64_t *) dma_cr;
  57.246 +	for (int i=1; i<=16; i++){
  57.247 +		uint64_t *ddma_cb = (uint64_t *) (dma_cb + i*32);
  57.248 +		uint64_t *ddma_cr = (uint64_t *) (dma_cr + i*32);
  57.249 +		*ddma_cb = *dborder_cb;
  57.250 +		*ddma_cr = *dborder_cr;
  57.251 +	}
  57.252 +}
  57.253 +
  57.254 +static void extend_extra_edge_right(uint8_t *dma_y, uint8_t *dma_cb, uint8_t *dma_cr, uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr, int lines, int lines_c){
  57.255 +
  57.256 +	for (int i=0; i<lines; i++){
  57.257 +		memset(extra_y, dma_y[-1], 32);
  57.258 +		dma_y+=64; extra_y+=32;
  57.259 +	}
  57.260 +	
  57.261 +	for (int i=0; i<lines_c; i++){
  57.262 +		memset(extra_cb, dma_cb[-1], 16);
  57.263 +		memset(extra_cr, dma_cr[-1], 16);
  57.264 +		dma_cb+=32; dma_cr+=32;
  57.265 +		extra_cb+=16; extra_cr+=16;
  57.266 +	}
  57.267 +}
  57.268 +
  57.269 +static void extend_extra_edge_top(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){
  57.270 +	qword *qborder_y = (qword *) extra_y;
  57.271 +	qword *qborder_y2 = (qword *) (extra_y+16);
  57.272 +	
  57.273 +	for (int i=1; i<=32; i++){
  57.274 +		qword *qextra_y = (qword *) (extra_y-i*32);
  57.275 +		*qextra_y = *qborder_y;
  57.276 +		*(qextra_y+1) = *qborder_y2;
  57.277 +	}
  57.278 +	
  57.279 +	qword *qborder_cb = (qword *) extra_cb;
  57.280 +	qword *qborder_cr = (qword *) extra_cr;
  57.281 +	for (int i=1; i<=16; i++){
  57.282 +		qword *qextra_cb = (qword *) (extra_cb - i*16);
  57.283 +		qword *qextra_cr = (qword *) (extra_cr - i*16);
  57.284 +		*qextra_cb = *qborder_cb;
  57.285 +		*qextra_cr = *qborder_cr;
  57.286 +	}
  57.287 +}
  57.288 +
  57.289 +static void extend_extra_edge_bottom(uint8_t *extra_y, uint8_t *extra_cb, uint8_t *extra_cr){
  57.290 +	qword *qborder_y = (qword *) extra_y;
  57.291 +	qword *qborder_y2 = (qword *) (extra_y+16);
  57.292 +	
  57.293 +	for (int i=1; i<=32; i++){
  57.294 +		qword *qextra_y = (qword *) (extra_y+i*32);
  57.295 +		*qextra_y = *qborder_y;
  57.296 +		*(qextra_y+1) = *qborder_y2;
  57.297 +	}
  57.298 +	
  57.299 +	qword *qborder_cb = (qword *) extra_cb;
  57.300 +	qword *qborder_cr = (qword *) extra_cr;
  57.301 +	for (int i=1; i<=16; i++){
  57.302 +		qword *qextra_cb = (qword *) (extra_cb + i*16);
  57.303 +		qword *qextra_cr = (qword *) (extra_cr + i*16);
  57.304 +		*qextra_cb = *qborder_cb;
  57.305 +		*qextra_cr = *qborder_cr;
  57.306 +	}
  57.307 +}
  57.308 +
  57.309 +static void extend_edges(H264Context_spu *h, int mb_x, int mb_y){
  57.310 +	H264slice *s = h->s;
  57.311 +	
  57.312 +	uint8_t *dma_y; 
  57.313 +	uint8_t *dma_cb; 
  57.314 +	uint8_t *dma_cr;
  57.315 +	
  57.316 +	uint8_t *extra_y  = extra_edge_y;
  57.317 +	uint8_t *extra_cb = extra_edge_cb;
  57.318 +	uint8_t *extra_cr = extra_edge_cr;
  57.319 +	
  57.320 +	int pos = (mb_x+2) %4;
  57.321 +	if (mb_x == 0){
  57.322 +		if (mb_y ==0){
  57.323 +			extend_edges_left(&dma_y_ls[32*64], &dma_cb_ls[16*32], &dma_cr_ls[16*32], 12, 6);
  57.324 +		}else if (mb_y == s->mb_height -1){
  57.325 +			extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 20, 10);
  57.326 +		}else {
  57.327 +			extend_edges_left(dma_y_ls, dma_cb_ls, dma_cr_ls, 16, 8);
  57.328 +		}
  57.329 +	}else if (mb_x == s->mb_width-1){
  57.330 +		dma_y  = &dma_y_ls [(pos+1)*16];
  57.331 +		dma_cb = &dma_cb_ls[(pos+1)*8];
  57.332 +		dma_cr = &dma_cr_ls[(pos+1)*8];
  57.333 +		if (mb_y ==0){
  57.334 +			dma_y   += 32*64;
  57.335 +			dma_cb  += 16*32;
  57.336 +			dma_cr  += 16*32;
  57.337 +			extra_y = extra_edge_y  + 32*32;
  57.338 +			extra_cb= extra_edge_cb + 16*16;
  57.339 +			extra_cr= extra_edge_cr + 16*16;
  57.340 +			
  57.341 +			if (pos==2){
  57.342 +				extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 1);
  57.343 +				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6);
  57.344 +			}else if (pos==3){
  57.345 +				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 12, 6);
  57.346 +			}else{
  57.347 +				extend_edges_right(dma_y, dma_cb, dma_cr, 12, 6, 2);
  57.348 +			}
  57.349 +		}else if (mb_y == s->mb_height -1){
  57.350 +			if (pos==2){
  57.351 +				extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 1);
  57.352 +				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10);
  57.353 +			}else if (pos==3){
  57.354 +				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 20, 10);
  57.355 +			}else{
  57.356 +				extend_edges_right(dma_y, dma_cb, dma_cr, 20, 10, 2);
  57.357 +			}				
  57.358 +		}else {
  57.359 +			if (pos==2){
  57.360 +				extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1);
  57.361 +				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8);
  57.362 +			}else if (pos==3){
  57.363 +				extend_extra_edge_right(dma_y, dma_cb, dma_cr, extra_y, extra_cb, extra_cr, 16, 8);
  57.364 +			}else{
  57.365 +				extend_edges_right(dma_y, dma_cb, dma_cr, 16, 8, 1);
  57.366 +			}
  57.367 +		}
  57.368 +	}
  57.369 +		
  57.370 +	if (mb_y == 0){
  57.371 +		dma_y  = &dma_y_ls [32*64];
  57.372 +		dma_cb = &dma_cb_ls[16*32];
  57.373 +		dma_cr = &dma_cr_ls[16*32];
  57.374 +		extra_y = extra_edge_y  + 32*32;
  57.375 +		extra_cb= extra_edge_cb + 16*16;
  57.376 +		extra_cr= extra_edge_cr + 16*16;
  57.377 +		
  57.378 +		if (mb_x ==0){
  57.379 +			extend_edges_top (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8);
  57.380 +			extend_edges_top (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8);
  57.381 +			extend_edges_top (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8);
  57.382 +		}else if (mb_x == s->mb_width -1){
  57.383 +			if (pos==2){
  57.384 +				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
  57.385 +				extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
  57.386 +				extend_extra_edge_top(extra_y, extra_cb, extra_cr);
  57.387 +			}else if (pos == 3){
  57.388 +				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
  57.389 +				extend_extra_edge_top(extra_y, extra_cb, extra_cr);
  57.390 +			}else{
  57.391 +				extend_edges_top (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
  57.392 +				extend_edges_top (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
  57.393 +				extend_edges_top (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8);
  57.394 +			}			
  57.395 +		}else {
  57.396 +			extend_edges_top (dma_y + pos*16, dma_cb + pos*8, dma_cr + pos*8);
  57.397 +		}
  57.398 +	}else if (mb_y == s->mb_height -1){
  57.399 +		dma_y  = &dma_y_ls [19*64];
  57.400 +		dma_cb = &dma_cb_ls[9*32];
  57.401 +		dma_cr = &dma_cr_ls[9*32];
  57.402 +		extra_y = extra_edge_y  + 19*32;
  57.403 +		extra_cb= extra_edge_cb + 9*16;
  57.404 +		extra_cr= extra_edge_cr + 9*16;
  57.405 +		
  57.406 +		if (mb_x ==0){
  57.407 +			extend_edges_bottom (dma_y + 0*16, dma_cb +0*8, dma_cr + 0*8);
  57.408 +			extend_edges_bottom (dma_y + 1*16, dma_cb +1*8, dma_cr + 1*8);
  57.409 +			extend_edges_bottom (dma_y + 2*16, dma_cb +2*8, dma_cr + 2*8);
  57.410 +		}else if (mb_x == s->mb_width -1){
  57.411 +			if (pos==2){
  57.412 +				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
  57.413 +				extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
  57.414 +				extend_extra_edge_bottom(extra_y, extra_cb, extra_cr);
  57.415 +			}else if (pos == 3){
  57.416 +				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
  57.417 +				extend_extra_edge_bottom(extra_y, extra_cb, extra_cr);
  57.418 +			}else{				
  57.419 +				extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
  57.420 +				extend_edges_bottom (dma_y + (pos+1)*16, dma_cb +(pos+1)*8, dma_cr + (pos+1)*8);
  57.421 +				extend_edges_bottom (dma_y + (pos+2)*16, dma_cb +(pos+2)*8, dma_cr + (pos+2)*8);
  57.422 +			}
  57.423 +		}else {
  57.424 +			extend_edges_bottom (dma_y + pos*16, dma_cb +pos*8, dma_cr + pos*8);
  57.425 +		}
  57.426 +	}
  57.427 +}
  57.428 +
  57.429 +static void send_pic_data(H264Context_spu *h, int mb_x, int mb_y, int pos, int stride_y, int stride_c){
  57.430 +	H264slice *s = h->s;
  57.431 +	int lines, lines_c;
  57.432 +	int linesize = s->linesize;
  57.433 +	int uvlinesize = s->uvlinesize;
  57.434 +	
  57.435 +	uint8_t* dst_y  = s->dst_y + (mb_x-pos)*16 + (mb_y*16)*linesize;
  57.436 +	uint8_t* dst_cb = s->dst_cb +(mb_x-pos)*8 + (mb_y*8)*uvlinesize;
  57.437 +	uint8_t* dst_cr = s->dst_cr +(mb_x-pos)*8 + (mb_y*8)*uvlinesize;
  57.438 +
  57.439 +	if (mb_y == 0){
  57.440 +		dst_y -= 32 *linesize;
  57.441 +		dst_cb-= 16 *uvlinesize;
  57.442 +		dst_cr-= 16 *uvlinesize;
  57.443 +	}else {
  57.444 +		dst_y -= 4 *linesize;
  57.445 +		dst_cb-= 2 *uvlinesize;
  57.446 +		dst_cr-= 2 *uvlinesize;
  57.447 +	}
  57.448 +	
  57.449 +	if (mb_y == 0){
  57.450 +		lines = 12+32; lines_c=6+16;
  57.451 +	}else if (mb_y == s->mb_height-1){
  57.452 +		lines = 20+32; lines_c=10+16;
  57.453 +	}else{
  57.454 +		lines = 16; lines_c=8;
  57.455 +	}
  57.456 +	
  57.457 +	put_list = put_list_buf;
  57.458 +	put_dma_list(dma_y_ls, dst_y, stride_y, lines, linesize, MBD_pic);
  57.459 +	put_dma_list(dma_cb_ls, dst_cb, stride_c, lines_c, uvlinesize, MBD_pic);
  57.460 +	put_dma_list(dma_cr_ls, dst_cr, stride_c, lines_c, uvlinesize, MBD_pic);
  57.461 +
  57.462 +	if (mb_x == s->mb_width-1 && pos>1){		
  57.463 +		put_dma_list(extra_edge_y, dst_y+64, 32, lines, linesize, MBD_pic);
  57.464 +		put_dma_list(extra_edge_cb, dst_cb+32, 16, lines_c, uvlinesize, MBD_pic);
  57.465 +		put_dma_list(extra_edge_cr, dst_cr+32, 16, lines_c, uvlinesize, MBD_pic);
  57.466 +   	}
  57.467 +}
  57.468 +
  57.469 +void copy_data_and_send(H264Context_spu *h, int mb_x, int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
  57.470 +	H264slice *s = h->s;
  57.471 +	int lines, lines_c;
  57.472 +	int pos = (mb_x+2)%4; //4 slots in our 64 byte wide transfer buffer. Offset 2 for edge emulation
  57.473 +	uint8_t *dma_y = &dma_y_ls[pos*16];
  57.474 +	uint8_t *dma_cb = &dma_cb_ls[pos*8];
  57.475 +	uint8_t *dma_cr = &dma_cr_ls[pos*8];
  57.476 +	
  57.477 +	if (mb_y == 0){
  57.478 +		dma_y += 32*64;
  57.479 +		dma_cb+= 16*32;
  57.480 +		dma_cr+= 16*32;
  57.481 +	}else{		
  57.482 +		dest_y -= 4*stride_y;
  57.483 +		dest_cb-= 2*stride_c;
  57.484 +		dest_cr-= 2*stride_c;		
  57.485 +	}
  57.486 +	
  57.487 +	if (mb_y == 0){
  57.488 +		lines = 12; lines_c=6;
  57.489 +	}else if (mb_y == s->mb_height-1){
  57.490 +		lines = 20; lines_c=10;
  57.491 +	}else{
  57.492 +		lines = 16; lines_c=8;
  57.493 +	}
  57.494 +
  57.495 +	for(int i=0; i<lines; i++){
  57.496 +		qword *qdest_y = (qword *) dest_y;
  57.497 +		qword *qdma_y  = (qword *) dma_y;
  57.498 +		*qdma_y = *qdest_y;
  57.499 +		dma_y +=64;
  57.500 +		dest_y+=stride_y;
  57.501 +	}
  57.502 +
  57.503 +	for(int i=0; i<lines_c; i++){
  57.504 +		uint64_t *ddest_cb  = (uint64_t *) dest_cb;
  57.505 +		uint64_t *ddest_cr  = (uint64_t *) dest_cr;
  57.506 +		uint64_t *ddma_cb   = (uint64_t *) dma_cb;
  57.507 +		uint64_t *ddma_cr   = (uint64_t *) dma_cr;
  57.508 +		*ddma_cb = *ddest_cb;
  57.509 +		*ddma_cr = *ddest_cr;
  57.510 +		dma_cb +=32;
  57.511 +		dma_cr +=32;
  57.512 +		dest_cb+=stride_c;
  57.513 +		dest_cr+=stride_c;
  57.514 +	}
  57.515 +
  57.516 +	extend_edges(h, mb_x, mb_y);
  57.517 +
  57.518 +	//send when dma buf is full
  57.519 +	if (pos==3){
  57.520 +		send_pic_data(h, mb_x, mb_y, pos, 64, 32);
  57.521 +	} else if (mb_x == s->mb_width-1){
  57.522 +		send_pic_data(h, mb_x, mb_y, pos, 64, 32);
  57.523 +	}
  57.524 +}
  57.525 +
  57.526 +static void shift_left(int mb_y, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
  57.527 +	int lines, lines_c;
  57.528 +	if (mb_y > 0){
  57.529 +		lines  =20;
  57.530 +		lines_c=10;
  57.531 +		dest_y  -= 4*stride_y;
  57.532 +		dest_cb -= 2*stride_c;
  57.533 +		dest_cr -= 2*stride_c;
  57.534 +	}else {
  57.535 +		lines  =16;
  57.536 +		lines_c= 8;		
  57.537 +	}		
  57.538 +		
  57.539 +	for (int i=0; i<lines; i++){
  57.540 +		qword *left_y  = (qword *) (dest_y -16);
  57.541 +		qword *qdest_y = (qword *) dest_y;
  57.542 +		*left_y = *qdest_y;
  57.543 +		dest_y += stride_y;
  57.544 +	}
  57.545 +	
  57.546 +	for (int i=0; i<lines_c; i++){
  57.547 +		uint64_t *left_cb  = (uint64_t *) (dest_cb -8);
  57.548 +		uint64_t *left_cr  = (uint64_t *) (dest_cr -8);
  57.549 +		uint64_t *ddest_cb = (uint64_t *) dest_cb;
  57.550 +		uint64_t *ddest_cr = (uint64_t *) dest_cr;
  57.551 +		*left_cb = *ddest_cb;
  57.552 +		*left_cr = *ddest_cr;
  57.553 +		dest_cb += stride_c;
  57.554 +		dest_cr += stride_c;
  57.555 +	}
  57.556 +}
  57.557 +
  57.558 +void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c){
  57.559 +	H264slice *s = h->s;
  57.560 +	H264Mb *mb = h->mb;
  57.561 +    const int mb_x= mb->mb_x;
  57.562 +    const int mb_y= mb->mb_y;    
  57.563 +    const int mb_type= mb->mb_type;
  57.564 +	
  57.565 +	uint8_t *dest_y, *dest_cb, *dest_cr;	//ls ptrs (abstracts the fact it is operating in a ls buffer)
  57.566 +
  57.567 +    int i;
  57.568 +  
  57.569 +    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
  57.570 +    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
  57.571 +
  57.572 +	dest_y  = dest_y_ls + 16 + 4*stride_y;
  57.573 +	dest_cb = dest_cb_ls + 8 + 2*stride_c;
  57.574 +	dest_cr = dest_cr_ls + 8 + 2*stride_c;
  57.575 +	
  57.576 +	if(IS_8x8DCT(mb_type)){
  57.577 +		idct_dc_add = ff_idct8_dc_add;
  57.578 +		idct_add = h->dsp.h264_idct_add[0];
  57.579 +	}
  57.580 +	else{
  57.581 +		idct_dc_add = ff_idct_dc_add;
  57.582 +		idct_add = h->dsp.h264_idct_add[1];
  57.583 +	}
  57.584 +
  57.585 +	if (mb_y>0){
  57.586 +		copy_top_borders(mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.587 +	}
  57.588 +
  57.589 +	if(IS_INTRA(mb_type)){
  57.590 +		xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 1);
  57.591 +
  57.592 +		h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cb, stride_c);
  57.593 +		h->hpc.pred8x8[ mb->chroma_pred_mode ](dest_cr, stride_c);
  57.594 +
  57.595 +		if(IS_INTRA4x4(mb_type)){
  57.596 +			if(IS_8x8DCT(mb_type)){
  57.597 +
  57.598 +				for(i=0; i<16; i+=4){
  57.599 +					uint8_t * const ptr= dest_y + block_offset[i];
  57.600 +					const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ];
  57.601 +					const int nnz = mb->non_zero_count_cache[ scan8[i] ];
  57.602 +					h->hpc.pred8x8l[ dir ](ptr, (mb->topleft_samples_available<<i)&0x8000,
  57.603 +												(mb->topright_samples_available<<i)&0x4000, stride_y);
  57.604 +
  57.605 +					if(nnz){
  57.606 +						if(nnz == 1 && mb->mb[i*16])
  57.607 +							idct_dc_add(ptr, mb->mb + i*16, stride_y);
  57.608 +						else{
  57.609 +							idct_add   (ptr, mb->mb + i*16, stride_y);
  57.610 +						}
  57.611 +					}
  57.612 +				}
  57.613 +			}else{
  57.614 +				for(i=0; i<16; i++){
  57.615 +					uint8_t * const ptr= dest_y + block_offset[i];
  57.616 +					const int dir= mb->intra4x4_pred_mode_cache[ scan8[i] ];
  57.617 +
  57.618 +					uint8_t *topright;
  57.619 +					int nnz, tr;
  57.620 +					if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
  57.621 +						const int topright_avail= (mb->topright_samples_available<<i)&0x8000;
  57.622 +						if(!topright_avail){
  57.623 +							tr= ptr[3 - stride_y]*0x01010101;
  57.624 +							topright= (uint8_t*) &tr;
  57.625 +						}else
  57.626 +							topright= ptr + 4 - stride_y;
  57.627 +					}else
  57.628 +						topright= NULL;
  57.629 +
  57.630 +					h->hpc.pred4x4[ dir ](ptr, topright, stride_y);
  57.631 +					nnz = mb->non_zero_count_cache[ scan8[i] ];
  57.632 +					if(nnz){
  57.633 +						if(nnz == 1 && mb->mb[i*16])
  57.634 +							idct_dc_add(ptr, mb->mb + i*16, stride_y);
  57.635 +						else
  57.636 +							idct_add   (ptr, mb->mb + i*16, stride_y);
  57.637 +					}
  57.638 +				}
  57.639 +			}
  57.640 +
  57.641 +		}else{
  57.642 +			h->hpc.pred16x16[ mb->intra16x16_pred_mode ](dest_y , stride_y);
  57.643 +			h264_luma_dc_dequant_idct_c(mb->mb, mb->dequant4_coeff_y);
  57.644 +		}
  57.645 +		xchg_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c, 0);
  57.646 +
  57.647 +	}else {
  57.648 +		hl_motion(h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.649 +	}
  57.650 +
  57.651 +	if(!IS_INTRA4x4(mb_type)){
  57.652 +		if(IS_INTRA16x16(mb_type)){
  57.653 +			for(i=0; i<16; i++){
  57.654 +				if(mb->non_zero_count_cache[ scan8[i] ])
  57.655 +					idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
  57.656 +				else if(mb->mb[i*16])
  57.657 +					idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
  57.658 +			}
  57.659 +		}else if(mb->cbp&15){
  57.660 +			const int incr = IS_8x8DCT(mb_type) ? 4 : 1;
  57.661 +			for(i=0; i<16; i+=incr){
  57.662 +				int nnz = mb->non_zero_count_cache[ scan8[i] ];
  57.663 +				if(nnz){
  57.664 +					if(nnz==1 && mb->mb[i*16])
  57.665 +						idct_dc_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
  57.666 +					else
  57.667 +						idct_add(dest_y + block_offset[i], mb->mb + i*16, stride_y);
  57.668 +				}
  57.669 +			}
  57.670 +		}
  57.671 +	}
  57.672 +
  57.673 +	if(mb->cbp&0x30){
  57.674 +		uint8_t *dest[2] = {dest_cb, dest_cr};
  57.675 +		chroma_dc_dequant_idct_c(mb->mb + 16*16, mb->dequant4_coeff_cb);
  57.676 +		chroma_dc_dequant_idct_c(mb->mb + 16*16+4*16, mb->dequant4_coeff_cr);
  57.677 +
  57.678 +		idct_add = h->dsp.h264_idct_add[1];
  57.679 +		idct_dc_add = ff_idct_dc_add;
  57.680 +		for(i=16; i<16+8; i++){
  57.681 +			if(mb->non_zero_count_cache[ scan8[i] ])
  57.682 +				idct_add   (dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c);
  57.683 +			else if(mb->mb[i*16])
  57.684 +				idct_dc_add(dest[(i&4)>>2] + block_offset[i], mb->mb + i*16, stride_c);
  57.685 +		}
  57.686 +	}
  57.687 +
  57.688 +	// save unfiltered borders
  57.689 +	backup_mb_border(h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.690 +	if (mb->deblock_mb){
  57.691 +		filter_mb( h, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.692 +	}
  57.693 +
  57.694 +	if (mb_y < s->mb_height-1){
  57.695 +		if(mb_x>0){
  57.696 +			send_top_borders(h, mb_x-1, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
  57.697 +		}
  57.698 +		if (mb_x == s->mb_width-1){
  57.699 +			send_top_borders(h, mb_x, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.700 +		}
  57.701 +	}
  57.702 +	update_tgt_spe_dep(h, 0);
  57.703 +
  57.704 +	if (h->blocking){
  57.705 +		if (mb_x>0){			
  57.706 +			copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
  57.707 +			wait_dma_id(MBD_pic);
  57.708 +		}
  57.709 +		if (mb_x == s->mb_width-1){			
  57.710 +			copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.711 +			wait_dma_id(MBD_pic);
  57.712 +		}
  57.713 +		
  57.714 +	}else{
  57.715 +		if (mb_x>0){
  57.716 +			wait_dma_id(MBD_pic);
  57.717 +			copy_data_and_send(h, mb_x-1, mb_y, dest_y-16, dest_cb-8, dest_cr-8, stride_y, stride_c);
  57.718 +		}
  57.719 +		if (mb_x == s->mb_width-1){
  57.720 +			wait_dma_id(MBD_pic);
  57.721 +			copy_data_and_send(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.722 +		}
  57.723 +	}
  57.724 +
  57.725 +	if (mb_x < s->mb_width)
  57.726 +		shift_left(mb_y, dest_y, dest_cb, dest_cr, stride_y, stride_c);
  57.727 +	
  57.728 +}

    58.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    58.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_decode_mb_spu.h	Mon Aug 27 12:09:56 2012 +0200
    58.3 @@ -0,0 +1,97 @@
    58.4 +/*
    58.5 + * Copyright (c) 2009 TUDelft 
    58.6 + * 
    58.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 
    58.8 + */
    58.9 +
   58.10 +/**
   58.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   58.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
   58.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   58.14 + * 
   58.15 + * SIMD kernels 
   58.16 + * H.264/AVC motion compensation
   58.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   58.18 + * @author Albert Paradis <apar7632@hotmail.com>
   58.19 + */ 
   58.20 +
   58.21 +#ifndef H264_DECODE_MB_SPU_H
   58.22 +#define H264_DECODE_MB_SPU_H
   58.23 +
   58.24 +#define CELL_SPE
   58.25 +#include "libavcodec/avcodec.h"
   58.26 +#include "types_spu.h"
   58.27 +#include "h264_types_spu.h"
   58.28 +#include "h264_mc_spu.h"
   58.29 +#include "h264_dma.h"
   58.30 +#include "dsputil_spu.h"
   58.31 +#include "h264_intra_spu.h"
   58.32 +
   58.33 +/**
   58.34 + * H264Context
   58.35 + */
   58.36 +typedef struct H264Context_spu{
   58.37 +	DECLARE_ALIGNED_16(H264spe, spe);		// contains simple type parameters that doesn't change
   58.38 +    DECLARE_ALIGNED_16(H264Mb, mb_buf[3]);			// contains simple type parameters that changes for macroblock
   58.39 +    DECLARE_ALIGNED_16(H264slice, slice_buf[2]);	// contains simple type parameters that changes for slice
   58.40 +	
   58.41 +	DSPContext_spu dsp;  // struct that contains pointers to mc interpolations functions
   58.42 +	H264PredContext_spu hpc;  // struct that contains pointers to intra prediction functions
   58.43 +
   58.44 +	H264slice *s;
   58.45 +	int sl_idx;
   58.46 +	int frames;
   58.47 +	//mc arg buffer
   58.48 +	H264mc mc_buf[2];
   58.49 +	H264mc *mc;		//mc ptr to current decoded mb
   58.50 +	int mc_idx;
   58.51 +	int n_mc;		//next mb_id to mc
   58.52 +	int mb_proc;
   58.53 +	int mb_total;
   58.54 +	int curr_line;
   58.55 +	
   58.56 +	H264Mb* mb;		//mb ptr to current decoded mb
   58.57 +	int mb_id;		//next mb_id to dma
   58.58 +	int mb_dec; 	//mb_buf index - decoded mb
   58.59 +	int mb_mc;		//mb_buf index - prebuffer motion data
   58.60 +	int mb_dma;		//mb_buf index - target for dma mb data
   58.61 +	int next_mb_idx;
   58.62 +/*// for deblocking filter
   58.63 +    int edges[2];
   58.64 +    int start[2]; 
   58.65 +    int bS[2][4][4];				// dir, edge, bS;
   58.66 +    int qp[2][4];					// dir, edge;
   58.67 +    int chroma_qp[2][2][4];			// cb/cr, dir, edge;	
   58.68 +*/
   58.69 +	int blocking; 
   58.70 +}H264Context_spu;
   58.71 +
   58.72 +void print_output(H264Context_spu* h, const char* msg);
   58.73 +void hl_decode_mb_internal(H264Context_spu *h, int stride_y, int stride_c);
   58.74 +void update_tgt_spe_dep(H264Context_spu *h, int end);
   58.75 +
   58.76 +// IDCT functions
   58.77 +void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
   58.78 +void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
   58.79 +
   58.80 +void ff_idct_dc_add(uint8_t *dst, DCTELEM *block, int stride);
   58.81 +void ff_idct8_dc_add(uint8_t *dst, DCTELEM *block, int stride);
   58.82 +
   58.83 +void ff_cropTbl_init();
   58.84 +void add_pixels8_c(uint8_t *pixels, DCTELEM *block, int line_size);
   58.85 +void add_pixels4_c(uint8_t *pixels, DCTELEM *block, int line_size);
   58.86 +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul);
   58.87 +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul);
   58.88 +// Filter functions
   58.89 +//void calculate_bS_qp(H264Context_spu *h);
   58.90 +
   58.91 +// Motion compensation function
   58.92 +void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc);
   58.93 +void calc_mc_params(H264Mb *mb, H264mc *mc);
   58.94 +void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c);
   58.95 +
   58.96 +
   58.97 +// Function to get traces
   58.98 +void trace_event_SPU(int event, int id);
   58.99 +
  58.100 +#endif

    59.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    59.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.c	Mon Aug 27 12:09:56 2012 +0200
    59.3 @@ -0,0 +1,332 @@
    59.4 +/*
    59.5 + * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
    59.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    59.7 + *
    59.8 + * This file is part of FFmpeg.
    59.9 + *
   59.10 + * FFmpeg is free software; you can redistribute it and/or
   59.11 + * modify it under the terms of the GNU Lesser General Public
   59.12 + * License as published by the Free Software Foundation; either
   59.13 + * version 2.1 of the License, or (at your option) any later version.
   59.14 + *
   59.15 + * FFmpeg is distributed in the hope that it will be useful,
   59.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   59.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   59.18 + * Lesser General Public License for more details.
   59.19 + *
   59.20 + * You should have received a copy of the GNU Lesser General Public
   59.21 + * License along with FFmpeg; if not, write to the Free Software
   59.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   59.23 + */
   59.24 +
   59.25 +/**
   59.26 + * @file
   59.27 + * H.264 / AVC / MPEG4 part10 direct mb/block decoding.
   59.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   59.29 + */
   59.30 +#define CELL_SPE
   59.31 +#include "libavcodec/avcodec.h"
   59.32 +#include "dsputil_spu.h"
   59.33 +#include "h264_tables.h"
   59.34 +#include "h264_types_spu.h"
   59.35 +#include "libavutil/common.h"
   59.36 +#include "libavutil/intreadwrite.h"
   59.37 +#include "mathops_spu.h"
   59.38 +#include "rectangle_spu.h"
   59.39 +
   59.40 +//#undef NDEBUG
   59.41 +#include <assert.h>
   59.42 +static void pred_spatial_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
   59.43 +    H264Mb *m = s->m;
   59.44 +    int b4_stride = hc->b_stride;
   59.45 +	const int mb_x = m->mb_x;    
   59.46 +    int mb_type_col[2];
   59.47 +    const int16_t (*l1mv0)[2], (*l1mv1)[2];
   59.48 +    const int8_t *l1ref0, *l1ref1;
   59.49 +    const int is_b8x8 = IS_8X8(*mb_type);
   59.50 +    unsigned int sub_mb_type= MB_TYPE_L0L1;
   59.51 +    int i8, i4;
   59.52 +    int ref[2];
   59.53 +    int mv[2];
   59.54 +    int list;
   59.55 +
   59.56 +    //assert(h->ref_list[1][0].reference&3);
   59.57 +
   59.58 +#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
   59.59 +
   59.60 +    /* ref = min(neighbors) */
   59.61 +    for(list=0; list<2; list++){
   59.62 +        int left_ref = m->ref_cache[list][scan8[0] - 1];
   59.63 +        int top_ref  = m->ref_cache[list][scan8[0] - 8];
   59.64 +        int refc = m->ref_cache[list][scan8[0] - 8 + 4];
   59.65 +        const int16_t *C= m->mv_cache[list][ scan8[0] - 8 + 4];
   59.66 +        if(refc == PART_NOT_AVAILABLE){
   59.67 +            refc = m->ref_cache[list][scan8[0] - 8 - 1];
   59.68 +            C    = m-> mv_cache[list][scan8[0] - 8 - 1];
   59.69 +        }
   59.70 +        ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
   59.71 +        if(ref[list] >= 0){
   59.72 +            //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
   59.73 +            const int16_t * const A= m->mv_cache[list][ scan8[0] - 1 ];
   59.74 +            const int16_t * const B= m->mv_cache[list][ scan8[0] - 8 ];
   59.75 +
   59.76 +            int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
   59.77 +            if(match_count > 1){ //most common
   59.78 +                mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]),
   59.79 +                                     mid_pred(A[1], B[1], C[1]) );
   59.80 +            }else {
   59.81 +                assert(match_count==1);
   59.82 +                if(left_ref==ref[list]){
   59.83 +                    mv[list]= AV_RN32A(A);
   59.84 +                }else if(top_ref==ref[list]){
   59.85 +                    mv[list]= AV_RN32A(B);
   59.86 +                }else{
   59.87 +                    mv[list]= AV_RN32A(C);
   59.88 +                }
   59.89 +            }
   59.90 +        }else{
   59.91 +            int mask= ~(MB_TYPE_L0 << (2*list));
   59.92 +            mv[list] = 0;
   59.93 +            ref[list] = -1;
   59.94 +            if(!is_b8x8)
   59.95 +                *mb_type &= mask;
   59.96 +            sub_mb_type &= mask;
   59.97 +        }
   59.98 +    }
   59.99 +
  59.100 +    if(ref[0] < 0 && ref[1] < 0){
  59.101 +        ref[0] = ref[1] = 0;
  59.102 +        if(!is_b8x8)
  59.103 +            *mb_type |= MB_TYPE_L0L1;
  59.104 +        sub_mb_type |= MB_TYPE_L0L1;
  59.105 +    }
  59.106 +
  59.107 +    if(!(is_b8x8|mv[0]|mv[1])){
  59.108 +        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
  59.109 +        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
  59.110 +        fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
  59.111 +        fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
  59.112 +        *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
  59.113 +        return;
  59.114 +    }
  59.115 +
  59.116 +    mb_type_col[0] =
  59.117 +    mb_type_col[1] = hc->list1_mb_type[mb_x];
  59.118 +
  59.119 +    sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
  59.120 +    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
  59.121 +        *mb_type   |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
  59.122 +    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
  59.123 +        *mb_type   |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
  59.124 +    }else{
  59.125 +        if(!s->direct_8x8_inference_flag){
  59.126 +            /* FIXME save sub mb types from previous frames (or derive from MVs)
  59.127 +            * so we know exactly what block size to use */
  59.128 +            sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
  59.129 +        }
  59.130 +        *mb_type   |= MB_TYPE_8x8;
  59.131 +    }
  59.132 +
  59.133 +//     l1mv0  = (void *) &hc->list1_motion_val[0][4*mb_x];
  59.134 +//     l1mv1  = (void *) &hc->list1_motion_val[1][4*mb_x];
  59.135 +	l1mv0  = (void *) hc->list1_motion_val[0];
  59.136 +    l1mv1  = (void *) hc->list1_motion_val[1];
  59.137 +    l1ref0 = &hc->list1_ref_index [0][4*mb_x];
  59.138 +    l1ref1 = &hc->list1_ref_index [1][4*mb_x];
  59.139 +//     if(!b8_stride){
  59.140 +//         if(m->mb_y&1){
  59.141 +//             l1ref0 += 2;
  59.142 +//             l1ref1 += 2;
  59.143 +//             l1mv0  +=  2*b4_stride;
  59.144 +//             l1mv1  +=  2*b4_stride;
  59.145 +//         }
  59.146 +//     }
  59.147 +
  59.148 +    if(IS_16X16(*mb_type)){
  59.149 +        int a,b;
  59.150 +
  59.151 +        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
  59.152 +        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
  59.153 +        if(!IS_INTRA(mb_type_col[0]) && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
  59.154 +            || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
  59.155 +            ))){
  59.156 +            a=b=0;
  59.157 +            if(ref[0] > 0)
  59.158 +                a= mv[0];
  59.159 +            if(ref[1] > 0)
  59.160 +                b= mv[1];
  59.161 +        }else{
  59.162 +            a= mv[0];
  59.163 +            b= mv[1];
  59.164 +        }
  59.165 +        fill_rectangle(&m->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
  59.166 +        fill_rectangle(&m->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
  59.167 +    }else{
  59.168 +        int n=0;
  59.169 +        for(i8=0; i8<4; i8++){
  59.170 +            const int x8 = i8&1;
  59.171 +            const int y8 = i8>>1;
  59.172 +
  59.173 +            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
  59.174 +                continue;
  59.175 +            m->sub_mb_type[i8] = sub_mb_type;
  59.176 +
  59.177 +            fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4);
  59.178 +            fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4);
  59.179 +            fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
  59.180 +            fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
  59.181 +
  59.182 +            /* col_zero_flag */
  59.183 +            if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 ))
  59.184 +                ){
  59.185 +                const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
  59.186 +                if(IS_SUB_8X8(sub_mb_type)){
  59.187 +//                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
  59.188 +					const int16_t *mv_col = l1mv[x8*3 + y8*3*4];
  59.189 +                    if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
  59.190 +                        if(ref[0] == 0)
  59.191 +                            fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
  59.192 +                        if(ref[1] == 0)
  59.193 +                            fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
  59.194 +                        n+=4;
  59.195 +                    }
  59.196 +                }else{
  59.197 +                    int k=0;
  59.198 +                    for(i4=0; i4<4; i4++){
  59.199 +                        //const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
  59.200 +						const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4];
  59.201 +                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
  59.202 +                            if(ref[0] == 0)
  59.203 +                                AV_ZERO32(m->mv_cache[0][scan8[i8*4+i4]]);
  59.204 +                            if(ref[1] == 0)
  59.205 +                                AV_ZERO32(m->mv_cache[1][scan8[i8*4+i4]]);
  59.206 +                            k++;
  59.207 +                        }
  59.208 +                    }
  59.209 +                    if(!(k&3))
  59.210 +                        m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8;
  59.211 +                    n+=k;
  59.212 +                }
  59.213 +            }
  59.214 +        }
  59.215 +        if(!is_b8x8 && !(n&15)){
  59.216 +            *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
  59.217 +        }
  59.218 +    }
  59.219 +}
  59.220 +
  59.221 +static void pred_temp_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
  59.222 +    H264Mb *m = s->m;
  59.223 +	const int mb_x = m->mb_x;
  59.224 +    int b4_stride = hc->b_stride;    
  59.225 +    int mb_type_col[2];
  59.226 +    const int16_t (*l1mv0)[2], (*l1mv1)[2];
  59.227 +    const int8_t *l1ref0, *l1ref1;
  59.228 +    const int is_b8x8 = IS_8X8(*mb_type);
  59.229 +    unsigned int sub_mb_type;
  59.230 +    int i8, i4;
  59.231 +    const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]};
  59.232 +    const int *dist_scale_factor = s->dist_scale_factor;
  59.233 +
  59.234 +    mb_type_col[0] =
  59.235 +    mb_type_col[1] = hc->list1_mb_type[mb_x];
  59.236 +
  59.237 +    sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
  59.238 +    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
  59.239 +        *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
  59.240 +    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
  59.241 +        *mb_type   |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
  59.242 +    }else{
  59.243 +        if(!s->direct_8x8_inference_flag){
  59.244 +            /* FIXME save sub mb types from previous frames (or derive from MVs)
  59.245 +            * so we know exactly what block size to use */
  59.246 +            sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
  59.247 +        }
  59.248 +        *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
  59.249 +    }
  59.250 +
  59.251 +//     l1mv0  = (void *) &hc->list1_motion_val[0][4*mb_x];
  59.252 +//     l1mv1  = (void *) &hc->list1_motion_val[1][4*mb_x];
  59.253 +	l1mv0  = (void *) hc->list1_motion_val[0];
  59.254 +    l1mv1  = (void *) hc->list1_motion_val[1];
  59.255 +    l1ref0 = &hc->list1_ref_index [0][4*mb_x];
  59.256 +    l1ref1 = &hc->list1_ref_index [1][4*mb_x];
  59.257 +
  59.258 +    /* one-to-one mv scaling */
  59.259 +    if(IS_16X16(*mb_type)){
  59.260 +        int ref, mv0, mv1;
  59.261 +
  59.262 +        fill_rectangle(&m->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
  59.263 +        if(IS_INTRA(mb_type_col[0])){
  59.264 +            ref=mv0=mv1=0;
  59.265 +        }else{
  59.266 +            const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
  59.267 +            : map_col_to_list0[1][l1ref1[0]];
  59.268 +            const int scale = dist_scale_factor[ref0];
  59.269 +            const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
  59.270 +            int mv_l0[2];
  59.271 +            mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
  59.272 +            mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
  59.273 +            ref= ref0;
  59.274 +            mv0= pack16to32(mv_l0[0],mv_l0[1]);
  59.275 +            mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
  59.276 +        }
  59.277 +        fill_rectangle(&m->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
  59.278 +        fill_rectangle(&m-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
  59.279 +        fill_rectangle(&m-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
  59.280 +    }else{
  59.281 +        for(i8=0; i8<4; i8++){
  59.282 +            const int x8 = i8&1;
  59.283 +            const int y8 = i8>>1;
  59.284 +            int ref0, scale;
  59.285 +            const int16_t (*l1mv)[2]= l1mv0;
  59.286 +
  59.287 +            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
  59.288 +                continue;
  59.289 +            m->sub_mb_type[i8] = sub_mb_type;
  59.290 +            fill_rectangle(&m->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
  59.291 +            if(IS_INTRA(mb_type_col[0])){
  59.292 +                fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
  59.293 +                fill_rectangle(&m-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
  59.294 +                fill_rectangle(&m-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
  59.295 +                continue;
  59.296 +            }
  59.297 +
  59.298 +            ref0 = l1ref0[i8];
  59.299 +            if(ref0 >= 0)
  59.300 +                ref0 = map_col_to_list0[0][ref0 ];
  59.301 +            else{
  59.302 +                ref0 = map_col_to_list0[1][l1ref1[i8]];
  59.303 +                l1mv= l1mv1;
  59.304 +            }
  59.305 +            scale = dist_scale_factor[ref0];
  59.306 +
  59.307 +            fill_rectangle(&m->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
  59.308 +            if(IS_SUB_8X8(sub_mb_type)){
  59.309 +//                 const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
  59.310 +				const int16_t *mv_col = l1mv[x8*3 + y8*3*4];
  59.311 +                int mx = (scale * mv_col[0] + 128) >> 8;
  59.312 +                int my = (scale * mv_col[1] + 128) >> 8;
  59.313 +                fill_rectangle(&m->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
  59.314 +                fill_rectangle(&m->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
  59.315 +            }else
  59.316 +            for(i4=0; i4<4; i4++){
  59.317 +//                 const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
  59.318 +				const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*4];
  59.319 +                int16_t *mv_l0 = m->mv_cache[0][scan8[i8*4+i4]];
  59.320 +                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
  59.321 +                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
  59.322 +                AV_WN32A(m->mv_cache[1][scan8[i8*4+i4]],
  59.323 +                    pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]));
  59.324 +            }
  59.325 +        }
  59.326 +    }
  59.327 +}
  59.328 +
  59.329 +void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type){
  59.330 +    if(s->direct_spatial_mv_pred){
  59.331 +        pred_spatial_direct_motion(hc, s, mb_type);
  59.332 +    }else{
  59.333 +        pred_temp_direct_motion(hc, s, mb_type);
  59.334 +    }
  59.335 +}

    60.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    60.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_direct_spu.h	Mon Aug 27 12:09:56 2012 +0200
    60.3 @@ -0,0 +1,8 @@
    60.4 +#ifndef H264_DIRECT_H
    60.5 +#define H264_DIRECT_H
    60.6 +
    60.7 +#include "h264_types_spu.h"
    60.8 +
    60.9 +void ff_h264_pred_direct_motion(H264Cabac_spu *hc, EDSlice_spu *s, int *mb_type);
   60.10 +
   60.11 +#endif

    61.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    61.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.c	Mon Aug 27 12:09:56 2012 +0200
    61.3 @@ -0,0 +1,74 @@
    61.4 +#include <spu_mfcio.h>
    61.5 +#include "h264_dma.h"
    61.6 +
    61.7 +DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]);
    61.8 +dma_list_elem_t* put_list;
    61.9 +
   61.10 +DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]);
   61.11 +dma_list_elem_t* get_list;
   61.12 +
   61.13 +inline void spu_dma_get(void *ls, unsigned ea, int size, int tag){
   61.14 +	mfc_get(ls, ea, size, tag, 0, 0);
   61.15 +}
   61.16 +
   61.17 +inline void spu_dma_put(void *ls, unsigned ea, int size, int tag){
   61.18 +	mfc_put(ls, ea, size, tag, 0, 0);
   61.19 +}
   61.20 +
   61.21 +inline void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag){
   61.22 +	mfc_putb(ls, ea, size, tag, 0, 0);
   61.23 +}
   61.24 +
   61.25 +// Function that wait to finish a DMA transfer with especific id
   61.26 +inline void wait_dma_id(int id){
   61.27 +	spu_writech(MFC_WrTagMask, 1<< id);
   61.28 +	(void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
   61.29 +}
   61.30 +
   61.31 +// Functions to get/put a block from/to main memory
   61.32 +void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier)
   61.33 +{
   61.34 +    unsigned int i = 0;
   61.35 +    unsigned int listsize;
   61.36 +    unsigned int ea_low;
   61.37 +
   61.38 +	dma_list_elem_t* list = get_list;
   61.39 +	get_list+=h;
   61.40 +
   61.41 +    ea_low=(uint32_t) mfc_ea2l(ea);
   61.42 +
   61.43 +    /* Create the list, size of each list id the "width" parameter defined by the user */
   61.44 +    for ( i=0; i<h; i++ ){
   61.45 +        list[i].size.all32 = w;
   61.46 +        list[i].ea_low = ea_low;
   61.47 +        ea_low += stride;
   61.48 +    }
   61.49 +    /* Specify the list size and initiate the list transfer */
   61.50 +    listsize = h*sizeof(dma_list_elem_t);
   61.51 +    if (barrier)
   61.52 +		mfc_getlb(dst, (unsigned)ea, list, listsize, tag, 0, 0);
   61.53 +	else
   61.54 +		mfc_getl(dst, (unsigned)ea, list, listsize, tag, 0, 0);
   61.55 +}
   61.56 +
   61.57 +
   61.58 +void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag){
   61.59 +    unsigned int i = 0;
   61.60 +    unsigned int listsize;
   61.61 +    unsigned int ea_low;
   61.62 +
   61.63 +	dma_list_elem_t* list = put_list;
   61.64 +	put_list+=h;
   61.65 +
   61.66 +	ea_low=(uint32_t) mfc_ea2l(ea);
   61.67 +
   61.68 +    /* Create the list, size of each list id the "width" parameter defined by the user */
   61.69 +    for ( i=0; i<h; i++ ) {
   61.70 +        list[i].size.all32 = size;
   61.71 +        list[i].ea_low = ea_low;
   61.72 +        ea_low += stride;
   61.73 +    }
   61.74 +    /* Specify the list size and initiate the list transfer */
   61.75 +    listsize = h*sizeof(dma_list_elem_t);
   61.76 +	mfc_putl(src, (unsigned) ea, list, listsize, tag, 0, 0);
   61.77 +}

    62.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    62.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_dma.h	Mon Aug 27 12:09:56 2012 +0200
    62.3 @@ -0,0 +1,59 @@
    62.4 +#ifndef H264_DMA_H
    62.5 +#define H264_DMA_H
    62.6 +
    62.7 +#include "libavutil/mem.h"
    62.8 +
    62.9 +typedef struct dma_list_elem {
   62.10 +	union {
   62.11 +		unsigned int all32;
   62.12 +		struct {
   62.13 +		unsigned int stall    : 1;
   62.14 +		unsigned int reserved : 15;
   62.15 +		unsigned int nbytes   : 16;
   62.16 +		} bits;
   62.17 +	} size;
   62.18 +	uint64_t ea_low : 32;
   62.19 +}dma_list_elem_t;
   62.20 +
   62.21 +extern DECLARE_ALIGNED_16(dma_list_elem_t, put_list_buf[2*(52+26+26)]);
   62.22 +extern dma_list_elem_t* put_list;
   62.23 +
   62.24 +extern DECLARE_ALIGNED_16(dma_list_elem_t, get_list_buf[16*(4+5 + 2*3)]);
   62.25 +extern dma_list_elem_t* get_list;
   62.26 +
   62.27 +enum{
   62.28 +	MBD_slice=1,
   62.29 +	MBD_buf1,
   62.30 +	MBD_buf2,
   62.31 +	MBD_buf3,
   62.32 +	MBD_put,
   62.33 +	MBD_pic,
   62.34 +	MBD_mc_buf1,
   62.35 +	MBD_mc_buf2
   62.36 +};
   62.37 +
   62.38 +enum{
   62.39 +	ED_spe=1,
   62.40 +	ED_slice,
   62.41 +	ED_raw,
   62.42 +	ED_get,
   62.43 +	ED_get2,
   62.44 +	ED_get_mv,
   62.45 +	ED_put,
   62.46 +	ED_putmb0,
   62.47 +	ED_putmb1,
   62.48 +};
   62.49 +
   62.50 +// Functions to get/put a block from/to main memory
   62.51 +void get_dma_list(void *dst, void* ea, unsigned int w, unsigned int h, unsigned int stride, unsigned int tag, int barrier);
   62.52 +void put_dma_list(void *src, void* ea, unsigned int size, unsigned int h, unsigned int stride, unsigned int tag);
   62.53 +
   62.54 +//Functions to do a dma transfer for 32-bit
   62.55 +void spu_dma_get(void *ls, unsigned ea, int size, int tag);
   62.56 +void spu_dma_put(void *ls, unsigned ea, int size, int tag);
   62.57 +void spu_dma_barrier_put(void *ls, unsigned ea, int size, int tag);
   62.58 +
   62.59 +// Function that wait to finish a DMA transfer with especific id
   62.60 +void wait_dma_id(int id);
   62.61 +
   62.62 +#endif

    63.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    63.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_filter_spu_vec.c	Mon Aug 27 12:09:56 2012 +0200
    63.3 @@ -0,0 +1,650 @@
    63.4 +/*
    63.5 + * Copyright (c) 2009 TUDelft 
    63.6 + * 
    63.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 
    63.8 + */
    63.9 +
   63.10 +/**
   63.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   63.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
   63.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   63.14 + * 
   63.15 + * SIMD kernels 
   63.16 + * H.264/AVC motion compensation
   63.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   63.18 + * @author Albert Paradis <apar7632@hotmail.com>
   63.19 + */ 
   63.20 +
   63.21 +
   63.22 +#include <stdio.h>
   63.23 +#include <spu_mfcio.h>
   63.24 +#include <spu_intrinsics.h>
   63.25 +
   63.26 +#include "h264_filter_spu.h"
   63.27 +#include "h264_decode_mb_spu.h"
   63.28 +// To use scan8 table
   63.29 +#include "h264_mc_spu.h"
   63.30 +
   63.31 +
   63.32 +int get_chroma_qp(H264Context_spu *h, int t, int qscale){
   63.33 +    return h->slice.chroma_qp_table[t][qscale];
   63.34 +}
   63.35 +
   63.36 +static inline int clip(int a, int amin, int amax){
   63.37 +    if (a < amin)
   63.38 +        return amin;
   63.39 +    else if (a > amax)
   63.40 +        return amax;
   63.41 +    else
   63.42 +        return a;
   63.43 +}
   63.44 +
   63.45 +static inline vsint16_t clip_altivec(vsint16_t a, vsint16_t amin, vsint16_t amax){
   63.46 +    vector unsigned short min_mask,max_mask;
   63.47 +    min_mask = spu_cmpgt(amin, a);
   63.48 +    max_mask = spu_cmpgt(a, amax);
   63.49 +
   63.50 +    return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask);
   63.51 +}
   63.52 +
   63.53 +static inline vsint16_t clip_uint8_altivec(vsint16_t a){
   63.54 +    const vsint16_t amax = {255,255,255,255,255,255,255,255};
   63.55 +    const vsint16_t amin = {0, 0, 0, 0, 0, 0, 0, 0};
   63.56 +    vector unsigned short min_mask,max_mask;
   63.57 +    min_mask = spu_cmpgt(amin, a);
   63.58 +    max_mask = spu_cmpgt(a, amax);
   63.59 +
   63.60 +    return spu_sel(spu_sel(a,amin,min_mask),amax,max_mask);
   63.61 +}
   63.62 +
   63.63 +static  inline void h264_loop_filter_chroma(vsint16_t *pix, int alpha, int beta, int8_t *tc0){
   63.64 +
   63.65 +    short a = (short) tc0[0];
   63.66 +    short b = (short) tc0[1];
   63.67 +    short c = (short) tc0[2];
   63.68 +    short d = (short) tc0[3];
   63.69 +    const vsint16_t vec_tc0 = {a,a,b,b,c,c,d,d};
   63.70 +    const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0};
   63.71 +    vector unsigned short mask_B0;
   63.72 +
   63.73 +    mask_B0 = spu_cmpgt(vec_v0, vec_tc0);
   63.74 +
   63.75 +    const vsint16_t p0 = pix[-1];
   63.76 +    const vsint16_t p1 = pix[-2];
   63.77 +    const vsint16_t q0 = pix[0];
   63.78 +    const vsint16_t q1 = pix[1];
   63.79 +
   63.80 +    const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
   63.81 +    const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
   63.82 +    const vsint16_t v_2 = {2,2,2,2,2,2,2,2};
   63.83 +    const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
   63.84 +    const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
   63.85 +
   63.86 +    vsint16_t rp0;
   63.87 +    vsint16_t rq0;
   63.88 +    vsint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0;
   63.89 +    vector unsigned short mask_B1, mask_tmp;
   63.90 +    vsint16_t i_delta;
   63.91 +
   63.92 +    abs_p0mq0 = (vector signed short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
   63.93 +    abs_p1mp0 = (vector signed short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
   63.94 +    abs_q1mq0 = (vector signed short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
   63.95 +
   63.96 +    mask_B1  = spu_cmpgt(v_alpha, abs_p0mq0);
   63.97 +    mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
   63.98 +    mask_B1  = spu_and(mask_B1, mask_tmp);
   63.99 +    mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
  63.100 +    mask_B1  = spu_and(mask_B1, mask_tmp);
  63.101 +
  63.102 +
  63.103 +    i_delta = clip_altivec(spu_rlmaska(spu_add(spu_sl(spu_sub(q0,p0 ), (vuint16_t)v_2), spu_add(spu_sub(p1,q1),v_4)), (vsint16_t)-v_3), -vec_tc0, vec_tc0);
  63.104 +
  63.105 +    rp0 = clip_uint8_altivec( spu_add(p0,i_delta));
  63.106 +    rq0 = clip_uint8_altivec( spu_sub(q0,i_delta));
  63.107 +
  63.108 +    pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0);
  63.109 +    pix[0]  = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0);
  63.110 +}
  63.111 +
  63.112 +static void h264_v_loop_filter_luma_c(vsint16_t *pix, int alpha, int beta, int8_t *tc0, int inc_low2high){
  63.113 +
  63.114 +    short a = (short) tc0[0 + inc_low2high];
  63.115 +    short b = (short) tc0[1 + inc_low2high];
  63.116 +    const vsint16_t vec_tc0 = {a,a,a,a,b,b,b,b};
  63.117 +    const vsint16_t vec_v0 = {0, 0, 0, 0, 0, 0, 0, 0};
  63.118 +    vector unsigned short mask_B0;
  63.119 +
  63.120 +    mask_B0 = spu_cmpgt(vec_v0, vec_tc0);
  63.121 +    const vsint16_t p0 = pix[-1];
  63.122 +    const vsint16_t p1 = pix[-2];
  63.123 +    const vsint16_t p2 = pix[-3];
  63.124 +    const vsint16_t q0 = pix[0];
  63.125 +    const vsint16_t q1 = pix[1];
  63.126 +    const vsint16_t q2 = pix[2];
  63.127 +
  63.128 +    const vuint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
  63.129 +    const vuint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
  63.130 +
  63.131 +    const vuint16_t v_1 = {1,1,1,1,1,1,1,1};
  63.132 +    const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
  63.133 +    const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
  63.134 +    const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
  63.135 +
  63.136 +    vsint16_t rp0, rp1;
  63.137 +    vsint16_t rq0, rq1;
  63.138 +    vsint16_t tc0_B2P, tc0_B2Q, rtc0;
  63.139 +    vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0;
  63.140 +    vector unsigned short mask_B1, mask_B2P, mask_B2Q, mask_tmp;
  63.141 +    vsint16_t i_delta, i_delta2;
  63.142 +
  63.143 +    abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
  63.144 +    abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
  63.145 +    abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
  63.146 +    abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0);
  63.147 +    abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0);
  63.148 +
  63.149 +    mask_B1  = spu_cmpgt(v_alpha, abs_p0mq0);
  63.150 +    mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
  63.151 +    mask_B1  = spu_and(mask_B1, mask_tmp);
  63.152 +    mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
  63.153 +    mask_B1  = spu_and(mask_B1, mask_tmp);
  63.154 +
  63.155 +    mask_B2P = spu_cmpgt(v_beta, abs_p2mp0);
  63.156 +    mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0);
  63.157 +
  63.158 +    rp1 = spu_add(p1, clip_altivec(spu_sub(spu_rlmaska(spu_add(p2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), p1), -vec_tc0, vec_tc0 ));
  63.159 +    rq1 = spu_add(q1, clip_altivec(spu_sub(spu_rlmaska(spu_add(q2, (vector signed short) spu_avg((vector unsigned char) p0, (vector unsigned char) q0)),(vsint16_t)-v_1), q1), -vec_tc0, vec_tc0 ));
  63.160 +
  63.161 +    tc0_B2P = spu_add(vec_tc0, (vsint16_t) v_1);
  63.162 +    tc0_B2P = spu_sel(vec_tc0, tc0_B2P, mask_B2P);
  63.163 +
  63.164 +    tc0_B2Q = spu_add(tc0_B2P, (vsint16_t) v_1);
  63.165 +    rtc0    = spu_sel(tc0_B2P, tc0_B2Q, mask_B2Q);
  63.166 +    i_delta2 = spu_add(spu_sub(p1,q1),v_4);
  63.167 +    i_delta = spu_sl(spu_sub(q0,p0 ), v_2);
  63.168 +    i_delta = spu_add(i_delta,i_delta2 );
  63.169 +    i_delta = spu_rlmaska(i_delta, (vsint16_t)-v_3);
  63.170 +    i_delta = clip_altivec(i_delta, -rtc0, rtc0);
  63.171 +
  63.172 +    rp0 = clip_uint8_altivec( spu_add(p0,i_delta));    /* p0' */
  63.173 +    rq0 = clip_uint8_altivec( spu_sub(q0,i_delta));    /* q0' */
  63.174 +
  63.175 +    pix[-2] = spu_sel(spu_sel(p1,spu_sel(p1,rp1,mask_B2P) ,mask_B1), p1,mask_B0);
  63.176 +    pix[-1] = spu_sel(spu_sel(p0, rp0, mask_B1), p0,mask_B0);
  63.177 +    pix[0]  = spu_sel(spu_sel(q0, rq0, mask_B1), q0,mask_B0);
  63.178 +    pix[1]  = spu_sel(spu_sel(q1,spu_sel(q1,rq1,mask_B2Q) ,mask_B1), q1,mask_B0);
  63.179 +}
  63.180 +
  63.181 +
  63.182 +
  63.183 +static inline void h264_loop_filter_chroma_intra(vsint16_t *pix, int alpha, int beta){
  63.184 +
  63.185 +    const vuint16_t p0 = (vuint16_t) pix[-1];
  63.186 +    const vuint16_t p1 = (vuint16_t) pix[-2];
  63.187 +    const vuint16_t q0 = (vuint16_t) pix[0];
  63.188 +    const vuint16_t q1 = (vuint16_t) pix[1];
  63.189 +
  63.190 +    const vsint16_t v_alpha = {(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha,(signed short) alpha};
  63.191 +    const vsint16_t v_beta = {(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta,(signed short) beta};
  63.192 +    const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
  63.193 +
  63.194 +    vuint16_t rp0;
  63.195 +    vuint16_t rq0;
  63.196 +    vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0;
  63.197 +    vector unsigned short mask_B0, mask_tmp;
  63.198 +
  63.199 +    abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
  63.200 +    abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
  63.201 +    abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
  63.202 +
  63.203 +    mask_B0  = spu_cmpgt(v_alpha, (vsint16_t)abs_p0mq0);
  63.204 +    mask_tmp = spu_cmpgt(v_beta, (vsint16_t)abs_p1mp0);
  63.205 +    mask_B0  = spu_and(mask_B0, mask_tmp);
  63.206 +    mask_tmp = spu_cmpgt( v_beta, (vsint16_t)abs_q1mq0);
  63.207 +    mask_B0  = spu_and(mask_B0, mask_tmp);
  63.208 +
  63.209 +    rp0 = spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2;
  63.210 +    rp0 = spu_rlmaska(rp0, (vsint16_t)-v_2);
  63.211 +    rq0 = spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2;
  63.212 +    rq0 = spu_rlmaska(rq0, (vsint16_t)-v_2);
  63.213 +
  63.214 +    pix[-1] = (vsint16_t) spu_sel(p0, rp0, mask_B0);
  63.215 +    pix[0]  = (vsint16_t) spu_sel(q0, rq0, mask_B0);
  63.216 +}
  63.217 +int slice_alpha_c0_offset;
  63.218 +int slice_beta_offset;
  63.219 +static void filter_mb_edgecv(vsint16_t *pix, int bS[4], int qp ) {
  63.220 +    int i;	
  63.221 +    const int index_a = qp + slice_alpha_c0_offset;
  63.222 +    const int alpha = (alpha_table+52)[index_a];
  63.223 +    const int beta  = (beta_table+52)[qp + slice_beta_offset];
  63.224 +
  63.225 +    if( bS[0] < 4 ) {
  63.226 +        int8_t tc[4];
  63.227 +        for(i=0; i<4; i++)
  63.228 +            tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
  63.229 +        h264_loop_filter_chroma(pix, alpha, beta, tc);
  63.230 +    } else {
  63.231 +        h264_loop_filter_chroma_intra(pix, alpha, beta);
  63.232 +    }
  63.233 +}
  63.234 +
  63.235 +static void filter_mb_edgeh(vsint16_t *pix, int bS[4], int qp, int inc_low2high ) {
  63.236 +    int i;
  63.237 +    const int index_a = qp + slice_alpha_c0_offset;
  63.238 +    const int alpha = (alpha_table+52)[index_a];
  63.239 +    const int beta  = (beta_table+52)[qp + slice_beta_offset];
  63.240 +
  63.241 +    if( bS[0] < 4 ) {
  63.242 +        int8_t tc[4];
  63.243 +        for(i=0; i<4; i++)
  63.244 +            tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
  63.245 +        h264_v_loop_filter_luma_c(pix, alpha, beta, tc, inc_low2high);
  63.246 +    } else {
  63.247 +
  63.248 +        const vuint16_t p0 = (vuint16_t) pix[-1];
  63.249 +        const vuint16_t p1 = (vuint16_t) pix[-2];
  63.250 +        const vuint16_t p2 = (vuint16_t) pix[-3];
  63.251 +        const vuint16_t p3 = (vuint16_t) pix[-4];
  63.252 +        const vuint16_t q0 = (vuint16_t) pix[0];
  63.253 +        const vuint16_t q1 = (vuint16_t) pix[1];
  63.254 +        const vuint16_t q2 = (vuint16_t) pix[2];
  63.255 +        const vuint16_t q3 = (vuint16_t) pix[3];
  63.256 +
  63.257 +    	const vuint16_t v_alpha = {(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha,(unsigned short) alpha};
  63.258 +    	const vuint16_t v_beta = {(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta,(unsigned short) beta};
  63.259 +    	const vuint16_t v_2 = {2,2,2,2,2,2,2,2};
  63.260 +    	const vuint16_t v_3 = {3,3,3,3,3,3,3,3};
  63.261 +    	const vsint16_t v_4 = {4,4,4,4,4,4,4,4};
  63.262 +
  63.263 +        vuint16_t rp0_B1f, rp0_B2t, rp0_B2f, rp1_B2t, rp2_B2t;
  63.264 +        vuint16_t rq0_B1f, rq0_B2t, rq0_B2f, rq1_B2t, rq2_B2t;
  63.265 +        vuint16_t abs_p0mq0, abs_p1mp0, abs_q1mq0, abs_p2mp0, abs_q2mq0;
  63.266 +        vuint16_t v_alpha_2 = spu_rlmaska(v_alpha, (vsint16_t)-v_2);
  63.267 +        vector unsigned short mask_B0, mask_B1, mask_B2P, mask_B2Q, mask_tmp;
  63.268 +
  63.269 +        v_alpha_2 = spu_add(v_alpha_2, v_2);
  63.270 +
  63.271 +	abs_p0mq0 = (vector unsigned short) spu_absd((vector unsigned char) p0,(vector unsigned char) q0);
  63.272 +    	abs_p1mp0 = (vector unsigned short) spu_absd((vector unsigned char) p1,(vector unsigned char) p0);
  63.273 +    	abs_q1mq0 = (vector unsigned short) spu_absd((vector unsigned char) q1,(vector unsigned char) q0);
  63.274 +        abs_p2mp0 = (vector unsigned short) spu_absd((vector unsigned char) p2,(vector unsigned char) p0);
  63.275 +        abs_q2mq0 = (vector unsigned short) spu_absd((vector unsigned char) q2,(vector unsigned char) q0);
  63.276 +
  63.277 +	mask_B0  = spu_cmpgt(v_alpha, abs_p0mq0);
  63.278 +	mask_tmp = spu_cmpgt(v_beta, abs_p1mp0);
  63.279 +	mask_B0  = spu_and(mask_B0, mask_tmp);
  63.280 +	mask_tmp = spu_cmpgt( v_beta, abs_q1mq0);
  63.281 +	mask_B0  = spu_and(mask_B0, mask_tmp);
  63.282 +
  63.283 +        mask_B1  = spu_cmpgt(v_alpha_2, abs_p0mq0);
  63.284 +        mask_B2P = spu_cmpgt(v_beta,abs_p2mp0);
  63.285 +        mask_B2Q = spu_cmpgt(v_beta ,abs_q2mq0);
  63.286 +
  63.287 +        rp0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p2,p1),spu_add(p1,p0)),spu_add(spu_add(p0,q0),spu_add(q0,q1))),(vuint16_t)v_4),(vsint16_t) -v_3);
  63.288 +        		//( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
  63.289 +        rp1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p2,p1),spu_add(q0,p0)),v_2),(vsint16_t)-v_2);//( p2 + p1 + p0 + q0 + 2 ) >> 2;
  63.290 +        rp2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p3,p3),spu_add(p2,p2)),spu_add(spu_add(p2,p1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3);
  63.291 +        		//( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
  63.292 +        rq0_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(p1,p0),spu_add(p0,q0)),spu_add(spu_add(q0,q1),spu_add(q1,q2))),(vuint16_t)v_4),(vsint16_t)-v_3);
  63.293 +
  63.294 +        		//( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
  63.295 +        rq1_B2t = spu_rlmaska(spu_add(spu_add(spu_add(p0,q0),spu_add(q1,q2)),v_2),(vsint16_t)-v_2);//( p0 + q0 + q1 + q2 + 2 ) >> 2;
  63.296 +        rq2_B2t = spu_rlmaska(spu_add(spu_add(spu_add(spu_add(q3,q3),spu_add(q2,q2)),spu_add(spu_add(q2,q1),spu_add(q0,p0))),(vuint16_t)v_4),(vsint16_t)-v_3);
  63.297 +        		//( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
  63.298 +        rp0_B1f =
  63.299 +        rp0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(p1,p0),spu_add(p1,q1)),v_2),(vsint16_t)-v_2);//( 2*p1 + p0 + q1 + 2 ) >> 2;
  63.300 +        rq0_B1f =
  63.301 +        rq0_B2f = spu_rlmaska(spu_add(spu_add(spu_add(q1,q0),spu_add(q1,p1)),v_2),(vsint16_t)-v_2);//( 2*q1 + q0 + p1 + 2 ) >> 2;
  63.302 +
  63.303 +        pix[-1] = (vsint16_t) spu_sel(p0, spu_sel(rp0_B1f, spu_sel(rp0_B2f, rp0_B2t, mask_B2P), mask_B1), mask_B0);
  63.304 +        pix[-2] = (vsint16_t) spu_sel(p1, spu_sel(p1, spu_sel(p1, rp1_B2t, mask_B2P), mask_B1), mask_B0);
  63.305 +        pix[-3] = (vsint16_t) spu_sel(p2, spu_sel(p2, spu_sel(p2, rp2_B2t, mask_B2P), mask_B1), mask_B0);
  63.306 +        pix[0] = (vsint16_t) spu_sel(q0, spu_sel(rq0_B1f, spu_sel(rq0_B2f, rq0_B2t, mask_B2Q), mask_B1), mask_B0);
  63.307 +        pix[1] = (vsint16_t) spu_sel(q1, spu_sel(q1, spu_sel(q1, rq1_B2t,mask_B2Q), mask_B1), mask_B0);
  63.308 +        pix[2] = (vsint16_t) spu_sel(q2, spu_sel(q2, spu_sel(q2, rq2_B2t,mask_B2Q), mask_B1), mask_B0);
  63.309 +    }
  63.310 +}
  63.311 +
  63.312 +// This function gets bS and qp for luma and chroma before the filter
  63.313 +void calculate_bS_qp(H264Context_spu *h){
  63.314 +	H264mb* mb = &h->mb;
  63.315 +	H264slice* slice = h->slice;
  63.316 +    int dir;
  63.317 +    const int mvy_limit = 4;
  63.318 +    /* FIXME: A given frame may occupy more than one position in
  63.319 +     * the reference list. So ref2frm should be populated with
  63.320 +     * frame numbers, not indices. */
  63.321 +
  63.322 +	int (*ref2frm)[64] = slice->ref2frm;
  63.323 +	int mb_x = mb->mb_x;
  63.324 +	int mb_y = mb->mb_y;
  63.325 +	int mb_type =mb->mb_type;
  63.326 +    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
  63.327 +    for( dir = 0; dir < 2; dir++ ){
  63.328 +        int edge;
  63.329 +		const int mbm_type = dir == 0 ? mb->mb_type_xy_n1 : mb->mb_type_top;
  63.330 +        const int8_t qscale_mbm = dir == 0 ? mb->qscale_mbxy_n1 : mb->qscale_mbxy_top;
  63.331 +
  63.332 +        // how often to recheck mv-based bS when iterating between edges
  63.333 +        const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :(mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
  63.334 +        // how often to recheck mv-based bS when iterating along each edge
  63.335 +        const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
  63.336 +
  63.337 +		h->edges[dir] = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP)) == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
  63.338 +
  63.339 +		if ((dir==0 && mb_x==0) || (dir==1 && mb_y==0))
  63.340 +			h->start[dir] =1;
  63.341 +		else
  63.342 +			h->start[dir] =0;
  63.343 +
  63.344 +        /* Calculate bS */
  63.345 +        for( edge = h->start[dir]; edge < h->edges[dir]; edge++ ) {
  63.346 +            /* mbn_xy: neighbor macroblock */
  63.347 +            const int mbn_type = edge > 0 ? mb_type : mbm_type;
  63.348 +            const int8_t qscale_mbn_xy = edge > 0 ? mb->qscale_mbxy : qscale_mbm;
  63.349 +			int* bS = h->bS[dir][edge];
  63.350 +
  63.351 +            if( (edge&1) && IS_8x8DCT(mb_type) ){
  63.352 +                bS[0] = bS[1] = bS[2] = bS[3] = 0; //extra code due to decoupling
  63.353 +                continue;
  63.354 +            }
  63.355 +            if( IS_INTRA(mb_type) ||
  63.356 +                IS_INTRA(mbn_type) ) {
  63.357 +                int value;
  63.358 +                if (edge == 0) {
  63.359 +					value = 4;
  63.360 +				} else {
  63.361 +					value = 3;
  63.362 +				}
  63.363 +                bS[0] = bS[1] = bS[2] = bS[3] = value;
  63.364 +            } else {
  63.365 +                int i, l;
  63.366 +                int mv_done;
  63.367 +
  63.368 +                if( edge & mask_edge ) {
  63.369 +					bS[0] = bS[1] = bS[2] = bS[3] = 0;
  63.370 +                    mv_done = 1;
  63.371 +                }
  63.372 +                else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
  63.373 +                    int b_idx= 8 + 4 + edge * (dir ? 8:1);
  63.374 +                    int bn_idx= b_idx - (dir ? 8:1);
  63.375 +                    int v = 0;
  63.376 +
  63.377 +                    for( l = 0; !v && l < 1 + (slice->slice_type_nos == FF_B_TYPE); l++ ) {
  63.378 +                        v |= ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] ||
  63.379 +                             FFABS(mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
  63.380 +                             FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit;
  63.381 +                    }
  63.382 +                    bS[0] = bS[1] = bS[2] = bS[3] = v;
  63.383 +
  63.384 +					mv_done = 1;
  63.385 +                }
  63.386 +                else
  63.387 +                    mv_done = 0;
  63.388 +
  63.389 +                for( i = 0; i < 4; i++ ) {
  63.390 +                    int x = dir == 0 ? edge : i;
  63.391 +                    int y = dir == 0 ? i    : edge;
  63.392 +                    int b_idx= 8 + 4 + x + 8*y;
  63.393 +                    int bn_idx= b_idx - (dir ? 8:1);
  63.394 +
  63.395 +                    if( mb->non_zero_count_cache[b_idx] != 0 ||
  63.396 +                        mb->non_zero_count_cache[bn_idx] != 0 ) {
  63.397 +                        bS[i] = 2;
  63.398 +                    }
  63.399 +                    else if(!mv_done)
  63.400 +                    {
  63.401 +                        bS[i] = 0;
  63.402 +                        for( l = 0; l < 1 + (slice->slice_type == B_TYPE); l++ ) {
  63.403 +                            if( ref2frm[mb->ref_cache[l][b_idx]+2] != ref2frm[mb->ref_cache[l][bn_idx]+2] ||
  63.404 +                                FFABS( mb->mv_cache[l][b_idx][0] - mb->mv_cache[l][bn_idx][0] ) >= 4 ||
  63.405 +                                FFABS( mb->mv_cache[l][b_idx][1] - mb->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
  63.406 +                                bS[i] = 1;
  63.407 +                                break;
  63.408 +                            }
  63.409 +                        }
  63.410 +                    }
  63.411 +                }
  63.412 +
  63.413 +                if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
  63.414 +                    continue;
  63.415 +            }
  63.416 +
  63.417 +            /* Filter edge */
  63.418 +            // Do not use s->qscale as luma quantizer because it has not the same
  63.419 +            // value in IPCM macroblocks.
  63.420 +            h->qp[dir][edge] = ( mb->qscale_mbxy + qscale_mbn_xy + 1 ) >> 1;
  63.421 +            h->chroma_qp[0][dir][edge] = ( mb->chroma_qp[0] + get_chroma_qp(h, 0, qscale_mbn_xy ) + 1 ) >> 1;
  63.422 +
  63.423 +			h->chroma_qp[1][dir][edge] = ( mb->chroma_qp[1] + get_chroma_qp(h, 1, qscale_mbn_xy ) + 1 ) >> 1;
  63.424 +        }
  63.425 +		slice_alpha_c0_offset=slice->slice_alpha_c0_offset;
  63.426 +		slice_beta_offset= slice->slice_beta_offset;
  63.427 +    }
  63.428 +}
  63.429 +
  63.430 +
  63.431 +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7,merge_h,merge_l) \
  63.432 +    b0 = spu_shuffle( a0, a4, merge_h); \
  63.433 +    b1 = spu_shuffle( a0, a4, merge_l ); \
  63.434 +    b2 = spu_shuffle( a1, a5, merge_h ); \
  63.435 +    b3 = spu_shuffle( a1, a5, merge_l ); \
  63.436 +    b4 = spu_shuffle( a2, a6, merge_h ); \
  63.437 +    b5 = spu_shuffle( a2, a6, merge_l ); \
  63.438 +    b6 = spu_shuffle( a3, a7, merge_h ); \
  63.439 +    b7 = spu_shuffle( a3, a7, merge_l ); \
  63.440 +    a0 = spu_shuffle( b0, b4, merge_h ); \
  63.441 +    a1 = spu_shuffle( b0, b4, merge_l ); \
  63.442 +    a2 = spu_shuffle( b1, b5, merge_h ); \
  63.443 +    a3 = spu_shuffle( b1, b5, merge_l ); \
  63.444 +    a4 = spu_shuffle( b2, b6, merge_h ); \
  63.445 +    a5 = spu_shuffle( b2, b6, merge_l); \
  63.446 +    a6 = spu_shuffle( b3, b7, merge_h ); \
  63.447 +    a7 = spu_shuffle( b3, b7, merge_l ); \
  63.448 +    b0 = spu_shuffle( a0, a4, merge_h ); \
  63.449 +    b1 = spu_shuffle( a0, a4, merge_l ); \
  63.450 +    b2 = spu_shuffle( a1, a5, merge_h ); \
  63.451 +    b3 = spu_shuffle( a1, a5, merge_l); \
  63.452 +    b4 = spu_shuffle( a2, a6, merge_h ); \
  63.453 +    b5 = spu_shuffle( a2, a6, merge_l ); \
  63.454 +    b6 = spu_shuffle( a3, a7, merge_h ); \
  63.455 +    b7 = spu_shuffle( a3, a7, merge_l )
  63.456 +
  63.457 +void filter_mb_spu(vsint16_t *img_y, vsint16_t *img_cb, vsint16_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int edges[2], int bS[2][4][4], int qp[2][4], int chroma_qp[2][2][4], int start[2]){
  63.458 +
  63.459 +    int dir,x;
  63.460 +    vsint16_t o_vec_img_y[(16+8)*2];
  63.461 +    vsint16_t t_vec_img_y[(16+8)*2];
  63.462 +    vsint16_t *vec_img_y_o = o_vec_img_y;
  63.463 +    vsint16_t *vec_img_y_t = t_vec_img_y;
  63.464 +
  63.465 +    vsint16_t o_vec_img_cb[8+8+4];
  63.466 +    vsint16_t t_vec_img_cb[8+8];
  63.467 +    vsint16_t *vec_img_cb_o = &o_vec_img_cb[2];
  63.468 +    vsint16_t *vec_img_cb_t = t_vec_img_cb;
  63.469 +
  63.470 +    vsint16_t o_vec_img_cr[8+8+4];
  63.471 +    vsint16_t t_vec_img_cr[8+8];
  63.472 +    vsint16_t *vec_img_cr_o = &o_vec_img_cr[2];
  63.473 +    vsint16_t *vec_img_cr_t = t_vec_img_cr;
  63.474 +
  63.475 +    vuint8_t *pvec_tmp;
  63.476 +
  63.477 +    const vuint8_t patt_high = {16,  0, 17,  1, 18,  2, 19,  3, 20,  4, 21,  5, 22,  6, 23,  7};
  63.478 +    const vuint8_t patt_low  = {16,  8, 17,  9, 18, 10, 19, 11, 20, 12, 21, 13, 22, 14, 23, 15};
  63.479 +    const vuint8_t patt_unpack={ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
  63.480 +    const vuint8_t patt_pack_hw={0,  1,  2,  3,  4,  5,  6,  7, 17, 19, 21, 23, 25, 27, 29, 31};
  63.481 +    const vuint8_t patt_pack_chroma_aligned={0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F,
  63.482 +                                             0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F};
  63.483 +    const vuint8_t patt_pack_chroma_unaligned={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  63.484 +                                               0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F};
  63.485 +    const vuint8_t v_0  	   = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  63.486 +    const vuint8_t mergehu16 = {0x00,0x01,0x10,0x11,0x02,0x03,0x12,0x13,0x04,0x05,0x14,0x15,0x06,0x07,0x16,0x17};
  63.487 +    const vuint8_t mergelu16 = {0x08,0x09,0x18,0x19,0x0A,0x0B,0x1A,0x1B,0x0C,0x0D,0x1C,0x1D,0x0E,0x0F,0x1E,0x1F};
  63.488 +    vuint8_t store_chroma, store_chroma_n1, load_chroma, load_chroma_n1;
  63.489 +    int mb_xy_n1;
  63.490 +    const int unalign_chroma = (unsigned int) img_cb & 15;
  63.491 +
  63.492 +    if(unalign_chroma==0){
  63.493 +        load_chroma = patt_high;
  63.494 +        load_chroma_n1 = patt_low;  // for load chroma mb_x-1
  63.495 +        store_chroma = patt_pack_chroma_aligned;
  63.496 +        store_chroma_n1 = patt_pack_chroma_unaligned;  // for store chroma mb_x-1
  63.497 +        mb_xy_n1 = 1;   //  si no hay desalineamineto se necesita el bloque anterior para filtrar horizontalmente
  63.498 +    }
  63.499 +    else{
  63.500 +        load_chroma = patt_low;
  63.501 +        load_chroma_n1 = patt_high; // for load mb_x-1
  63.502 +        store_chroma = patt_pack_chroma_unaligned;
  63.503 +        store_chroma_n1 = patt_pack_chroma_aligned;    // for store chroma mb_x-1
  63.504 +        mb_xy_n1 = 0;   //  si hay desalineamineto 8 no se necesita el bloque anterior
  63.505 +    }
  63.506 +
  63.507 +    /* dir : 0 -> vertical edge, 1 -> horizontal edge */
  63.508 +
  63.509 +    // LOAD MB_X -1
  63.510 +
  63.511 +    for (x = 0; x < 16; x++){  //Unpack Memory to 8 positions vector
  63.512 +        vec_img_y_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize - 1], v_0 , patt_low);
  63.513 +    }
  63.514 +
  63.515 +    for (x = 0; x < 8; x++){  //Unpack Memory to 8 positions vector
  63.516 +	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cb[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1);
  63.517 +	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t)img_cr[x*uvlinesize - mb_xy_n1], v_0 , load_chroma_n1);
  63.518 +    }
  63.519 +
  63.520 +    VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16);
  63.521 +
  63.522 +    VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16);
  63.523 +
  63.524 +    VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16);
  63.525 +
  63.526 +    VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16);
  63.527 +
  63.528 +    vec_img_y_t  = &vec_img_y_t[8];
  63.529 +    vec_img_y_o  = &vec_img_y_o[8];
  63.530 +    vec_img_cb_t = &vec_img_cb_t[8];
  63.531 +    vec_img_cb_o = &vec_img_cb_o[10];
  63.532 +    vec_img_cr_t = &vec_img_cr_t[8];
  63.533 +    vec_img_cr_o = &vec_img_cr_o[10];
  63.534 +
  63.535 +    //LOAD CURRENT MB
  63.536 +    for (x = 0; x < 16; x++){  //Unpack Memory to 8 positions vector
  63.537 +        pvec_tmp  	  = (vuint8_t *) &img_y[x*linesize];
  63.538 +	vec_img_y_o[x]    = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_high);
  63.539 +	vec_img_y_o[x+24] = (vsint16_t) spu_shuffle(*pvec_tmp, v_0 , patt_low);
  63.540 +    }
  63.541 +
  63.542 +    for (x = 0; x < 8; x++){  //Unpack Memory to 8 positions vector
  63.543 +	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma);
  63.544 +	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma);
  63.545 +    }
  63.546 +
  63.547 +    //TRANSPOSE MATRIX
  63.548 +
  63.549 +    VEC_TRANSPOSE_8(vec_img_y_o[0], vec_img_y_o[1], vec_img_y_o[2], vec_img_y_o[3], vec_img_y_o[4], vec_img_y_o[5], vec_img_y_o[6], vec_img_y_o[7], vec_img_y_t[0], vec_img_y_t[1], vec_img_y_t[2], vec_img_y_t[3], vec_img_y_t[4], vec_img_y_t[5], vec_img_y_t[6], vec_img_y_t[7],mergehu16, mergelu16);
  63.550 +
  63.551 +    VEC_TRANSPOSE_8(vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15], vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31],mergehu16, mergelu16);
  63.552 +
  63.553 +    VEC_TRANSPOSE_8(vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31], vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15],mergehu16, mergelu16);
  63.554 +
  63.555 +    VEC_TRANSPOSE_8(vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39], vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39],mergehu16, mergelu16);
  63.556 +
  63.557 +    VEC_TRANSPOSE_8(vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7], vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7],mergehu16, mergelu16);
  63.558 +
  63.559 +    VEC_TRANSPOSE_8(vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7], vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7],mergehu16, mergelu16);
  63.560 +
  63.561 +    //PROCESS
  63.562 +    dir = 0;
  63.563 +    {
  63.564 +        int edge;
  63.565 +        for( edge = start[dir]; edge < edges[dir]; edge++ ) {
  63.566 +            if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0)
  63.567 +            {
  63.568 +            	filter_mb_edgeh( &vec_img_y_t[4*edge   ], bS[dir][edge], qp[dir][edge],0);//low
  63.569 +            	filter_mb_edgeh( &vec_img_y_t[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high
  63.570 +
  63.571 +                if( (edge&1) == 0 ) {
  63.572 +                    filter_mb_edgecv( &vec_img_cb_t[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] );
  63.573 +                    filter_mb_edgecv( &vec_img_cr_t[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] );
  63.574 +                }
  63.575 +            }
  63.576 +        }
  63.577 +    }
  63.578 +
  63.579 +    //SAVE MB_X -1 RESULTS
  63.580 +
  63.581 +    VEC_TRANSPOSE_8(vec_img_y_t[-8], vec_img_y_t[-7], vec_img_y_t[-6], vec_img_y_t[-5], vec_img_y_t[-4], vec_img_y_t[-3], vec_img_y_t[-2], vec_img_y_t[-1], vec_img_y_o[-8], vec_img_y_o[-7], vec_img_y_o[-6], vec_img_y_o[-5], vec_img_y_o[-4], vec_img_y_o[-3], vec_img_y_o[-2], vec_img_y_o[-1],mergehu16, mergelu16);
  63.582 +
  63.583 +    VEC_TRANSPOSE_8(vec_img_y_t[16], vec_img_y_t[17], vec_img_y_t[18], vec_img_y_t[19], vec_img_y_t[20], vec_img_y_t[21], vec_img_y_t[22], vec_img_y_t[23], vec_img_y_o[16], vec_img_y_o[17], vec_img_y_o[18], vec_img_y_o[19], vec_img_y_o[20], vec_img_y_o[21], vec_img_y_o[22], vec_img_y_o[23],mergehu16, mergelu16);
  63.584 +
  63.585 +    VEC_TRANSPOSE_8(vec_img_cb_t[ -8], vec_img_cb_t[-7], vec_img_cb_t[-6], vec_img_cb_t[-5], vec_img_cb_t[-4], vec_img_cb_t[-3], vec_img_cb_t[-2], vec_img_cb_t[-1], vec_img_cb_o[-10], vec_img_cb_o[-9], vec_img_cb_o[-8], vec_img_cb_o[-7], vec_img_cb_o[-6], vec_img_cb_o[-5], vec_img_cb_o[-4], vec_img_cb_o[-3],mergehu16, mergelu16);
  63.586 +
  63.587 +    VEC_TRANSPOSE_8(vec_img_cr_t[ -8], vec_img_cr_t[-7], vec_img_cr_t[-6], vec_img_cr_t[-5], vec_img_cr_t[-4], vec_img_cr_t[-3], vec_img_cr_t[-2], vec_img_cr_t[-1], vec_img_cr_o[-10], vec_img_cr_o[-9], vec_img_cr_o[-8], vec_img_cr_o[-7], vec_img_cr_o[-6], vec_img_cr_o[-5], vec_img_cr_o[-4], vec_img_cr_o[-3],mergehu16, mergelu16);
  63.588 +
  63.589 +    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
  63.590 +    	img_y[x*linesize - 1] = spu_shuffle(img_y[x*linesize - 1], vec_img_y_o[-8+x], patt_pack_hw);
  63.591 +    }
  63.592 +
  63.593 +    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
  63.594 +    	img_y[(x+8)*linesize - 1] = spu_shuffle(img_y[(x+8)*linesize - 1], vec_img_y_o[16+x], patt_pack_hw);
  63.595 +    }
  63.596 +
  63.597 +    for (x = 0; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
  63.598 +    	img_cb[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cb[x*uvlinesize - mb_xy_n1], vec_img_cb_o[-10+x], store_chroma_n1);
  63.599 +    	img_cr[x*uvlinesize - mb_xy_n1] = spu_shuffle(img_cr[x*uvlinesize - mb_xy_n1], vec_img_cr_o[-10+x], store_chroma_n1);
  63.600 +    }
  63.601 +
  63.602 +    //TRANSPOSE MATRIX
  63.603 +
  63.604 +    VEC_TRANSPOSE_8(vec_img_y_t[ 0], vec_img_y_t[ 1], vec_img_y_t[ 2], vec_img_y_t[ 3], vec_img_y_t[ 4], vec_img_y_t[ 5], vec_img_y_t[ 6], vec_img_y_t[ 7], vec_img_y_o[ 0], vec_img_y_o[ 1], vec_img_y_o[ 2], vec_img_y_o[ 3], vec_img_y_o[ 4], vec_img_y_o[ 5], vec_img_y_o[ 6], vec_img_y_o[ 7],mergehu16, mergelu16);
  63.605 +
  63.606 +    VEC_TRANSPOSE_8(vec_img_y_t[ 8], vec_img_y_t[ 9], vec_img_y_t[10], vec_img_y_t[11], vec_img_y_t[12], vec_img_y_t[13], vec_img_y_t[14], vec_img_y_t[15], vec_img_y_o[24], vec_img_y_o[25], vec_img_y_o[26], vec_img_y_o[27], vec_img_y_o[28], vec_img_y_o[29], vec_img_y_o[30], vec_img_y_o[31],mergehu16, mergelu16);
  63.607 +
  63.608 +    VEC_TRANSPOSE_8(vec_img_y_t[24], vec_img_y_t[25], vec_img_y_t[26], vec_img_y_t[27], vec_img_y_t[28], vec_img_y_t[29], vec_img_y_t[30], vec_img_y_t[31], vec_img_y_o[ 8], vec_img_y_o[ 9], vec_img_y_o[10], vec_img_y_o[11], vec_img_y_o[12], vec_img_y_o[13], vec_img_y_o[14], vec_img_y_o[15],mergehu16, mergelu16);
  63.609 +
  63.610 +    VEC_TRANSPOSE_8(vec_img_y_t[32], vec_img_y_t[33], vec_img_y_t[34], vec_img_y_t[35], vec_img_y_t[36], vec_img_y_t[37], vec_img_y_t[38], vec_img_y_t[39], vec_img_y_o[32], vec_img_y_o[33], vec_img_y_o[34], vec_img_y_o[35], vec_img_y_o[36], vec_img_y_o[37], vec_img_y_o[38], vec_img_y_o[39],mergehu16, mergelu16);
  63.611 +
  63.612 +    VEC_TRANSPOSE_8(vec_img_cb_t[0], vec_img_cb_t[1], vec_img_cb_t[2], vec_img_cb_t[3], vec_img_cb_t[4], vec_img_cb_t[5], vec_img_cb_t[6], vec_img_cb_t[7], vec_img_cb_o[0], vec_img_cb_o[1], vec_img_cb_o[2], vec_img_cb_o[3], vec_img_cb_o[4], vec_img_cb_o[5], vec_img_cb_o[6], vec_img_cb_o[7],mergehu16, mergelu16);
  63.613 +
  63.614 +    VEC_TRANSPOSE_8(vec_img_cr_t[0], vec_img_cr_t[1], vec_img_cr_t[2], vec_img_cr_t[3], vec_img_cr_t[4], vec_img_cr_t[5], vec_img_cr_t[6], vec_img_cr_t[7], vec_img_cr_o[0], vec_img_cr_o[1], vec_img_cr_o[2], vec_img_cr_o[3], vec_img_cr_o[4], vec_img_cr_o[5], vec_img_cr_o[6], vec_img_cr_o[7],mergehu16, mergelu16);
  63.615 +
  63.616 +
  63.617 +    //LOAD MB_Y - 1
  63.618 +    for (x = -4; x < 0; x++){  //Unpack Memory to 8 positions vector
  63.619 +	vec_img_y_o[x]    = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_high);
  63.620 +	vec_img_y_o[x+24] = (vsint16_t) spu_shuffle((vuint8_t) img_y[x*linesize], v_0 , patt_low);
  63.621 +    }
  63.622 +
  63.623 +    for (x = -2; x < 0; x++){  //Unpack Memory to 8 positions vector
  63.624 +	vec_img_cb_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cb[x*uvlinesize], v_0 , load_chroma);
  63.625 +	vec_img_cr_o[x] = (vsint16_t) spu_shuffle((vuint8_t) img_cr[x*uvlinesize], v_0 , load_chroma);
  63.626 +    }
  63.627 +
  63.628 +    //PROCESS
  63.629 +    dir = 1;
  63.630 +    {
  63.631 +        int edge;
  63.632 +        for( edge = start[dir]; edge < edges[dir]; edge++ ) {
  63.633 +            if(bS[dir][edge][0]+bS[dir][edge][1]+bS[dir][edge][2]+bS[dir][edge][3] != 0)
  63.634 +            {
  63.635 +            	filter_mb_edgeh( &vec_img_y_o[4*edge   ], bS[dir][edge], qp[dir][edge],0);//low
  63.636 +            	filter_mb_edgeh( &vec_img_y_o[4*edge+24], bS[dir][edge], qp[dir][edge],2);//high
  63.637 +            	if( (edge&1) == 0 ) {
  63.638 +            	    filter_mb_edgecv( &vec_img_cb_o[2*edge], bS[dir][edge], chroma_qp[0][dir][edge] );
  63.639 +                    filter_mb_edgecv( &vec_img_cr_o[2*edge], bS[dir][edge], chroma_qp[1][dir][edge] );
  63.640 +            	}
  63.641 +            }
  63.642 +        }
  63.643 +
  63.644 +        for (x = -3; x < 16; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
  63.645 +    	    img_y[x*linesize] = spu_shuffle(vec_img_y_o[x], vec_img_y_o[x+24], patt_unpack);
  63.646 +        }
  63.647 +
  63.648 +        for (x = -1; x < 8; x++){  //pack Memory to 8 positions vector ERROR - No check for writing out of the memory
  63.649 +            img_cb[x*uvlinesize] = spu_shuffle(img_cb[x*uvlinesize], vec_img_cb_o[x], store_chroma);
  63.650 +            img_cr[x*uvlinesize] = spu_shuffle(img_cr[x*uvlinesize], vec_img_cr_o[x], store_chroma);
  63.651 +        }
  63.652 +    }
  63.653 +}

    64.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    64.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.c	Mon Aug 27 12:09:56 2012 +0200
    64.3 @@ -0,0 +1,408 @@
    64.4 +/*
    64.5 + * Copyright (c) 2009 TUDelft 
    64.6 + * 
    64.7 + * Cell Parallel SPU - Macroblock Decoding.
    64.8 + */
    64.9 +
   64.10 +/**
   64.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   64.12 + * Cell Parallel SPU - Macroblock Decoding
   64.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   64.14 + * 
   64.15 + * SIMD kernels 
   64.16 + * H.264/AVC motion compensation
   64.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   64.18 + * @author Albert Paradis <apar7632@hotmail.com>
   64.19 + */ 
   64.20 +
   64.21 +#include <spu_intrinsics.h>
   64.22 +#include "types_spu.h"
   64.23 +#include "h264_tables.h"
   64.24 +#include "h264_idct_spu.h"
   64.25 +#include "h264_intra_spu.h"
   64.26 +
   64.27 +/***********************************************************************
   64.28 + * ff_h264_idct_add_spu
   64.29 + ***********************************************************************
   64.30 + *  h264 idct 4x4 transform with SPU SIMD intrinsics
   64.31 + *  using the factorized algorithm 
   64.32 + *  Mauricio Alvarez: alvarez@ac.upc.edu
   64.33 + *  - DCTELEM* block: transformed coefficients are stored consecutvely in memory, 
   64.34 + *  - for the 4x4 transform the structure is like that:
   64.35 + *       || coef_00 | coef_01 || coef_02 | coef_03 ||..||coef_0F||
   64.36 + *  - Usually the DCTELEM block is declared with an alignment modificator in such a way 
   64.37 + *    that the  array is 128 bit (16 byte, 8 short) aligned.
   64.38 + *  - The dst pointer can be unaligned with unaligment as a multiple of 4.
   64.39 + ***********************************************************************/
   64.40 +
   64.41 +// idct_dc
   64.42 +void ff_idct_dc_add(uint8_t *dst, short *block, int stride){
   64.43 +    int i, j;
   64.44 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
   64.45 +    int dc = (block[0] + 32) >> 6;
   64.46 +    for( j = 0; j < 4; j++ ){
   64.47 +        for( i = 0; i < 4; i++ )
   64.48 +            dst[i] = cm[ dst[i] + dc ];
   64.49 +        dst += stride;
   64.50 +    }
   64.51 +}
   64.52 +
   64.53 +void ff_idct8_dc_add(uint8_t *dst, short *block, int stride){
   64.54 +    int i, j;
   64.55 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
   64.56 +    int dc = (block[0] + 32) >> 6;
   64.57 +    for( j = 0; j < 8; j++ ){
   64.58 +        for( i = 0; i < 8; i++ )
   64.59 +            dst[i] = cm[ dst[i] + dc ];
   64.60 +        dst += stride;
   64.61 +    }
   64.62 +}
   64.63 +
   64.64 +// add without idct
   64.65 +
   64.66 +void add_pixels8_c(uint8_t *pixels, short *block, int line_size)
   64.67 +{
   64.68 +    int i;
   64.69 +    for(i=0;i<8;i++) {
   64.70 +        pixels[0] += block[0];
   64.71 +        pixels[1] += block[1];
   64.72 +        pixels[2] += block[2];
   64.73 +        pixels[3] += block[3];
   64.74 +        pixels[4] += block[4];
   64.75 +        pixels[5] += block[5];
   64.76 +        pixels[6] += block[6];
   64.77 +        pixels[7] += block[7];
   64.78 +        pixels += line_size;
   64.79 +        block += 8;
   64.80 +    }
   64.81 +}
   64.82 +
   64.83 +void add_pixels4_c(uint8_t *pixels, short *block, int line_size)
   64.84 +{
   64.85 +    int i;
   64.86 +    for(i=0;i<4;i++) {
   64.87 +        pixels[0] += block[0];
   64.88 +        pixels[1] += block[1];
   64.89 +        pixels[2] += block[2];
   64.90 +        pixels[3] += block[3];
   64.91 +        pixels += line_size;
   64.92 +        block += 4;
   64.93 +    }
   64.94 +}
   64.95 +
   64.96 +void h264_luma_dc_dequant_idct_c(short *block, int qmul){
   64.97 +	#define stride 16
   64.98 +	int i;
   64.99 +	int temp[16]; //FIXME check if this is a good idea
  64.100 +	static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
  64.101 +	static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
  64.102 +
  64.103 +	for(i=0; i<4; i++){
  64.104 +		const int offset= y_offset[i];
  64.105 +		const int z0= block[offset+stride*0] + block[offset+stride*4];
  64.106 +		const int z1= block[offset+stride*0] - block[offset+stride*4];
  64.107 +		const int z2= block[offset+stride*1] - block[offset+stride*5];
  64.108 +		const int z3= block[offset+stride*1] + block[offset+stride*5];
  64.109 +
  64.110 +		temp[4*i+0]= z0+z3;
  64.111 +		temp[4*i+1]= z1+z2;
  64.112 +		temp[4*i+2]= z1-z2;
  64.113 +		temp[4*i+3]= z0-z3;
  64.114 +	}
  64.115 +
  64.116 +	for(i=0; i<4; i++){
  64.117 +		const int offset= x_offset[i];
  64.118 +		const int z0= temp[4*0+i] + temp[4*2+i];
  64.119 +		const int z1= temp[4*0+i] - temp[4*2+i];
  64.120 +		const int z2= temp[4*1+i] - temp[4*3+i];
  64.121 +		const int z3= temp[4*1+i] + temp[4*3+i];
  64.122 +
  64.123 +		block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
  64.124 +		block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
  64.125 +		block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
  64.126 +		block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
  64.127 +	}
  64.128 +}
  64.129 +#undef stride
  64.130 +
  64.131 +void chroma_dc_dequant_idct_c(short *block, int qmul){
  64.132 +	const int stride= 16*2;
  64.133 +	const int xStride= 16;
  64.134 +	int a,b,c,d,e;
  64.135 +
  64.136 +	a= block[stride*0 + xStride*0];
  64.137 +	b= block[stride*0 + xStride*1];
  64.138 +	c= block[stride*1 + xStride*0];
  64.139 +	d= block[stride*1 + xStride*1];
  64.140 +
  64.141 +	e= a-b;
  64.142 +	a= a+b;
  64.143 +	b= c-d;
  64.144 +	c= c+d;
  64.145 +
  64.146 +	block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
  64.147 +	block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
  64.148 +	block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
  64.149 +	block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
  64.150 +}
  64.151 +
  64.152 +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride)
  64.153 +{
  64.154 +  vsint16_t __vz0, __vz1, __vz2, __vz3; // used as temporal storage in for VEC_1D_DCT
  64.155 +  vsint16_t va0, va1, va2, va3;
  64.156 +  vsint16_t vtmp0, vtmp1, vtmp2, vtmp3;
  64.157 +  vuint16_t sat;
  64.158 +  vuint8_t va_u8;
  64.159 +  vsint16_t vdst_ss;
  64.160 +  vuint8_t dstperm;
  64.161 +  vuint8_t vdst, vdst_orig, vfdst;
  64.162 +  const int16_t imax = 255;
  64.163 +  const vsint32_t vzero = spu_splats(0);
  64.164 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  64.165 +  const int shift_dst = (unsigned int) dst  & 15;
  64.166 +  const vuint8_t packu16   = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F);
  64.167 +  const vuint8_t mergehu8  = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17);
  64.168 +  //for optimized matrix transpose:
  64.169 +  const vuint8_t tr0 =AVV(0x00,0x01,0x08,0x09,0x10,0x11,0x18,0x19,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
  64.170 +  const vuint8_t tr1 =AVV(0x02,0x03,0x0A,0x0B,0x12,0x13,0x1A,0x1B,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
  64.171 +  const vuint8_t tr2 =AVV(0x04,0x05,0x0C,0x0D,0x14,0x15,0x1C,0x1D,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
  64.172 +  const vuint8_t tr3 =AVV(0x06,0x07,0x0E,0x0F,0x16,0x17,0x1E,0x1F,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
  64.173 +  const vuint8_t conc =AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17);
  64.174 +
  64.175 +  block[0] += 32;  // add 32 as a DC-level for rounding
  64.176 +
  64.177 +  //load matrix
  64.178 +  vtmp0 = *(vsint16_t *)(block);
  64.179 +  vtmp1 = spu_rlqwbyte(vtmp0,8);
  64.180 +  vtmp2 = *(vsint16_t *)(block+8);
  64.181 +  vtmp3 = spu_rlqwbyte(vtmp2,8);
  64.182 +
  64.183 +  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
  64.184 +
  64.185 +  //concatenate first two rows of matrix
  64.186 +  va0=spu_shuffle(va0,va1,conc);
  64.187 +  //concatenate last two rows of matrix
  64.188 +  va2=spu_shuffle(va2,va3,conc);
  64.189 +
  64.190 +  //do transpose starting from two vectors, storing as four vectors of which the second part is unused
  64.191 +  vtmp0 = spu_shuffle( va0, va2, tr0);
  64.192 +  vtmp1 = spu_shuffle( va0, va2, tr1);
  64.193 +  vtmp2 = spu_shuffle( va0, va2, tr2);
  64.194 +  vtmp3 = spu_shuffle( va0, va2, tr3);
  64.195 +
  64.196 +  VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
  64.197 +
  64.198 +  // division by 64
  64.199 +  va0 = spu_rlmaska(va0,-6);
  64.200 +  va1 = spu_rlmaska(va1,-6);
  64.201 +  va2 = spu_rlmaska(va2,-6);
  64.202 +  va3 = spu_rlmaska(va3,-6);
  64.203 +
  64.204 +  switch (shift_dst){
  64.205 +    case 0: {
  64.206 +      dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
  64.207 +                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
  64.208 +    } break;
  64.209 +    case 4: {
  64.210 +      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
  64.211 +                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
  64.212 +    } break;
  64.213 +    case 8: {
  64.214 +      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  64.215 +  	                      0x10, 0x11, 0x12, 0x13, 0x0C, 0x0D, 0x0E, 0x0F);
  64.216 +    } break;
  64.217 +    case 12: {
  64.218 +      dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  64.219 +                              0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13);
  64.220 +    } break;
  64.221 +    default: {
  64.222 +      dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07,
  64.223 +                              0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
  64.224 +    } break;
  64.225 +  }
  64.226 +
  64.227 +  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm);
  64.228 +  dst += stride;
  64.229 +  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm);
  64.230 +  dst += stride;
  64.231 +  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm);
  64.232 +  dst += stride;
  64.233 +  VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm);
  64.234 +}
  64.235 +
  64.236 +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride)
  64.237 +{
  64.238 +	vsint16_t va0, va1, va2, va3, va4, va5, va6, va7;
  64.239 +	vsint16_t vza0, vza1, vza2, vza3, vza4, vza5, vza6, vza7, vzal,vzah;
  64.240 +	vsint16_t vzb0, vzb1, vzb2, vzb3, vzb4, vzb5, vzb6, vzb7;
  64.241 +	vsint16_t vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, vtmp6, vtmp7;
  64.242 +	vuint16_t sat;
  64.243 +	vuint8_t va_u8;
  64.244 +	const int block_stride=8;
  64.245 +	vsint16_t vdst_ss;
  64.246 +	const int16_t imax = 255;
  64.247 +	const vsint32_t vzero = spu_splats(0);
  64.248 +	const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  64.249 +	vuint8_t vdst, vdst_orig, vfdst;
  64.250 +	vuint8_t dstperm;
  64.251 +	const int shift_dst = (unsigned int) dst  & 15;
  64.252 +	const vuint8_t packu16   = AVV(0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F);
  64.253 +	const vuint8_t mergehu8  = AVV(0x00,0x10,0x01,0x11,0x02,0x12,0x03,0x13,0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17);
  64.254 +	const vuint8_t m1        = AVV(0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17);
  64.255 +	const vuint8_t m2        = AVV(0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F);
  64.256 +	const vuint8_t m3        = AVV(0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x18,0x19,0x1A,0x1B);
  64.257 +	const vuint8_t m4        = AVV(0x14,0x15,0x16,0x17,0x04,0x05,0x06,0x07,0x1C,0x1D,0x1E,0x1F,0x0C,0x0D,0x0E,0x0F);
  64.258 +	const vuint8_t m5        = AVV(0x00,0x01,0x10,0x11,0x04,0x05,0x14,0x15,0x08,0x09,0x18,0x19,0x0C,0x0D,0x1C,0x1D);
  64.259 +	const vuint8_t m6        = AVV(0x12,0x13,0x02,0x03,0x16,0x17,0x06,0x07,0x1A,0x1B,0x0A,0x0B,0x1E,0x1F,0x0E,0x0F);
  64.260 +
  64.261 +	block[0] += 32;  // add 32 as a DC-level for rounding
  64.262 +
  64.263 +	vtmp0 = *(vsint16_t *)(block);
  64.264 +	vtmp1 = *(vsint16_t *)(block + block_stride);
  64.265 +	vtmp2 = *(vsint16_t *)(block + 2*block_stride);
  64.266 +	vtmp3 = *(vsint16_t *)(block + 3*block_stride);
  64.267 +	vtmp4 = *(vsint16_t *)(block + 4*block_stride);
  64.268 +	vtmp5 = *(vsint16_t *)(block + 5*block_stride);
  64.269 +	vtmp6 = *(vsint16_t *)(block + 6*block_stride);
  64.270 +	vtmp7 = *(vsint16_t *)(block + 7*block_stride);
  64.271 +
  64.272 +	VEC_1D_DCT8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7);
  64.273 +	VEC_TRANSPOSE_8(vtmp0,vtmp1,vtmp2,vtmp3,vtmp4,vtmp5,vtmp6,vtmp7,va0,va1,va2,va3,va4,va5,va6,va7);
  64.274 +	VEC_1D_DCT8(va0, va1, va2, va3, va4, va5, va6, va7);
  64.275 +
  64.276 +	va0 = spu_rlmaska(va0,-6);
  64.277 +	va1 = spu_rlmaska(va1,-6);
  64.278 +	va2 = spu_rlmaska(va2,-6);
  64.279 +	va3 = spu_rlmaska(va3,-6);
  64.280 +	va4 = spu_rlmaska(va4,-6);
  64.281 +	va5 = spu_rlmaska(va5,-6);
  64.282 +	va6 = spu_rlmaska(va6,-6);
  64.283 +	va7 = spu_rlmaska(va7,-6);
  64.284 +
  64.285 +	if (shift_dst==8)
  64.286 +		dstperm = (vuint8_t)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  64.287 +				   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
  64.288 +	else																		    dstperm = (vuint8_t)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  64.289 +			0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
  64.290 +
  64.291 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va0,dstperm);
  64.292 +	dst += stride;
  64.293 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va1,dstperm);
  64.294 +	dst += stride;
  64.295 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va2,dstperm);
  64.296 +	dst += stride;
  64.297 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va3,dstperm);
  64.298 +	dst += stride;
  64.299 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va4,dstperm);
  64.300 +	dst += stride;
  64.301 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va5,dstperm);
  64.302 +	dst += stride;
  64.303 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va6,dstperm);
  64.304 +	dst += stride;
  64.305 +	VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(dst,shift_dst,va7,dstperm);
  64.306 +
  64.307 +}
  64.308 +
  64.309 +/*
  64.310 +
  64.311 +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride){
  64.312 +    int i;
  64.313 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  64.314 +
  64.315 +    block[0] += 32;
  64.316 +
  64.317 +    for(i=0; i<4; i++){
  64.318 +        const int z0=  block[0 + 4*i]     +  block[2 + 4*i];
  64.319 +        const int z1=  block[0 + 4*i]     -  block[2 + 4*i];
  64.320 +        const int z2= (block[1 + 4*i]>>1) -  block[3 + 4*i];
  64.321 +        const int z3=  block[1 + 4*i]     + (block[3 + 4*i]>>1);
  64.322 +
  64.323 +        block[0 + 4*i]= z0 + z3;
  64.324 +        block[1 + 4*i]= z1 + z2;
  64.325 +        block[2 + 4*i]= z1 - z2;
  64.326 +        block[3 + 4*i]= z0 - z3;
  64.327 +    }
  64.328 +
  64.329 +    for(i=0; i<4; i++){
  64.330 +        const int z0=  block[i + 4*0]     +  block[i + 4*2];
  64.331 +        const int z1=  block[i + 4*0]     -  block[i + 4*2];
  64.332 +        const int z2= (block[i + 4*1]>>1) -  block[i + 4*3];
  64.333 +        const int z3=  block[i + 4*1]     + (block[i + 4*3]>>1);
  64.334 +
  64.335 +        dst[i + 0*stride]= cm[ dst[i + 0*stride] + ((z0 + z3) >> 6) ];
  64.336 +        dst[i + 1*stride]= cm[ dst[i + 1*stride] + ((z1 + z2) >> 6) ];
  64.337 +        dst[i + 2*stride]= cm[ dst[i + 2*stride] + ((z1 - z2) >> 6) ];
  64.338 +        dst[i + 3*stride]= cm[ dst[i + 3*stride] + ((z0 - z3) >> 6) ];
  64.339 +    }
  64.340 +}
  64.341 +
  64.342 +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride){
  64.343 +    int i;
  64.344 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  64.345 +
  64.346 +    block[0] += 32;
  64.347 +	
  64.348 +    for( i = 0; i < 8; i++ )
  64.349 +    {
  64.350 +        const int a0 =  block[0+i*8] + block[4+i*8];
  64.351 +        const int a2 =  block[0+i*8] - block[4+i*8];
  64.352 +        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
  64.353 +        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
  64.354 +
  64.355 +        const int b0 = a0 + a6;
  64.356 +        const int b2 = a2 + a4;
  64.357 +        const int b4 = a2 - a4;
  64.358 +        const int b6 = a0 - a6;
  64.359 +
  64.360 +        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
  64.361 +        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
  64.362 +        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
  64.363 +        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
  64.364 +
  64.365 +        const int b1 = (a7>>2) + a1;
  64.366 +        const int b3 =  a3 + (a5>>2);
  64.367 +        const int b5 = (a3>>2) - a5;
  64.368 +        const int b7 =  a7 - (a1>>2);
  64.369 +
  64.370 +        block[0+i*8] = b0 + b7;
  64.371 +        block[7+i*8] = b0 - b7;
  64.372 +        block[1+i*8] = b2 + b5;
  64.373 +        block[6+i*8] = b2 - b5;
  64.374 +        block[2+i*8] = b4 + b3;
  64.375 +        block[5+i*8] = b4 - b3;
  64.376 +        block[3+i*8] = b6 + b1;
  64.377 +        block[4+i*8] = b6 - b1;
  64.378 +    }
  64.379 +    for( i = 0; i < 8; i++ )
  64.380 +    {
  64.381 +        const int a0 =  block[i+0*8] + block[i+4*8];
  64.382 +        const int a2 =  block[i+0*8] - block[i+4*8];
  64.383 +        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
  64.384 +        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
  64.385 +
  64.386 +        const int b0 = a0 + a6;
  64.387 +        const int b2 = a2 + a4;
  64.388 +        const int b4 = a2 - a4;
  64.389 +        const int b6 = a0 - a6;
  64.390 +
  64.391 +        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
  64.392 +        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
  64.393 +        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
  64.394 +        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
  64.395 +
  64.396 +        const int b1 = (a7>>2) + a1;
  64.397 +        const int b3 =  a3 + (a5>>2);
  64.398 +        const int b5 = (a3>>2) - a5;
  64.399 +        const int b7 =  a7 - (a1>>2);
  64.400 +			
  64.401 +		dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ];
  64.402 +		dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ];
  64.403 +		dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ];
  64.404 +		dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ];
  64.405 +		dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ];
  64.406 +		dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ];
  64.407 +		dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ];
  64.408 +		dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
  64.409 +	}
  64.410 +}*/
  64.411 +

    65.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    65.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_idct_spu.h	Mon Aug 27 12:09:56 2012 +0200
    65.3 @@ -0,0 +1,141 @@
    65.4 +#ifndef H264_IDCT_SPU_H
    65.5 +#define H264_IDCT_SPU_H
    65.6 +
    65.7 +void h264_idct4_add_spu(uint8_t *dst, short *block, int stride);
    65.8 +void h264_idct8_add_spu(uint8_t *dst, short *block, int stride);
    65.9 +
   65.10 +/***********************************************************************
   65.11 + * VEC_1D_IDCT
   65.12 + ***********************************************************************
   65.13 + * 1-dimensional 4x4 H264 integer DCT inverse transform.
   65.14 + * Actually source and destination are 8x4. The low elements of the
   65.15 + * source are discarded and the low elements of the destination mustn't
   65.16 + * be used. 
   65.17 + * __vz0-__vz3 registers need to be declared in the caller function
   65.18 + ***********************************************************************/
   65.19 +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)				\
   65.20 +  /* 1st stage */								\
   65.21 +  __vz0 = spu_add(vb0,vb2);		/* temp[0] = Y[0] + Y[2] 	*/	\
   65.22 +  __vz1 = spu_sub(vb0,vb2);		/* temp[1] = Y[0] - Y[2] 	*/	\
   65.23 +  __vz2 = spu_rlmaska(vb1,-1);							\
   65.24 +  __vz2 = spu_sub(__vz2,vb3);		/* temp[2] = Y[1].1/2 - Y[3] 	*/	\
   65.25 +  __vz3 = spu_rlmaska(vb3,-1);							\
   65.26 +  __vz3 = spu_add(vb1,__vz3);		/* temp[3] = Y[1] + Y[3].1/2 	*/	\
   65.27 +										\
   65.28 +  /* 2nd stage: output */							\
   65.29 +  va0 = spu_add(__vz0,__vz3);		/* x[0] = temp[0] + temp[3] 	*/	\
   65.30 +  va1 = spu_add(__vz1,__vz2);		/* x[1] = temp[1] + temp[2] 	*/	\
   65.31 +  va2 = spu_sub(__vz1,__vz2);		/* x[2] = temp[1] - temp[2] 	*/  	\
   65.32 +  va3 = spu_sub(__vz0,__vz3)		/* x[3] = temp[0] - temp[3] 	*/	
   65.33 +
   65.34 +/***********************************************************************
   65.35 + * VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8
   65.36 + ***********************************************************************
   65.37 + * load a vuint8_t vector from a unaligned memory position p
   65.38 + * Converts the vector to vsint16_t
   65.39 + * Adds the loaded and converted vector to a defined vector va
   65.40 + * converts back the result to vuint8_t and store it to memory
   65.41 + **********************************************************************/
   65.42 +
   65.43 +#define VEC_LOAD_UNALIGNED_U8_ADD_S16_STORE_U8(p,shift,va,align_dst)	\
   65.44 +    vdst_orig = *(vuint8_t *) (p);					\
   65.45 +    vdst = spu_or(spu_slqwbyte(vdst_orig, shift),(vuint8_t) vzero);	\
   65.46 +    vdst_ss = (vsint16_t) spu_shuffle((vuint8_t)vzero,vdst,mergehu8);	\
   65.47 +    va = spu_add(va,vdst_ss);						\
   65.48 +    sat = spu_cmpgt(va,(vsint16_t)vzero);				\
   65.49 +    va = spu_and(va,(vsint16_t)sat);					\
   65.50 +    sat = spu_cmpgt(va,vmax);						\
   65.51 +    va = spu_sel(va,vmax,sat);						\
   65.52 +    va_u8 = (vuint8_t) spu_shuffle(va,(vsint16_t) vzero,packu16);	\
   65.53 +    vfdst = spu_shuffle(vdst_orig, va_u8, align_dst);			\
   65.54 +    *(vuint8_t *) (dst) = vfdst
   65.55 +
   65.56 +/***********************************************************************
   65.57 + * VEC_TRANSPOSE_8
   65.58 + ***********************************************************************
   65.59 + * Transposes a 8x8 matrix of s16 vectors
   65.60 + **********************************************************************/
   65.61 +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
   65.62 +    b0 = spu_shuffle( a0, a4, m1 ); \
   65.63 +    b1 = spu_shuffle( a1, a5, m1 ); \
   65.64 +    b2 = spu_shuffle( a2, a6, m1 ); \
   65.65 +    b3 = spu_shuffle( a3, a7, m1 ); \
   65.66 +    b4 = spu_shuffle( a4, a0, m2 ); \
   65.67 +    b5 = spu_shuffle( a5, a1, m2 ); \
   65.68 +    b6 = spu_shuffle( a6, a2, m2 ); \
   65.69 +    b7 = spu_shuffle( a7, a3, m2 ); \
   65.70 +    a0 = spu_shuffle( b0, b2, m3 ); \
   65.71 +    a1 = spu_shuffle( b1, b3, m3 ); \
   65.72 +    a2 = spu_shuffle( b2, b0, m4 ); \
   65.73 +    a3 = spu_shuffle( b3, b1, m4 ); \
   65.74 +    a4 = spu_shuffle( b4, b6, m3 ); \
   65.75 +    a5 = spu_shuffle( b5, b7, m3 ); \
   65.76 +    a6 = spu_shuffle( b6, b4, m4 ); \
   65.77 +    a7 = spu_shuffle( b7, b5, m4 ); \
   65.78 +    b0 = spu_shuffle( a0, a1, m5 ); \
   65.79 +    b1 = spu_shuffle( a1, a0, m6 ); \
   65.80 +    b2 = spu_shuffle( a2, a3, m5 ); \
   65.81 +    b3 = spu_shuffle( a3, a2, m6 ); \
   65.82 +    b4 = spu_shuffle( a4, a5, m5 ); \
   65.83 +    b5 = spu_shuffle( a5, a4, m6 ); \
   65.84 +    b6 = spu_shuffle( a6, a7, m5 ); \
   65.85 +    b7 = spu_shuffle( a7, a6, m6 )
   65.86 +
   65.87 +/***********************************************************************
   65.88 + * VEC_1D_IDCT8
   65.89 + ***********************************************************************
   65.90 + * 1-dimensional 8x8 H264 integer DCT inverse transform.
   65.91 + ***********************************************************************/
   65.92 +#define VEC_1D_DCT8(vb0,vb1,vb2,vb3,vb4,vb5,vb6,vb7)						\
   65.93 +  vza0 = spu_add(vb0,vb4);		/* a[0] = Y[0] + Y[4] 	*/				\
   65.94 +  vza2 = spu_sub(vb0,vb4);		/* a[2] = Y[0] - Y[4]	*/				\
   65.95 +  vza4 = spu_rlmaska(vb2,-1);									\
   65.96 +  vza4 = spu_sub(vza4,vb6);		/* a[4] = Y[2]>>1 - Y[6]	*/			\
   65.97 +  vza6 = spu_rlmaska(vb6,-1	);								\
   65.98 +  vza6 = spu_add(vb2,vza6);		/* a[6] = Y[2]    + Y[6]>>1	*/			\
   65.99 +  												\
  65.100 +  vzb0 = spu_add(vza0,vza6);		/* b[0] = a[0] + a[6]	*/				\
  65.101 +  vzb2 = spu_add(vza2,vza4);		/* b[2] = a[2] + a[4]	*/				\
  65.102 +  vzb4 = spu_sub(vza2,vza4);		/* b[4] = a[2] - a[4]	*/				\
  65.103 +  vzb6 = spu_sub(vza0,vza6);		/* b[6] = a[0] - a[6]	*/				\
  65.104 +  												\
  65.105 +  vza1 = spu_rlmaska(vb7,-1);									\
  65.106 +  vzal = spu_add(vza1,vb7);									\
  65.107 +  vzah = spu_sub(vb5,vb3);									\
  65.108 +  vza1 = spu_sub(vzah,vzal);	/* a1 = (-Y[3] + Y[5]) - (Y[7] + (Y[7]>>1))	*/		\
  65.109 +  												\
  65.110 +  vza3 = spu_rlmaska(vb3,-1);									\
  65.111 +  vzal = spu_add(vza3,vb3);									\
  65.112 +  vzah = spu_add(vb1,vb7);									\
  65.113 +  vza3 = spu_sub(vzah,vzal);  	/* a3 =  (Y[1] + Y[7]) - (Y[3] + (Y[3]>>1))	*/		\
  65.114 +  												\
  65.115 +  vza5 = spu_rlmaska(vb5,-1);									\
  65.116 +  vzal = spu_add(vza5,vb5);									\
  65.117 +  vzah = spu_sub(vb7,vb1);									\
  65.118 +  vza5 = spu_add(vzah,vzal);	/* a5 = (-Y[1] + Y[7]) + (Y[5] + Y[5]>>1))	*/		\
  65.119 +												\
  65.120 +  vza7 = spu_rlmaska(vb1,-1);									\
  65.121 +  vzal = spu_add(vza7,vb1);									\
  65.122 +  vzah = spu_add(vb3,vb5);									\
  65.123 +  vza7 = spu_add(vzah,vzal);	/* a7 =  (Y[3] + Y[5]) + (Y[1] + (Y[1]>>1))	*/		\
  65.124 +  												\
  65.125 +  vzb1 = spu_rlmaska(vza7,-2);									\
  65.126 +  vzb1 = spu_add(vzb1,vza1);		/* b1 = (a7>>2) + a1	*/				\
  65.127 +  vzb3 = spu_rlmaska(vza5,-2);									\
  65.128 +  vzb3 = spu_add(vzb3,vza3);		/* b3 =  a3 + (a5>>2)	*/				\
  65.129 +  vzb5 = spu_rlmaska(vza3,-2);									\
  65.130 +  vzb5 = spu_sub(vzb5,vza5);  		/* b5 = (a3>>2) - a5	*/				\
  65.131 +  vzb7 = spu_rlmaska(vza1,-2);									\
  65.132 +  vzb7 = spu_sub(vza7,vzb7);		/* b7 =  a7 - (a1>>2)	*/				\
  65.133 +  												\
  65.134 +  vb0 = spu_add(vzb0,vzb7); 		/* src[i][0] = b0 + b7	*/				\
  65.135 +  vb7 = spu_sub(vzb0,vzb7);		/* src[i][7] = b0 - b7	*/				\
  65.136 +  vb1 = spu_add(vzb2,vzb5);		/* src[i][1] = b2 + b5	*/				\
  65.137 +  vb6 = spu_sub(vzb2,vzb5);		/* src[i][6] = b2 - b5	*/				\
  65.138 +  vb2 = spu_add(vzb4,vzb3);		/* src[i][2] = b4 + b3	*/				\
  65.139 +  vb5 = spu_sub(vzb4,vzb3);		/* src[i][5] = b4 - b3	*/				\
  65.140 +  vb3 = spu_add(vzb6,vzb1);		/* src[i][3] = b6 + b1	*/				\
  65.141 +  vb4 = spu_sub(vzb6,vzb1);		/* src[i][4] = b6 - b1	*/
  65.142 +  
  65.143 +
  65.144 +#endif /*H264_IDCT_SPU_H*/

    66.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    66.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.c	Mon Aug 27 12:09:56 2012 +0200
    66.3 @@ -0,0 +1,802 @@
    66.4 +#include "types_spu.h"
    66.5 +#include "h264_tables.h"
    66.6 +#include "h264_intra_spu.h"
    66.7 +#include <assert.h>
    66.8 +
    66.9 +void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
   66.10 +	(void) topright;
   66.11 +    const uint32_t a= ((uint32_t*)(src-stride))[0];
   66.12 +    ((uint32_t*)(src+0*stride))[0]= a;
   66.13 +    ((uint32_t*)(src+1*stride))[0]= a;
   66.14 +    ((uint32_t*)(src+2*stride))[0]= a;
   66.15 +    ((uint32_t*)(src+3*stride))[0]= a;
   66.16 +}
   66.17 +
   66.18 +void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
   66.19 +	(void) topright;
   66.20 +    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
   66.21 +    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
   66.22 +    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
   66.23 +    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
   66.24 +}
   66.25 +
   66.26 +void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
   66.27 +	(void) topright;
   66.28 +    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
   66.29 +                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
   66.30 +    ((uint32_t*)(src+0*stride))[0]=
   66.31 +    ((uint32_t*)(src+1*stride))[0]=
   66.32 +    ((uint32_t*)(src+2*stride))[0]=
   66.33 +    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
   66.34 +}
   66.35 +
   66.36 +void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
   66.37 +	(void) topright;
   66.38 +    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
   66.39 +
   66.40 +    ((uint32_t*)(src+0*stride))[0]=
   66.41 +    ((uint32_t*)(src+1*stride))[0]=
   66.42 +    ((uint32_t*)(src+2*stride))[0]=
   66.43 +    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
   66.44 +}
   66.45 +
   66.46 +void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
   66.47 +	(void) topright;
   66.48 +    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
   66.49 +
   66.50 +    ((uint32_t*)(src+0*stride))[0]=
   66.51 +    ((uint32_t*)(src+1*stride))[0]=
   66.52 +    ((uint32_t*)(src+2*stride))[0]=
   66.53 +    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
   66.54 +}
   66.55 +
   66.56 +void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
   66.57 +	(void) topright;
   66.58 +    ((uint32_t*)(src+0*stride))[0]=
   66.59 +    ((uint32_t*)(src+1*stride))[0]=
   66.60 +    ((uint32_t*)(src+2*stride))[0]=
   66.61 +    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
   66.62 +}
   66.63 +
   66.64 +
   66.65 +#define LOAD_TOP_RIGHT_EDGE\
   66.66 +    const int t4= topright[0];\
   66.67 +    const int t5= topright[1];\
   66.68 +    const int t6= topright[2];\
   66.69 +    const int t7= topright[3];\
   66.70 +
   66.71 +#define LOAD_LEFT_EDGE\
   66.72 +    const int l0= src[-1+0*stride];\
   66.73 +    const int l1= src[-1+1*stride];\
   66.74 +    const int l2= src[-1+2*stride];\
   66.75 +    const int l3= src[-1+3*stride];\
   66.76 +
   66.77 +#define LOAD_TOP_EDGE\
   66.78 +    const int t0= src[ 0-1*stride];\
   66.79 +    const int t1= src[ 1-1*stride];\
   66.80 +    const int t2= src[ 2-1*stride];\
   66.81 +    const int t3= src[ 3-1*stride];\
   66.82 +
   66.83 +void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){	
   66.84 +	(void) topright;
   66.85 +    const int lt= src[-1-1*stride];
   66.86 +    LOAD_TOP_EDGE
   66.87 +    LOAD_LEFT_EDGE
   66.88 +
   66.89 +    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
   66.90 +    src[0+2*stride]=
   66.91 +    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
   66.92 +    src[0+1*stride]=
   66.93 +    src[1+2*stride]=
   66.94 +    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
   66.95 +    src[0+0*stride]=
   66.96 +    src[1+1*stride]=
   66.97 +    src[2+2*stride]=
   66.98 +    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
   66.99 +    src[1+0*stride]=
  66.100 +    src[2+1*stride]=
  66.101 +    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
  66.102 +    src[2+0*stride]=
  66.103 +    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
  66.104 +    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
  66.105 +}
  66.106 +
  66.107 +void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
  66.108 +    LOAD_TOP_EDGE
  66.109 +    LOAD_TOP_RIGHT_EDGE
  66.110 +//    LOAD_LEFT_EDGE
  66.111 +
  66.112 +    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
  66.113 +    src[1+0*stride]=
  66.114 +    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
  66.115 +    src[2+0*stride]=
  66.116 +    src[1+1*stride]=
  66.117 +    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
  66.118 +    src[3+0*stride]=
  66.119 +    src[2+1*stride]=
  66.120 +    src[1+2*stride]=
  66.121 +    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
  66.122 +    src[3+1*stride]=
  66.123 +    src[2+2*stride]=
  66.124 +    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
  66.125 +    src[3+2*stride]=
  66.126 +    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
  66.127 +    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
  66.128 +}
  66.129 +
  66.130 +void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
  66.131 +	(void) topright;
  66.132 +    const int lt= src[-1-1*stride];
  66.133 +    LOAD_TOP_EDGE
  66.134 +    LOAD_LEFT_EDGE
  66.135 +	(void) l3;
  66.136 +
  66.137 +    src[0+0*stride]=
  66.138 +    src[1+2*stride]=(lt + t0 + 1)>>1;
  66.139 +    src[1+0*stride]=
  66.140 +    src[2+2*stride]=(t0 + t1 + 1)>>1;
  66.141 +    src[2+0*stride]=
  66.142 +    src[3+2*stride]=(t1 + t2 + 1)>>1;
  66.143 +    src[3+0*stride]=(t2 + t3 + 1)>>1;
  66.144 +    src[0+1*stride]=
  66.145 +    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
  66.146 +    src[1+1*stride]=
  66.147 +    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
  66.148 +    src[2+1*stride]=
  66.149 +    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
  66.150 +    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
  66.151 +    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
  66.152 +    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
  66.153 +}
  66.154 +
  66.155 +void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
  66.156 +    LOAD_TOP_EDGE
  66.157 +    LOAD_TOP_RIGHT_EDGE
  66.158 +	(void) t7;
  66.159 +
  66.160 +    src[0+0*stride]=(t0 + t1 + 1)>>1;
  66.161 +    src[1+0*stride]=
  66.162 +    src[0+2*stride]=(t1 + t2 + 1)>>1;
  66.163 +    src[2+0*stride]=
  66.164 +    src[1+2*stride]=(t2 + t3 + 1)>>1;
  66.165 +    src[3+0*stride]=
  66.166 +    src[2+2*stride]=(t3 + t4+ 1)>>1;
  66.167 +    src[3+2*stride]=(t4 + t5+ 1)>>1;
  66.168 +    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
  66.169 +    src[1+1*stride]=
  66.170 +    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
  66.171 +    src[2+1*stride]=
  66.172 +    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
  66.173 +    src[3+1*stride]=
  66.174 +    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
  66.175 +    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
  66.176 +}
  66.177 +
  66.178 +void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
  66.179 +	(void) topright;
  66.180 +    LOAD_LEFT_EDGE
  66.181 +
  66.182 +    src[0+0*stride]=(l0 + l1 + 1)>>1;
  66.183 +    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
  66.184 +    src[2+0*stride]=
  66.185 +    src[0+1*stride]=(l1 + l2 + 1)>>1;
  66.186 +    src[3+0*stride]=
  66.187 +    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
  66.188 +    src[2+1*stride]=
  66.189 +    src[0+2*stride]=(l2 + l3 + 1)>>1;
  66.190 +    src[3+1*stride]=
  66.191 +    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
  66.192 +    src[3+2*stride]=
  66.193 +    src[1+3*stride]=
  66.194 +    src[0+3*stride]=
  66.195 +    src[2+2*stride]=
  66.196 +    src[2+3*stride]=
  66.197 +    src[3+3*stride]=l3;
  66.198 +}
  66.199 +
  66.200 +void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
  66.201 +	(void) topright;
  66.202 +    const int lt= src[-1-1*stride];
  66.203 +    LOAD_TOP_EDGE
  66.204 +    LOAD_LEFT_EDGE
  66.205 +	(void) t3;
  66.206 +
  66.207 +    src[0+0*stride]=
  66.208 +    src[2+1*stride]=(lt + l0 + 1)>>1;
  66.209 +    src[1+0*stride]=
  66.210 +    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
  66.211 +    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
  66.212 +    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
  66.213 +    src[0+1*stride]=
  66.214 +    src[2+2*stride]=(l0 + l1 + 1)>>1;
  66.215 +    src[1+1*stride]=
  66.216 +    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
  66.217 +    src[0+2*stride]=
  66.218 +    src[2+3*stride]=(l1 + l2+ 1)>>1;
  66.219 +    src[1+2*stride]=
  66.220 +    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
  66.221 +    src[0+3*stride]=(l2 + l3 + 1)>>1;
  66.222 +    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
  66.223 +}
  66.224 +
  66.225 +void ff_pred16x16_vertical_c(uint8_t *src, int stride){
  66.226 +    int i;
  66.227 +	const vuint32_t v= *((vuint32_t*)(src-stride));
  66.228 +    for(i=0; i<4; i++){
  66.229 +		*((vuint32_t*) src 			 ) =v;
  66.230 +		*((vuint32_t*)(src +   stride)) =v;
  66.231 +		*((vuint32_t*)(src + 2*stride)) =v;
  66.232 +		*((vuint32_t*)(src + 3*stride)) =v;
  66.233 +		src+= 4*stride;
  66.234 +    }
  66.235 +	
  66.236 +	/*const uint32_t a= ((uint32_t*)(src-stride))[0];
  66.237 +	const uint32_t b= ((uint32_t*)(src-stride))[1];
  66.238 +	const uint32_t c= ((uint32_t*)(src-stride))[2];
  66.239 +	const uint32_t d= ((uint32_t*)(src-stride))[3];
  66.240 +
  66.241 +	for(i=0; i<16; i++){
  66.242 +		((uint32_t*)(src+i*stride))[0]= a;
  66.243 +		((uint32_t*)(src+i*stride))[1]= b;
  66.244 +		((uint32_t*)(src+i*stride))[2]= c;
  66.245 +		((uint32_t*)(src+i*stride))[3]= d;
  66.246 +	}*/
  66.247 +}
  66.248 +
  66.249 +void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
  66.250 +    int i;
  66.251 +	
  66.252 +    for(i=0; i<16; i++){
  66.253 +        ((uint32_t*)(src+i*stride))[0]=
  66.254 +        ((uint32_t*)(src+i*stride))[1]=
  66.255 +        ((uint32_t*)(src+i*stride))[2]=
  66.256 +        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
  66.257 +    }
  66.258 +}
  66.259 +
  66.260 +void ff_pred16x16_dc_c(uint8_t *src, int stride){
  66.261 +    int i;
  66.262 +	int dc=0;
  66.263 +    for(i=0;i<16; i++){
  66.264 +        dc+= src[-1+i*stride];
  66.265 +    }
  66.266 +
  66.267 +    for(i=0;i<16; i++){
  66.268 +		dc+= src[i-stride];
  66.269 +    }
  66.270 +	dc= 0x01010101*((dc + 16)>>5);
  66.271 +    
  66.272 +    for(i=0; i<16; i++){
  66.273 +        ((uint32_t*)(src+i*stride))[0]=
  66.274 +        ((uint32_t*)(src+i*stride))[1]=
  66.275 +        ((uint32_t*)(src+i*stride))[2]=
  66.276 +        ((uint32_t*)(src+i*stride))[3]= dc;
  66.277 +    }
  66.278 +}
  66.279 +
  66.280 +void ff_pred16x16_left_dc_c(uint8_t *src, int stride){
  66.281 +    int i;
  66.282 +	
  66.283 +	int dc=0;
  66.284 +    for(i=0;i<16; i++){
  66.285 +        dc+= src[-1+i*stride];
  66.286 +    }
  66.287 +	dc= 0x01010101*((dc + 8)>>4);
  66.288 +	
  66.289 +    for(i=0; i<16; i++){
  66.290 +        ((uint32_t*)(src+i*stride))[0]=
  66.291 +        ((uint32_t*)(src+i*stride))[1]=
  66.292 +        ((uint32_t*)(src+i*stride))[2]=
  66.293 +        ((uint32_t*)(src+i*stride))[3]= dc;
  66.294 +    }
  66.295 +}
  66.296 +
  66.297 +void ff_pred16x16_top_dc_c(uint8_t *src, int stride){
  66.298 +    int i;
  66.299 +	int dc0=0;
  66.300 +    for(i=0;i<16; i++){
  66.301 +        dc0+= src[i-stride];
  66.302 +    }
  66.303 +	
  66.304 +	dc0= 0x01010101*((dc0 + 8)>>4);
  66.305 +	
  66.306 +    for(i=0; i<16; i++){
  66.307 +        ((uint32_t*)(src+i*stride))[0]=
  66.308 +        ((uint32_t*)(src+i*stride))[1]=
  66.309 +        ((uint32_t*)(src+i*stride))[2]=
  66.310 +        ((uint32_t*)(src+i*stride))[3]= dc0;
  66.311 +    }
  66.312 +}
  66.313 +
  66.314 +void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
  66.315 +    int i;
  66.316 +	
  66.317 +	/*const vuint32_t v= AVV(0x01010101U*128U, 0x01010101U*128U,0x01010101U*128U,0x01010101U*128U);
  66.318 +	for(i=0; i<4; i++){
  66.319 +		*((vuint32_t*) src 			  ) =v;
  66.320 +		*((vuint32_t*)(src +   stride)) =v;
  66.321 +		*((vuint32_t*)(src + 2*stride)) =v;
  66.322 +		*((vuint32_t*)(src + 3*stride)) =v;
  66.323 +		src+= 4*stride;
  66.324 +	}*/
  66.325 +	
  66.326 +    for(i=0; i<16; i++){
  66.327 +        ((uint32_t*)(src+i*stride))[0]=
  66.328 +        ((uint32_t*)(src+i*stride))[1]=
  66.329 +        ((uint32_t*)(src+i*stride))[2]=
  66.330 +        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
  66.331 +    }
  66.332 +}
  66.333 +
  66.334 +void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
  66.335 +	int i, j, k;
  66.336 +	int a;
  66.337 +	uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  66.338 +	const uint8_t * const src0 = src+7-stride;
  66.339 +	const uint8_t *src1 = src+8*stride-1;
  66.340 +	const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
  66.341 +	int H = src0[1] - src0[-1];
  66.342 +	int V = src1[0] - src2[ 0];
  66.343 +	for(k=2; k<=8; ++k) {
  66.344 +		src1 += stride; src2 -= stride;
  66.345 +		H += k*(src0[k] - src0[-k]);
  66.346 +		V += k*(src1[0] - src2[ 0]);
  66.347 +	}
  66.348 +	if(svq3){
  66.349 +		H = ( 5*(H/4) ) / 16;
  66.350 +		V = ( 5*(V/4) ) / 16;
  66.351 +
  66.352 +		/* required for 100% accuracy */
  66.353 +		i = H; H = V; V = i;
  66.354 +	}else{
  66.355 +		H = ( 5*H+32 ) >> 6;
  66.356 +		V = ( 5*V+32 ) >> 6;
  66.357 +	}
  66.358 +
  66.359 +	a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
  66.360 +	for(j=16; j>0; --j) {
  66.361 +		int b = a;
  66.362 +		a += V;
  66.363 +		for(i=-16; i<0; i+=4) {
  66.364 +		src[16+i] = cm[ (b    ) >> 5 ];
  66.365 +		src[17+i] = cm[ (b+  H) >> 5 ];
  66.366 +		src[18+i] = cm[ (b+2*H) >> 5 ];
  66.367 +		src[19+i] = cm[ (b+3*H) >> 5 ];
  66.368 +		b += 4*H;
  66.369 +		}
  66.370 +		src += stride;
  66.371 +	}
  66.372 +}
  66.373 +
  66.374 +void ff_pred16x16_plane_c(uint8_t *src, int stride){
  66.375 +    pred16x16_plane_compat_c(src, stride, 0);
  66.376 +}
  66.377 +
  66.378 +void ff_pred8x8_vertical_c(uint8_t *src, int stride){
  66.379 +    int i;
  66.380 +    const uint32_t a= ((uint32_t*)(src-stride))[0];
  66.381 +    const uint32_t b= ((uint32_t*)(src-stride))[1];
  66.382 +
  66.383 +    for(i=0; i<8; i++){
  66.384 +        ((uint32_t*)(src+i*stride))[0]= a;
  66.385 +        ((uint32_t*)(src+i*stride))[1]= b;
  66.386 +    }
  66.387 +}
  66.388 +
  66.389 +void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
  66.390 +    int i;
  66.391 +
  66.392 +    for(i=0; i<8; i++){
  66.393 +        ((uint32_t*)(src+i*stride))[0]=
  66.394 +        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
  66.395 +    }
  66.396 +}
  66.397 +
  66.398 +void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
  66.399 +    int i;
  66.400 +
  66.401 +    for(i=0; i<8; i++){
  66.402 +        ((uint32_t*)(src+i*stride))[0]=
  66.403 +        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
  66.404 +    }
  66.405 +}
  66.406 +
  66.407 +void ff_pred8x8_left_dc_c(uint8_t *src, int stride){
  66.408 +    int i;
  66.409 +    int dc0, dc2;
  66.410 +
  66.411 +    dc0=dc2=0;
  66.412 +    for(i=0;i<4; i++){
  66.413 +        dc0+= src[-1+i*stride];
  66.414 +        dc2+= src[-1+(i+4)*stride];
  66.415 +    }
  66.416 +    dc0= 0x01010101*((dc0 + 2)>>2);
  66.417 +    dc2= 0x01010101*((dc2 + 2)>>2);
  66.418 +
  66.419 +    for(i=0; i<4; i++){
  66.420 +        ((uint32_t*)(src+i*stride))[0]=
  66.421 +        ((uint32_t*)(src+i*stride))[1]= dc0;
  66.422 +    }
  66.423 +    for(i=4; i<8; i++){
  66.424 +        ((uint32_t*)(src+i*stride))[0]=
  66.425 +        ((uint32_t*)(src+i*stride))[1]= dc2;
  66.426 +    }
  66.427 +}
  66.428 +
  66.429 +void ff_pred8x8_top_dc_c(uint8_t *src, int stride){
  66.430 +    int i;
  66.431 +    int dc0, dc1;
  66.432 +
  66.433 +    dc0=dc1=0;
  66.434 +    for(i=0;i<4; i++){
  66.435 +        dc0+= src[i-stride];
  66.436 +        dc1+= src[4+i-stride];
  66.437 +    }
  66.438 +    dc0= 0x01010101*((dc0 + 2)>>2);
  66.439 +    dc1= 0x01010101*((dc1 + 2)>>2);
  66.440 +
  66.441 +    for(i=0; i<4; i++){
  66.442 +        ((uint32_t*)(src+i*stride))[0]= dc0;
  66.443 +        ((uint32_t*)(src+i*stride))[1]= dc1;
  66.444 +    }
  66.445 +    for(i=4; i<8; i++){
  66.446 +        ((uint32_t*)(src+i*stride))[0]= dc0;
  66.447 +        ((uint32_t*)(src+i*stride))[1]= dc1;
  66.448 +    }
  66.449 +}
  66.450 +
  66.451 +
  66.452 +void ff_pred8x8_dc_c(uint8_t *src, int stride){
  66.453 +    int i;
  66.454 +    int dc0, dc1, dc2, dc3;
  66.455 +
  66.456 +    dc0=dc1=dc2=0;
  66.457 +    for(i=0;i<4; i++){
  66.458 +        dc0+= src[-1+i*stride] + src[i-stride];
  66.459 +        dc1+= src[4+i-stride];
  66.460 +        dc2+= src[-1+(i+4)*stride];
  66.461 +    }
  66.462 +    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
  66.463 +    dc0= 0x01010101*((dc0 + 4)>>3);
  66.464 +    dc1= 0x01010101*((dc1 + 2)>>2);
  66.465 +    dc2= 0x01010101*((dc2 + 2)>>2);
  66.466 +
  66.467 +    for(i=0; i<4; i++){
  66.468 +        ((uint32_t*)(src+i*stride))[0]= dc0;
  66.469 +        ((uint32_t*)(src+i*stride))[1]= dc1;
  66.470 +    }
  66.471 +    for(i=4; i<8; i++){
  66.472 +        ((uint32_t*)(src+i*stride))[0]= dc2;
  66.473 +        ((uint32_t*)(src+i*stride))[1]= dc3;
  66.474 +    }
  66.475 +}
  66.476 +
  66.477 +void ff_pred8x8_plane_c(uint8_t *src, int stride){
  66.478 +  int j, k;
  66.479 +  int a;
  66.480 +  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  66.481 +  const uint8_t * const src0 = src+3-stride;
  66.482 +  const uint8_t *src1 = src+4*stride-1;
  66.483 +  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
  66.484 +  int H = src0[1] - src0[-1];
  66.485 +  int V = src1[0] - src2[ 0];
  66.486 +  for(k=2; k<=4; ++k) {
  66.487 +    src1 += stride; src2 -= stride;
  66.488 +    H += k*(src0[k] - src0[-k]);
  66.489 +    V += k*(src1[0] - src2[ 0]);
  66.490 +  }
  66.491 +  H = ( 17*H+16 ) >> 5;
  66.492 +  V = ( 17*V+16 ) >> 5;
  66.493 +
  66.494 +  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
  66.495 +  for(j=8; j>0; --j) {
  66.496 +    int b = a;
  66.497 +    a += V;
  66.498 +    src[0] = cm[ (b    ) >> 5 ];
  66.499 +    src[1] = cm[ (b+  H) >> 5 ];
  66.500 +    src[2] = cm[ (b+2*H) >> 5 ];
  66.501 +    src[3] = cm[ (b+3*H) >> 5 ];
  66.502 +    src[4] = cm[ (b+4*H) >> 5 ];
  66.503 +    src[5] = cm[ (b+5*H) >> 5 ];
  66.504 +    src[6] = cm[ (b+6*H) >> 5 ];
  66.505 +    src[7] = cm[ (b+7*H) >> 5 ];
  66.506 +    src += stride;
  66.507 +  }
  66.508 +}
  66.509 +
  66.510 +
  66.511 +#define SRC(x,y) src[(x)+(y)*stride]
  66.512 +#define PL(y) \
  66.513 +    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
  66.514 +#define PREDICT_8x8_LOAD_LEFT \
  66.515 +    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
  66.516 +                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
  66.517 +    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
  66.518 +    const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
  66.519 +
  66.520 +#define PT(x) \
  66.521 +    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
  66.522 +#define PREDICT_8x8_LOAD_TOP \
  66.523 +    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
  66.524 +                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
  66.525 +    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
  66.526 +    const int t7 = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
  66.527 +                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
  66.528 +
  66.529 +#define PTR(x) \
  66.530 +    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
  66.531 +#define PREDICT_8x8_LOAD_TOPRIGHT \
  66.532 +    int t8, t9, t10, t11, t12, t13, t14, t15; \
  66.533 +    if(has_topright) { \
  66.534 +        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
  66.535 +        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
  66.536 +    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
  66.537 +
  66.538 +#define PREDICT_8x8_LOAD_TOPLEFT \
  66.539 +    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
  66.540 +
  66.541 +#define PREDICT_8x8_DC(v) \
  66.542 +    int y; \
  66.543 +    for( y = 0; y < 8; y++ ) { \
  66.544 +        ((uint32_t*)src)[0] = \
  66.545 +        ((uint32_t*)src)[1] = v; \
  66.546 +        src += stride; \
  66.547 +    }
  66.548 +
  66.549 +static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.550 +{
  66.551 +	(void) has_topright;
  66.552 +	(void) has_topleft;
  66.553 +    PREDICT_8x8_DC(0x80808080);
  66.554 +}
  66.555 +static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.556 +{
  66.557 +	(void) has_topright;
  66.558 +    PREDICT_8x8_LOAD_LEFT;
  66.559 +    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
  66.560 +    PREDICT_8x8_DC(dc);
  66.561 +}
  66.562 +static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.563 +{
  66.564 +    PREDICT_8x8_LOAD_TOP;
  66.565 +    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
  66.566 +    PREDICT_8x8_DC(dc);
  66.567 +}
  66.568 +static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.569 +{
  66.570 +    PREDICT_8x8_LOAD_LEFT;
  66.571 +    PREDICT_8x8_LOAD_TOP;
  66.572 +    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
  66.573 +                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
  66.574 +    PREDICT_8x8_DC(dc);
  66.575 +}
  66.576 +static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.577 +{
  66.578 +	(void) has_topright;
  66.579 +    PREDICT_8x8_LOAD_LEFT;
  66.580 +#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
  66.581 +               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
  66.582 +    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
  66.583 +#undef ROW
  66.584 +}
  66.585 +static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.586 +{
  66.587 +    int y;
  66.588 +    PREDICT_8x8_LOAD_TOP;
  66.589 +    src[0] = t0;
  66.590 +    src[1] = t1;
  66.591 +    src[2] = t2;
  66.592 +    src[3] = t3;
  66.593 +    src[4] = t4;
  66.594 +    src[5] = t5;
  66.595 +    src[6] = t6;
  66.596 +    src[7] = t7;
  66.597 +    for( y = 1; y < 8; y++ )
  66.598 +        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
  66.599 +}
  66.600 +static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.601 +{
  66.602 +    PREDICT_8x8_LOAD_TOP;
  66.603 +    PREDICT_8x8_LOAD_TOPRIGHT;
  66.604 +    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
  66.605 +    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
  66.606 +    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
  66.607 +    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
  66.608 +    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
  66.609 +    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
  66.610 +    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
  66.611 +    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
  66.612 +    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
  66.613 +    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
  66.614 +    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
  66.615 +    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
  66.616 +    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
  66.617 +    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
  66.618 +    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
  66.619 +}
  66.620 +static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.621 +{
  66.622 +    PREDICT_8x8_LOAD_TOP;
  66.623 +    PREDICT_8x8_LOAD_LEFT;
  66.624 +    PREDICT_8x8_LOAD_TOPLEFT;
  66.625 +    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
  66.626 +    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
  66.627 +    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
  66.628 +    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
  66.629 +    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
  66.630 +    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
  66.631 +    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
  66.632 +    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
  66.633 +    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
  66.634 +    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
  66.635 +    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
  66.636 +    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
  66.637 +    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
  66.638 +    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
  66.639 +    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
  66.640 +
  66.641 +}
  66.642 +static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.643 +{
  66.644 +    PREDICT_8x8_LOAD_TOP;
  66.645 +    PREDICT_8x8_LOAD_LEFT;
  66.646 +    PREDICT_8x8_LOAD_TOPLEFT;
  66.647 +	(void) l7;
  66.648 +    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
  66.649 +    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
  66.650 +    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
  66.651 +    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
  66.652 +    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
  66.653 +    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
  66.654 +    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
  66.655 +    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
  66.656 +    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
  66.657 +    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
  66.658 +    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
  66.659 +    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
  66.660 +    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
  66.661 +    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
  66.662 +    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
  66.663 +    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
  66.664 +    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
  66.665 +    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
  66.666 +    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
  66.667 +    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
  66.668 +    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
  66.669 +    SRC(7,0)= (t6 + t7 + 1) >> 1;
  66.670 +}
  66.671 +static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.672 +{
  66.673 +    PREDICT_8x8_LOAD_TOP;
  66.674 +    PREDICT_8x8_LOAD_LEFT;
  66.675 +    PREDICT_8x8_LOAD_TOPLEFT;
  66.676 +	(void) t7;
  66.677 +    SRC(0,7)= (l6 + l7 + 1) >> 1;
  66.678 +    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
  66.679 +    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
  66.680 +    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
  66.681 +    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
  66.682 +    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
  66.683 +    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
  66.684 +    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
  66.685 +    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
  66.686 +    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
  66.687 +    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
  66.688 +    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
  66.689 +    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
  66.690 +    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
  66.691 +    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
  66.692 +    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
  66.693 +    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
  66.694 +    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
  66.695 +    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
  66.696 +    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
  66.697 +    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
  66.698 +    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
  66.699 +}
  66.700 +static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.701 +{
  66.702 +    PREDICT_8x8_LOAD_TOP;
  66.703 +    PREDICT_8x8_LOAD_TOPRIGHT;
  66.704 +    SRC(0,0)= (t0 + t1 + 1) >> 1;
  66.705 +    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
  66.706 +    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
  66.707 +    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
  66.708 +    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
  66.709 +    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
  66.710 +    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
  66.711 +    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
  66.712 +    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
  66.713 +    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
  66.714 +    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
  66.715 +    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
  66.716 +    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
  66.717 +    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
  66.718 +    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
  66.719 +    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
  66.720 +    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
  66.721 +    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
  66.722 +    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
  66.723 +    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
  66.724 +    SRC(7,6)= (t10 + t11 + 1) >> 1;
  66.725 +    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
  66.726 +}
  66.727 +static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
  66.728 +{
  66.729 +	(void) has_topright;
  66.730 +    PREDICT_8x8_LOAD_LEFT;
  66.731 +    SRC(0,0)= (l0 + l1 + 1) >> 1;
  66.732 +    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
  66.733 +    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
  66.734 +    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
  66.735 +    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
  66.736 +    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
  66.737 +    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
  66.738 +    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
  66.739 +    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
  66.740 +    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
  66.741 +    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
  66.742 +    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
  66.743 +    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
  66.744 +    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
  66.745 +    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
  66.746 +    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
  66.747 +    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
  66.748 +    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
  66.749 +}
  66.750 +#undef PREDICT_8x8_LOAD_LEFT
  66.751 +#undef PREDICT_8x8_LOAD_TOP
  66.752 +#undef PREDICT_8x8_LOAD_TOPLEFT
  66.753 +#undef PREDICT_8x8_LOAD_TOPRIGHT
  66.754 +#undef PREDICT_8x8_DC
  66.755 +#undef PTR
  66.756 +#undef PT
  66.757 +#undef PL
  66.758 +#undef SRC
  66.759 +
  66.760 +void init_pred_ptrs(H264PredContext_spu *i){
  66.761 +
  66.762 +    i->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
  66.763 +    i->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
  66.764 +    i->pred4x4[DC_PRED             ]= pred4x4_dc_c;
  66.765 +    i->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
  66.766 +    i->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
  66.767 +    i->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
  66.768 +    i->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
  66.769 +    i->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
  66.770 +    i->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
  66.771 +    i->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
  66.772 +    i->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
  66.773 +    i->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
  66.774 +
  66.775 +    i->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
  66.776 +    i->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
  66.777 +    i->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
  66.778 +    i->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
  66.779 +    i->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
  66.780 +    i->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
  66.781 +    i->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
  66.782 +    i->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
  66.783 +    i->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
  66.784 +    i->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
  66.785 +    i->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
  66.786 +    i->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
  66.787 +
  66.788 +  
  66.789 +    i->pred8x8[VERT_PRED8x8   ]= ff_pred8x8_vertical_c;
  66.790 +    i->pred8x8[HOR_PRED8x8    ]= ff_pred8x8_horizontal_c;
  66.791 +    i->pred8x8[PLANE_PRED8x8  ]= ff_pred8x8_plane_c;
  66.792 +	i->pred8x8[DC_PRED8x8     ]= ff_pred8x8_dc_c;
  66.793 +    i->pred8x8[LEFT_DC_PRED8x8]= ff_pred8x8_left_dc_c;
  66.794 +    i->pred8x8[TOP_DC_PRED8x8 ]= ff_pred8x8_top_dc_c;
  66.795 +    i->pred8x8[DC_128_PRED8x8 ]= ff_pred8x8_128_dc_c;
  66.796 +
  66.797 +    i->pred16x16[DC_PRED8x8     ]= ff_pred16x16_dc_c;
  66.798 +    i->pred16x16[VERT_PRED8x8   ]= ff_pred16x16_vertical_c;
  66.799 +    i->pred16x16[HOR_PRED8x8    ]= ff_pred16x16_horizontal_c;
  66.800 +    i->pred16x16[PLANE_PRED8x8  ]= ff_pred16x16_plane_c;
  66.801 +    i->pred16x16[LEFT_DC_PRED8x8]= ff_pred16x16_left_dc_c;
  66.802 +    i->pred16x16[TOP_DC_PRED8x8 ]= ff_pred16x16_top_dc_c;
  66.803 +    i->pred16x16[DC_128_PRED8x8 ]= ff_pred16x16_128_dc_c;
  66.804 +
  66.805 +}

    67.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    67.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_intra_spu.h	Mon Aug 27 12:09:56 2012 +0200
    67.3 @@ -0,0 +1,48 @@
    67.4 +#ifndef H264_INTRA_SPU_H
    67.5 +#define H264_INTRA_SPU_H
    67.6 +
    67.7 +#define MAX_NEG_CROP       1024
    67.8 +
    67.9 +// For Intra mode
   67.10 +#define MB_TYPE_INTRA4x4   0x0001
   67.11 +#define IS_INTRA(a)       ((a)&7)
   67.12 +#define IS_INTRA4x4(a)    ((a)&MB_TYPE_INTRA4x4)
   67.13 +
   67.14 +#define CODEC_FLAG_GRAY   0x2000
   67.15 +
   67.16 +#define VERT_PRED             0
   67.17 +#define HOR_PRED              1
   67.18 +#define DC_PRED               2
   67.19 +#define DIAG_DOWN_LEFT_PRED   3
   67.20 +#define DIAG_DOWN_RIGHT_PRED  4
   67.21 +#define VERT_RIGHT_PRED       5
   67.22 +#define HOR_DOWN_PRED         6
   67.23 +#define VERT_LEFT_PRED        7
   67.24 +#define HOR_UP_PRED           8
   67.25 +
   67.26 +#define LEFT_DC_PRED          9
   67.27 +#define TOP_DC_PRED           10
   67.28 +#define DC_128_PRED           11
   67.29 +
   67.30 +
   67.31 +#define DC_PRED8x8            0
   67.32 +#define HOR_PRED8x8           1
   67.33 +#define VERT_PRED8x8          2
   67.34 +#define PLANE_PRED8x8         3
   67.35 +
   67.36 +#define LEFT_DC_PRED8x8       4
   67.37 +#define TOP_DC_PRED8x8        5
   67.38 +#define DC_128_PRED8x8        6
   67.39 +
   67.40 +typedef struct H264PredContext_spu{
   67.41 +
   67.42 +  intra_pred4x4 pred4x4[9+3];
   67.43 +  intra_pred16x16 pred16x16[4+3];
   67.44 +  intra_pred8x8 pred8x8[4+3];
   67.45 +  intra_pred8x8l pred8x8l[9+3];
   67.46 +
   67.47 +}H264PredContext_spu;
   67.48 +
   67.49 +void init_pred_ptrs(H264PredContext_spu *i);
   67.50 +
   67.51 +#endif

    68.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    68.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_luma_template_spu.c	Mon Aug 27 12:09:56 2012 +0200
    68.3 @@ -0,0 +1,1560 @@
    68.4 +static void PREFIX_h264_qpel16_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
    68.5 +  
    68.6 +  register int i;
    68.7 +
    68.8 +  const int16_t i20ss= 20;
    68.9 +  const int16_t i5ss= 5;
   68.10 +  const int16_t i16ss= 16;
   68.11 +  const int16_t imax = 255;
   68.12 +
   68.13 +  const vsint32_t vzero = spu_splats(0);
   68.14 +  const vsint16_t v20ss = spu_splats(i20ss);
   68.15 +  const vsint16_t v5ss = spu_splats(i5ss);
   68.16 +  const vsint16_t v16ss = spu_splats(i16ss);
   68.17 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
   68.18 +  vuint16_t sat;
   68.19 +
   68.20 +  const int shift_src =(unsigned int) src & 15;
   68.21 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
   68.22 +  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
   68.23 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
   68.24 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
   68.25 +
   68.26 +  uint8_t *srcbis = src - (STRIDE_Y * 2);
   68.27 +
   68.28 +  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
   68.29 +  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
   68.30 +  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
   68.31 +
   68.32 +  srcbis += STRIDE_Y;
   68.33 +  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
   68.34 +  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
   68.35 +  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
   68.36 +
   68.37 +  srcbis += STRIDE_Y;
   68.38 +  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
   68.39 +  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
   68.40 +  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
   68.41 +
   68.42 +  srcbis += STRIDE_Y;
   68.43 +  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
   68.44 +  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
   68.45 +  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
   68.46 +
   68.47 +  srcbis += STRIDE_Y;
   68.48 +  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
   68.49 +  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
   68.50 +  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
   68.51 +
   68.52 +  srcbis += STRIDE_Y;
   68.53 +
   68.54 +  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
   68.55 +  vsint16_t srcM2ssB = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
   68.56 +  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
   68.57 +  vsint16_t srcM1ssB = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
   68.58 +  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
   68.59 +  vsint16_t srcP0ssB = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
   68.60 +  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
   68.61 +  vsint16_t srcP1ssB = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
   68.62 +  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
   68.63 +  vsint16_t srcP2ssB = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
   68.64 +
   68.65 +  for (i = 0 ; i < h ; i++) {
   68.66 +    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
   68.67 +    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
   68.68 +    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
   68.69 +
   68.70 +    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
   68.71 +    const vsint16_t srcP3ssB = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
   68.72 +    srcbis += STRIDE_Y;
   68.73 +
   68.74 +    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
   68.75 +    const vsint16_t sum1B = spu_add(srcP0ssB, srcP1ssB);
   68.76 +    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
   68.77 +    const vsint16_t sum2B = spu_add(srcM1ssB, srcP2ssB);
   68.78 +    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
   68.79 +    const vsint16_t sum3B = spu_add(srcM2ssB, srcP3ssB);
   68.80 +
   68.81 +    srcM2ssA = srcM1ssA;
   68.82 +    srcM2ssB = srcM1ssB;
   68.83 +    srcM1ssA = srcP0ssA;
   68.84 +    srcM1ssB = srcP0ssB;
   68.85 +    srcP0ssA = srcP1ssA;
   68.86 +    srcP0ssB = srcP1ssB;
   68.87 +    srcP1ssA = srcP2ssA;
   68.88 +    srcP1ssB = srcP2ssB;
   68.89 +    srcP2ssA = srcP3ssA;
   68.90 +    srcP2ssB = srcP3ssB;
   68.91 +
   68.92 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
   68.93 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
   68.94 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
   68.95 +    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
   68.96 +
   68.97 +    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
   68.98 +    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
   68.99 +    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
  68.100 +    const vsint16_t pp1B = spu_add(pp1B3, v16ss);
  68.101 +
  68.102 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
  68.103 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
  68.104 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
  68.105 +
  68.106 +    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
  68.107 +    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
  68.108 +    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
  68.109 +
  68.110 +    const vsint16_t pp3A = spu_add(sum3A, pp1A);
  68.111 +    const vsint16_t pp3B = spu_add(sum3B, pp1B);
  68.112 +
  68.113 +    const vsint16_t psumA = spu_sub(pp3A, pp2A);
  68.114 +    const vsint16_t psumB = spu_sub(pp3B, pp2B);
  68.115 +
  68.116 +    vsint16_t sumA = spu_rlmask(psumA, -5);
  68.117 +    vsint16_t sumB = spu_rlmask(psumB, -5);
  68.118 +
  68.119 +    //Saturation to 0 and 255
  68.120 +    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
  68.121 +    sumA = spu_and(sumA,(vsint16_t)sat);
  68.122 +    sat = spu_cmpgt(sumA,vmax);
  68.123 +    sumA = spu_sel(sumA,vmax,sat);
  68.124 +    sat = spu_cmpgt(sumB,(vsint16_t)vzero);
  68.125 +    sumB = spu_and(sumB,(vsint16_t)sat);
  68.126 +    sat = spu_cmpgt(sumB,vmax);
  68.127 +    sumB = spu_sel(sumB,vmax,sat);
  68.128 +
  68.129 +    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu);
  68.130 +
  68.131 +    /* 16x16 dest luma blocks are alway aligned */
  68.132 +    const vuint8_t vdst = *(vuint8_t *)dst;
  68.133 +
  68.134 +    vuint8_t fsum;
  68.135 +    OP_U8_SPU(fsum, sum, vdst);
  68.136 +
  68.137 +    *(vuint8_t *)dst=fsum;
  68.138 +    
  68.139 +    dst += dstStride; /* stride is  multiple of 16 ,so dstperm and dstmask can remain out of the loop */
  68.140 +  }
  68.141 +}
  68.142 +
  68.143 +static void PREFIX_h264_qpel16_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
  68.144 +
  68.145 +  register int i;
  68.146 +  
  68.147 +  const int16_t i20ss = 20;
  68.148 +  const int16_t i5ss = 5;
  68.149 +  const int16_t i16ss = 16;
  68.150 +  const int16_t imax = 255;
  68.151 +
  68.152 +  const vsint32_t vzero = spu_splats(0);
  68.153 +  const vsint16_t v20ss = spu_splats(i20ss);
  68.154 +  const vsint16_t v5ss = spu_splats(i5ss);
  68.155 +  const vsint16_t v16ss = spu_splats(i16ss);
  68.156 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  68.157 +  vuint16_t sat;
  68.158 +
  68.159 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
  68.160 +  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
  68.161 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
  68.162 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
  68.163 +
  68.164 +  const int permM2 = (unsigned int) (src-2) & 15;
  68.165 +  const int permM1 = (unsigned int) (src-1) & 15;
  68.166 +  const int permP0 = (unsigned int) (src) & 15;
  68.167 +  const int permP1 = (unsigned int) (src+1) & 15;
  68.168 +  const int permP2 = (unsigned int) (src+2) & 15;
  68.169 +  const int permP3 = (unsigned int) (src+3) & 15;
  68.170 +
  68.171 +  register int align = ((((unsigned long)src) - 2) % 16);
  68.172 +
  68.173 +  for (i = 0 ; i < h ; i ++) {
  68.174 +    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  68.175 +    vuint8_t srcR1 = *(vuint8_t *)(src-2);
  68.176 +    vuint8_t srcR2 = *(vuint8_t *)(src+14);
  68.177 +
  68.178 +    switch (align) {
  68.179 +    default: {
  68.180 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.181 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.182 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.183 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.184 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.185 +      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
  68.186 +    } break;
  68.187 +    case 11: {
  68.188 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.189 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.190 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.191 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.192 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.193 +      srcP3 = srcR2;
  68.194 +    } break;
  68.195 +    case 12: {
  68.196 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.197 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.198 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.199 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.200 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.201 +      srcP2 = srcR2;
  68.202 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.203 +    } break;
  68.204 +    case 13: {
  68.205 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.206 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.207 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.208 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.209 +      srcP1 = srcR2;
  68.210 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.211 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.212 +    } break;
  68.213 +    case 14: {
  68.214 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.215 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.216 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.217 +      srcP0 = srcR2;
  68.218 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.219 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.220 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.221 +    } break;
  68.222 +    case 15: {
  68.223 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.224 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.225 +      srcM1 = srcR2;
  68.226 +      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
  68.227 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.228 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.229 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.230 +    } break;
  68.231 +    }
  68.232 +
  68.233 +    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
  68.234 +    const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
  68.235 +    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
  68.236 +    const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
  68.237 +
  68.238 +    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
  68.239 +    const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
  68.240 +    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
  68.241 +    const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
  68.242 +
  68.243 +    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
  68.244 +    const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
  68.245 +    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
  68.246 +    const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
  68.247 +
  68.248 +    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
  68.249 +    const vsint16_t sum1B = spu_add(srcP0B, srcP1B);
  68.250 +    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
  68.251 +    const vsint16_t sum2B = spu_add(srcM1B, srcP2B);
  68.252 +    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
  68.253 +    const vsint16_t sum3B = spu_add(srcM2B, srcP3B);
  68.254 +
  68.255 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
  68.256 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
  68.257 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
  68.258 +    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
  68.259 +
  68.260 +    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
  68.261 +    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
  68.262 +    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
  68.263 +    const vsint16_t pp1B = spu_add(pp1B3, v16ss);
  68.264 +
  68.265 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
  68.266 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
  68.267 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
  68.268 +
  68.269 +    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
  68.270 +    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
  68.271 +    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
  68.272 +
  68.273 +    const vsint16_t pp3A = spu_add(sum3A, pp1A);
  68.274 +    const vsint16_t pp3B = spu_add(sum3B, pp1B);
  68.275 +
  68.276 +    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
  68.277 +    const vsint16_t psumB = spu_sub(pp3B, (vsint16_t)pp2B);
  68.278 +
  68.279 +    vsint16_t sumA = spu_rlmask(psumA, -5);
  68.280 +    vsint16_t sumB = spu_rlmask(psumB, -5);
  68.281 +
  68.282 +    //Saturation to 0 and 255
  68.283 +    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
  68.284 +    sumA = spu_and(sumA,(vsint16_t)sat);
  68.285 +    sat = spu_cmpgt(sumA,vmax);
  68.286 +    sumA = spu_sel(sumA,vmax,sat);
  68.287 +    sat = spu_cmpgt(sumB,(vsint16_t)vzero);
  68.288 +    sumB = spu_and(sumB,(vsint16_t)sat);
  68.289 +    sat = spu_cmpgt(sumB,vmax);
  68.290 +    sumB = spu_sel(sumB,vmax,sat);
  68.291 +
  68.292 +    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, sumB, packsu);
  68.293 +
  68.294 +    /* 16x16 dest luma blocks are alway aligned */
  68.295 +    const vuint8_t vdst = *(vuint8_t *)dst;
  68.296 +
  68.297 +    vuint8_t fsum;
  68.298 +    OP_U8_SPU(fsum, sum, vdst);
  68.299 +
  68.300 +    *(vuint8_t *)dst=fsum;
  68.301 +    
  68.302 +    src += STRIDE_Y;
  68.303 +    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
  68.304 +   }
  68.305 +}
  68.306 +
  68.307 +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
  68.308 +static void PREFIX_h264_qpel16_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
  68.309 +  register int i;
  68.310 +
  68.311 +  const int16_t i20ss = 20;
  68.312 +  const int16_t i5ss = 5;
  68.313 +  const int16_t imax = 255;
  68.314 +
  68.315 +  const vsint32_t vzero = spu_splats(0);
  68.316 +  const vsint16_t v20ss = spu_splats(i20ss);
  68.317 +  const vsint16_t v5ss = spu_splats(i5ss);
  68.318 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  68.319 +  vuint16_t sat;
  68.320 +
  68.321 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
  68.322 +  const vuint8_t mergel = {0x80,0x08,0x80,0x09,0x80,0x0A,0x80,0x0B,0x80,0x0C,0x80,0x0D,0x80,0x0E,0x80,0x0F};
  68.323 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
  68.324 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
  68.325 +
  68.326 +  const int permM2 = (unsigned int) (src-2) & 15;
  68.327 +  const int permM1 = (unsigned int) (src-1) & 15;
  68.328 +  const int permP0 = (unsigned int) (src) & 15;
  68.329 +  const int permP1 = (unsigned int) (src+1) & 15;
  68.330 +  const int permP2 = (unsigned int) (src+2) & 15;
  68.331 +  const int permP3 = (unsigned int) (src+3) & 15;
  68.332 +
  68.333 +  register int align = ((((unsigned long)src) - 2) % 16);
  68.334 +
  68.335 +  src -= (2 * STRIDE_Y);
  68.336 +
  68.337 +  for (i = 0 ; i < (h+5) ; i ++) {
  68.338 +    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  68.339 +    vuint8_t srcR1 = *(vuint8_t *)(src-2);
  68.340 +    vuint8_t srcR2 = *(vuint8_t *)(src+14);
  68.341 +
  68.342 +    switch (align) {
  68.343 +    default: {
  68.344 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.345 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.346 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.347 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.348 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.349 +      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
  68.350 +    } break;
  68.351 +    case 11: {
  68.352 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.353 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.354 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.355 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.356 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.357 +      srcP3 = srcR2;
  68.358 +    } break;
  68.359 +    case 12: {
  68.360 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.361 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.362 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.363 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.364 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.365 +      srcP2 = srcR2;
  68.366 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.367 +    } break;
  68.368 +    case 13: {
  68.369 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.370 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.371 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.372 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.373 +      srcP1 = srcR2;
  68.374 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.375 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.376 +    } break;
  68.377 +    case 14: {
  68.378 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.379 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.380 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.381 +      srcP0 = srcR2;
  68.382 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.383 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.384 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.385 +    } break;
  68.386 +    case 15: {
  68.387 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.388 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.389 +      srcM1 = srcR2;
  68.390 +      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
  68.391 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.392 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.393 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.394 +    } break;
  68.395 +    }
  68.396 +
  68.397 +    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
  68.398 +    const vsint16_t srcP0B = (vsint16_t)spu_shuffle(srcP0, srcP0, mergel);
  68.399 +    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
  68.400 +    const vsint16_t srcP1B = (vsint16_t)spu_shuffle(srcP1, srcP1, mergel);
  68.401 +
  68.402 +    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
  68.403 +    const vsint16_t srcP2B = (vsint16_t)spu_shuffle(srcP2, srcP2, mergel);
  68.404 +    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
  68.405 +    const vsint16_t srcP3B = (vsint16_t)spu_shuffle(srcP3, srcP3, mergel);
  68.406 +
  68.407 +    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
  68.408 +    const vsint16_t srcM2B = (vsint16_t)spu_shuffle(srcM2, srcM2, mergel);
  68.409 +    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
  68.410 +    const vsint16_t srcM1B = (vsint16_t)spu_shuffle(srcM1, srcM1, mergel);
  68.411 +
  68.412 +    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
  68.413 +    const vsint16_t sum1B = spu_add(srcP0B, srcP1B);
  68.414 +    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
  68.415 +    const vsint16_t sum2B = spu_add(srcM1B, srcP2B);
  68.416 +    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
  68.417 +    const vsint16_t sum3B = spu_add(srcM2B, srcP3B);
  68.418 +
  68.419 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
  68.420 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
  68.421 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
  68.422 +    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
  68.423 +
  68.424 +    const vsint32_t pp1B1 = spu_mule(sum1B, v20ss);
  68.425 +    const vsint32_t pp1B2 = spu_mulo(sum1B, v20ss);
  68.426 +    const vsint16_t pp1B3 = (vsint16_t)spu_shuffle((vsint16_t)pp1B1, (vsint16_t)pp1B2, mez);
  68.427 +    const vsint16_t pp1B = spu_add(pp1B3, sum3B);
  68.428 +
  68.429 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
  68.430 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
  68.431 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
  68.432 +
  68.433 +    const vsint32_t pp2B1 = spu_mule(sum2B, v5ss);
  68.434 +    const vsint32_t pp2B2 = spu_mulo(sum2B, v5ss);
  68.435 +    const vsint16_t pp2B = (vsint16_t)spu_shuffle((vsint16_t)pp2B1, (vsint16_t)pp2B2, mez);
  68.436 +
  68.437 +    const vsint16_t psumA = spu_sub(pp1A, pp2A);
  68.438 +    const vsint16_t psumB = spu_sub(pp1B, pp2B);
  68.439 +
  68.440 +    *(vsint16_t *)tmp = psumA;
  68.441 +    *(vsint16_t *)(tmp+8) = psumB;
  68.442 +
  68.443 +    src += STRIDE_Y;
  68.444 +    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
  68.445 +  }
  68.446 +
  68.447 +  const int32_t ni10si = -10;
  68.448 +  const int16_t i1ss = 1;
  68.449 +  const int32_t i512si = 512;
  68.450 +  const int32_t ni16si = -16;
  68.451 +
  68.452 +  const vsint32_t nv10si = spu_splats(ni10si);
  68.453 +  const vsint16_t v1ss = spu_splats(i1ss);
  68.454 +  const vsint32_t v512si = spu_splats(i512si);
  68.455 +  const vsint32_t nv16si = spu_splats(ni16si);
  68.456 +
  68.457 +  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
  68.458 +  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
  68.459 +
  68.460 +  int16_t *tmpbis = tmp - (tmpStride * (h+5));
  68.461 +
  68.462 +  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
  68.463 +  vsint16_t tmpM2ssB = *(vsint16_t *)(tmpbis+8);
  68.464 +  tmpbis += tmpStride;
  68.465 +  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
  68.466 +  vsint16_t tmpM1ssB = *(vsint16_t *)(tmpbis+8);
  68.467 +  tmpbis += tmpStride;
  68.468 +  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
  68.469 +  vsint16_t tmpP0ssB = *(vsint16_t *)(tmpbis+8);
  68.470 +  tmpbis += tmpStride;
  68.471 +  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
  68.472 +  vsint16_t tmpP1ssB = *(vsint16_t *)(tmpbis+8);
  68.473 +  tmpbis += tmpStride;
  68.474 +  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
  68.475 +  vsint16_t tmpP2ssB = *(vsint16_t *)(tmpbis+8);
  68.476 +  tmpbis += tmpStride;
  68.477 +
  68.478 +  for (i = 0 ; i < h ; i++) {
  68.479 +    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
  68.480 +    const vsint16_t tmpP3ssB = *(vsint16_t *)(tmpbis+8);
  68.481 +    tmpbis += tmpStride;
  68.482 +
  68.483 +    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
  68.484 +    const vsint16_t sum1B = spu_add(tmpP0ssB, tmpP1ssB);
  68.485 +    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
  68.486 +    const vsint16_t sum2B = spu_add(tmpM1ssB, tmpP2ssB);
  68.487 +    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
  68.488 +    const vsint16_t sum3B = spu_add(tmpM2ssB, tmpP3ssB);
  68.489 +
  68.490 +    tmpM2ssA = tmpM1ssA;
  68.491 +    tmpM2ssB = tmpM1ssB;
  68.492 +    tmpM1ssA = tmpP0ssA;
  68.493 +    tmpM1ssB = tmpP0ssB;
  68.494 +    tmpP0ssA = tmpP1ssA;
  68.495 +    tmpP0ssB = tmpP1ssB;
  68.496 +    tmpP1ssA = tmpP2ssA;
  68.497 +    tmpP1ssB = tmpP2ssB;
  68.498 +    tmpP2ssA = tmpP3ssA;
  68.499 +    tmpP2ssB = tmpP3ssB;
  68.500 +
  68.501 +    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
  68.502 +    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
  68.503 +    const vsint32_t pp1Be = spu_mule(sum1B, v20ss);
  68.504 +    const vsint32_t pp1Bo = spu_mulo(sum1B, v20ss);
  68.505 +
  68.506 +    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
  68.507 +    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
  68.508 +    const vsint32_t pp2Be = spu_mule(sum2B, v5ss);
  68.509 +    const vsint32_t pp2Bo = spu_mulo(sum2B, v5ss);
  68.510 +
  68.511 +    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
  68.512 +    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
  68.513 +    const vsint32_t pp3Be = spu_rlmask((vsint32_t)sum3B, nv16si);
  68.514 +    const vsint32_t pp3Bo = spu_mulo(sum3B, v1ss);
  68.515 +
  68.516 +    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
  68.517 +    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
  68.518 +    const vsint32_t pp1cBe = spu_add(pp1Be, v512si);
  68.519 +    const vsint32_t pp1cBo = spu_add(pp1Bo, v512si);
  68.520 +
  68.521 +    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
  68.522 +    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
  68.523 +    const vsint32_t pp32Be = spu_sub(pp3Be, pp2Be);
  68.524 +    const vsint32_t pp32Bo = spu_sub(pp3Bo, pp2Bo);
  68.525 +
  68.526 +    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
  68.527 +    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
  68.528 +    const vsint32_t sumBe = spu_add(pp1cBe, pp32Be);
  68.529 +    const vsint32_t sumBo = spu_add(pp1cBo, pp32Bo);
  68.530 +
  68.531 +    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
  68.532 +    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
  68.533 +    const vsint32_t ssumBe = spu_rlmask(sumBe, nv10si);
  68.534 +    const vsint32_t ssumBo = spu_rlmask(sumBo, nv10si);
  68.535 +
  68.536 +    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, ssumBe, packs);
  68.537 +    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, ssumBo, packs);
  68.538 +
  68.539 +    //Saturation to 0 and 255
  68.540 +    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
  68.541 +    ssume = spu_and(ssume,(vsint16_t)sat);
  68.542 +    sat = spu_cmpgt(ssume,vmax);
  68.543 +    ssume = spu_sel(ssume,vmax,sat);
  68.544 +    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
  68.545 +    ssumo = spu_and(ssumo,(vsint16_t)sat);
  68.546 +    sat = spu_cmpgt(ssumo,vmax);
  68.547 +    ssumo = spu_sel(ssumo,vmax,sat);
  68.548 +
  68.549 +    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
  68.550 +
  68.551 +    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
  68.552 +
  68.553 +    /* 16x16 dest luma blocks are alway aligned */
  68.554 +    const vuint8_t vdst = *(vuint8_t *)dst;
  68.555 +
  68.556 +    vuint8_t fsum;
  68.557 +    OP_U8_SPU(fsum, sum, vdst);
  68.558 +
  68.559 +    *(vuint8_t *)dst=fsum;
  68.560 +    
  68.561 +    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
  68.562 +
  68.563 +  }
  68.564 +}
  68.565 +
  68.566 +static void PREFIX_h264_qpel8_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
  68.567 +  
  68.568 +  register int i;
  68.569 +
  68.570 +  const int16_t i20ss= 20;
  68.571 +  const int16_t i5ss= 5;
  68.572 +  const int16_t i16ss= 16;
  68.573 +  const int16_t imax = 255;
  68.574 +
  68.575 +  const vsint32_t vzero = spu_splats(0);
  68.576 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  68.577 +  vuint16_t sat;
  68.578 +
  68.579 +  const vsint16_t v20ss = spu_splats(i20ss);
  68.580 +  const vsint16_t v5ss = spu_splats(i5ss);
  68.581 +  const vsint16_t v16ss = spu_splats(i16ss);
  68.582 +  const int shift_src = (unsigned int) src & 15;
  68.583 +
  68.584 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
  68.585 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
  68.586 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
  68.587 +
  68.588 +  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
  68.589 +  const int shift_dst = (unsigned int) dst & 15;
  68.590 +  vuint8_t dstmask;
  68.591 +  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  68.592 +  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
  68.593 +
  68.594 +  if(shift_dst==0){
  68.595 +    dstmask = dst8mask1;
  68.596 +  }
  68.597 +  else{
  68.598 +    dstmask = dst8mask2;
  68.599 +  }
  68.600 +
  68.601 +  uint8_t *srcbis = src - (STRIDE_Y * 2);
  68.602 +
  68.603 +  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
  68.604 +  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
  68.605 +  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
  68.606 +
  68.607 +  srcbis += STRIDE_Y;
  68.608 +  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
  68.609 +  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
  68.610 +  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
  68.611 +
  68.612 +  srcbis += STRIDE_Y;
  68.613 +  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
  68.614 +  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
  68.615 +  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
  68.616 +
  68.617 +  srcbis += STRIDE_Y;
  68.618 +  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
  68.619 +  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
  68.620 +  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
  68.621 +
  68.622 +  srcbis += STRIDE_Y;
  68.623 +  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
  68.624 +  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
  68.625 +  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
  68.626 +
  68.627 +  srcbis += STRIDE_Y;
  68.628 +
  68.629 +  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
  68.630 +  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
  68.631 +  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
  68.632 +  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
  68.633 +  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
  68.634 +
  68.635 +  for (i = 0 ; i < h ; i++) {
  68.636 +    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
  68.637 +    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
  68.638 +    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
  68.639 +
  68.640 +    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
  68.641 +    srcbis += STRIDE_Y;
  68.642 +
  68.643 +    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
  68.644 +    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
  68.645 +    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
  68.646 +
  68.647 +    srcM2ssA = srcM1ssA;
  68.648 +    srcM1ssA = srcP0ssA;
  68.649 +    srcP0ssA = srcP1ssA;
  68.650 +    srcP1ssA = srcP2ssA;
  68.651 +    srcP2ssA = srcP3ssA;
  68.652 +
  68.653 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
  68.654 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
  68.655 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
  68.656 +    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
  68.657 +
  68.658 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
  68.659 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
  68.660 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
  68.661 +
  68.662 +    const vsint16_t pp3A = spu_add(sum3A, pp1A);
  68.663 +    const vsint16_t psumA = spu_sub(pp3A, pp2A);
  68.664 +    vsint16_t sumA = spu_rlmask(psumA, -5);
  68.665 +
  68.666 +    //Saturation to 0 and 255
  68.667 +    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
  68.668 +    sumA = spu_and(sumA,(vsint16_t)sat);
  68.669 +    sat = spu_cmpgt(sumA,vmax);
  68.670 +    sumA = spu_sel(sumA,vmax,sat);
  68.671 +
  68.672 +    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
  68.673 +
  68.674 +    const vuint8_t dst1 = *(vuint8_t *)dst;
  68.675 +
  68.676 +    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
  68.677 +    vuint8_t fsum;
  68.678 +    OP_U8_SPU(fsum, dsum, dst1);
  68.679 +
  68.680 +    *(vuint8_t *)dst=fsum;
  68.681 +    
  68.682 +    dst += dstStride; 
  68.683 +  }
  68.684 +}
  68.685 +
  68.686 +static void PREFIX_h264_qpel8_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
  68.687 +
  68.688 +  register int i;
  68.689 +  
  68.690 +  const int16_t i20ss = 20;
  68.691 +  const int16_t i5ss = 5;
  68.692 +  const int16_t i16ss = 16;
  68.693 +  const int16_t imax = 255;
  68.694 +
  68.695 +  const vsint32_t vzero = spu_splats(0);
  68.696 +  const vsint16_t v20ss = spu_splats(i20ss);
  68.697 +  const vsint16_t v5ss = spu_splats(i5ss);
  68.698 +  const vsint16_t v16ss = spu_splats(i16ss);
  68.699 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  68.700 +  vuint16_t sat;
  68.701 +
  68.702 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
  68.703 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
  68.704 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
  68.705 +
  68.706 +  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
  68.707 +  const int shift_dst = (unsigned int) dst & 15;
  68.708 +  vuint8_t dstmask;
  68.709 +  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  68.710 +  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
  68.711 +
  68.712 +  if(shift_dst==0){
  68.713 +    dstmask = dst8mask1;
  68.714 +  }
  68.715 +  else{
  68.716 +    dstmask = dst8mask2;
  68.717 +  }
  68.718 +
  68.719 +  const int permM2 = (unsigned int) (src-2) & 15;
  68.720 +  const int permM1 = (unsigned int) (src-1) & 15;
  68.721 +  const int permP0 = (unsigned int) (src) & 15;
  68.722 +  const int permP1 = (unsigned int) (src+1) & 15;
  68.723 +  const int permP2 = (unsigned int) (src+2) & 15;
  68.724 +  const int permP3 = (unsigned int) (src+3) & 15;
  68.725 +
  68.726 +  register int align = ((((unsigned long)src) - 2) % 16);
  68.727 +
  68.728 +  for (i = 0 ; i < h ; i ++) {
  68.729 +    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  68.730 +    vuint8_t srcR1 = *(vuint8_t *)(src-2);
  68.731 +    vuint8_t srcR2 = *(vuint8_t *)(src+14);
  68.732 +
  68.733 +    switch (align) {
  68.734 +    default: {
  68.735 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.736 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.737 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.738 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.739 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.740 +      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
  68.741 +    } break;
  68.742 +    case 11: {
  68.743 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.744 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.745 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.746 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.747 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.748 +      srcP3 = srcR2;
  68.749 +    } break;
  68.750 +    case 12: {
  68.751 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.752 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.753 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.754 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.755 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.756 +      srcP2 = srcR2;
  68.757 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.758 +    } break;
  68.759 +    case 13: {
  68.760 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.761 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.762 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.763 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.764 +      srcP1 = srcR2;
  68.765 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.766 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.767 +    } break;
  68.768 +    case 14: {
  68.769 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.770 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.771 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.772 +      srcP0 = srcR2;
  68.773 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.774 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.775 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.776 +    } break;
  68.777 +    case 15: {
  68.778 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.779 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.780 +      srcM1 = srcR2;
  68.781 +      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
  68.782 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.783 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.784 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.785 +    } break;
  68.786 +    }
  68.787 +
  68.788 +    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
  68.789 +    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
  68.790 +
  68.791 +    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
  68.792 +    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
  68.793 +
  68.794 +    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
  68.795 +    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
  68.796 +
  68.797 +    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
  68.798 +    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
  68.799 +    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
  68.800 +
  68.801 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
  68.802 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
  68.803 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
  68.804 +    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
  68.805 +
  68.806 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
  68.807 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
  68.808 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
  68.809 +
  68.810 +    const vsint16_t pp3A = spu_add(sum3A, pp1A);
  68.811 +
  68.812 +    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
  68.813 +    
  68.814 +    vsint16_t sumA = spu_rlmask(psumA, -5);
  68.815 +
  68.816 +    //Saturation to 0 and 255
  68.817 +    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
  68.818 +    sumA = spu_and(sumA,(vsint16_t)sat);
  68.819 +    sat = spu_cmpgt(sumA,vmax);
  68.820 +    sumA = spu_sel(sumA,vmax,sat);
  68.821 +
  68.822 +    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
  68.823 +
  68.824 +    const vuint8_t dst1 = *(vuint8_t *)dst;
  68.825 +
  68.826 +    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
  68.827 +    vuint8_t fsum;
  68.828 +    OP_U8_SPU(fsum, dsum, dst1);
  68.829 +
  68.830 +    *(vuint8_t *)dst=fsum;
  68.831 +    
  68.832 +    src += STRIDE_Y;
  68.833 +    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
  68.834 +   }
  68.835 +}
  68.836 +
  68.837 +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
  68.838 +static void PREFIX_h264_qpel8_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
  68.839 +  register int i;
  68.840 +
  68.841 +  const int16_t i20ss = 20;
  68.842 +  const int16_t i5ss = 5;
  68.843 +  const int16_t imax = 255;
  68.844 +
  68.845 +  const vsint32_t vzero = spu_splats(0);
  68.846 +  const vsint16_t v20ss = spu_splats(i20ss);
  68.847 +  const vsint16_t v5ss = spu_splats(i5ss);
  68.848 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
  68.849 +  vuint16_t sat;
  68.850 +
  68.851 +  const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07};
  68.852 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
  68.853 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
  68.854 +
  68.855 +  const int permM2 = (unsigned int) (src-2) & 15;
  68.856 +  const int permM1 = (unsigned int) (src-1) & 15;
  68.857 +  const int permP0 = (unsigned int) (src) & 15;
  68.858 +  const int permP1 = (unsigned int) (src+1) & 15;
  68.859 +  const int permP2 = (unsigned int) (src+2) & 15;
  68.860 +  const int permP3 = (unsigned int) (src+3) & 15;
  68.861 +
  68.862 +  register int align = ((((unsigned long)src) - 2) % 16);
  68.863 +
  68.864 +  src -= (2 * STRIDE_Y);
  68.865 +
  68.866 +  for (i = 0 ; i < (h+5) ; i ++) {
  68.867 +    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  68.868 +    vuint8_t srcR1 = *(vuint8_t *)(src-2);
  68.869 +    vuint8_t srcR2 = *(vuint8_t *)(src+14);
  68.870 +
  68.871 +    switch (align) {
  68.872 +    default: {
  68.873 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.874 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.875 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.876 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.877 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.878 +      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
  68.879 +    } break;
  68.880 +    case 11: {
  68.881 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.882 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.883 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.884 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.885 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
  68.886 +      srcP3 = srcR2;
  68.887 +    } break;
  68.888 +    case 12: {
  68.889 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.890 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.891 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.892 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.893 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
  68.894 +      srcP2 = srcR2;
  68.895 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.896 +    } break;
  68.897 +    case 13: {
  68.898 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.899 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.900 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.901 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
  68.902 +      srcP1 = srcR2;
  68.903 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.904 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.905 +    } break;
  68.906 +    case 14: {
  68.907 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.908 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.909 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
  68.910 +      srcP0 = srcR2;
  68.911 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.912 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.913 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.914 +    } break;
  68.915 +    case 15: {
  68.916 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
  68.917 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
  68.918 +      srcM1 = srcR2;
  68.919 +      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
  68.920 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
  68.921 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
  68.922 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
  68.923 +    } break;
  68.924 +    }
  68.925 +
  68.926 +    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh);
  68.927 +    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh);
  68.928 +    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh);
  68.929 +    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh);
  68.930 +    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh);
  68.931 +    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh);
  68.932 +
  68.933 +    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
  68.934 +    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
  68.935 +    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
  68.936 +
  68.937 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
  68.938 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
  68.939 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
  68.940 +    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
  68.941 +
  68.942 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
  68.943 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
  68.944 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
  68.945 +
  68.946 +    const vsint16_t psumA = spu_sub(pp1A, pp2A);
  68.947 +
  68.948 +    *(vsint16_t *)tmp = psumA;
  68.949 +
  68.950 +    src += STRIDE_Y;
  68.951 +    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
  68.952 +  }
  68.953 +
  68.954 +  const int32_t ni10si = -10;
  68.955 +  const int16_t i1ss = 1;
  68.956 +  const int32_t i512si = 512;
  68.957 +  const int32_t ni16si = -16;
  68.958 +
  68.959 +  const vsint32_t nv10si = spu_splats(ni10si);
  68.960 +  const vsint16_t v1ss = spu_splats(i1ss);
  68.961 +  const vsint32_t v512si = spu_splats(i512si);
  68.962 +  const vsint32_t nv16si = spu_splats(ni16si);
  68.963 +
  68.964 +  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
  68.965 +  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
  68.966 +
  68.967 +  const int shift_dst = (unsigned int) (dst) & 15;
  68.968 +  /* 8x8 dest luma blocks are aligned or desaligned by 8*/
  68.969 +  vuint8_t dstmask;
  68.970 +  const vuint8_t dst8mask1= {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
  68.971 +  const vuint8_t dst8mask2= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17};
  68.972 +
  68.973 +  if(shift_dst==0){
  68.974 +    dstmask = dst8mask1;
  68.975 +  }
  68.976 +  else{
  68.977 +    dstmask = dst8mask2;
  68.978 +  }
  68.979 +
  68.980 +  int16_t *tmpbis = tmp - (tmpStride * (h+5));
  68.981 +
  68.982 +  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
  68.983 +  tmpbis += tmpStride;
  68.984 +  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
  68.985 +  tmpbis += tmpStride;
  68.986 +  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
  68.987 +  tmpbis += tmpStride;
  68.988 +  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
  68.989 +  tmpbis += tmpStride;
  68.990 +  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
  68.991 +  tmpbis += tmpStride;
  68.992 +
  68.993 +  for (i = 0 ; i < h ; i++) {
  68.994 +    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
  68.995 +    tmpbis += tmpStride;
  68.996 +
  68.997 +    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
  68.998 +    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
  68.999 +    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
 68.1000 +
 68.1001 +    tmpM2ssA = tmpM1ssA;
 68.1002 +    tmpM1ssA = tmpP0ssA;
 68.1003 +    tmpP0ssA = tmpP1ssA;
 68.1004 +    tmpP1ssA = tmpP2ssA;
 68.1005 +    tmpP2ssA = tmpP3ssA;
 68.1006 +
 68.1007 +    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
 68.1008 +    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
 68.1009 +    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
 68.1010 +    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
 68.1011 +
 68.1012 +    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
 68.1013 +    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
 68.1014 +
 68.1015 +    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
 68.1016 +    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
 68.1017 +
 68.1018 +    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
 68.1019 +    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
 68.1020 +
 68.1021 +    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
 68.1022 +    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
 68.1023 +
 68.1024 +    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
 68.1025 +    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
 68.1026 +
 68.1027 +    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs);
 68.1028 +    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs);
 68.1029 +
 68.1030 +    //Saturation to 0 and 255
 68.1031 +    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
 68.1032 +    ssume = spu_and(ssume,(vsint16_t)sat);
 68.1033 +    sat = spu_cmpgt(ssume,vmax);
 68.1034 +    ssume = spu_sel(ssume,vmax,sat);
 68.1035 +    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
 68.1036 +    ssumo = spu_and(ssumo,(vsint16_t)sat);
 68.1037 +    sat = spu_cmpgt(ssumo,vmax);
 68.1038 +    ssumo = spu_sel(ssumo,vmax,sat);
 68.1039 +
 68.1040 +    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
 68.1041 +
 68.1042 +    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
 68.1043 +
 68.1044 +    const vuint8_t dst1 = *(vuint8_t *)dst;
 68.1045 +
 68.1046 +    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
 68.1047 +    vuint8_t fsum;
 68.1048 +    OP_U8_SPU(fsum, dsum, dst1);
 68.1049 +
 68.1050 +    *(vuint8_t *)dst=fsum;
 68.1051 +    
 68.1052 +    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
 68.1053 +
 68.1054 +  }
 68.1055 +}
 68.1056 +
 68.1057 +static void PREFIX_h264_qpel4_v_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
 68.1058 +  
 68.1059 +  register int i;
 68.1060 +
 68.1061 +  const int16_t i20ss= 20;
 68.1062 +  const int16_t i5ss= 5;
 68.1063 +  const int16_t i16ss= 16;
 68.1064 +  const int16_t imax = 255;
 68.1065 +
 68.1066 +  const vsint32_t vzero = spu_splats(0);
 68.1067 +  const vsint16_t v20ss = spu_splats(i20ss);
 68.1068 +  const vsint16_t v5ss = spu_splats(i5ss);
 68.1069 +  const vsint16_t v16ss = spu_splats(i16ss);
 68.1070 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
 68.1071 +  vuint16_t sat;
 68.1072 +
 68.1073 +  const int shift_src = (unsigned int) src & 15;
 68.1074 +
 68.1075 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
 68.1076 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
 68.1077 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
 68.1078 +
 68.1079 +  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
 68.1080 +  const int shift_dst = (unsigned int) dst & 15;
 68.1081 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 68.1082 +  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
 68.1083 +  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
 68.1084 +  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
 68.1085 +  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
 68.1086 +
 68.1087 +  switch(shift_dst){
 68.1088 +    case 0:  dstmask = dst4mask0;
 68.1089 +             break;
 68.1090 +    case 4:  dstmask = dst4mask4;
 68.1091 +             break;
 68.1092 +    case 8:  dstmask = dst4mask8;
 68.1093 +             break;
 68.1094 +    case 12: dstmask = dst4mask12;
 68.1095 +             break;
 68.1096 +  }
 68.1097 +
 68.1098 +  uint8_t *srcbis = src - (STRIDE_Y * 2);
 68.1099 +
 68.1100 +  const vuint8_t srcM2a = *(vuint8_t *)(srcbis);
 68.1101 +  const vuint8_t srcM2b = *(vuint8_t *)(srcbis+16);
 68.1102 +  const vuint8_t srcM2= spu_or(spu_slqwbyte(srcM2a, shift_src), spu_rlmaskqwbyte(srcM2b, shift_src-16));
 68.1103 +
 68.1104 +  srcbis += STRIDE_Y;
 68.1105 +  const vuint8_t srcM1a = *(vuint8_t *)(srcbis);
 68.1106 +  const vuint8_t srcM1b = *(vuint8_t *)(srcbis+16);
 68.1107 +  const vuint8_t srcM1= spu_or(spu_slqwbyte(srcM1a, shift_src), spu_rlmaskqwbyte(srcM1b, shift_src-16));
 68.1108 +
 68.1109 +  srcbis += STRIDE_Y;
 68.1110 +  const vuint8_t srcP0a = *(vuint8_t *)(srcbis);
 68.1111 +  const vuint8_t srcP0b = *(vuint8_t *)(srcbis+16);
 68.1112 +  const vuint8_t srcP0= spu_or(spu_slqwbyte(srcP0a, shift_src), spu_rlmaskqwbyte(srcP0b, shift_src-16));
 68.1113 +
 68.1114 +  srcbis += STRIDE_Y;
 68.1115 +  const vuint8_t srcP1a = *(vuint8_t *)(srcbis);
 68.1116 +  const vuint8_t srcP1b = *(vuint8_t *)(srcbis+16);
 68.1117 +  const vuint8_t srcP1= spu_or(spu_slqwbyte(srcP1a, shift_src), spu_rlmaskqwbyte(srcP1b, shift_src-16));
 68.1118 +
 68.1119 +  srcbis += STRIDE_Y;
 68.1120 +  const vuint8_t srcP2a = *(vuint8_t *)(srcbis);
 68.1121 +  const vuint8_t srcP2b = *(vuint8_t *)(srcbis+16);
 68.1122 +  const vuint8_t srcP2= spu_or(spu_slqwbyte(srcP2a, shift_src), spu_rlmaskqwbyte(srcP2b, shift_src-16));
 68.1123 +
 68.1124 +  srcbis += STRIDE_Y;
 68.1125 +
 68.1126 +  vsint16_t srcM2ssA = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
 68.1127 +  vsint16_t srcM1ssA = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
 68.1128 +  vsint16_t srcP0ssA = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
 68.1129 +  vsint16_t srcP1ssA = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
 68.1130 +  vsint16_t srcP2ssA = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
 68.1131 +
 68.1132 +  for (i = 0 ; i < h ; i++) {
 68.1133 +    const vuint8_t srcP3a = *(vuint8_t *)(srcbis);
 68.1134 +    const vuint8_t srcP3b = *(vuint8_t *)(srcbis+16);
 68.1135 +    const vuint8_t srcP3= spu_or(spu_slqwbyte(srcP3a, shift_src), spu_rlmaskqwbyte(srcP3b, shift_src-16));
 68.1136 +
 68.1137 +    const vsint16_t srcP3ssA = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
 68.1138 +    srcbis += STRIDE_Y;
 68.1139 +
 68.1140 +    const vsint16_t sum1A = spu_add(srcP0ssA, srcP1ssA);
 68.1141 +    const vsint16_t sum2A = spu_add(srcM1ssA, srcP2ssA);
 68.1142 +    const vsint16_t sum3A = spu_add(srcM2ssA, srcP3ssA);
 68.1143 +
 68.1144 +    srcM2ssA = srcM1ssA;
 68.1145 +    srcM1ssA = srcP0ssA;
 68.1146 +    srcP0ssA = srcP1ssA;
 68.1147 +    srcP1ssA = srcP2ssA;
 68.1148 +    srcP2ssA = srcP3ssA;
 68.1149 +
 68.1150 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
 68.1151 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
 68.1152 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
 68.1153 +    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
 68.1154 +
 68.1155 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
 68.1156 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
 68.1157 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
 68.1158 +
 68.1159 +    const vsint16_t pp3A = spu_add(sum3A, pp1A);
 68.1160 +    const vsint16_t psumA = spu_sub(pp3A, pp2A);
 68.1161 +    vsint16_t sumA = spu_rlmask(psumA, -5);
 68.1162 +
 68.1163 +    //Saturation to 0 and 255
 68.1164 +    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
 68.1165 +    sumA = spu_and(sumA,(vsint16_t)sat);
 68.1166 +    sat = spu_cmpgt(sumA,vmax);
 68.1167 +    sumA = spu_sel(sumA,vmax,sat);
 68.1168 +
 68.1169 +    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
 68.1170 +
 68.1171 +    const vuint8_t dst1 = *(vuint8_t *)dst;
 68.1172 +
 68.1173 +    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
 68.1174 +    vuint8_t fsum;
 68.1175 +    OP_U8_SPU(fsum, dsum, dst1);
 68.1176 +
 68.1177 +    *(vuint8_t *)dst=fsum;
 68.1178 +    
 68.1179 +    dst += dstStride; 
 68.1180 +  }
 68.1181 +}
 68.1182 +
 68.1183 +static void PREFIX_h264_qpel4_h_lowpass_spu(uint8_t * dst, uint8_t * src, int dstStride, int h) {
 68.1184 +
 68.1185 +  register int i;
 68.1186 +  
 68.1187 +  const int16_t i20ss = 20;
 68.1188 +  const int16_t i5ss = 5;
 68.1189 +  const int16_t i16ss = 16;
 68.1190 +  const int16_t imax = 255;
 68.1191 +
 68.1192 +  const vsint32_t vzero = spu_splats(0);
 68.1193 +  const vsint16_t v20ss = spu_splats(i20ss);
 68.1194 +  const vsint16_t v5ss = spu_splats(i5ss);
 68.1195 +  const vsint16_t v16ss = spu_splats(i16ss);
 68.1196 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
 68.1197 +  vuint16_t sat;
 68.1198 +
 68.1199 +  const vuint8_t mergeh = {0x80,0x00,0x80,0x01,0x80,0x02,0x80,0x03,0x80,0x04,0x80,0x05,0x80,0x06,0x80,0x07};
 68.1200 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
 68.1201 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
 68.1202 +
 68.1203 +  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
 68.1204 +  const int shift_dst = (unsigned int) dst & 15;
 68.1205 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 68.1206 +  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
 68.1207 +  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
 68.1208 +  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
 68.1209 +  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
 68.1210 +
 68.1211 +  switch(shift_dst){
 68.1212 +    case 0:  dstmask = dst4mask0;
 68.1213 +             break;
 68.1214 +    case 4:  dstmask = dst4mask4;
 68.1215 +             break;
 68.1216 +    case 8:  dstmask = dst4mask8;
 68.1217 +             break;
 68.1218 +    case 12: dstmask = dst4mask12;
 68.1219 +             break;
 68.1220 +  }
 68.1221 +
 68.1222 +  const int permM2 = (unsigned int) (src-2) & 15;
 68.1223 +  const int permM1 = (unsigned int) (src-1) & 15;
 68.1224 +  const int permP0 = (unsigned int) (src) & 15;
 68.1225 +  const int permP1 = (unsigned int) (src+1) & 15;
 68.1226 +  const int permP2 = (unsigned int) (src+2) & 15;
 68.1227 +  const int permP3 = (unsigned int) (src+3) & 15;
 68.1228 +
 68.1229 +  register int align = ((((unsigned long)src) - 2) % 16);
 68.1230 +
 68.1231 +  for (i = 0 ; i < h ; i ++) {
 68.1232 +    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
 68.1233 +    vuint8_t srcR1 = *(vuint8_t *)(src-2);
 68.1234 +    vuint8_t srcR2 = *(vuint8_t *)(src+14);
 68.1235 +
 68.1236 +    switch (align) {
 68.1237 +    default: {
 68.1238 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1239 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1240 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1241 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
 68.1242 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
 68.1243 +      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
 68.1244 +    } break;
 68.1245 +    case 11: {
 68.1246 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1247 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1248 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1249 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
 68.1250 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
 68.1251 +      srcP3 = srcR2;
 68.1252 +    } break;
 68.1253 +    case 12: {
 68.1254 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1255 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1256 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1257 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1258 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
 68.1259 +      srcP2 = srcR2;
 68.1260 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1261 +    } break;
 68.1262 +    case 13: {
 68.1263 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1264 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1265 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1266 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1267 +      srcP1 = srcR2;
 68.1268 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
 68.1269 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1270 +    } break;
 68.1271 +    case 14: {
 68.1272 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1273 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1274 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1275 +      srcP0 = srcR2;
 68.1276 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
 68.1277 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
 68.1278 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1279 +    } break;
 68.1280 +    case 15: {
 68.1281 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1282 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1283 +      srcM1 = srcR2;
 68.1284 +      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
 68.1285 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
 68.1286 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
 68.1287 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1288 +    } break;
 68.1289 +    }
 68.1290 +
 68.1291 +    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, srcP0, mergeh);
 68.1292 +    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, srcP1, mergeh);
 68.1293 +
 68.1294 +    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, srcP2, mergeh);
 68.1295 +    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, srcP3, mergeh);
 68.1296 +
 68.1297 +    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, srcM2, mergeh);
 68.1298 +    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, srcM1, mergeh);
 68.1299 +
 68.1300 +    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
 68.1301 +    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
 68.1302 +    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
 68.1303 +
 68.1304 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
 68.1305 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
 68.1306 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
 68.1307 +    const vsint16_t pp1A = spu_add(pp1A3, v16ss);
 68.1308 +
 68.1309 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
 68.1310 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
 68.1311 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
 68.1312 +
 68.1313 +    const vsint16_t pp3A = spu_add(sum3A, pp1A);
 68.1314 +
 68.1315 +    const vsint16_t psumA = spu_sub(pp3A, (vsint16_t)pp2A);
 68.1316 +    
 68.1317 +    vsint16_t sumA = spu_rlmask(psumA, -5);
 68.1318 +
 68.1319 +    //Saturation to 0 and 255
 68.1320 +    sat = spu_cmpgt(sumA,(vsint16_t)vzero);
 68.1321 +    sumA = spu_and(sumA,(vsint16_t)sat);
 68.1322 +    sat = spu_cmpgt(sumA,vmax);
 68.1323 +    sumA = spu_sel(sumA,vmax,sat);
 68.1324 +
 68.1325 +    const vuint8_t sum = (vuint8_t)spu_shuffle(sumA, (vsint16_t)vzero, packsu);
 68.1326 +
 68.1327 +    const vuint8_t dst1 = *(vuint8_t *)dst;
 68.1328 +
 68.1329 +    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
 68.1330 +    vuint8_t fsum;
 68.1331 +    OP_U8_SPU(fsum, dsum, dst1);
 68.1332 +
 68.1333 +    *(vuint8_t *)dst=fsum;
 68.1334 +    
 68.1335 +    src += STRIDE_Y;
 68.1336 +    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
 68.1337 +   }
 68.1338 +}
 68.1339 +
 68.1340 +static void PREFIX_h264_qpel4_hv_lowpass_spu(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int h) {
 68.1341 +  register int i;
 68.1342 +
 68.1343 +  const int16_t i20ss = 20;
 68.1344 +  const int16_t i5ss = 5;
 68.1345 +  const int16_t imax = 255;
 68.1346 +
 68.1347 +  const vsint32_t vzero = spu_splats(0);
 68.1348 +  const vsint16_t v20ss = spu_splats(i20ss);
 68.1349 +  const vsint16_t v5ss = spu_splats(i5ss);
 68.1350 +  const vsint16_t vmax = (vsint16_t)spu_splats(imax);
 68.1351 +  vuint16_t sat;
 68.1352 +
 68.1353 +  const vuint8_t mergeh = {0x10,0x00,0x11,0x01,0x12,0x02,0x13,0x03,0x14,0x04,0x15,0x05,0x16,0x06,0x17,0x07};
 68.1354 +  const vuint8_t packsu = {0x01,0x03,0x05,0x07,0x09,0x0B,0x0D,0x0F,0x11,0x13,0x15,0x17,0x19,0x1B,0x1D,0x1F};
 68.1355 +  const vuint8_t mez = {0x02,0x03,0x12,0x13,0x06,0x07,0x16,0x17,0x0A,0x0B,0x1A,0x1B,0x0E,0x0F,0x1E,0x1F};
 68.1356 +
 68.1357 +  const int permM2 = (unsigned int) (src-2) & 15;
 68.1358 +  const int permM1 = (unsigned int) (src-1) & 15;
 68.1359 +  const int permP0 = (unsigned int) (src) & 15;
 68.1360 +  const int permP1 = (unsigned int) (src+1) & 15;
 68.1361 +  const int permP2 = (unsigned int) (src+2) & 15;
 68.1362 +  const int permP3 = (unsigned int) (src+3) & 15;
 68.1363 +
 68.1364 +  register int align = ((((unsigned long)src) - 2) % 16);
 68.1365 +
 68.1366 +  src -= (2 * STRIDE_Y);
 68.1367 +
 68.1368 +  for (i = 0 ; i < (h+5) ; i ++) {
 68.1369 +    vuint8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
 68.1370 +    vuint8_t srcR1 = *(vuint8_t *)(src-2);
 68.1371 +    vuint8_t srcR2 = *(vuint8_t *)(src+14);
 68.1372 +
 68.1373 +    switch (align) {
 68.1374 +    default: {
 68.1375 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1376 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1377 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1378 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
 68.1379 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
 68.1380 +      srcP3 = spu_or(spu_slqwbyte(srcR1, permP3), spu_rlmaskqwbyte(srcR2, permP3-16));
 68.1381 +    } break;
 68.1382 +    case 11: {
 68.1383 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1384 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1385 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1386 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
 68.1387 +      srcP2 = spu_or(spu_slqwbyte(srcR1, permP2), spu_rlmaskqwbyte(srcR2, permP2-16));
 68.1388 +      srcP3 = srcR2;
 68.1389 +    } break;
 68.1390 +    case 12: {
 68.1391 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1392 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1393 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1394 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1395 +      srcP1 = spu_or(spu_slqwbyte(srcR1, permP1), spu_rlmaskqwbyte(srcR2, permP1-16));
 68.1396 +      srcP2 = srcR2;
 68.1397 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1398 +    } break;
 68.1399 +    case 13: {
 68.1400 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1401 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1402 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1403 +      srcP0 = spu_or(spu_slqwbyte(srcR1, permP0), spu_rlmaskqwbyte(srcR2, permP0-16));
 68.1404 +      srcP1 = srcR2;
 68.1405 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
 68.1406 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1407 +    } break;
 68.1408 +    case 14: {
 68.1409 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1410 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1411 +      srcM1 = spu_or(spu_slqwbyte(srcR1, permM1), spu_rlmaskqwbyte(srcR2, permM1-16));
 68.1412 +      srcP0 = srcR2;
 68.1413 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
 68.1414 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
 68.1415 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1416 +    } break;
 68.1417 +    case 15: {
 68.1418 +      vuint8_t srcR3 = *(vuint8_t *)(src+30);
 68.1419 +      srcM2 = spu_or(spu_slqwbyte(srcR1, permM2), spu_rlmaskqwbyte(srcR2, permM2-16));
 68.1420 +      srcM1 = srcR2;
 68.1421 +      srcP0 = spu_or(spu_slqwbyte(srcR2, permP0), spu_rlmaskqwbyte(srcR3, permP0-16));
 68.1422 +      srcP1 = spu_or(spu_slqwbyte(srcR2, permP1), spu_rlmaskqwbyte(srcR3, permP1-16));
 68.1423 +      srcP2 = spu_or(spu_slqwbyte(srcR2, permP2), spu_rlmaskqwbyte(srcR3, permP2-16));
 68.1424 +      srcP3 = spu_or(spu_slqwbyte(srcR2, permP3), spu_rlmaskqwbyte(srcR3, permP3-16));
 68.1425 +    } break;
 68.1426 +    }
 68.1427 +
 68.1428 +    const vsint16_t srcP0A = (vsint16_t)spu_shuffle(srcP0, (vuint8_t)vzero, mergeh);
 68.1429 +    const vsint16_t srcP1A = (vsint16_t)spu_shuffle(srcP1, (vuint8_t)vzero, mergeh);
 68.1430 +    const vsint16_t srcP2A = (vsint16_t)spu_shuffle(srcP2, (vuint8_t)vzero, mergeh);
 68.1431 +    const vsint16_t srcP3A = (vsint16_t)spu_shuffle(srcP3, (vuint8_t)vzero, mergeh);
 68.1432 +    const vsint16_t srcM2A = (vsint16_t)spu_shuffle(srcM2, (vuint8_t)vzero, mergeh);
 68.1433 +    const vsint16_t srcM1A = (vsint16_t)spu_shuffle(srcM1, (vuint8_t)vzero, mergeh);
 68.1434 +
 68.1435 +    const vsint16_t sum1A = spu_add(srcP0A, srcP1A);
 68.1436 +    const vsint16_t sum2A = spu_add(srcM1A, srcP2A);
 68.1437 +    const vsint16_t sum3A = spu_add(srcM2A, srcP3A);
 68.1438 +
 68.1439 +    const vsint32_t pp1A1 = spu_mule(sum1A, v20ss);
 68.1440 +    const vsint32_t pp1A2 = spu_mulo(sum1A, v20ss);
 68.1441 +    const vsint16_t pp1A3 = (vsint16_t)spu_shuffle((vsint16_t)pp1A1, (vsint16_t)pp1A2, mez);
 68.1442 +    const vsint16_t pp1A = spu_add(pp1A3, sum3A);
 68.1443 +
 68.1444 +    const vsint32_t pp2A1 = spu_mule(sum2A, v5ss);
 68.1445 +    const vsint32_t pp2A2 = spu_mulo(sum2A, v5ss);
 68.1446 +    const vsint16_t pp2A = (vsint16_t)spu_shuffle((vsint16_t)pp2A1, (vsint16_t)pp2A2, mez);
 68.1447 +
 68.1448 +    const vsint16_t psumA = spu_sub(pp1A, pp2A);
 68.1449 +
 68.1450 +    *(vsint16_t *)tmp = psumA;
 68.1451 +
 68.1452 +    src += STRIDE_Y;
 68.1453 +    tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
 68.1454 +  }
 68.1455 +
 68.1456 +  const int32_t ni10si = -10;
 68.1457 +  const int16_t i1ss = 1;
 68.1458 +  const int32_t i512si = 512;
 68.1459 +  const int32_t ni16si = -16;
 68.1460 +
 68.1461 +  const vsint32_t nv10si = spu_splats(ni10si);
 68.1462 +  const vsint16_t v1ss = spu_splats(i1ss);
 68.1463 +  const vsint32_t v512si = spu_splats(i512si);
 68.1464 +  const vsint32_t nv16si = spu_splats(ni16si);
 68.1465 +
 68.1466 +  const vuint8_t mperm = {0x00,0x08,0x01,0x09,0x02,0x0A,0x03,0x0B,0x04,0x0C,0x05,0x0D,0x06,0x0E,0x07,0x0F};
 68.1467 +  const vuint8_t packs = {0x02,0x03,0x06,0x07,0x0A,0x0B,0x0E,0x0F,0x12,0x13,0x16,0x17,0x1A,0x1B,0x1E,0x1F};
 68.1468 +
 68.1469 +  const int shift_dst = (unsigned int) (dst) & 15;
 68.1470 +  /* 4x4 dest luma blocks are aligned or desaligned by 4,8 or 12*/
 68.1471 +  vuint8_t dstmask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 68.1472 +  const vuint8_t dst4mask0= {0x10,0x11,0x12,0x13,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
 68.1473 +  const vuint8_t dst4mask4= {0x00,0x01,0x02,0x03,0x10,0x11,0x12,0x13,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F};
 68.1474 +  const vuint8_t dst4mask8= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x10,0x11,0x12,0x13,0x0C,0x0D,0x0E,0x0F};
 68.1475 +  const vuint8_t dst4mask12= {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x10,0x11,0x12,0x13};
 68.1476 +
 68.1477 +  switch(shift_dst){
 68.1478 +    case 0:  dstmask = dst4mask0;
 68.1479 +             break;
 68.1480 +    case 4:  dstmask = dst4mask4;
 68.1481 +             break;
 68.1482 +    case 8:  dstmask = dst4mask8;
 68.1483 +             break;
 68.1484 +    case 12: dstmask = dst4mask12;
 68.1485 +             break;
 68.1486 +  }
 68.1487 +
 68.1488 +  int16_t *tmpbis = tmp - (tmpStride * (h+5));
 68.1489 +
 68.1490 +  vsint16_t tmpM2ssA = *(vsint16_t *)(tmpbis);
 68.1491 +  tmpbis += tmpStride;
 68.1492 +  vsint16_t tmpM1ssA = *(vsint16_t *)(tmpbis);
 68.1493 +  tmpbis += tmpStride;
 68.1494 +  vsint16_t tmpP0ssA = *(vsint16_t *)(tmpbis);
 68.1495 +  tmpbis += tmpStride;
 68.1496 +  vsint16_t tmpP1ssA = *(vsint16_t *)(tmpbis);
 68.1497 +  tmpbis += tmpStride;
 68.1498 +  vsint16_t tmpP2ssA = *(vsint16_t *)(tmpbis);
 68.1499 +  tmpbis += tmpStride;
 68.1500 +
 68.1501 +  for (i = 0 ; i < h ; i++) {
 68.1502 +    const vsint16_t tmpP3ssA = *(vsint16_t *)(tmpbis);
 68.1503 +    tmpbis += tmpStride;
 68.1504 +
 68.1505 +    const vsint16_t sum1A = spu_add(tmpP0ssA, tmpP1ssA);
 68.1506 +    const vsint16_t sum2A = spu_add(tmpM1ssA, tmpP2ssA);
 68.1507 +    const vsint16_t sum3A = spu_add(tmpM2ssA, tmpP3ssA);
 68.1508 +
 68.1509 +    tmpM2ssA = tmpM1ssA;
 68.1510 +    tmpM1ssA = tmpP0ssA;
 68.1511 +    tmpP0ssA = tmpP1ssA;
 68.1512 +    tmpP1ssA = tmpP2ssA;
 68.1513 +    tmpP2ssA = tmpP3ssA;
 68.1514 +
 68.1515 +    const vsint32_t pp1Ae = spu_mule(sum1A, v20ss);
 68.1516 +    const vsint32_t pp1Ao = spu_mulo(sum1A, v20ss);
 68.1517 +    const vsint32_t pp2Ae = spu_mule(sum2A, v5ss);
 68.1518 +    const vsint32_t pp2Ao = spu_mulo(sum2A, v5ss);
 68.1519 +
 68.1520 +    const vsint32_t pp3Ae = spu_rlmask((vsint32_t)sum3A, nv16si);
 68.1521 +    const vsint32_t pp3Ao = spu_mulo(sum3A, v1ss);
 68.1522 +
 68.1523 +    const vsint32_t pp1cAe = spu_add(pp1Ae, v512si);
 68.1524 +    const vsint32_t pp1cAo = spu_add(pp1Ao, v512si);
 68.1525 +
 68.1526 +    const vsint32_t pp32Ae = spu_sub(pp3Ae, pp2Ae);
 68.1527 +    const vsint32_t pp32Ao = spu_sub(pp3Ao, pp2Ao);
 68.1528 +
 68.1529 +    const vsint32_t sumAe = spu_add(pp1cAe, pp32Ae);
 68.1530 +    const vsint32_t sumAo = spu_add(pp1cAo, pp32Ao);
 68.1531 +
 68.1532 +    const vsint32_t ssumAe = spu_rlmask(sumAe, nv10si);
 68.1533 +    const vsint32_t ssumAo = spu_rlmask(sumAo, nv10si);
 68.1534 +
 68.1535 +    vsint16_t ssume = (vsint16_t)spu_shuffle(ssumAe, vzero, packs);
 68.1536 +    vsint16_t ssumo = (vsint16_t)spu_shuffle(ssumAo, vzero, packs);
 68.1537 +
 68.1538 +    //Saturation to 0 and 255
 68.1539 +    sat = spu_cmpgt(ssume,(vsint16_t)vzero);
 68.1540 +    ssume = spu_and(ssume,(vsint16_t)sat);
 68.1541 +    sat = spu_cmpgt(ssume,vmax);
 68.1542 +    ssume = spu_sel(ssume,vmax,sat);
 68.1543 +    sat = spu_cmpgt(ssumo,(vsint16_t)vzero);
 68.1544 +    ssumo = spu_and(ssumo,(vsint16_t)sat);
 68.1545 +    sat = spu_cmpgt(ssumo,vmax);
 68.1546 +    ssumo = spu_sel(ssumo,vmax,sat);
 68.1547 +
 68.1548 +    const vuint8_t sumv = (vuint8_t)spu_shuffle(ssume, ssumo, packsu);
 68.1549 +
 68.1550 +    const vuint8_t sum = spu_shuffle(sumv, sumv, mperm);
 68.1551 +
 68.1552 +    const vuint8_t dst1 = *(vuint8_t *)dst;
 68.1553 +
 68.1554 +    const vuint8_t dsum = spu_shuffle(dst1, sum, dstmask);
 68.1555 +    vuint8_t fsum;
 68.1556 +    OP_U8_SPU(fsum, dsum, dst1);
 68.1557 +
 68.1558 +    *(vuint8_t *)dst=fsum;
 68.1559 +    
 68.1560 +    dst += dstStride; /* stride is multiple of 16 so dstperm and dstmask can remain out of the loop */
 68.1561 +
 68.1562 +  }
 68.1563 +}

    69.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    69.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.c	Mon Aug 27 12:09:56 2012 +0200
    69.3 @@ -0,0 +1,362 @@
    69.4 +/*
    69.5 + * Copyright (c) 2009 TUDelft 
    69.6 + * 
    69.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 
    69.8 + */
    69.9 +
   69.10 +/**
   69.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   69.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
   69.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   69.14 + * 
   69.15 + * SIMD kernels 
   69.16 + * H.264/AVC motion compensation
   69.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   69.18 + * @author Albert Paradis <apar7632@hotmail.com>
   69.19 + */ 
   69.20 +
   69.21 +
   69.22 +#include <stdio.h>
   69.23 +#include <spu_intrinsics.h>
   69.24 +#include <spu_mfcio.h>
   69.25 +#include <assert.h>
   69.26 +
   69.27 +#include "h264_mc_spu.h"
   69.28 +#include "h264_dma.h"
   69.29 +#include "h264_tables.h"
   69.30 +#include "h264_decode_mb_spu.h"
   69.31 +
   69.32 +
   69.33 +//biweight buffer 
   69.34 +DECLARE_ALIGNED_16(uint8_t, tmp_y_ls[48*16]);	      		
   69.35 +DECLARE_ALIGNED_16(uint8_t, tmp_cb_ls[32*8]);
   69.36 +DECLARE_ALIGNED_16(uint8_t, tmp_cr_ls[32*8]);
   69.37 +
   69.38 +//ref buffer (double buffered)
   69.39 +DECLARE_ALIGNED_16(uint8_t, mc_ref[2][16*(4+5)*48 + 2*16*(2+1)*32]);
   69.40 +uint8_t* ref_ptr;
   69.41 +
   69.42 +/** Motion Compensation functions*/
   69.43 +
   69.44 +static void fill_mc_part(H264mc *mc, int n, int chroma_height, int x_offset, int y_offset, int itp, int weight, int list0, int list1){
   69.45 +	H264mc_part *mc_part = mc->mc_part + mc->npart;
   69.46 +	mc_part->n =n;
   69.47 +	mc_part->chroma_height =chroma_height;
   69.48 +	mc_part->x_offset = x_offset;
   69.49 +	mc_part->y_offset = y_offset;
   69.50 +	mc_part->itp = itp;
   69.51 +	mc_part->weight = weight;
   69.52 +	mc_part->list0 = list0;
   69.53 +	mc_part->list1 = list1;
   69.54 +	
   69.55 +	mc->npart++;
   69.56 +}
   69.57 +
   69.58 +void calc_mc_params(H264Mb* mb, H264mc *mc){
   69.59 +	int mb_type = mb->mb_type;
   69.60 +	mc->npart=0;	
   69.61 +
   69.62 +	assert(!IS_INTRA(mb_type));
   69.63 +	if(IS_16X16(mb_type)){
   69.64 +		fill_mc_part(mc, 0, 8, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
   69.65 +    }else if(IS_16X8(mb_type)){
   69.66 +		fill_mc_part(mc, 0, 4, 0, 0, 0, 0, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
   69.67 +		fill_mc_part(mc, 8, 4, 0, 4, 0, 1, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
   69.68 +    }else if(IS_8X16(mb_type)){
   69.69 +		fill_mc_part(mc, 0, 8, 0, 0, 1, 2, IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
   69.70 +		fill_mc_part(mc, 4, 8, 4, 0, 1, 2, IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
   69.71 +    }else{
   69.72 +        int i;
   69.73 +        assert(IS_8X8(mb_type));
   69.74 +
   69.75 +        for(i=0; i<4; i++){
   69.76 +            const int sub_mb_type= mb->sub_mb_type[i];
   69.77 +            const int n= 4*i;
   69.78 +            int x_offset= (i&1)<<2;
   69.79 +            int y_offset= (i&2)<<1;
   69.80 +
   69.81 +			if(IS_SUB_8X8(sub_mb_type)){
   69.82 +				fill_mc_part(mc, n, 4, x_offset, y_offset, 1, 3, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
   69.83 +            }else if(IS_SUB_8X4(sub_mb_type)){
   69.84 +				fill_mc_part(mc, n, 2, x_offset, y_offset, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
   69.85 +				fill_mc_part(mc, n+2, 2, x_offset, y_offset+2, 1, 4, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
   69.86 +            }else if(IS_SUB_4X8(sub_mb_type)){
   69.87 +				fill_mc_part(mc, n, 4, x_offset, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
   69.88 +				fill_mc_part(mc, n+1, 4, x_offset+2, y_offset, 2, 5, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
   69.89 +            }else{
   69.90 +                int j;
   69.91 +                assert(IS_SUB_4X4(sub_mb_type));
   69.92 +                for(j=0; j<4; j++){
   69.93 +                    int sub_x_offset= x_offset + 2*(j&1);
   69.94 +                    int sub_y_offset= y_offset +   (j&2);
   69.95 +					fill_mc_part(mc, n+j, 2, sub_x_offset, sub_y_offset, 2, 6, IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
   69.96 +                }
   69.97 +            }
   69.98 +        }
   69.99 +    }
  69.100 +}
  69.101 +
  69.102 +/**
  69.103 +*	Returns a pointer to mc_buf 
  69.104 +*/
  69.105 +static void* alloc_mc_buf(int size){
  69.106 +	void* ptr = ref_ptr;
  69.107 +	ref_ptr += size;
  69.108 +	return ptr;
  69.109 +}
  69.110 +
  69.111 +#define TAG_OFFSET_MC MBD_mc_buf1
  69.112 +static uint8_t* get_mc_data(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){
  69.113 +	assert(src_ea);
  69.114 +	int unalign;
  69.115 +	unsigned address_align;
  69.116 +	
  69.117 +	uint8_t* ea;
  69.118 +	uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride);
  69.119 +
  69.120 +	ea = src_ea + pic_xoffset + pic_yoffset*linesize; 
  69.121 +	address_align = ((unsigned) ea) & 0xFFFFFFF0;
  69.122 +	unalign = ((unsigned) ea) & 0xF;
  69.123 +	get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, idx + TAG_OFFSET_MC, 0);
  69.124 +	return (ref_ptr + unalign);
  69.125 +}
  69.126 +
  69.127 +static uint8_t* get_mc_data_blocking(uint8_t* src_ea, int pic_xoffset, int pic_yoffset, int blk_h, int stride, int linesize, int idx){
  69.128 +	assert(src_ea);
  69.129 +	int unalign;
  69.130 +	unsigned address_align;
  69.131 +
  69.132 +	uint8_t* ea;
  69.133 +	uint8_t* ref_ptr = alloc_mc_buf(blk_h*stride);
  69.134 +
  69.135 +	ea = src_ea + pic_xoffset + pic_yoffset*linesize;
  69.136 +	address_align = ((unsigned) ea) & 0xFFFFFFF0;
  69.137 +	unalign = ((unsigned) ea) & 0xF;
  69.138 +	get_dma_list(ref_ptr, (void *)address_align, stride, blk_h, linesize, MBD_mc_buf1, 0);
  69.139 +	wait_dma_id(MBD_mc_buf1);
  69.140 +	return (ref_ptr + unalign);
  69.141 +}
  69.142 +
  69.143 +//#undef TAG_OFFSET_MC
  69.144 +
  69.145 +static void get_mc_components(H264Context_spu *h, H264Mb *mb, H264mc_part* mc_part, Picture_spu *pic, int n, int chroma_height, int list, int src_x_offset, int src_y_offset, int idx){
  69.146 +	assert(pic);
  69.147 +	H264slice *s = h->s;
  69.148 +	ref_data *ref = &mc_part->ref[list];
  69.149 +    const int mx= mb->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
  69.150 +    const int my= mb->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
  69.151 +
  69.152 +    const int pic_width  = 16*s->mb_width;
  69.153 +    const int pic_height = 16*s->mb_height;
  69.154 +	
  69.155 +	int blk_h= chroma_height*2+5;
  69.156 +	//int blk_w= 8*2+5;
  69.157 +	
  69.158 +	int blk_h_c= chroma_height+1;
  69.159 +	//int blk_w_c= 9;
  69.160 +
  69.161 +	int ymx= mx>>2;
  69.162 +    int ymy= my>>2;
  69.163 +    int cmy= my>>3;
  69.164 +    int cmx= mx>>3;
  69.165 +
  69.166 +    //truncate the motion vectors references
  69.167 +    if(ymy>= pic_height+2){
  69.168 +        ymy=pic_height+1;
  69.169 +    }else if(ymy <=-19){
  69.170 +        ymy=-18;
  69.171 +    }
  69.172 +    if(ymx>= pic_width+2){
  69.173 +        ymx= pic_width+1;
  69.174 +    }else if(ymx<=-19){
  69.175 +        ymx=-19;
  69.176 +    }
  69.177 +
  69.178 +	if(cmy >= pic_height>>1){
  69.179 +        cmy = (pic_height>>1) -1;
  69.180 +    }else if(cmy<=-9){
  69.181 +        cmy=-8;
  69.182 +    }
  69.183 +    if(cmx >= pic_width>>1){
  69.184 +        cmx = (pic_width>>1) -1;
  69.185 +    }else if(cmx<=-9){
  69.186 +        cmx=-8;
  69.187 +    }
  69.188 +	if (!h->blocking){
  69.189 +		ref->data[0]=get_mc_data(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx);
  69.190 +		ref->data[1]=get_mc_data(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
  69.191 +		ref->data[2]=get_mc_data(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
  69.192 +	} else {
  69.193 +		ref->data[0]=get_mc_data_blocking(pic->data[0], ymx-2, ymy-2, blk_h, STRIDE_Y, s->linesize, idx);
  69.194 +		ref->data[1]=get_mc_data_blocking(pic->data[1], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
  69.195 +		ref->data[2]=get_mc_data_blocking(pic->data[2], cmx, cmy, blk_h_c, STRIDE_C, s->uvlinesize, idx);
  69.196 +
  69.197 +	}
  69.198 +	
  69.199 +}
  69.200 +
  69.201 +static void get_ref_data(H264Context_spu *h, H264Mb *mb, H264mc_part *mc_part, int idx){
  69.202 +	H264slice *s = h->s;
  69.203 +	int x_offset = mc_part->x_offset;
  69.204 +	int y_offset = mc_part->y_offset;
  69.205 +	int list0 = mc_part->list0;
  69.206 +	int list1 = mc_part->list1;
  69.207 +	int n = mc_part->n;
  69.208 +	int chroma_height = mc_part->chroma_height;
  69.209 +	Picture_spu *refpic;
  69.210 +	
  69.211 +	x_offset += 8*mb->mb_x;
  69.212 +    y_offset += 8*mb->mb_y;
  69.213 +	
  69.214 +	if(list0){
  69.215 +		refpic= &s->ref_list[0][ mb->ref_cache[0][ scan8[n] ] ];
  69.216 +		get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 0, x_offset, y_offset, idx);
  69.217 +	}
  69.218 +	if(list1){
  69.219 +		refpic= &s->ref_list[1][ mb->ref_cache[1][ scan8[n] ] ];
  69.220 +		get_mc_components(h, mb, mc_part, refpic, n, chroma_height, 1, x_offset, y_offset, idx);
  69.221 +	}
  69.222 +}
  69.223 +
  69.224 +void fill_ref_buf(H264Context_spu *h, H264Mb *mb, H264mc *mc){
  69.225 +	int idx = h->mc_idx;
  69.226 +	int i;
  69.227 +
  69.228 +	get_list = get_list_buf;
  69.229 +	ref_ptr = mc_ref[idx];
  69.230 +	for(i=0; i<mc->npart; i++){
  69.231 +		get_ref_data(h, mb, &mc->mc_part[i], idx);
  69.232 +	}
  69.233 +}
  69.234 +
  69.235 +static void mc_dir_part(H264Context_spu *h, H264mc_part* mc_part, int n, int chroma_height, int list, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, int stride_y, int stride_c){
  69.236 +	
  69.237 +	H264Mb *mb = h->mb;
  69.238 +	ref_data* ref = &mc_part->ref[list];
  69.239 +    const int mx= mb->mv_cache[list][ scan8[n] ][0];	//to determine the interpolation mode
  69.240 +    const int my= mb->mv_cache[list][ scan8[n] ][1];
  69.241 +    const int luma_xy= (mx&3) + ((my&3)<<2);
  69.242 +	uint8_t *src_y, *src_cb, *src_cr;
  69.243 +    
  69.244 +	src_y = ref->data[0] +2+2*STRIDE_Y;
  69.245 +	src_cb = ref->data[1];
  69.246 +	src_cr = ref->data[2];
  69.247 +	
  69.248 +	qpix_op[luma_xy](dest_y, src_y, stride_y, chroma_height*2);
  69.249 +	chroma_op(dest_cb, src_cb, stride_c, chroma_height, mx&7, my&7);
  69.250 +	chroma_op(dest_cr, src_cr, stride_c, chroma_height, mx&7, my&7);
  69.251 +}
  69.252 +
  69.253 +
  69.254 +static void mc_part_biweighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg){
  69.255 +
  69.256 +	H264Mb *mb = h->mb;
  69.257 +	H264slice *s = h->s;
  69.258 +	int n = mc_part->n;
  69.259 +	int chroma_height = mc_part->chroma_height;
  69.260 +	int itp = mc_part->itp;
  69.261 +	int refn0 = mb->ref_cache[0][ scan8[n] ];
  69.262 +	int refn1 = mb->ref_cache[1][ scan8[n] ];        
  69.263 +	qpel_mc_func *qpix_put=  h->dsp.put_h264_qpel_pixels_tab[itp];
  69.264 +    h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp];
  69.265 +    
  69.266 +	// don't optimize for luma-only case, since B-frames usually
  69.267 +	// use implicit weights => chroma too. 
  69.268 +	mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c);
  69.269 +	
  69.270 +	mc_dir_part(h, mc_part, n, chroma_height, 1, tmp_y_ls, tmp_cb_ls, tmp_cr_ls, qpix_put, chroma_put, STRIDE_Y, STRIDE_C);
  69.271 +
  69.272 +	if(s->use_weight == 2){
  69.273 +		int weight0 = s->implicit_weight[refn0][refn1][mb->mb_y&1];
  69.274 +		int weight1 = 64 - weight0;
  69.275 +		luma_weight_avg(  dest_y,  tmp_y_ls, stride_y, STRIDE_Y, 5, weight0, weight1, 0);
  69.276 +		chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0);
  69.277 +		chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, 5, weight0, weight1, 0);
  69.278 +	}else{
  69.279 +		luma_weight_avg(dest_y, tmp_y_ls, stride_y, STRIDE_Y, s->luma_log2_weight_denom,  s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0], s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]);
  69.280 +		
  69.281 +		chroma_weight_avg(dest_cb, tmp_cb_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0], s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]);
  69.282 +		
  69.283 +		chroma_weight_avg(dest_cr, tmp_cr_ls, stride_c, STRIDE_C, s->chroma_log2_weight_denom, s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0], s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]);
  69.284 +	}
  69.285 +}
  69.286 +
  69.287 +static void mc_part_weighted(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op, int list1){
  69.288 +
  69.289 +	H264Mb *mb = h->mb;
  69.290 +	H264slice *s = h->s;
  69.291 +
  69.292 +	int n = mc_part->n;
  69.293 +	int chroma_height = mc_part->chroma_height;
  69.294 +	int itp = mc_part->itp;
  69.295 +	qpel_mc_func *qpix_put=  h->dsp.put_h264_qpel_pixels_tab[itp];
  69.296 +    h264_chroma_mc_func chroma_put= h->dsp.put_h264_chroma_pixels_tab[itp];
  69.297 +    
  69.298 +    int list = list1 ? 1 : 0;
  69.299 +	int refn = mb->ref_cache[list][ scan8[n] ];      
  69.300 +
  69.301 +	mc_dir_part(h, mc_part, n, chroma_height, list, dest_y, dest_cb, dest_cr, qpix_put, chroma_put, stride_y, stride_c);
  69.302 +
  69.303 +	luma_weight_op(dest_y, stride_y, s->luma_log2_weight_denom, s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]);
  69.304 +	if(s->use_weight_chroma){
  69.305 +		chroma_weight_op(dest_cb, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]);
  69.306 +		
  69.307 +		chroma_weight_op(dest_cr, stride_c, s->chroma_log2_weight_denom, s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]);
  69.308 +	}
  69.309 +}
  69.310 +
  69.311 +
  69.312 +static void mc_part_std(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c, int list0, int list1){
  69.313 +	int n = mc_part->n;
  69.314 +	int chroma_height = mc_part->chroma_height;
  69.315 +	int itp = mc_part->itp;
  69.316 +
  69.317 +    qpel_mc_func *qpix_op=  h->dsp.put_h264_qpel_pixels_tab[itp];
  69.318 +    h264_chroma_mc_func chroma_op= h->dsp.put_h264_chroma_pixels_tab[itp];
  69.319 +    
  69.320 +    if(list0){
  69.321 +        mc_dir_part(h, mc_part, n, chroma_height, 0, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c);
  69.322 +
  69.323 +        qpix_op=  h->dsp.avg_h264_qpel_pixels_tab[itp];
  69.324 +        chroma_op= h->dsp.avg_h264_chroma_pixels_tab[itp];
  69.325 +    }
  69.326 +
  69.327 +    if(list1){
  69.328 +        mc_dir_part(h, mc_part, n, chroma_height, 1, dest_y, dest_cb, dest_cr, qpix_op, chroma_op, stride_y, stride_c);
  69.329 +    }
  69.330 +}
  69.331 +
  69.332 +static void mc_part(H264Context_spu *h, H264mc_part *mc_part, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
  69.333 +	H264slice *s = h->s;
  69.334 +	
  69.335 +	int weight = mc_part->weight;
  69.336 +	
  69.337 +	int x_offset = mc_part->x_offset;
  69.338 +	int y_offset = mc_part->y_offset;
  69.339 +	int list0 = mc_part->list0;
  69.340 +	int list1 = mc_part->list1;
  69.341 +    
  69.342 +	dest_y  += 2*x_offset + 2*y_offset*stride_y;
  69.343 +    dest_cb +=   x_offset +   y_offset*stride_c;
  69.344 +    dest_cr +=   x_offset +   y_offset*stride_c;
  69.345 +    
  69.346 +	if(list0 && list1 && s->use_weight !=0){
  69.347 +		h264_biweight_func *weight_avg = &h->dsp.biweight_h264_pixels_tab[weight];
  69.348 +        mc_part_biweighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_avg[0], weight_avg[3]);
  69.349 +	}
  69.350 +	else if ((list0 || list1) && s->use_weight ==1){
  69.351 +		h264_weight_func *weight_op = &h->dsp.weight_h264_pixels_tab[weight];
  69.352 +		mc_part_weighted(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, weight_op[0], weight_op[3], list1);
  69.353 +	}
  69.354 +	else{
  69.355 +        mc_part_std(h, mc_part, dest_y, dest_cb, dest_cr, stride_y, stride_c, list0, list1);
  69.356 +	}
  69.357 +}
  69.358 +
  69.359 +void hl_motion(H264Context_spu *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, int stride_y, int stride_c){
  69.360 +	int i;
  69.361 +	H264mc *mc =h->mc; 
  69.362 +	for(i=0; i<mc->npart; i++){
  69.363 +		mc_part(h, &mc->mc_part[i], dest_y, dest_cb, dest_cr, stride_y, stride_c);
  69.364 +	}
  69.365 +}

    70.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    70.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_mc_spu.h	Mon Aug 27 12:09:56 2012 +0200
    70.3 @@ -0,0 +1,53 @@
    70.4 +#ifndef H264_MC_SPU_H
    70.5 +#define H264_MC_SPU_H
    70.6 +
    70.7 +//#include "types_spu.h"
    70.8 +
    70.9 +// motion compensation constants:
   70.10 +#define MB_TYPE_16x16      0x0008
   70.11 +#define MB_TYPE_16x8       0x0010
   70.12 +#define MB_TYPE_8x16       0x0020
   70.13 +#define MB_TYPE_8x8        0x0040
   70.14 +#define MB_TYPE_P0L0       0x1000
   70.15 +#define IS_16X16(a)        ((a)&MB_TYPE_16x16)
   70.16 +#define IS_16X8(a)         ((a)&MB_TYPE_16x8)
   70.17 +#define IS_8X16(a)         ((a)&MB_TYPE_8x16)
   70.18 +#define IS_8X8(a)          ((a)&MB_TYPE_8x8)
   70.19 +#define IS_SUB_8X8(a)      ((a)&MB_TYPE_16x16) //note reused
   70.20 +#define IS_SUB_8X4(a)      ((a)&MB_TYPE_16x8)  //note reused
   70.21 +#define IS_SUB_4X8(a)      ((a)&MB_TYPE_8x16)  //note reused
   70.22 +#define IS_SUB_4X4(a)      ((a)&MB_TYPE_8x8)   //note reused
   70.23 +#define IS_DIR(a, part, list) ((a) & (MB_TYPE_P0L0<<((part)+2*(list))))
   70.24 +
   70.25 +#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
   70.26 +#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
   70.27 +
   70.28 +//Motion compensation buffer strides
   70.29 +#define STRIDE_Y 48 
   70.30 +#define STRIDE_C 32
   70.31 +
   70.32 +typedef struct ref_data{
   70.33 +	uint8_t *data[3];
   70.34 +}ref_data;
   70.35 +
   70.36 +typedef struct H264mc_part{
   70.37 +	int n;
   70.38 +	int chroma_height;
   70.39 +	int x_offset;
   70.40 +	int y_offset;
   70.41 +	int itp;
   70.42 +	int weight;
   70.43 +	int list0;
   70.44 +	int list1;
   70.45 +	int use_weight;
   70.46 +	ref_data ref[2];
   70.47 +
   70.48 +}H264mc_part;
   70.49 +
   70.50 +typedef struct H264mc{
   70.51 +	H264mc_part mc_part[16];
   70.52 +	int npart;
   70.53 +}H264mc;
   70.54 +
   70.55 +
   70.56 +#endif

    71.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    71.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_pred_spu.h	Mon Aug 27 12:09:56 2012 +0200
    71.3 @@ -0,0 +1,90 @@
    71.4 +/*
    71.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
    71.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    71.7 + *
    71.8 + * This file is part of FFmpeg.
    71.9 + *
   71.10 + * FFmpeg is free software; you can redistribute it and/or
   71.11 + * modify it under the terms of the GNU Lesser General Public
   71.12 + * License as published by the Free Software Foundation; either
   71.13 + * version 2.1 of the License, or (at your option) any later version.
   71.14 + *
   71.15 + * FFmpeg is distributed in the hope that it will be useful,
   71.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   71.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   71.18 + * Lesser General Public License for more details.
   71.19 + *
   71.20 + * You should have received a copy of the GNU Lesser General Public
   71.21 + * License along with FFmpeg; if not, write to the Free Software
   71.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   71.23 + */
   71.24 +
   71.25 +/**
   71.26 + * @file
   71.27 + * H.264 / AVC / MPEG4 prediction functions.
   71.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   71.29 + */
   71.30 +
   71.31 +#ifndef AVCODEC_H264PRED_H
   71.32 +#define AVCODEC_H264PRED_H
   71.33 +
   71.34 +//#include "libavutil/common.h"
   71.35 +//#include "dsputil.h"
   71.36 +
   71.37 +/**
   71.38 + * Prediction types
   71.39 + */
   71.40 +//@{
   71.41 +#define VERT_PRED             0
   71.42 +#define HOR_PRED              1
   71.43 +#define DC_PRED               2
   71.44 +#define DIAG_DOWN_LEFT_PRED   3
   71.45 +#define DIAG_DOWN_RIGHT_PRED  4
   71.46 +#define VERT_RIGHT_PRED       5
   71.47 +#define HOR_DOWN_PRED         6
   71.48 +#define VERT_LEFT_PRED        7
   71.49 +#define HOR_UP_PRED           8
   71.50 +
   71.51 +#define LEFT_DC_PRED          9
   71.52 +#define TOP_DC_PRED           10
   71.53 +#define DC_128_PRED           11
   71.54 +
   71.55 +#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN   12
   71.56 +#define HOR_UP_PRED_RV40_NODOWN           13
   71.57 +#define VERT_LEFT_PRED_RV40_NODOWN        14
   71.58 +
   71.59 +#define DC_PRED8x8            0
   71.60 +#define HOR_PRED8x8           1
   71.61 +#define VERT_PRED8x8          2
   71.62 +#define PLANE_PRED8x8         3
   71.63 +
   71.64 +#define LEFT_DC_PRED8x8       4
   71.65 +#define TOP_DC_PRED8x8        5
   71.66 +#define DC_128_PRED8x8        6
   71.67 +
   71.68 +#define ALZHEIMER_DC_L0T_PRED8x8 7
   71.69 +#define ALZHEIMER_DC_0LT_PRED8x8 8
   71.70 +#define ALZHEIMER_DC_L00_PRED8x8 9
   71.71 +#define ALZHEIMER_DC_0L0_PRED8x8 10
   71.72 +//@}
   71.73 +
   71.74 +/**
   71.75 + * Context for storing H.264 prediction functions
   71.76 + */
   71.77 +typedef struct H264PredContext{
   71.78 +    void (*pred4x4  [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
   71.79 +    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
   71.80 +    void (*pred8x8  [4+3+4])(uint8_t *src, int stride);
   71.81 +    void (*pred16x16[4+3])(uint8_t *src, int stride);
   71.82 +
   71.83 +    void (*pred4x4_add  [2])(uint8_t *pix/*align  4*/, const DCTELEM *block/*align 16*/, int stride);
   71.84 +    void (*pred8x8l_add [2])(uint8_t *pix/*align  8*/, const DCTELEM *block/*align 16*/, int stride);
   71.85 +    void (*pred8x8_add  [3])(uint8_t *pix/*align  8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
   71.86 +    void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
   71.87 +}H264PredContext;
   71.88 +
   71.89 +void ff_h264_pred_init(H264PredContext *h);
   71.90 +void ff_h264_pred_init_arm(H264PredContext *h);
   71.91 +
   71.92 +
   71.93 +#endif /* AVCODEC_H264PRED_H */

    72.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    72.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.c	Mon Aug 27 12:09:56 2012 +0200
    72.3 @@ -0,0 +1,26 @@
    72.4 +#include <stdint.h>
    72.5 +#include "h264_tables.h"
    72.6 +
    72.7 +uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP] = {0, };
    72.8 +
    72.9 +int block_offset[16+4+4];
   72.10 +
   72.11 +void ff_cropTbl_init(){
   72.12 +    int i;
   72.13 +    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
   72.14 +    for(i=0;i<MAX_NEG_CROP;i++) {
   72.15 +        ff_cropTbl[i] = 0;
   72.16 +        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
   72.17 +    }
   72.18 +}
   72.19 +
   72.20 +void init_block_offset(int linesize, int uvlinesize){
   72.21 +	int i;
   72.22 +	for(i=0; i<16; i++){
   72.23 +        block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*linesize*((scan8[i] - scan8[0])>>3);
   72.24 +    }
   72.25 +    for(i=0; i<4; i++){
   72.26 +        block_offset[16+i]=
   72.27 +        block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*uvlinesize*((scan8[i] - scan8[0])>>3);
   72.28 +    }
   72.29 +}
   72.30 \ No newline at end of file

    73.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    73.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_tables.h	Mon Aug 27 12:09:56 2012 +0200
    73.3 @@ -0,0 +1,83 @@
    73.4 +#ifndef H264_TABLES_H
    73.5 +#define H264_TABLES_H
    73.6 +
    73.7 +#define MAX_NEG_CROP       1024
    73.8 +
    73.9 +extern uint8_t ff_cropTbl[256+2 *MAX_NEG_CROP];
   73.10 +extern int block_offset[16+4+4];
   73.11 +
   73.12 +static const uint8_t scan8[16 + 2*4]={
   73.13 +	4+1*8, 5+1*8, 4+2*8, 5+2*8,
   73.14 +	6+1*8, 7+1*8, 6+2*8, 7+2*8,
   73.15 +	4+3*8, 5+3*8, 4+4*8, 5+4*8,
   73.16 +	6+3*8, 7+3*8, 6+4*8, 7+4*8,
   73.17 +	1+1*8, 2+1*8,
   73.18 +	1+2*8, 2+2*8,
   73.19 +	1+4*8, 2+4*8,
   73.20 +	1+5*8, 2+5*8,
   73.21 +};
   73.22 +
   73.23 +static const uint8_t ff_zigzag_direct[64] = {
   73.24 +    0,   1,  8, 16,  9,  2,  3, 10,
   73.25 +    17, 24, 32, 25, 18, 11,  4,  5,
   73.26 +    12, 19, 26, 33, 40, 48, 41, 34,
   73.27 +    27, 20, 13,  6,  7, 14, 21, 28,
   73.28 +    35, 42, 49, 56, 57, 50, 43, 36,
   73.29 +    29, 22, 15, 23, 30, 37, 44, 51,
   73.30 +    58, 59, 52, 45, 38, 31, 39, 46,
   73.31 +    53, 60, 61, 54, 47, 55, 62, 63
   73.32 +};
   73.33 +
   73.34 +static const uint8_t zigzag_scan[16]={
   73.35 + 0+0*4, 1+0*4, 0+1*4, 0+2*4,
   73.36 + 1+1*4, 2+0*4, 3+0*4, 2+1*4,
   73.37 + 1+2*4, 0+3*4, 1+3*4, 2+2*4,
   73.38 + 3+1*4, 3+2*4, 2+3*4, 3+3*4,
   73.39 +};
   73.40 +
   73.41 +static const uint8_t luma_dc_zigzag_scan[16]={
   73.42 + 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
   73.43 + 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
   73.44 + 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
   73.45 + 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
   73.46 +};
   73.47 +
   73.48 +static const uint8_t chroma_dc_scan[4]={
   73.49 + (0+0*2)*16, (1+0*2)*16,
   73.50 + (0+1*2)*16, (1+1*2)*16,  //FIXME
   73.51 +};
   73.52 +
   73.53 +static const uint8_t rem6[52]={
   73.54 +0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
   73.55 +};
   73.56 +
   73.57 +static const uint8_t div6[52]={
   73.58 +0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
   73.59 +};
   73.60 +
   73.61 +static const uint8_t dequant4_coeff_init[6][3]={
   73.62 +  {10,13,16},
   73.63 +  {11,14,18},
   73.64 +  {13,16,20},
   73.65 +  {14,18,23},
   73.66 +  {16,20,25},
   73.67 +  {18,23,29},
   73.68 +};
   73.69 +
   73.70 +static const uint8_t dequant8_coeff_init_scan[16] = {
   73.71 +  0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
   73.72 +};
   73.73 +static const uint8_t dequant8_coeff_init[6][6]={
   73.74 +  {20,18,32,19,25,24},
   73.75 +  {22,19,35,21,28,26},
   73.76 +  {26,23,42,24,33,31},
   73.77 +  {28,25,45,26,35,33},
   73.78 +  {32,28,51,30,40,38},
   73.79 +  {36,32,58,34,46,43},
   73.80 +};
   73.81 +
   73.82 +
   73.83 +void init_block_offset(int linesize, int uvlinesize);
   73.84 +void ff_cropTbl_init();
   73.85 +
   73.86 +#endif

    74.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    74.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/h264_types_spu.h	Mon Aug 27 12:09:56 2012 +0200
    74.3 @@ -0,0 +1,203 @@
    74.4 +#ifndef H264_CELL_TYPES_H
    74.5 +#define H264_CELL_TYPES_H
    74.6 +
    74.7 +#include <libsync.h>
    74.8 +#include <libavcodec/avcodec.h>
    74.9 +
   74.10 +typedef struct spe_pos{
   74.11 +	volatile int count;		//number of mb processed
   74.12 +	uint32_t pad[3];
   74.13 +}spe_pos;
   74.14 +
   74.15 +//only the picture pointers are needed from the picture struct;
   74.16 +typedef struct Picture_spu {
   74.17 +	uint8_t* data[3];
   74.18 +} Picture_spu;
   74.19 +
   74.20 +///For Cell, might be idea to use this instead for everything
   74.21 +// struct that contains the pararms that change on slice
   74.22 +typedef struct H264slice{
   74.23 +	int deblocking_filter;
   74.24 +    int linesize;
   74.25 +    int uvlinesize;
   74.26 +	int mb_width;
   74.27 +	int mb_height;
   74.28 +
   74.29 +    int use_weight;
   74.30 +    int use_weight_chroma;
   74.31 +    int luma_log2_weight_denom;
   74.32 +    int chroma_log2_weight_denom;
   74.33 +
   74.34 +    int16_t luma_weight[16][2][2];
   74.35 +    int16_t chroma_weight[16][2][2][2];
   74.36 +    int16_t implicit_weight[16][16][2];
   74.37 +
   74.38 +	// ref picture ptr
   74.39 +    Picture_spu ref_list[2][16];
   74.40 +	int state;
   74.41 +	int emu_edge_width;
   74.42 +    int emu_edge_height;
   74.43 +
   74.44 +    int slice_type;
   74.45 +	int slice_type_nos;
   74.46 +	int slice_alpha_c0_offset;
   74.47 +    int slice_beta_offset;
   74.48 +
   74.49 +	uint8_t chroma_qp_table[2][64];
   74.50 +
   74.51 +	H264Mb *blocks;
   74.52 +	uint8_t  *dst_y, *dst_cb, *dst_cr;
   74.53 +
   74.54 +    //uint32_t pad[2];		// padding the structure for multiple of 16 bytes
   74.55 +}H264slice;
   74.56 +
   74.57 +typedef struct 	H264spe{
   74.58 +#define EDIP 0
   74.59 +#define EDB  1
   74.60 +#define MBD  2
   74.61 +	int type;
   74.62 +	int idx;
   74.63 +	int spe_id;
   74.64 +	int spe_total;
   74.65 +	int mb_width;
   74.66 +	int mb_stride;
   74.67 +	int mb_height;
   74.68 +	int linesize;
   74.69 +	int uvlinesize;
   74.70 +	//H264slice* slice_params;
   74.71 +	void* src_spe;
   74.72 +	void* tgt_spe;
   74.73 +
   74.74 +	mutex_ea_t lock;
   74.75 +	cond_ea_t cond;
   74.76 +	atomic_ea_t cnt;
   74.77 +
   74.78 +	mutex_ea_t rl_lock;
   74.79 +	cond_ea_t rl_cond;
   74.80 +	atomic_ea_t rl_cnt;
   74.81 +}H264spe;
   74.82 +
   74.83 +typedef struct H264Cabac_spu{
   74.84 +	int blocking;
   74.85 +
   74.86 +    int top_cbp;
   74.87 +    int left_cbp;
   74.88 +    int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct
   74.89 +
   74.90 +    uint32_t dequant4_buffer[6][52][16];
   74.91 +    uint32_t dequant8_buffer[2][52][64];
   74.92 +    uint32_t (*dequant4_coeff[6])[16];
   74.93 +    uint32_t (*dequant8_coeff[2])[64];
   74.94 +
   74.95 +    uint8_t (*non_zero_count_top)[32];
   74.96 +	uint8_t (*non_zero_count)[32];
   74.97 +
   74.98 +	uint8_t (*mvd_top[2])[2];
   74.99 +	uint8_t (*mvd[2])[2];
  74.100 +
  74.101 +	uint8_t *direct_top;
  74.102 +	uint8_t *direct;    
  74.103 +
  74.104 +	uint8_t *chroma_pred_mode_top;
  74.105 +	uint8_t *chroma_pred_mode;    
  74.106 +
  74.107 +	int8_t  *intra4x4_pred_mode_top;
  74.108 +    int8_t  *intra4x4_pred_mode;	
  74.109 +
  74.110 +	uint16_t *cbp_top;
  74.111 +	uint16_t *cbp;    
  74.112 +
  74.113 +	int8_t *qscale_top;
  74.114 +	int8_t *qscale;	
  74.115 +
  74.116 +	int8_t *ref_index_top[2];
  74.117 +	int8_t *ref_index[2];
  74.118 +
  74.119 +	int16_t (*motion_val_top[2])[2];
  74.120 +	int16_t (*motion_val[2])[2];
  74.121 +	uint32_t *mb_type_top;
  74.122 +	uint32_t *mb_type;
  74.123 +
  74.124 +	int8_t *list1_ref_index[2];		
  74.125 +	uint32_t *list1_mb_type;
  74.126 +	DECLARE_ALIGNED_16(int16_t, list1_motion_val[2][4*4][2]); // fill for a macroblock when required
  74.127 +
  74.128 +	int b_stride;
  74.129 +	int mb_stride;
  74.130 +	int mb_width;
  74.131 +	int mb_height;
  74.132 +
  74.133 +    uint8_t zigzag_scan[16];
  74.134 +    uint8_t zigzag_scan8x8[64];
  74.135 +
  74.136 +    uint8_t direct_cache[5*8];
  74.137 +    // Used to calculate loopfilter bS.
  74.138 +    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
  74.139 +    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
  74.140 +    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
  74.141 +    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
  74.142 +
  74.143 +} H264Cabac_spu;
  74.144 +
  74.145 +typedef struct EDSlice_spu{
  74.146 +    PPS pps;                 ///< current pps
  74.147 +    
  74.148 +    H264Mb *mbs;
  74.149 +
  74.150 +    int state;
  74.151 +    int qp_thresh;      ///< QP threshold to skip loopfilter
  74.152 +
  74.153 +	PictureInfo pic;
  74.154 +	PictureInfo list1;
  74.155 +//    Picture *ref_list[2][16];         ///Reordered version of default_ref_list according to picture reordering in slice header
  74.156 +    int ref_count[2];   ///< counts frames or fields, depending on current mb mode
  74.157 +	int slice_type;
  74.158 +    int slice_type_nos;
  74.159 +	int direct_8x8_inference_flag;
  74.160 +
  74.161 +    uint8_t list_count;
  74.162 +    uint32_t coded_pic_num;
  74.163 +///stuff only needed for nal/entropy decoding
  74.164 +    H264Mb *m;
  74.165 +    //GetBitContext gb;
  74.166 +	const uint8_t *bytestream_start;
  74.167 +	int byte_bufsize;
  74.168 +    int transform_bypass;
  74.169 +    int direct_spatial_mv_pred;
  74.170 +    int map_col_to_list0[2][16];
  74.171 +    int dist_scale_factor[16];
  74.172 +
  74.173 +    int cabac_init_idc;
  74.174 +    int ref2frm[2][64];  ///< reference to frame number lists, the first 2 are for -2,-1
  74.175 +    int qscale;
  74.176 +    int chroma_qp[2]; //QPc
  74.177 +    int last_qscale_diff;
  74.178 +
  74.179 +//  Picture* release_ref[MAX_MMCO_COUNT];
  74.180 +//   int release_cnt;
  74.181 +
  74.182 +
  74.183 +//     int use_weight;
  74.184 +//     int use_weight_chroma;
  74.185 +//    int luma_log2_weight_denom;
  74.186 +//    int chroma_log2_weight_denom;
  74.187 +
  74.188 +//     int8_t luma_weight[16][2][2];
  74.189 +//     int8_t chroma_weight[16][2][2][2];
  74.190 +//     int8_t implicit_weight[16][16][2];
  74.191 +
  74.192 +
  74.193 +
  74.194 +//  int slice_alpha_c0_offset;
  74.195 +//  int slice_beta_offset;
  74.196 +    
  74.197 +//    int nal_ref_idc;
  74.198 +//    int nal_unit_type;
  74.199 +//     uint8_t *rbsp_buffer;
  74.200 +//     unsigned int rbsp_buffer_size;
  74.201 +
  74.202 +
  74.203 +
  74.204 +} EDSlice_spu;
  74.205 +
  74.206 +#endif

    75.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    75.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/mathops_spu.h	Mon Aug 27 12:09:56 2012 +0200
    75.3 @@ -0,0 +1,137 @@
    75.4 +/*
    75.5 + * simple math operations
    75.6 + * Copyright (c) 2001, 2002 Fabrice Bellard
    75.7 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
    75.8 + *
    75.9 + * This file is part of FFmpeg.
   75.10 + *
   75.11 + * FFmpeg is free software; you can redistribute it and/or
   75.12 + * modify it under the terms of the GNU Lesser General Public
   75.13 + * License as published by the Free Software Foundation; either
   75.14 + * version 2.1 of the License, or (at your option) any later version.
   75.15 + *
   75.16 + * FFmpeg is distributed in the hope that it will be useful,
   75.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   75.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   75.19 + * Lesser General Public License for more details.
   75.20 + *
   75.21 + * You should have received a copy of the GNU Lesser General Public
   75.22 + * License along with FFmpeg; if not, write to the Free Software
   75.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   75.24 + */
   75.25 +#ifndef AVCODEC_MATHOPS_H
   75.26 +#define AVCODEC_MATHOPS_H
   75.27 +
   75.28 +// #include "libavutil/common.h"
   75.29 +// #include "libavutil/internal.h"
   75.30 +// 
   75.31 +// /* generic implementation */
   75.32 +// 
   75.33 +// #ifndef MULL
   75.34 +// #   define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s))
   75.35 +// #endif
   75.36 +// 
   75.37 +// #ifndef MULH
   75.38 +// //gcc 3.4 creates an incredibly bloated mess out of this
   75.39 +// //#    define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32)
   75.40 +// 
   75.41 +// static av_always_inline int MULH(int a, int b){
   75.42 +//     return ((int64_t)(a) * (int64_t)(b))>>32;
   75.43 +// }
   75.44 +// #endif
   75.45 +// 
   75.46 +// #ifndef UMULH
   75.47 +// static av_always_inline unsigned UMULH(unsigned a, unsigned b){
   75.48 +//     return ((uint64_t)(a) * (uint64_t)(b))>>32;
   75.49 +// }
   75.50 +// #endif
   75.51 +// 
   75.52 +// #ifndef MUL64
   75.53 +// #   define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
   75.54 +// #endif
   75.55 +// 
   75.56 +// #ifndef MAC64
   75.57 +// #   define MAC64(d, a, b) ((d) += MUL64(a, b))
   75.58 +// #endif
   75.59 +// 
   75.60 +// #ifndef MLS64
   75.61 +// #   define MLS64(d, a, b) ((d) -= MUL64(a, b))
   75.62 +// #endif
   75.63 +// 
   75.64 +// /* signed 16x16 -> 32 multiply add accumulate */
   75.65 +// #ifndef MAC16
   75.66 +// #   define MAC16(rt, ra, rb) rt += (ra) * (rb)
   75.67 +// #endif
   75.68 +// 
   75.69 +// /* signed 16x16 -> 32 multiply */
   75.70 +// #ifndef MUL16
   75.71 +// #   define MUL16(ra, rb) ((ra) * (rb))
   75.72 +// #endif
   75.73 +// 
   75.74 +// #ifndef MLS16
   75.75 +// #   define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb))
   75.76 +// #endif
   75.77 +
   75.78 +/* median of 3 */
   75.79 +#ifndef mid_pred
   75.80 +#define mid_pred mid_pred
   75.81 +static inline av_const int mid_pred(int a, int b, int c)
   75.82 +{
   75.83 +#if 0
   75.84 +    int t= (a-b)&((a-b)>>31);
   75.85 +    a-=t;
   75.86 +    b+=t;
   75.87 +    b-= (b-c)&((b-c)>>31);
   75.88 +    b+= (a-b)&((a-b)>>31);
   75.89 +
   75.90 +    return b;
   75.91 +#else
   75.92 +    if(a>b){
   75.93 +        if(c>b){
   75.94 +            if(c>a) b=a;
   75.95 +            else    b=c;
   75.96 +        }
   75.97 +    }else{
   75.98 +        if(b>c){
   75.99 +            if(c>a) b=c;
  75.100 +            else    b=a;
  75.101 +        }
  75.102 +    }
  75.103 +    return b;
  75.104 +#endif
  75.105 +}
  75.106 +#endif
  75.107 +
  75.108 +// #ifndef sign_extend
  75.109 +// static inline av_const int sign_extend(int val, unsigned bits)
  75.110 +// {
  75.111 +//     return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
  75.112 +// }
  75.113 +// #endif
  75.114 +// 
  75.115 +// #ifndef zero_extend
  75.116 +// static inline av_const unsigned zero_extend(unsigned val, unsigned bits)
  75.117 +// {
  75.118 +//     return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
  75.119 +// }
  75.120 +// #endif
  75.121 +// 
  75.122 +// #ifndef COPY3_IF_LT
  75.123 +// #define COPY3_IF_LT(x, y, a, b, c, d)\
  75.124 +// if ((y) < (x)) {\
  75.125 +//     (x) = (y);\
  75.126 +//     (a) = (b);\
  75.127 +//     (c) = (d);\
  75.128 +// }
  75.129 +// #endif
  75.130 +// 
  75.131 +// #ifndef NEG_SSR32
  75.132 +// #   define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
  75.133 +// #endif
  75.134 +// 
  75.135 +// #ifndef NEG_USR32
  75.136 +// #   define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
  75.137 +// #endif
  75.138 +
  75.139 +#endif /* AVCODEC_MATHOPS_H */
  75.140 +

    76.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    76.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/rectangle_spu.h	Mon Aug 27 12:09:56 2012 +0200
    76.3 @@ -0,0 +1,92 @@
    76.4 +/*
    76.5 + * rectangle filling function
    76.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    76.7 + *
    76.8 + * This file is part of FFmpeg.
    76.9 + *
   76.10 + * FFmpeg is free software; you can redistribute it and/or
   76.11 + * modify it under the terms of the GNU Lesser General Public
   76.12 + * License as published by the Free Software Foundation; either
   76.13 + * version 2.1 of the License, or (at your option) any later version.
   76.14 + *
   76.15 + * FFmpeg is distributed in the hope that it will be useful,
   76.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   76.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   76.18 + * Lesser General Public License for more details.
   76.19 + *
   76.20 + * You should have received a copy of the GNU Lesser General Public
   76.21 + * License along with FFmpeg; if not, write to the Free Software
   76.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   76.23 + */
   76.24 +
   76.25 +/**
   76.26 + * @file
   76.27 + * useful rectangle filling function
   76.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   76.29 + */
   76.30 +
   76.31 +#ifndef AVCODEC_RECTANGLE_H
   76.32 +#define AVCODEC_RECTANGLE_H
   76.33 +
   76.34 +#include <assert.h>
   76.35 +
   76.36 +#define STRIDE_ALIGN 16
   76.37 +
   76.38 +
   76.39 +/**
   76.40 + * fill a rectangle.
   76.41 + * @param h height of the rectangle, should be a constant
   76.42 + * @param w width of the rectangle, should be a constant
   76.43 + * @param size the size of val (1, 2 or 4), should be a constant
   76.44 + */
   76.45 +static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
   76.46 +    uint8_t *p= (uint8_t*)vp;
   76.47 +    assert(size==1 || size==2 || size==4);
   76.48 +    assert(w<=4);
   76.49 +
   76.50 +    w      *= size;
   76.51 +    stride *= size;
   76.52 +
   76.53 +    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
   76.54 +    assert((stride&(w-1))==0);
   76.55 +    if(w==2){
   76.56 +        const uint16_t v= size==4 ? val : val*0x0101;
   76.57 +        *(uint16_t*)(p + 0*stride)= v;
   76.58 +        if(h==1) return;
   76.59 +        *(uint16_t*)(p + 1*stride)= v;
   76.60 +        if(h==2) return;
   76.61 +        *(uint16_t*)(p + 2*stride)= v;
   76.62 +        *(uint16_t*)(p + 3*stride)= v;
   76.63 +    }else if(w==4){
   76.64 +        const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
   76.65 +        *(uint32_t*)(p + 0*stride)= v;
   76.66 +        if(h==1) return;
   76.67 +        *(uint32_t*)(p + 1*stride)= v;
   76.68 +        if(h==2) return;
   76.69 +        *(uint32_t*)(p + 2*stride)= v;
   76.70 +        *(uint32_t*)(p + 3*stride)= v;
   76.71 +    }else if(w==8){
   76.72 +        const uint64_t v=  size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
   76.73 +        *(uint64_t*)(p + 0*stride)= v;
   76.74 +        if(h==1) return;
   76.75 +        *(uint64_t*)(p + 1*stride)= v;
   76.76 +        if(h==2) return;
   76.77 +        *(uint64_t*)(p + 2*stride)= v;
   76.78 +        *(uint64_t*)(p + 3*stride)= v;
   76.79 +    }else if(w==16){
   76.80 +        const uint64_t v= val*0x0100000001ULL;
   76.81 +        *(uint64_t*)(p + 0+0*stride)= v;
   76.82 +        *(uint64_t*)(p + 8+0*stride)= v;
   76.83 +        *(uint64_t*)(p + 0+1*stride)= v;
   76.84 +        *(uint64_t*)(p + 8+1*stride)= v;
   76.85 +        if(h==2) return;
   76.86 +        *(uint64_t*)(p + 0+2*stride)= v;
   76.87 +        *(uint64_t*)(p + 8+2*stride)= v;
   76.88 +        *(uint64_t*)(p + 0+3*stride)= v;
   76.89 +        *(uint64_t*)(p + 8+3*stride)= v;
   76.90 +    }else
   76.91 +        assert(0);
   76.92 +    assert(h==4);
   76.93 +}
   76.94 +
   76.95 +#endif /* AVCODEC_RECTANGLE_H */

    77.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    77.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/spe_ed.c	Mon Aug 27 12:09:56 2012 +0200
    77.3 @@ -0,0 +1,508 @@
    77.4 +#define CELL_SPE
    77.5 +
    77.6 +#include <string.h>
    77.7 +#include <stdio.h>
    77.8 +#include <spu_intrinsics.h>
    77.9 +#include <spu_mfcio.h>
   77.10 +#include "libavcodec/avcodec.h"
   77.11 +#include "h264_cabac_spu.h"
   77.12 +#include "cabac_spu.h"
   77.13 +#include "h264_types_spu.h"
   77.14 +#include "h264_tables.h"
   77.15 +#include "h264_dma.h"
   77.16 +#include "h264_tables.h"
   77.17 +
   77.18 +#define MB_WIDTH 240
   77.19 +#define MB_STRIDE (MB_WIDTH+16)
   77.20 +
   77.21 +H264Cabac_spu hcabac;
   77.22 +CABACContext cabac;
   77.23 +DECLARE_ALIGNED_16(EDSlice_spu, slice[2]);
   77.24 +DECLARE_ALIGNED_16(H264Mb, mb[2]);
   77.25 +DECLARE_ALIGNED_16(H264spe, spe);
   77.26 +
   77.27 +DECLARE_ALIGNED_16(uint8_t, non_zero_count_table[2][MB_STRIDE][32]);
   77.28 +DECLARE_ALIGNED_16(uint8_t, mvd_table[2][2][8*MB_STRIDE][2]);
   77.29 +DECLARE_ALIGNED_16(uint8_t, direct_table[2][4*MB_STRIDE]);
   77.30 +DECLARE_ALIGNED_16(uint8_t, chroma_pred_mode_table[2][MB_STRIDE]);
   77.31 +DECLARE_ALIGNED_16(uint8_t, intra4x4_pred_mode_table[2][8*MB_STRIDE]);
   77.32 +DECLARE_ALIGNED_16(uint16_t,cbp_table[2][MB_STRIDE]);
   77.33 +DECLARE_ALIGNED_16(uint8_t, qscale_table[2][MB_STRIDE]);
   77.34 +
   77.35 +DECLARE_ALIGNED_16(uint32_t, mb_type_table[2][MB_STRIDE]);
   77.36 +DECLARE_ALIGNED_16(int8_t, ref_index_table[2][2][4*MB_STRIDE]);
   77.37 +DECLARE_ALIGNED_16(int16_t, motion_val_table[2][2][4*4*MB_WIDTH][2]);
   77.38 +
   77.39 +DECLARE_ALIGNED(128, uint8_t, bytestream_ls[4096]);
   77.40 +DECLARE_ALIGNED_16(uint32_t, list1_mb_type_table[2][MB_STRIDE]);
   77.41 +DECLARE_ALIGNED_16(int8_t, list1_ref_index_table[2][2][4*MB_STRIDE]);
   77.42 +
   77.43 +DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending
   77.44 +//mb position of neighbouring spes
   77.45 +DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1
   77.46 +static int total_lines;
   77.47 +
   77.48 +static inline int dep_resolved(H264spe *p){
   77.49 +	int spe_id = p->spe_id;
   77.50 +	volatile int lines_proc = src_spe.count;
   77.51 +	if (spe_id==0)
   77.52 +		return (total_lines < lines_proc-1 +p->mb_height)? 1:0;
   77.53 +	else
   77.54 +		return (total_lines < lines_proc-1)? 1:0;
   77.55 +}
   77.56 +
   77.57 +static void update_tgt_spe_dep(H264spe *p, int end){
   77.58 +	// 	if (end ){
   77.59 +   total_lines++;
   77.60 +   spe_pos* dma_spe = &dma_temp;
   77.61 +   spe_pos* tgt_spe = p->tgt_spe + (unsigned) &src_spe; //located in target spe local store
   77.62 +   dma_spe->count = end? total_lines+1: total_lines;
   77.63 +   spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), ED_put);
   77.64 +   // 	}
   77.65 +   
   77.66 +}
   77.67 +
   77.68 +static int init_cabac(H264spe *p, H264Cabac_spu *hc){
   77.69 +	hc->mb_height = p->mb_height;
   77.70 +	hc->mb_width = p->mb_width;
   77.71 +	hc->b_stride = 4*p->mb_width;
   77.72 +	hc->mb_stride = p->mb_stride;
   77.73 +	
   77.74 +	for(int i=0; i<16; i++){
   77.75 +		#define T(x) (x>>2) | ((x<<2) & 0xF)
   77.76 +		hc->zigzag_scan[i] = T(zigzag_scan[i]);
   77.77 +		#undef T
   77.78 +	}
   77.79 +	for(int i=0; i<64; i++){
   77.80 +		#define T(x) (x>>3) | ((x&7)<<3)
   77.81 +		hc->zigzag_scan8x8[i] = T(ff_zigzag_direct[i]);
   77.82 +		#undef T
   77.83 +	}
   77.84 +}
   77.85 +
   77.86 +static void reset_cabac_buffers(){
   77.87 + memset(intra4x4_pred_mode_table, 0, sizeof(intra4x4_pred_mode_table));
   77.88 +	memset(mvd_table, 0, sizeof(mvd_table));
   77.89 +	memset(direct_table, 0, sizeof(direct_table));
   77.90 +	memset(chroma_pred_mode_table, 0, sizeof(chroma_pred_mode_table));
   77.91 +	memset(cbp_table, 0, sizeof(cbp_table));
   77.92 +	memset(qscale_table, 0, sizeof(qscale_table));
   77.93 + 	memset(mb_type_table, 0, sizeof(mb_type_table));
   77.94 +	memset(ref_index_table, 0, sizeof(ref_index_table));
   77.95 +	memset(motion_val_table, 0, sizeof(motion_val_table));
   77.96 +}
   77.97 +
   77.98 +static void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int bufsize){
   77.99 +	int align = (unsigned) buf & 0xF;
  77.100 +	int dma_size;
  77.101 +	
  77.102 +	c->bytestream_ea_start=
  77.103 +	c->bytestream_ea= buf;
  77.104 +	c->bytestream_ea_end= buf + bufsize;
  77.105 +	c->bufsize = bufsize;
  77.106 +	
  77.107 +	if (bufsize + align >= sizeof(bytestream_ls)){
  77.108 +		dma_size = sizeof(bytestream_ls);
  77.109 +		c->bufsize = c->bufsize +align - sizeof(bytestream_ls);				
  77.110 +	}else{
  77.111 +		int align_end = (bufsize+align) &0xF;
  77.112 +		if (align_end)
  77.113 +			dma_size = bufsize+align + 16-align_end;
  77.114 +		else
  77.115 +			dma_size = bufsize+align;
  77.116 +		c->bufsize = 0;
  77.117 +	}
  77.118 +// 	printf("%d\n", dma_size);
  77.119 +	c->bytestream_end  = &bytestream_ls[dma_size]; 
  77.120 +	c->bytestream_start= c->bytestream = &bytestream_ls[align];
  77.121 + 	spu_dma_get(bytestream_ls, (unsigned) buf - align, dma_size, ED_get );
  77.122 +	c->bytestream_ea_start=
  77.123 +	c->bytestream_ea= buf + dma_size -align;
  77.124 +
  77.125 +	wait_dma_id(ED_get);
  77.126 +	
  77.127 +	if (align %2){
  77.128 +		c->low =  (*c->bytestream++)<<18;
  77.129 +		c->low+=  (*c->bytestream++)<<10;
  77.130 +		c->low+= ((*c->bytestream++)<<2) + 2;
  77.131 +	}else {
  77.132 +		c->low =  (*c->bytestream++)<<18;
  77.133 +		c->low+=  (*c->bytestream++)<<10;
  77.134 +		c->low+=  (2<<8);
  77.135 +	}
  77.136 +
  77.137 +	c->range= 0x1FE;
  77.138 +	bytecount=0;
  77.139 +}
  77.140 +
  77.141 +static void init_dequant8_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
  77.142 +    int i,q,x;
  77.143 +    const int transpose = HAVE_ALTIVEC;
  77.144 +    hc->dequant8_coeff[0] = hc->dequant8_buffer[0];
  77.145 +    hc->dequant8_coeff[1] = hc->dequant8_buffer[1];
  77.146 +
  77.147 +    for(i=0; i<2; i++){
  77.148 +        if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
  77.149 +            hc->dequant8_coeff[1] = hc->dequant8_buffer[0];
  77.150 +            break;
  77.151 +        }
  77.152 +
  77.153 +        for(q=0; q<52; q++){
  77.154 +            int shift = div6[q];
  77.155 +            int idx = rem6[q];
  77.156 +            for(x=0; x<64; x++)
  77.157 +                hc->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
  77.158 +                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
  77.159 +                    s->pps.scaling_matrix8[i][x]) << shift;
  77.160 +        }
  77.161 +    }
  77.162 +}
  77.163 +
  77.164 +static void init_dequant4_coeff_table(EDSlice_spu *s, H264Cabac_spu *hc){
  77.165 +    int i,j,q,x;
  77.166 +    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
  77.167 +    for(i=0; i<6; i++ ){
  77.168 +        hc->dequant4_coeff[i] = hc->dequant4_buffer[i];
  77.169 +        for(j=0; j<i; j++){
  77.170 +            if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
  77.171 +                hc->dequant4_coeff[i] = hc->dequant4_buffer[j];
  77.172 +                break;
  77.173 +            }
  77.174 +        }
  77.175 +        if(j<i)
  77.176 +            continue;
  77.177 +
  77.178 +        for(q=0; q<52; q++){
  77.179 +            int shift = div6[q] + 2;
  77.180 +            int idx = rem6[q];
  77.181 +            for(x=0; x<16; x++)
  77.182 +                hc->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
  77.183 +                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
  77.184 +                    s->pps.scaling_matrix4[i][x]) << shift;
  77.185 +        }
  77.186 +    }
  77.187 +}
  77.188 +
  77.189 +static void init_dequant_tables(EDSlice_spu *s, H264Cabac_spu *hc){
  77.190 +    int i,x;
  77.191 +
  77.192 +    init_dequant4_coeff_table(s, hc);
  77.193 +    if(s->pps.transform_8x8_mode)
  77.194 +        init_dequant8_coeff_table(s, hc);
  77.195 +    if(s->transform_bypass){
  77.196 +        for(i=0; i<6; i++)
  77.197 +            for(x=0; x<16; x++)
  77.198 +                hc->dequant4_coeff[i][0][x] = 1<<6;
  77.199 +        if(s->pps.transform_8x8_mode)
  77.200 +            for(i=0; i<2; i++)
  77.201 +                for(x=0; x<64; x++)
  77.202 +                    hc->dequant8_coeff[i][0][x] = 1<<6;
  77.203 +    }
  77.204 +}
  77.205 +
  77.206 +static void init_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s){
  77.207 +	hc->non_zero_count_top 		= non_zero_count_table[0];
  77.208 +	hc->non_zero_count     		= non_zero_count_table[1];
  77.209 +	hc->mvd_top[0]				= mvd_table[0][0];
  77.210 +	hc->mvd[0]					= mvd_table[0][1];
  77.211 +	hc->mvd_top[1]				= mvd_table[1][0];
  77.212 +	hc->mvd[1]					= mvd_table[1][1];
  77.213 +	hc->direct_top		   		= direct_table[0];
  77.214 +	hc->direct			   		= direct_table[1];
  77.215 +	hc->chroma_pred_mode_top	= chroma_pred_mode_table[0];
  77.216 +	hc->chroma_pred_mode  		= chroma_pred_mode_table[1];
  77.217 +	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[0];
  77.218 +	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[1];
  77.219 +	hc->cbp_top			   		= cbp_table[0];
  77.220 +	hc->cbp				   		= cbp_table[1];
  77.221 +	hc->qscale_top			   	= qscale_table[0] +1;
  77.222 +	hc->qscale				   	= qscale_table[1] +1;
  77.223 +
  77.224 +	hc->mb_type_top 			= mb_type_table[0]+1;
  77.225 +	hc->mb_type		 			= mb_type_table[1]+1;
  77.226 +	hc->ref_index_top[0]		= ref_index_table[0][0];
  77.227 +	hc->ref_index_top[1]		= ref_index_table[1][0];
  77.228 +	hc->ref_index[0]			= ref_index_table[0][1];
  77.229 +	hc->ref_index[1]			= ref_index_table[1][1];
  77.230 +	hc->motion_val_top[0] 		= motion_val_table[0][0];
  77.231 +	hc->motion_val_top[1] 		= motion_val_table[1][0];
  77.232 +	hc->motion_val[0] 			= motion_val_table[0][1];
  77.233 +	hc->motion_val[1] 			= motion_val_table[1][1];
  77.234 +
  77.235 +	int mb_stride = hc->mb_stride;
  77.236 +
  77.237 +	if (s->slice_type_nos == FF_B_TYPE){
  77.238 +		while(!dep_resolved(&spe));
  77.239 +		spu_dma_get(list1_mb_type_table[0], (unsigned) (s->list1.mb_type -1), mb_stride*sizeof(uint32_t), ED_get);
  77.240 +		spu_dma_get(list1_ref_index_table[0][0], (unsigned) s->list1.ref_index[0], mb_stride*4*sizeof(int8_t), ED_get);
  77.241 +		spu_dma_get(list1_ref_index_table[0][1], (unsigned) s->list1.ref_index[1], mb_stride*4*sizeof(int8_t), ED_get);
  77.242 +		wait_dma_id(ED_get);
  77.243 +		spu_dma_get(list1_mb_type_table[1], (unsigned) (s->list1.mb_type -1 + mb_stride), mb_stride*sizeof(uint32_t), ED_get);
  77.244 +		spu_dma_get(list1_ref_index_table[1][0], (unsigned) (s->list1.ref_index[0] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
  77.245 +		spu_dma_get(list1_ref_index_table[1][1], (unsigned) (s->list1.ref_index[1] + 4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
  77.246 +		hc->list1_mb_type = list1_mb_type_table[0]+1;
  77.247 +		hc->list1_ref_index[0] = list1_ref_index_table[0][0];
  77.248 +		hc->list1_ref_index[1] = list1_ref_index_table[0][1];
  77.249 +	}	
  77.250 +
  77.251 +}
  77.252 +
  77.253 +static void update_entropy_buf(H264Cabac_spu *hc, EDSlice_spu *s, int line){
  77.254 +	int mb_stride = hc->mb_stride;
  77.255 +	int mb_width = hc->mb_width;
  77.256 +	int top = (line+1)%2;
  77.257 +	int cur = line%2;
  77.258 +	int bottom = (line+1)%2; //same as top, but to identify prebuffering of next line.
  77.259 +
  77.260 +	hc->non_zero_count_top 		= non_zero_count_table[top];
  77.261 +	hc->non_zero_count     		= non_zero_count_table[cur];
  77.262 +	hc->mvd_top[0]				= mvd_table[0][top];
  77.263 +	hc->mvd[0]					= mvd_table[0][cur];
  77.264 +	hc->mvd_top[1]				= mvd_table[1][top];
  77.265 +	hc->mvd[1]					= mvd_table[1][cur];
  77.266 +	hc->direct_top		   		= direct_table[top];
  77.267 +	hc->direct			   		= direct_table[cur];
  77.268 +	hc->chroma_pred_mode_top	= chroma_pred_mode_table[top];
  77.269 +	hc->chroma_pred_mode  		= chroma_pred_mode_table[cur];
  77.270 +	hc->intra4x4_pred_mode_top	= intra4x4_pred_mode_table[top];
  77.271 +	hc->intra4x4_pred_mode  	= intra4x4_pred_mode_table[cur];
  77.272 +	hc->cbp_top			   		= cbp_table[top];
  77.273 +	hc->cbp				   		= cbp_table[cur];
  77.274 +	hc->qscale_top			   	= qscale_table[top] +1;
  77.275 +	hc->qscale				   	= qscale_table[cur] +1;
  77.276 +
  77.277 +	hc->mb_type_top 			= mb_type_table[top]+1;
  77.278 +	hc->mb_type		 			= mb_type_table[cur]+1;
  77.279 +	hc->ref_index_top[0]		= ref_index_table[0][top];
  77.280 +	hc->ref_index_top[1]		= ref_index_table[1][top];
  77.281 +	hc->ref_index[0]			= ref_index_table[0][cur];
  77.282 +	hc->ref_index[1]			= ref_index_table[1][cur];
  77.283 +	hc->motion_val_top[0] 		= motion_val_table[0][top];
  77.284 +	hc->motion_val_top[1] 		= motion_val_table[1][top];
  77.285 +	hc->motion_val[0] 			= motion_val_table[0][cur];
  77.286 +	hc->motion_val[1] 			= motion_val_table[1][cur];
  77.287 +
  77.288 +	wait_dma_id(ED_put);
  77.289 +	
  77.290 +	spu_dma_put(mb_type_table[top], (unsigned) (s->pic.mb_type -1 + line*mb_stride), mb_stride*sizeof(uint32_t), ED_put);
  77.291 +	spu_dma_put(ref_index_table[0][top], (unsigned) (s->pic.ref_index[0] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
  77.292 +	spu_dma_put(ref_index_table[1][top], (unsigned) (s->pic.ref_index[1] + line*4*mb_stride), 4*mb_stride*sizeof(int8_t), ED_put);
  77.293 +	spu_dma_put(motion_val_table[0][top], (unsigned) (s->pic.motion_val[0]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
  77.294 +	spu_dma_put(motion_val_table[1][top], (unsigned) (s->pic.motion_val[1]+ line*16*mb_width), 16*mb_width*2*sizeof(int16_t), ED_put);
  77.295 +
  77.296 +	if (s->slice_type_nos == FF_B_TYPE){
  77.297 +		update_tgt_spe_dep(&spe, 0);
  77.298 +		wait_dma_id(ED_get);
  77.299 +						
  77.300 +		if (line + 2 < hc->mb_height){
  77.301 +			while(!dep_resolved(&spe));
  77.302 +			spu_dma_get(list1_mb_type_table[cur], (unsigned) (s->list1.mb_type -1 + (line+2)*mb_stride), mb_stride*sizeof(uint32_t), ED_get);
  77.303 +			spu_dma_get(list1_ref_index_table[cur][0], (unsigned) (s->list1.ref_index[0] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
  77.304 +			spu_dma_get(list1_ref_index_table[cur][1], (unsigned) (s->list1.ref_index[1] + (line+2)*4*mb_stride), mb_stride*4*sizeof(int8_t), ED_get);
  77.305 +		}
  77.306 +		hc->list1_mb_type = list1_mb_type_table[bottom]+1;
  77.307 +		hc->list1_ref_index[0] = list1_ref_index_table[bottom][0];
  77.308 +		hc->list1_ref_index[1] = list1_ref_index_table[bottom][1];
  77.309 +	}
  77.310 +
  77.311 +}
  77.312 +
  77.313 +// void printmbdiff(EDSlice_spu *s, H264Cabac_spu *hc, H264Mb *mp, H264Mb *ms){
  77.314 +// 
  77.315 +// 	printf("mb_x %d, %d\n", mp->mb_x, ms->mb_x);
  77.316 +// 	printf("mb_y %d, %d\n", mp->mb_y, ms->mb_y);
  77.317 +// 	printf("mb_xy %d, %d\n", mp->mb_xy, ms->mb_xy);
  77.318 +// 	printf("top_mb_xy %d, %d\n", mp->top_mb_xy, ms->top_mb_xy);
  77.319 +// 	printf("left_mb_xy %d, %d\n", mp->left_mb_xy, ms->left_mb_xy);
  77.320 +// 	printf("chroma_pred_mode %d, %d\n", mp->chroma_pred_mode, ms->chroma_pred_mode);
  77.321 +// 	printf("intra16x16_pred_mode %d, %d\n", mp->intra16x16_pred_mode, ms->intra16x16_pred_mode);
  77.322 +// 	printf("topleft_samples %d, %d\n", mp->topleft_samples_available, ms->topleft_samples_available);
  77.323 +// 	printf("topright_samples %d, %d\n", mp->topright_samples_available, ms->topright_samples_available);
  77.324 +// 	printf("top_samples %d, %d\n", mp->top_samples_available, ms->top_samples_available);
  77.325 +// 	printf("left_samples %d, %d\n", mp->left_samples_available, ms->left_samples_available);
  77.326 +// 
  77.327 +// 	if (memcmp(mp->intra4x4_pred_mode_cache, ms->intra4x4_pred_mode_cache, 40)){
  77.328 +// 		for (int i=0; i<5; i++){
  77.329 +// 			for (int j=0; j<8; j++){
  77.330 +// 				printf("%d, %d\t", mp->intra4x4_pred_mode_cache[i*8+j],ms->intra4x4_pred_mode_cache[i*8+j]);
  77.331 +// 			}
  77.332 +// 			printf("\n");
  77.333 +// 		}
  77.334 +// 	}
  77.335 +// 
  77.336 +// 	if (memcmp(mp->non_zero_count_cache, ms->non_zero_count_cache, 48)){
  77.337 +// 		for (int i=0; i<6; i++){
  77.338 +// 			for (int j=0; j<8; j++){
  77.339 +// 				printf("%u, %u\t", mp->non_zero_count_cache[i*8+j],ms->non_zero_count_cache[i*8+j]);
  77.340 +// 			}
  77.341 +// 			printf("\n");
  77.342 +// 		}
  77.343 +// 	}
  77.344 +// 
  77.345 +// 	if (memcmp(mp->sub_mb_type, ms->sub_mb_type, 8)){
  77.346 +// 		for (int i=0; i<4; i++){
  77.347 +// 			printf("%u, %u\t", mp->sub_mb_type[i], mp->sub_mb_type[i]);
  77.348 +// 			printf("\n");
  77.349 +// 		}
  77.350 +// 	}
  77.351 +// 
  77.352 +// 	if (memcmp(mp->mv_cache, ms->mv_cache, 320)){
  77.353 +// 		for (int k=0; k<2; k++){
  77.354 +// 			for (int i=0; i<5; i++){
  77.355 +// 				for (int j=0; j<8; j++){
  77.356 +// 					printf("%d, %d, %d, %d\t", mp->mv_cache[k][i*8+j][0], mp->mv_cache[k][i*8+j][1], ms->mv_cache[k][i*8+j][0], ms->mv_cache[k][i*8+j][1]);
  77.357 +// 				}
  77.358 +// 				printf("\n");
  77.359 +// 			}
  77.360 +// 		}
  77.361 +// 	}
  77.362 +// 
  77.363 +// 	if (memcmp(mp->ref_cache, ms->ref_cache, 80)){
  77.364 +// 		for (int k=0; k<2; k++){
  77.365 +// 			for (int i=0; i<5; i++){
  77.366 +// 				for (int j=0; j<8; j++){
  77.367 +// 					printf("%d, %d\t", mp->ref_cache[k][i*8+j], ms->ref_cache[k][i*8+j]);
  77.368 +// 				}
  77.369 +// 				printf("\n");
  77.370 +// 			}
  77.371 +// 		}
  77.372 +// 	}
  77.373 +// 
  77.374 +// 	printf("cbp %d, %d\n", mp->cbp, ms->cbp);
  77.375 +// 	for (int i=0; i<hc->mb_stride; i++){
  77.376 +//    		printf("%d, ", hc->cbp[i]); fflush(0);
  77.377 +//    	}
  77.378 +// 	printf("\n");
  77.379 +// 
  77.380 +// 	printf("mb_type %x, %x\n", mp->mb_type, ms->mb_type);
  77.381 +// 	printf("mb_type IS_INTRA %d, IS_INTRA16x16 %d, IS_DIRECT %d\n", IS_INTRA(ms->mb_type), IS_INTRA16x16(ms->mb_type), IS_DIRECT(ms->mb_type) );
  77.382 +// 	printf("left_type %d, %d\n", mp->left_type, ms->left_type);
  77.383 +// 	printf("top_type %d, %d\n", mp->top_type, ms->top_type);
  77.384 +// 	printf("qscale_mb_xy %d, %d\n", mp->qscale_mb_xy, ms->qscale_mb_xy);
  77.385 +// 	printf("qscale_left_mb_xy %d, %d\n", mp->qscale_left_mb_xy, ms->qscale_left_mb_xy);
  77.386 +// 	printf("qscale_top_mb_xy %d, %d\n", mp->qscale_top_mb_xy, ms->qscale_top_mb_xy);
  77.387 +// // 	for (int i=0; i<hc->mb_stride; i++){
  77.388 +// // 		printf("%d, ", qscale_table[0][i]); fflush(0);
  77.389 +// // 	}
  77.390 +// 
  77.391 +// 	if (memcmp(mp->mb, ms->mb, 768)){
  77.392 +// 		for (int i=0; i<16; i++){
  77.393 +// 			for (int j=0; j<16; j++){
  77.394 +// 				printf("%d, %d\t", mp->mb[j + i*16], ms->ref_cache[j + i*16]);
  77.395 +// 			}
  77.396 +// 			printf("\n");
  77.397 +// 		}
  77.398 +// 		for (int i=0; i<8; i++){
  77.399 +// 			for (int j=0; j<8; j++){
  77.400 +// 				printf("%d, %d\t", mp->mb[256 + j + i*8], ms->ref_cache[j + i*8]);
  77.401 +// 			}
  77.402 +// 			printf("\n");
  77.403 +// 		}
  77.404 +// 		for (int i=0; i<8; i++){
  77.405 +// 			for (int j=0; j<8; j++){
  77.406 +// 				printf("%d, %d\t", mp->mb[320+ j + i*8], ms->ref_cache[j + i*8]);
  77.407 +// 			}
  77.408 +// 			printf("\n");
  77.409 +// 		}
  77.410 +// 	}
  77.411 +// 
  77.412 +// 	if (memcmp(mp->bS, ms->bS, 32)){
  77.413 +// 		for (int k=0; k<2; k++){
  77.414 +// 			for (int i=0; i<4; i++){
  77.415 +// 				for (int j=0; j<4; j++){
  77.416 +// 					printf("%d, %d\t", mp->bS[k][i][j], mp->mv_cache[k][i][j]);
  77.417 +// 				}
  77.418 +// 				printf("\n");
  77.419 +// 			}
  77.420 +// 		}
  77.421 +// 	}
  77.422 +// 	if (memcmp(mp->edges, ms->edges, 4)){
  77.423 +// 		printf("edges %d, %d, %d, %d\n", mp->edges[0], ms->edges[0], mp->edges[1], ms->edges[1]);
  77.424 +// 		printf("deblock %d, %d\n", mp->deblock_mb, ms->deblock_mb);
  77.425 +// 	}
  77.426 +// 
  77.427 +// 	printf("dequant4_coeff_y %d, %d\n", mp->dequant4_coeff_y, ms->dequant4_coeff_y);
  77.428 +// 	printf("dequant4_coeff_cb %d, %d\n", mp->dequant4_coeff_cb, ms->dequant4_coeff_cb);
  77.429 +// 	printf("dequant4_coeff_cr %d, %d\n", mp->dequant4_coeff_cr, ms->dequant4_coeff_cr);
  77.430 +// }
  77.431 +// DECLARE_ALIGNED_16(H264Mb, tmp);
  77.432 +
  77.433 +
  77.434 +int main(unsigned long long id, unsigned long long argp){
  77.435 +	EDSlice_spu *s;
  77.436 +	H264Cabac_spu *hc = &hcabac;
  77.437 +	CABACContext *c = &cabac;
  77.438 +	H264spe *p = &spe;
  77.439 +	
  77.440 +	spu_write_out_mbox((unsigned) slice);
  77.441 +	spu_dma_get(p, (unsigned) argp, sizeof(H264spe), ED_spe); //ID_slice is used out of convienience
  77.442 +	wait_dma_id(ED_spe);
  77.443 +
  77.444 +	ff_init_cabac_states();
  77.445 +	init_cabac(p, hc);
  77.446 +	hc->blocking=0;
  77.447 +	for(;;){
  77.448 +		spu_read_in_mbox();
  77.449 +		s = &slice[0];
  77.450 +		reset_cabac_buffers();
  77.451 +		init_entropy_buf(hc, s);
  77.452 +
  77.453 +		if (hc->blocking) wait_dma_id(ED_get);
  77.454 +		//printf("framesize %d\n", s->byte_bufsize);fflush(0);
  77.455 + 		init_dequant_tables(s, hc);
  77.456 +		ff_init_cabac_decoder( c, s->bytestream_start, s->byte_bufsize );
  77.457 + 		ff_h264_init_cabac_states(s, c);
  77.458 +
  77.459 +		int mb_slot=0;
  77.460 + 		for(int j=0; j<hc->mb_height; j++){
  77.461 +			for(int i=0; i<hc->mb_width; i++){
  77.462 +				int eos,ret;
  77.463 +				H264Mb *m = &mb[mb_slot];
  77.464 +				m->mb_x=i;
  77.465 +				m->mb_y=j;
  77.466 +				s->m = m;
  77.467 +
  77.468 +				ret = ff_h264_decode_mb_cabac(hc, s, c);
  77.469 +
  77.470 +// 				spu_dma_get(&tmp, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_get);
  77.471 +// 				wait_dma_id(ED_get);
  77.472 +// 				if (memcmp(&tmp, m, sizeof(H264Mb))){
  77.473 +// 					printf("coded pic num %d\n", s->coded_pic_num);
  77.474 +// 					printmbdiff(s, hc,&tmp, m);
  77.475 +// 					return 0;
  77.476 +// 				}
  77.477 +				//printf("qscale %d\n", m->qscale_mb_xy);
  77.478 +				if (!hc->blocking){
  77.479 +					if (mb_slot){
  77.480 +						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb1);
  77.481 +						wait_dma_id(ED_putmb0);
  77.482 +					}else {
  77.483 +						spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
  77.484 +						wait_dma_id(ED_putmb1);
  77.485 +					}
  77.486 +					mb_slot++; mb_slot%=2;
  77.487 +				}else {
  77.488 +					spu_dma_put(m, (unsigned) &s->mbs[j*hc->mb_width + i], sizeof(H264Mb), ED_putmb0);
  77.489 +					wait_dma_id(ED_putmb0);
  77.490 +				}
  77.491 +				
  77.492 +
  77.493 +				eos = get_cabac_terminate( c);
  77.494 +
  77.495 +				if( ret < 0) {
  77.496 +					fprintf(stderr, "error at %d bytecount\n", bytecount);
  77.497 +					return -1;
  77.498 +				}
  77.499 +			}
  77.500 +			update_entropy_buf(hc, s, j);
  77.501 +			if (hc->blocking){ wait_dma_id(ED_get); wait_dma_id(ED_put);}
  77.502 +		}
  77.503 +		wait_dma_id(ED_put);
  77.504 +		spu_write_out_mbox(1);
  77.505 +
  77.506 +	}
  77.507 +
  77.508 +	return 0;
  77.509 +
  77.510 +
  77.511 +}

    78.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    78.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/spe_mbd.c	Mon Aug 27 12:09:56 2012 +0200
    78.3 @@ -0,0 +1,356 @@
    78.4 +/*
    78.5 + * Copyright (c) 2009 TUDelft 
    78.6 + * 
    78.7 + * Cell Parallel SPU - 2DWave Macroblock Decoding. 
    78.8 + */
    78.9 +
   78.10 +/**
   78.11 + * @file libavcodec/cell/spu/h264_main_spu.c
   78.12 + * Cell Parallel SPU - 2DWave Macroblock Decoding
   78.13 + * @author C C Chi <c.c.chi@student.tudelft.nl>
   78.14 + * 
   78.15 + * SIMD kernels 
   78.16 + * H.264/AVC motion compensation
   78.17 + * @author Mauricio Alvarez <alvarez@ac.upc.edu>
   78.18 + * @author Albert Paradis <apar7632@hotmail.com>
   78.19 + */ 
   78.20 +
   78.21 +
   78.22 +/* Enable this lines to enable simulator statistic or generate traces */
   78.23 +
   78.24 +//#define ENABLE_SIMULATOR
   78.25 +//#define ENABLE_PARAVER_TRACING_CELL
   78.26 +
   78.27 +#ifdef ENABLE_SIMULATOR
   78.28 +	#include "/opt/ibm/systemsim-cell/include/callthru/spu/profile.h"
   78.29 +#endif
   78.30 +
   78.31 +#ifdef ENABLE_TRACES
   78.32 +	#include "spu_trace.h"
   78.33 +#endif
   78.34 +#include <unistd.h>
   78.35 +#include <stdio.h>
   78.36 +#include <spu_intrinsics.h>
   78.37 +#include <spu_mfcio.h>
   78.38 +#include <libsync.h>
   78.39 +#include <sys/time.h>
   78.40 +#include <assert.h>
   78.41 +
   78.42 +//#include "dsputil_cell.h"
   78.43 +#include "types_spu.h"
   78.44 +#include "h264_intra_spu.h"
   78.45 +#include "h264_decode_mb_spu.h"
   78.46 +#include "h264_mc_spu.h"
   78.47 +#include "h264_tables.h"
   78.48 +#include "h264_dma.h"
   78.49 +
   78.50 +
   78.51 +/** functions for supporting tracing with paraver for the SPU 
   78.52 + *
   78.53 + */
   78.54 +inline void trace_init_SPU(){
   78.55 +#ifdef ENABLE_PARAVER_TRACING_CELL
   78.56 +	SPUtrace_init ();
   78.57 +#endif
   78.58 +}
   78.59 +
   78.60 +inline void trace_fini_SPU(){
   78.61 +#ifdef ENABLE_PARAVER_TRACING_CELL
   78.62 +	SPUtrace_fini ();
   78.63 +#endif
   78.64 +}
   78.65 +
   78.66 +inline void trace_event_SPU(int event, int id){
   78.67 +#ifdef ENABLE_PARAVER_TRACING_CELL
   78.68 +	SPUtrace_event (event, id);
   78.69 +#else
   78.70 +	(void) event;
   78.71 +	(void) id;
   78.72 +#endif
   78.73 +}
   78.74 +
   78.75 +// for simulator statistic
   78.76 +inline void clear_statistic(){
   78.77 +#ifdef ENABLE_SIMULATOR
   78.78 +	prof_clear();
   78.79 +#endif
   78.80 +}
   78.81 +
   78.82 +inline void start_statistic(){
   78.83 +#ifdef ENABLE_SIMULATOR
   78.84 +	prof_start();
   78.85 +#endif
   78.86 +}
   78.87 +
   78.88 +inline void stop_statistic(){
   78.89 +#ifdef ENABLE_SIMULATOR
   78.90 +	prof_stop();
   78.91 +#endif
   78.92 +}
   78.93 +
   78.94 +H264Context_spu h_context;  // struct that contain all the params to decode a macroblock
   78.95 +
   78.96 +DECLARE_ALIGNED_16(spe_pos, dma_temp); //dma temp for sending
   78.97 +//mb position of neighbouring spes
   78.98 +DECLARE_ALIGNED_16(volatile spe_pos, src_spe); //written by SPE_ID -1
   78.99 +//DECLARE_ALIGNED_16(spe_pos, tgt_spe); //written by SPE_ID +1
  78.100 +
  78.101 +/**	
  78.102 +*	Initializes the buffering of the mb data and associated mc data. The init_mb_buffer needs to 
  78.103 +*	be called before any get_next_mb and only once at the beginning of the slice.
  78.104 +*
  78.105 +*	Note: init_mc_buffer and get_next_mb expect the width of the picture to be more than 2 mb's
  78.106 +*/
  78.107 +#define TAG_OFFSET_MB MBD_buf1
  78.108 +#define TAG_OFFSET_MC MBD_mc_buf1
  78.109 +static void init_mb_buffer(H264Context_spu* h){
  78.110 +	H264slice *s = h->s;
  78.111 +	H264Mb *next_mb;
  78.112 +	int mb_height = s->mb_height;
  78.113 +	int mb_width = s->mb_width;
  78.114 +
  78.115 +	h->mc_idx =0;
  78.116 +	
  78.117 +	h->mb_dec = 0;
  78.118 +	h->mb_mc = 0;
  78.119 +	h->mb_dma = 0;
  78.120 +		
  78.121 +	h->curr_line %= mb_height;
  78.122 +	h->next_mb_idx = h->curr_line * mb_width;
  78.123 +	h->mb_id = h->curr_line * mb_width;
  78.124 +	h->n_mc= h->curr_line * mb_width;
  78.125 +	
  78.126 +	next_mb = s->blocks + h->mb_id;
  78.127 +	spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
  78.128 +	h->mb_dma++;
  78.129 +	h->mb_id++;
  78.130 +	
  78.131 +	next_mb = s->blocks + h->mb_id;
  78.132 +	spu_dma_get(&h->mb_buf[h->mb_dma], (unsigned) next_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
  78.133 +	h->mb_dma++;
  78.134 +	h->mb_id++;
  78.135 +	wait_dma_id(0 + TAG_OFFSET_MB);	
  78.136 +	
  78.137 +	H264Mb *mb = &h->mb_buf[0];
  78.138 +	H264mc *mc = &h->mc_buf[0];
  78.139 +	if(!IS_INTRA(mb->mb_type)){
  78.140 +		calc_mc_params(mb, mc);
  78.141 +		fill_ref_buf(h, mb, mc);
  78.142 +	}
  78.143 +	h->n_mc++;
  78.144 +	h->mb_mc++;
  78.145 +}
  78.146 +
  78.147 +static void *get_next_mb(H264Context_spu *h){
  78.148 +	H264slice *s = h->s;
  78.149 +	H264spe *spe = &h->spe;
  78.150 +	H264Mb *mb_buf = h->mb_buf;	
  78.151 +	H264mc *mc_buf = h->mc_buf;
  78.152 +	H264Mb *next_mb;
  78.153 +	H264Mb *next_dma_mb;
  78.154 +	
  78.155 +	if (h->curr_line >= s->mb_height)
  78.156 +		return NULL;
  78.157 +	
  78.158 +	if (h->mb_id < h->mb_total){
  78.159 +		next_dma_mb = s->blocks + h->mb_id;
  78.160 +		spu_dma_get(&mb_buf[h->mb_dma], (unsigned) next_dma_mb, sizeof(H264Mb), h->mb_dma + TAG_OFFSET_MB);
  78.161 +		h->mb_dma = (h->mb_dma+1)%3;
  78.162 +		h->mb_id++;
  78.163 +		if (h->mb_id%s->mb_width ==0){
  78.164 +			h->mb_id+=(spe->spe_total-1)*s->mb_width;			
  78.165 +		}
  78.166 +	}
  78.167 +	
  78.168 +	h->mc = &mc_buf[h->mc_idx];
  78.169 +	wait_dma_id(h->mc_idx + TAG_OFFSET_MC);
  78.170 +	h->mc_idx = (h->mc_idx+1)%2;
  78.171 +	if (h->n_mc < h->mb_total){
  78.172 +		wait_dma_id(h->mb_mc + TAG_OFFSET_MB);
  78.173 +		H264Mb *mb = &mb_buf[h->mb_mc];
  78.174 +		H264mc *mc = &mc_buf[h->mc_idx];
  78.175 +		if(!IS_INTRA(mb->mb_type)){
  78.176 +			calc_mc_params(mb, mc);
  78.177 +			fill_ref_buf(h, mb, mc);
  78.178 +		}
  78.179 +		h->n_mc++;
  78.180 +		if (h->n_mc%s->mb_width ==0){
  78.181 +			h->n_mc+=(spe->spe_total-1)*s->mb_width;			
  78.182 +		}
  78.183 +	}
  78.184 +	h->next_mb_idx++;
  78.185 +	if (h->next_mb_idx % s->mb_width ==0){
  78.186 +		h->next_mb_idx+=(spe->spe_total-1)*s->mb_width;
  78.187 +		h->curr_line+=spe->spe_total;		
  78.188 +	}
  78.189 +	
  78.190 +	h->mb_mc = (h->mb_mc+1)%3;	
  78.191 +	next_mb = &mb_buf[h->mb_dec];
  78.192 +	h->mb_dec = (h->mb_dec+1)%3;
  78.193 +	return next_mb;
  78.194 +}
  78.195 +
  78.196 +static void *get_next_mb_blocking(H264Context_spu *h){
  78.197 +	H264slice *s = h->s;
  78.198 +	H264spe *spe = &h->spe;
  78.199 +	H264Mb *mb_buf = h->mb_buf;
  78.200 +	H264mc *mc_buf = h->mc_buf;
  78.201 +	H264Mb *next_mb;
  78.202 +	H264Mb *next_dma_mb;
  78.203 +
  78.204 +	if (h->mb_id >= h->mb_total)
  78.205 +		return NULL;
  78.206 +
  78.207 +	//printf("%d\n", h->mb_id);
  78.208 +	next_dma_mb = s->blocks + h->mb_id;
  78.209 +	spu_dma_get(&mb_buf[0], (unsigned) next_dma_mb, sizeof(H264Mb), MBD_buf1);
  78.210 +	//h->mb_dma = (h->mb_dma+1)%3;
  78.211 +	h->mb_id++;
  78.212 +	if (h->mb_id%s->mb_width ==0){
  78.213 +		h->mb_id+=(spe->spe_total-1)*s->mb_width;
  78.214 +	}
  78.215 +	wait_dma_id(MBD_buf1);
  78.216 +
  78.217 +	h->mc = &mc_buf[0];	
  78.218 +	//h->mc_idx = (h->mc_idx+1)%2;
  78.219 +	//if (h->n_mc < h->mb_total){
  78.220 +	H264Mb *mb = &mb_buf[0];
  78.221 +	H264mc *mc = &mc_buf[0];
  78.222 +	if(!IS_INTRA(mb->mb_type)){
  78.223 +		calc_mc_params(mb, mc);
  78.224 +		fill_ref_buf(h, mb, mc);
  78.225 +	}
  78.226 +	//h->n_mc++;
  78.227 +	/*if (h->n_mc%s->mb_width ==0){
  78.228 +		h->n_mc+=(spe->spe_total-1)*s->mb_width;
  78.229 +	}*/	
  78.230 +//	wait_dma_id(MBD_mc_buf1);
  78.231 +
  78.232 +// 	h->next_mb_idx++;
  78.233 +// 	if (h->next_mb_idx % s->mb_width ==0){
  78.234 +// 		h->next_mb_idx+=(spe->spe_total-1)*s->mb_width;
  78.235 +// 		h->curr_line+=spe->spe_total;
  78.236 +// 	}
  78.237 +
  78.238 +// 	h->mb_mc = (h->mb_mc+1)%3;
  78.239 +	next_mb = &mb_buf[0];
  78.240 +// 	h->mb_dec = (h->mb_dec+1)%3;
  78.241 +	return next_mb;
  78.242 +}
  78.243 +
  78.244 +
  78.245 +#undef TAG_OFFSET_MB
  78.246 +#undef TAG_OFFSET_MC
  78.247 +static inline int dep_resolved(H264Context_spu *h){
  78.248 +	H264slice *s = h->s;
  78.249 +	int spe_id = h->spe.spe_id;
  78.250 +	volatile int mb_proc_dep = src_spe.count;
  78.251 +	if (spe_id==0)
  78.252 +		return (h->mb_proc < mb_proc_dep-1 +s->mb_width)? 1:0;
  78.253 +	else
  78.254 +		return (h->mb_proc < mb_proc_dep-1)? 1:0;
  78.255 +}
  78.256 +
  78.257 +void update_tgt_spe_dep(H264Context_spu *h, int end){
  78.258 +	H264Mb *mb = h->mb;
  78.259 +	H264slice *s = h->s;
  78.260 +	H264spe *spe = &h->spe;
  78.261 +	int mb_x = mb->mb_x;
  78.262 +	
  78.263 +	if (end || (mb_x%2==0 && mb_x!=0) || mb_x==s->mb_width-1){
  78.264 +		spe_pos* dma_spe = &dma_temp;
  78.265 +		spe_pos* tgt_spe = (spe_pos*) ((unsigned) spe->tgt_spe + (unsigned) &src_spe); //located in target spe local store
  78.266 +		dma_spe->count = end? h->mb_proc+1: h->mb_proc;
  78.267 +		spu_dma_barrier_put(dma_spe, (unsigned) tgt_spe, sizeof(dma_temp), MBD_put);
  78.268 +	}
  78.269 +	h->mb_proc++;
  78.270 +}
  78.271 +
  78.272 +
  78.273 +int main(unsigned long long id, unsigned long long argp)
  78.274 +{
  78.275 +	(void) id;
  78.276 +	H264Context_spu* h = &h_context;
  78.277 +	H264spe *spe_params = (H264spe *) (unsigned) argp;    
  78.278 +	
  78.279 +	spu_dma_get(&h->spe, (unsigned) spe_params, sizeof(H264spe), MBD_slice); //ID_slice is used out of convienience
  78.280 +	wait_dma_id(MBD_slice);
  78.281 +
  78.282 +    //clear_statistic();
  78.283 +    dsputil_h264_init_cell(&h->dsp);
  78.284 +    ff_cropTbl_init();
  78.285 +    init_pred_ptrs(&h->hpc);
  78.286 +
  78.287 +	//send slice_buf to ppe
  78.288 +	spu_write_out_mbox((unsigned) h->slice_buf);
  78.289 +	h->sl_idx=0;
  78.290 +	// initialize tracing with paraver
  78.291 +    //trace_init_SPU();
  78.292 +	h->frames =0;	
  78.293 +	src_spe.count =0;
  78.294 +	h->mb_proc = 0;
  78.295 +
  78.296 +	h->mb_id=0;
  78.297 +	h->mc_idx=0;
  78.298 +	h->mb_dec=0;
  78.299 +	h->mb_mc=0;
  78.300 +	h->mb_dma=0;
  78.301 +	h->next_mb_idx=0;
  78.302 +
  78.303 +	h->blocking=0;
  78.304 +
  78.305 +
  78.306 +	H264spe* p = &h->spe;
  78.307 +	h->curr_line =p->spe_id;
  78.308 +	h->mb_total = p->mb_height*p->mb_width;
  78.309 +	int stride_y = 32;
  78.310 +	int stride_c = 16;
  78.311 +	//init block_offset array
  78.312 +	init_block_offset(stride_y, stride_c);
  78.313 +	for(;;){
  78.314 +		spu_read_in_mbox();
  78.315 +
  78.316 +		h->s = &h->slice_buf[h->sl_idx];
  78.317 +		h->sl_idx++; h->sl_idx%=2;
  78.318 +
  78.319 +		if (h->s->state< 0){			
  78.320 +			break;
  78.321 +		}
  78.322 +
  78.323 +		{
  78.324 +			if(!h->blocking){
  78.325 +				init_mb_buffer(h);
  78.326 +				while((h->mb=(H264Mb *)get_next_mb(h))){
  78.327 +					while(!dep_resolved(h));
  78.328 +					//printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p-	>spe_id);
  78.329 +					hl_decode_mb_internal(h, stride_y, stride_c);
  78.330 +				}
  78.331 +				update_tgt_spe_dep(h, 1);
  78.332 +			}else{
  78.333 +				h->mb_id=0;
  78.334 +				while((h->mb=(H264Mb *)get_next_mb_blocking(h))){
  78.335 +					while(!dep_resolved(h));
  78.336 +					//printf("frame %d mbx %d\t mby %d id %d\n", h->frames, h->mb->mb_x, h->mb->mb_y, p-	>spe_id);
  78.337 +					hl_decode_mb_internal(h, stride_y, stride_c);
  78.338 +				}
  78.339 +				update_tgt_spe_dep(h, 1);
  78.340 +			}
  78.341 +			
  78.342 +		}
  78.343 +
  78.344 +		h->frames++;
  78.345 +		
  78.346 +		if (p->spe_id == ((h->frames*p->mb_height -1)%p->spe_total)){
  78.347 +			//printf("spe %d, %d\n", atomic_read(p->rl_cnt), h->frames);
  78.348 +			//MBSlice is copied beforehand.
  78.349 +			//only inc cnt.
  78.350 +			atomic_inc(p->rl_cnt);		
  78.351 +		}
  78.352 +		{
  78.353 +			atomic_dec(p->cnt);
  78.354 +		}
  78.355 +	}
  78.356 +	
  78.357 +	return 0;
  78.358 +}
  78.359 +

    79.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    79.2 +++ b/ffmpeg_smp/h264dec/libavcodec/cell/types_spu.h	Mon Aug 27 12:09:56 2012 +0200
    79.3 @@ -0,0 +1,69 @@
    79.4 +/*
    79.5 + * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
    79.6 + *
    79.7 + * This file is part of FFmpeg.
    79.8 + *
    79.9 + * FFmpeg is free software; you can redistribute it and/or
   79.10 + * modify it under the terms of the GNU Lesser General Public
   79.11 + * License as published by the Free Software Foundation; either
   79.12 + * version 2.1 of the License, or (at your option) any later version.
   79.13 + *
   79.14 + * FFmpeg is distributed in the hope that it will be useful,
   79.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   79.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   79.17 + * Lesser General Public License for more details.
   79.18 + *
   79.19 + * You should have received a copy of the GNU Lesser General Public
   79.20 + * License along with FFmpeg; if not, write to the Free Software
   79.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   79.22 + */
   79.23 +
   79.24 +#ifndef TYPES_SPU_H
   79.25 +#define TYPES_SPU_H
   79.26 +
   79.27 +/***********************************************************************
   79.28 + * Scalar types
   79.29 + **********************************************************************/
   79.30 +    typedef signed char  int8_t;
   79.31 +    typedef signed short int16_t;
   79.32 +    typedef signed int   int32_t;
   79.33 +    typedef unsigned char  uint8_t;
   79.34 +    typedef unsigned short uint16_t;
   79.35 +    typedef unsigned int   uint32_t;
   79.36 +    typedef unsigned long long uint64_t;
   79.37 +
   79.38 +//     typedef short DCTELEM;		// transform coeficients of dct
   79.39 +
   79.40 +/***********************************************************************
   79.41 + * Vector types
   79.42 + **********************************************************************/
   79.43 +    typedef	vector	signed int	vsint32_t;
   79.44 +    typedef	vector	unsigned int	vuint32_t;
   79.45 +    typedef	vector	signed short	vsint16_t;
   79.46 +    typedef	vector	unsigned short	vuint16_t;
   79.47 +    typedef	vector	signed char	vsint8_t;
   79.48 +    typedef	vector	unsigned char	vuint8_t;
   79.49 +
   79.50 +/***********************************************************************
   79.51 + * Functions
   79.52 + **********************************************************************/
   79.53 +    typedef void (*qpel_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h);
   79.54 +    typedef void (*h264_chroma_mc_func)(uint8_t *dst, uint8_t *src, int dst_stride, int h, int x, int y);
   79.55 +    typedef void (*h264_idct_func)(uint8_t *dst, short *block, int stride);
   79.56 +    typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
   79.57 +    typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride, int log2_denom, int weightd,
   79.58 +                  int weights, int offset);
   79.59 +    typedef void(* intra_pred4x4)(uint8_t *src, uint8_t *topright, int stride);
   79.60 +    typedef void(* intra_pred16x16)(uint8_t *src, int stride);
   79.61 +    typedef void(* intra_pred8x8)(uint8_t *src, int stride);
   79.62 +    typedef void(* intra_pred8x8l)(uint8_t *src, int topleft, int topright, int stride);
   79.63 +
   79.64 +
   79.65 +#define AVV(x...) {x}
   79.66 +	
   79.67 +	
   79.68 +#endif // AVCODEC_TYPES_SPU_H
   79.69 +
   79.70 +
   79.71 +
   79.72 +

    80.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    80.2 +++ b/ffmpeg_smp/h264dec/libavcodec/dsputil.c	Mon Aug 27 12:09:56 2012 +0200
    80.3 @@ -0,0 +1,1057 @@
    80.4 +/*
    80.5 + * DSP utils
    80.6 + * Copyright (c) 2000, 2001 Fabrice Bellard
    80.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
    80.8 + *
    80.9 + * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   80.10 + *
   80.11 + * This file is part of FFmpeg.
   80.12 + *
   80.13 + * FFmpeg is free software; you can redistribute it and/or
   80.14 + * modify it under the terms of the GNU Lesser General Public
   80.15 + * License as published by the Free Software Foundation; either
   80.16 + * version 2.1 of the License, or (at your option) any later version.
   80.17 + *
   80.18 + * FFmpeg is distributed in the hope that it will be useful,
   80.19 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   80.20 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   80.21 + * Lesser General Public License for more details.
   80.22 + *
   80.23 + * You should have received a copy of the GNU Lesser General Public
   80.24 + * License along with FFmpeg; if not, write to the Free Software
   80.25 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   80.26 + */
   80.27 +
   80.28 +/**
   80.29 + * @file
   80.30 + * DSP utils
   80.31 + */
   80.32 +
   80.33 +#include "libavutil/log.h"
   80.34 +#include "dsputil.h"
   80.35 +#include "simple_idct.h"
   80.36 +#include "mathops.h"
   80.37 +#include "config.h"
   80.38 +
   80.39 +uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
   80.40 +uint32_t ff_squareTbl[512] = {0, };
   80.41 +
   80.42 +const uint8_t ff_zigzag_direct[64] = {
   80.43 +    0,   1,  8, 16,  9,  2,  3, 10,
   80.44 +    17, 24, 32, 25, 18, 11,  4,  5,
   80.45 +    12, 19, 26, 33, 40, 48, 41, 34,
   80.46 +    27, 20, 13,  6,  7, 14, 21, 28,
   80.47 +    35, 42, 49, 56, 57, 50, 43, 36,
   80.48 +    29, 22, 15, 23, 30, 37, 44, 51,
   80.49 +    58, 59, 52, 45, 38, 31, 39, 46,
   80.50 +    53, 60, 61, 54, 47, 55, 62, 63
   80.51 +};
   80.52 +
   80.53 +
   80.54 +#define PIXOP2(OPNAME, OP) \
   80.55 +static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   80.56 +    int i;\
   80.57 +    for(i=0; i<h; i++){\
   80.58 +        OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
   80.59 +        pixels+=line_size;\
   80.60 +        block +=line_size;\
   80.61 +    }\
   80.62 +}\
   80.63 +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   80.64 +    int i;\
   80.65 +    for(i=0; i<h; i++){\
   80.66 +        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
   80.67 +        pixels+=line_size;\
   80.68 +        block +=line_size;\
   80.69 +    }\
   80.70 +}\
   80.71 +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   80.72 +    int i;\
   80.73 +    for(i=0; i<h; i++){\
   80.74 +        OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
   80.75 +        OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
   80.76 +        pixels+=line_size;\
   80.77 +        block +=line_size;\
   80.78 +    }\
   80.79 +}\
   80.80 +static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
   80.81 +    OPNAME ## _pixels8_c(block, pixels, line_size, h);\
   80.82 +}\
   80.83 +\
   80.84 +static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
   80.85 +                                                int src_stride1, int src_stride2, int h){\
   80.86 +    int i;\
   80.87 +    for(i=0; i<h; i++){\
   80.88 +        uint32_t a,b;\
   80.89 +        a= AV_RN32(&src1[i*src_stride1  ]);\
   80.90 +        b= AV_RN32(&src2[i*src_stride2  ]);\
   80.91 +        OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
   80.92 +        a= AV_RN32(&src1[i*src_stride1+4]);\
   80.93 +        b= AV_RN32(&src2[i*src_stride2+4]);\
   80.94 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
   80.95 +    }\
   80.96 +}\
   80.97 +\
   80.98 +static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
   80.99 +                                                int src_stride1, int src_stride2, int h){\
  80.100 +    int i;\
  80.101 +    for(i=0; i<h; i++){\
  80.102 +        uint32_t a,b;\
  80.103 +        a= AV_RN32(&src1[i*src_stride1  ]);\
  80.104 +        b= AV_RN32(&src2[i*src_stride2  ]);\
  80.105 +        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
  80.106 +        a= AV_RN32(&src1[i*src_stride1+4]);\
  80.107 +        b= AV_RN32(&src2[i*src_stride2+4]);\
  80.108 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
  80.109 +    }\
  80.110 +}\
  80.111 +\
  80.112 +static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  80.113 +                                                int src_stride1, int src_stride2, int h){\
  80.114 +    int i;\
  80.115 +    for(i=0; i<h; i++){\
  80.116 +        uint32_t a,b;\
  80.117 +        a= AV_RN32(&src1[i*src_stride1  ]);\
  80.118 +        b= AV_RN32(&src2[i*src_stride2  ]);\
  80.119 +        OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
  80.120 +    }\
  80.121 +}\
  80.122 +\
  80.123 +static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  80.124 +                                                int src_stride1, int src_stride2, int h){\
  80.125 +    int i;\
  80.126 +    for(i=0; i<h; i++){\
  80.127 +        uint32_t a,b;\
  80.128 +        a= AV_RN16(&src1[i*src_stride1  ]);\
  80.129 +        b= AV_RN16(&src2[i*src_stride2  ]);\
  80.130 +        OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
  80.131 +    }\
  80.132 +}\
  80.133 +\
  80.134 +static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  80.135 +                                                int src_stride1, int src_stride2, int h){\
  80.136 +    OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
  80.137 +    OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  80.138 +}\
  80.139 +\
  80.140 +static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
  80.141 +                                                int src_stride1, int src_stride2, int h){\
  80.142 +    OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
  80.143 +    OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
  80.144 +}\
  80.145 +\
  80.146 +static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.147 +    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  80.148 +}\
  80.149 +\
  80.150 +static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.151 +    OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  80.152 +}\
  80.153 +\
  80.154 +static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.155 +    OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  80.156 +}\
  80.157 +\
  80.158 +static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.159 +    OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  80.160 +}\
  80.161 +\
  80.162 +static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  80.163 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  80.164 +    int i;\
  80.165 +    for(i=0; i<h; i++){\
  80.166 +        uint32_t a, b, c, d, l0, l1, h0, h1;\
  80.167 +        a= AV_RN32(&src1[i*src_stride1]);\
  80.168 +        b= AV_RN32(&src2[i*src_stride2]);\
  80.169 +        c= AV_RN32(&src3[i*src_stride3]);\
  80.170 +        d= AV_RN32(&src4[i*src_stride4]);\
  80.171 +        l0=  (a&0x03030303UL)\
  80.172 +           + (b&0x03030303UL)\
  80.173 +           + 0x02020202UL;\
  80.174 +        h0= ((a&0xFCFCFCFCUL)>>2)\
  80.175 +          + ((b&0xFCFCFCFCUL)>>2);\
  80.176 +        l1=  (c&0x03030303UL)\
  80.177 +           + (d&0x03030303UL);\
  80.178 +        h1= ((c&0xFCFCFCFCUL)>>2)\
  80.179 +          + ((d&0xFCFCFCFCUL)>>2);\
  80.180 +        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.181 +        a= AV_RN32(&src1[i*src_stride1+4]);\
  80.182 +        b= AV_RN32(&src2[i*src_stride2+4]);\
  80.183 +        c= AV_RN32(&src3[i*src_stride3+4]);\
  80.184 +        d= AV_RN32(&src4[i*src_stride4+4]);\
  80.185 +        l0=  (a&0x03030303UL)\
  80.186 +           + (b&0x03030303UL)\
  80.187 +           + 0x02020202UL;\
  80.188 +        h0= ((a&0xFCFCFCFCUL)>>2)\
  80.189 +          + ((b&0xFCFCFCFCUL)>>2);\
  80.190 +        l1=  (c&0x03030303UL)\
  80.191 +           + (d&0x03030303UL);\
  80.192 +        h1= ((c&0xFCFCFCFCUL)>>2)\
  80.193 +          + ((d&0xFCFCFCFCUL)>>2);\
  80.194 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.195 +    }\
  80.196 +}\
  80.197 +\
  80.198 +static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.199 +    OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  80.200 +}\
  80.201 +\
  80.202 +static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.203 +    OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  80.204 +}\
  80.205 +\
  80.206 +static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.207 +    OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
  80.208 +}\
  80.209 +\
  80.210 +static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  80.211 +    OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
  80.212 +}\
  80.213 +\
  80.214 +static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  80.215 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  80.216 +    int i;\
  80.217 +    for(i=0; i<h; i++){\
  80.218 +        uint32_t a, b, c, d, l0, l1, h0, h1;\
  80.219 +        a= AV_RN32(&src1[i*src_stride1]);\
  80.220 +        b= AV_RN32(&src2[i*src_stride2]);\
  80.221 +        c= AV_RN32(&src3[i*src_stride3]);\
  80.222 +        d= AV_RN32(&src4[i*src_stride4]);\
  80.223 +        l0=  (a&0x03030303UL)\
  80.224 +           + (b&0x03030303UL)\
  80.225 +           + 0x01010101UL;\
  80.226 +        h0= ((a&0xFCFCFCFCUL)>>2)\
  80.227 +          + ((b&0xFCFCFCFCUL)>>2);\
  80.228 +        l1=  (c&0x03030303UL)\
  80.229 +           + (d&0x03030303UL);\
  80.230 +        h1= ((c&0xFCFCFCFCUL)>>2)\
  80.231 +          + ((d&0xFCFCFCFCUL)>>2);\
  80.232 +        OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.233 +        a= AV_RN32(&src1[i*src_stride1+4]);\
  80.234 +        b= AV_RN32(&src2[i*src_stride2+4]);\
  80.235 +        c= AV_RN32(&src3[i*src_stride3+4]);\
  80.236 +        d= AV_RN32(&src4[i*src_stride4+4]);\
  80.237 +        l0=  (a&0x03030303UL)\
  80.238 +           + (b&0x03030303UL)\
  80.239 +           + 0x01010101UL;\
  80.240 +        h0= ((a&0xFCFCFCFCUL)>>2)\
  80.241 +          + ((b&0xFCFCFCFCUL)>>2);\
  80.242 +        l1=  (c&0x03030303UL)\
  80.243 +           + (d&0x03030303UL);\
  80.244 +        h1= ((c&0xFCFCFCFCUL)>>2)\
  80.245 +          + ((d&0xFCFCFCFCUL)>>2);\
  80.246 +        OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.247 +    }\
  80.248 +}\
  80.249 +static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  80.250 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  80.251 +    OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  80.252 +    OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  80.253 +}\
  80.254 +static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
  80.255 +                 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
  80.256 +    OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  80.257 +    OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
  80.258 +}\
  80.259 +\
  80.260 +static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  80.261 +{\
  80.262 +        int i, a0, b0, a1, b1;\
  80.263 +        a0= pixels[0];\
  80.264 +        b0= pixels[1] + 2;\
  80.265 +        a0 += b0;\
  80.266 +        b0 += pixels[2];\
  80.267 +\
  80.268 +        pixels+=line_size;\
  80.269 +        for(i=0; i<h; i+=2){\
  80.270 +            a1= pixels[0];\
  80.271 +            b1= pixels[1];\
  80.272 +            a1 += b1;\
  80.273 +            b1 += pixels[2];\
  80.274 +\
  80.275 +            block[0]= (a1+a0)>>2; /* FIXME non put */\
  80.276 +            block[1]= (b1+b0)>>2;\
  80.277 +\
  80.278 +            pixels+=line_size;\
  80.279 +            block +=line_size;\
  80.280 +\
  80.281 +            a0= pixels[0];\
  80.282 +            b0= pixels[1] + 2;\
  80.283 +            a0 += b0;\
  80.284 +            b0 += pixels[2];\
  80.285 +\
  80.286 +            block[0]= (a1+a0)>>2;\
  80.287 +            block[1]= (b1+b0)>>2;\
  80.288 +            pixels+=line_size;\
  80.289 +            block +=line_size;\
  80.290 +        }\
  80.291 +}\
  80.292 +\
  80.293 +static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  80.294 +{\
  80.295 +        int i;\
  80.296 +        const uint32_t a= AV_RN32(pixels  );\
  80.297 +        const uint32_t b= AV_RN32(pixels+1);\
  80.298 +        uint32_t l0=  (a&0x03030303UL)\
  80.299 +                    + (b&0x03030303UL)\
  80.300 +                    + 0x02020202UL;\
  80.301 +        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
  80.302 +                   + ((b&0xFCFCFCFCUL)>>2);\
  80.303 +        uint32_t l1,h1;\
  80.304 +\
  80.305 +        pixels+=line_size;\
  80.306 +        for(i=0; i<h; i+=2){\
  80.307 +            uint32_t a= AV_RN32(pixels  );\
  80.308 +            uint32_t b= AV_RN32(pixels+1);\
  80.309 +            l1=  (a&0x03030303UL)\
  80.310 +               + (b&0x03030303UL);\
  80.311 +            h1= ((a&0xFCFCFCFCUL)>>2)\
  80.312 +              + ((b&0xFCFCFCFCUL)>>2);\
  80.313 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.314 +            pixels+=line_size;\
  80.315 +            block +=line_size;\
  80.316 +            a= AV_RN32(pixels  );\
  80.317 +            b= AV_RN32(pixels+1);\
  80.318 +            l0=  (a&0x03030303UL)\
  80.319 +               + (b&0x03030303UL)\
  80.320 +               + 0x02020202UL;\
  80.321 +            h0= ((a&0xFCFCFCFCUL)>>2)\
  80.322 +              + ((b&0xFCFCFCFCUL)>>2);\
  80.323 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.324 +            pixels+=line_size;\
  80.325 +            block +=line_size;\
  80.326 +        }\
  80.327 +}\
  80.328 +\
  80.329 +static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  80.330 +{\
  80.331 +    int j;\
  80.332 +    for(j=0; j<2; j++){\
  80.333 +        int i;\
  80.334 +        const uint32_t a= AV_RN32(pixels  );\
  80.335 +        const uint32_t b= AV_RN32(pixels+1);\
  80.336 +        uint32_t l0=  (a&0x03030303UL)\
  80.337 +                    + (b&0x03030303UL)\
  80.338 +                    + 0x02020202UL;\
  80.339 +        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
  80.340 +                   + ((b&0xFCFCFCFCUL)>>2);\
  80.341 +        uint32_t l1,h1;\
  80.342 +\
  80.343 +        pixels+=line_size;\
  80.344 +        for(i=0; i<h; i+=2){\
  80.345 +            uint32_t a= AV_RN32(pixels  );\
  80.346 +            uint32_t b= AV_RN32(pixels+1);\
  80.347 +            l1=  (a&0x03030303UL)\
  80.348 +               + (b&0x03030303UL);\
  80.349 +            h1= ((a&0xFCFCFCFCUL)>>2)\
  80.350 +              + ((b&0xFCFCFCFCUL)>>2);\
  80.351 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.352 +            pixels+=line_size;\
  80.353 +            block +=line_size;\
  80.354 +            a= AV_RN32(pixels  );\
  80.355 +            b= AV_RN32(pixels+1);\
  80.356 +            l0=  (a&0x03030303UL)\
  80.357 +               + (b&0x03030303UL)\
  80.358 +               + 0x02020202UL;\
  80.359 +            h0= ((a&0xFCFCFCFCUL)>>2)\
  80.360 +              + ((b&0xFCFCFCFCUL)>>2);\
  80.361 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.362 +            pixels+=line_size;\
  80.363 +            block +=line_size;\
  80.364 +        }\
  80.365 +        pixels+=4-line_size*(h+1);\
  80.366 +        block +=4-line_size*h;\
  80.367 +    }\
  80.368 +}\
  80.369 +\
  80.370 +static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
  80.371 +{\
  80.372 +    int j;\
  80.373 +    for(j=0; j<2; j++){\
  80.374 +        int i;\
  80.375 +        const uint32_t a= AV_RN32(pixels  );\
  80.376 +        const uint32_t b= AV_RN32(pixels+1);\
  80.377 +        uint32_t l0=  (a&0x03030303UL)\
  80.378 +                    + (b&0x03030303UL)\
  80.379 +                    + 0x01010101UL;\
  80.380 +        uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
  80.381 +                   + ((b&0xFCFCFCFCUL)>>2);\
  80.382 +        uint32_t l1,h1;\
  80.383 +\
  80.384 +        pixels+=line_size;\
  80.385 +        for(i=0; i<h; i+=2){\
  80.386 +            uint32_t a= AV_RN32(pixels  );\
  80.387 +            uint32_t b= AV_RN32(pixels+1);\
  80.388 +            l1=  (a&0x03030303UL)\
  80.389 +               + (b&0x03030303UL);\
  80.390 +            h1= ((a&0xFCFCFCFCUL)>>2)\
  80.391 +              + ((b&0xFCFCFCFCUL)>>2);\
  80.392 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.393 +            pixels+=line_size;\
  80.394 +            block +=line_size;\
  80.395 +            a= AV_RN32(pixels  );\
  80.396 +            b= AV_RN32(pixels+1);\
  80.397 +            l0=  (a&0x03030303UL)\
  80.398 +               + (b&0x03030303UL)\
  80.399 +               + 0x01010101UL;\
  80.400 +            h0= ((a&0xFCFCFCFCUL)>>2)\
  80.401 +              + ((b&0xFCFCFCFCUL)>>2);\
  80.402 +            OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
  80.403 +            pixels+=line_size;\
  80.404 +            block +=line_size;\
  80.405 +        }\
  80.406 +        pixels+=4-line_size*(h+1);\
  80.407 +        block +=4-line_size*h;\
  80.408 +    }\
  80.409 +}\
  80.410 +\
  80.411 +CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
  80.412 +
  80.413 +#define op_avg(a, b) a = rnd_avg32(a, b)
  80.414 +
  80.415 +#define op_put(a, b) a = b
  80.416 +
  80.417 +PIXOP2(avg, op_avg)
  80.418 +PIXOP2(put, op_put)
  80.419 +#undef op_avg
  80.420 +#undef op_put
  80.421 +
  80.422 +
  80.423 +#define H264_CHROMA_MC(OPNAME, OP)\
  80.424 +static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
  80.425 +    const int A=(8-x)*(8-y);\
  80.426 +    const int B=(  x)*(8-y);\
  80.427 +    const int C=(8-x)*(  y);\
  80.428 +    const int D=(  x)*(  y);\
  80.429 +    int i;\
  80.430 +    \
  80.431 +    assert(x<8 && y<8 && x>=0 && y>=0);\
  80.432 +\
  80.433 +    if(D){\
  80.434 +        for(i=0; i<h; i++){\
  80.435 +            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
  80.436 +            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
  80.437 +            dst+= stride;\
  80.438 +            src+= stride;\
  80.439 +        }\
  80.440 +    }else{\
  80.441 +        const int E= B+C;\
  80.442 +        const int step= C ? stride : 1;\
  80.443 +        for(i=0; i<h; i++){\
  80.444 +            OP(dst[0], (A*src[0] + E*src[step+0]));\
  80.445 +            OP(dst[1], (A*src[1] + E*src[step+1]));\
  80.446 +            dst+= stride;\
  80.447 +            src+= stride;\
  80.448 +        }\
  80.449 +    }\
  80.450 +}\
  80.451 +\
  80.452 +static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
  80.453 +    const int A=(8-x)*(8-y);\
  80.454 +    const int B=(  x)*(8-y);\
  80.455 +    const int C=(8-x)*(  y);\
  80.456 +    const int D=(  x)*(  y);\
  80.457 +    int i;\
  80.458 +    \
  80.459 +    assert(x<8 && y<8 && x>=0 && y>=0);\
  80.460 +\
  80.461 +    if(D){\
  80.462 +        for(i=0; i<h; i++){\
  80.463 +            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
  80.464 +            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
  80.465 +            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
  80.466 +            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
  80.467 +            dst+= stride;\
  80.468 +            src+= stride;\
  80.469 +        }\
  80.470 +    }else{\
  80.471 +        const int E= B+C;\
  80.472 +        const int step= C ? stride : 1;\
  80.473 +        for(i=0; i<h; i++){\
  80.474 +            OP(dst[0], (A*src[0] + E*src[step+0]));\
  80.475 +            OP(dst[1], (A*src[1] + E*src[step+1]));\
  80.476 +            OP(dst[2], (A*src[2] + E*src[step+2]));\
  80.477 +            OP(dst[3], (A*src[3] + E*src[step+3]));\
  80.478 +            dst+= stride;\
  80.479 +            src+= stride;\
  80.480 +        }\
  80.481 +    }\
  80.482 +}\
  80.483 +\
  80.484 +static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
  80.485 +    const int A=(8-x)*(8-y);\
  80.486 +    const int B=(  x)*(8-y);\
  80.487 +    const int C=(8-x)*(  y);\
  80.488 +    const int D=(  x)*(  y);\
  80.489 +    int i;\
  80.490 +    \
  80.491 +    assert(x<8 && y<8 && x>=0 && y>=0);\
  80.492 +\
  80.493 +    if(D){\
  80.494 +        for(i=0; i<h; i++){\
  80.495 +            OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
  80.496 +            OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
  80.497 +            OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
  80.498 +            OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
  80.499 +            OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
  80.500 +            OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
  80.501 +            OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
  80.502 +            OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
  80.503 +            dst+= stride;\
  80.504 +            src+= stride;\
  80.505 +        }\
  80.506 +    }else{\
  80.507 +        const int E= B+C;\
  80.508 +        const int step= C ? stride : 1;\
  80.509 +        for(i=0; i<h; i++){\
  80.510 +            OP(dst[0], (A*src[0] + E*src[step+0]));\
  80.511 +            OP(dst[1], (A*src[1] + E*src[step+1]));\
  80.512 +            OP(dst[2], (A*src[2] + E*src[step+2]));\
  80.513 +            OP(dst[3], (A*src[3] + E*src[step+3]));\
  80.514 +            OP(dst[4], (A*src[4] + E*src[step+4]));\
  80.515 +            OP(dst[5], (A*src[5] + E*src[step+5]));\
  80.516 +            OP(dst[6], (A*src[6] + E*src[step+6]));\
  80.517 +            OP(dst[7], (A*src[7] + E*src[step+7]));\
  80.518 +            dst+= stride;\
  80.519 +            src+= stride;\
  80.520 +        }\
  80.521 +    }\
  80.522 +}
  80.523 +
  80.524 +#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
  80.525 +#define op_put(a, b) a = (((b) + 32)>>6)
  80.526 +
  80.527 +H264_CHROMA_MC(put_       , op_put)
  80.528 +H264_CHROMA_MC(avg_       , op_avg)
  80.529 +#undef op_avg
  80.530 +#undef op_put
  80.531 +
  80.532 +
  80.533 +#define H264_LOWPASS(OPNAME, OP, OP2) \
  80.534 +static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.535 +    const int h=2;\
  80.536 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.537 +    int i;\
  80.538 +    for(i=0; i<h; i++)\
  80.539 +    {\
  80.540 +        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  80.541 +        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  80.542 +        dst+=dstStride;\
  80.543 +        src+=srcStride;\
  80.544 +    }\
  80.545 +}\
  80.546 +\
  80.547 +static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.548 +    const int w=2;\
  80.549 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.550 +    int i;\
  80.551 +    for(i=0; i<w; i++)\
  80.552 +    {\
  80.553 +        const int srcB= src[-2*srcStride];\
  80.554 +        const int srcA= src[-1*srcStride];\
  80.555 +        const int src0= src[0 *srcStride];\
  80.556 +        const int src1= src[1 *srcStride];\
  80.557 +        const int src2= src[2 *srcStride];\
  80.558 +        const int src3= src[3 *srcStride];\
  80.559 +        const int src4= src[4 *srcStride];\
  80.560 +        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  80.561 +        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  80.562 +        dst++;\
  80.563 +        src++;\
  80.564 +    }\
  80.565 +}\
  80.566 +\
  80.567 +static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  80.568 +    const int h=2;\
  80.569 +    const int w=2;\
  80.570 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.571 +    int i;\
  80.572 +    src -= 2*srcStride;\
  80.573 +    for(i=0; i<h+5; i++)\
  80.574 +    {\
  80.575 +        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
  80.576 +        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
  80.577 +        tmp+=tmpStride;\
  80.578 +        src+=srcStride;\
  80.579 +    }\
  80.580 +    tmp -= tmpStride*(h+5-2);\
  80.581 +    for(i=0; i<w; i++)\
  80.582 +    {\
  80.583 +        const int tmpB= tmp[-2*tmpStride];\
  80.584 +        const int tmpA= tmp[-1*tmpStride];\
  80.585 +        const int tmp0= tmp[0 *tmpStride];\
  80.586 +        const int tmp1= tmp[1 *tmpStride];\
  80.587 +        const int tmp2= tmp[2 *tmpStride];\
  80.588 +        const int tmp3= tmp[3 *tmpStride];\
  80.589 +        const int tmp4= tmp[4 *tmpStride];\
  80.590 +        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
  80.591 +        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
  80.592 +        dst++;\
  80.593 +        tmp++;\
  80.594 +    }\
  80.595 +}\
  80.596 +static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.597 +    const int h=4;\
  80.598 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.599 +    int i;\
  80.600 +    for(i=0; i<h; i++)\
  80.601 +    {\
  80.602 +        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
  80.603 +        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
  80.604 +        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
  80.605 +        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
  80.606 +        dst+=dstStride;\
  80.607 +        src+=srcStride;\
  80.608 +    }\
  80.609 +}\
  80.610 +\
  80.611 +static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.612 +    const int w=4;\
  80.613 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.614 +    int i;\
  80.615 +    for(i=0; i<w; i++)\
  80.616 +    {\
  80.617 +        const int srcB= src[-2*srcStride];\
  80.618 +        const int srcA= src[-1*srcStride];\
  80.619 +        const int src0= src[0 *srcStride];\
  80.620 +        const int src1= src[1 *srcStride];\
  80.621 +        const int src2= src[2 *srcStride];\
  80.622 +        const int src3= src[3 *srcStride];\
  80.623 +        const int src4= src[4 *srcStride];\
  80.624 +        const int src5= src[5 *srcStride];\
  80.625 +        const int src6= src[6 *srcStride];\
  80.626 +        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  80.627 +        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  80.628 +        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  80.629 +        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  80.630 +        dst++;\
  80.631 +        src++;\
  80.632 +    }\
  80.633 +}\
  80.634 +\
  80.635 +static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  80.636 +    const int h=4;\
  80.637 +    const int w=4;\
  80.638 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.639 +    int i;\
  80.640 +    src -= 2*srcStride;\
  80.641 +    for(i=0; i<h+5; i++)\
  80.642 +    {\
  80.643 +        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
  80.644 +        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
  80.645 +        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
  80.646 +        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
  80.647 +        tmp+=tmpStride;\
  80.648 +        src+=srcStride;\
  80.649 +    }\
  80.650 +    tmp -= tmpStride*(h+5-2);\
  80.651 +    for(i=0; i<w; i++)\
  80.652 +    {\
  80.653 +        const int tmpB= tmp[-2*tmpStride];\
  80.654 +        const int tmpA= tmp[-1*tmpStride];\
  80.655 +        const int tmp0= tmp[0 *tmpStride];\
  80.656 +        const int tmp1= tmp[1 *tmpStride];\
  80.657 +        const int tmp2= tmp[2 *tmpStride];\
  80.658 +        const int tmp3= tmp[3 *tmpStride];\
  80.659 +        const int tmp4= tmp[4 *tmpStride];\
  80.660 +        const int tmp5= tmp[5 *tmpStride];\
  80.661 +        const int tmp6= tmp[6 *tmpStride];\
  80.662 +        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
  80.663 +        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
  80.664 +        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
  80.665 +        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
  80.666 +        dst++;\
  80.667 +        tmp++;\
  80.668 +    }\
  80.669 +}\
  80.670 +\
  80.671 +static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.672 +    const int h=8;\
  80.673 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.674 +    int i;\
  80.675 +    for(i=0; i<h; i++)\
  80.676 +    {\
  80.677 +        OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
  80.678 +        OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
  80.679 +        OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
  80.680 +        OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
  80.681 +        OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
  80.682 +        OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
  80.683 +        OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
  80.684 +        OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
  80.685 +        dst+=dstStride;\
  80.686 +        src+=srcStride;\
  80.687 +    }\
  80.688 +}\
  80.689 +\
  80.690 +static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.691 +    const int w=8;\
  80.692 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.693 +    int i;\
  80.694 +    for(i=0; i<w; i++)\
  80.695 +    {\
  80.696 +        const int srcB= src[-2*srcStride];\
  80.697 +        const int srcA= src[-1*srcStride];\
  80.698 +        const int src0= src[0 *srcStride];\
  80.699 +        const int src1= src[1 *srcStride];\
  80.700 +        const int src2= src[2 *srcStride];\
  80.701 +        const int src3= src[3 *srcStride];\
  80.702 +        const int src4= src[4 *srcStride];\
  80.703 +        const int src5= src[5 *srcStride];\
  80.704 +        const int src6= src[6 *srcStride];\
  80.705 +        const int src7= src[7 *srcStride];\
  80.706 +        const int src8= src[8 *srcStride];\
  80.707 +        const int src9= src[9 *srcStride];\
  80.708 +        const int src10=src[10*srcStride];\
  80.709 +        OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
  80.710 +        OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
  80.711 +        OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
  80.712 +        OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
  80.713 +        OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
  80.714 +        OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
  80.715 +        OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
  80.716 +        OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
  80.717 +        dst++;\
  80.718 +        src++;\
  80.719 +    }\
  80.720 +}\
  80.721 +\
  80.722 +static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  80.723 +    const int h=8;\
  80.724 +    const int w=8;\
  80.725 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
  80.726 +    int i;\
  80.727 +    src -= 2*srcStride;\
  80.728 +    for(i=0; i<h+5; i++)\
  80.729 +    {\
  80.730 +        tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
  80.731 +        tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
  80.732 +        tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
  80.733 +        tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
  80.734 +        tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
  80.735 +        tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
  80.736 +        tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
  80.737 +        tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
  80.738 +        tmp+=tmpStride;\
  80.739 +        src+=srcStride;\
  80.740 +    }\
  80.741 +    tmp -= tmpStride*(h+5-2);\
  80.742 +    for(i=0; i<w; i++)\
  80.743 +    {\
  80.744 +        const int tmpB= tmp[-2*tmpStride];\
  80.745 +        const int tmpA= tmp[-1*tmpStride];\
  80.746 +        const int tmp0= tmp[0 *tmpStride];\
  80.747 +        const int tmp1= tmp[1 *tmpStride];\
  80.748 +        const int tmp2= tmp[2 *tmpStride];\
  80.749 +        const int tmp3= tmp[3 *tmpStride];\
  80.750 +        const int tmp4= tmp[4 *tmpStride];\
  80.751 +        const int tmp5= tmp[5 *tmpStride];\
  80.752 +        const int tmp6= tmp[6 *tmpStride];\
  80.753 +        const int tmp7= tmp[7 *tmpStride];\
  80.754 +        const int tmp8= tmp[8 *tmpStride];\
  80.755 +        const int tmp9= tmp[9 *tmpStride];\
  80.756 +        const int tmp10=tmp[10*tmpStride];\
  80.757 +        OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
  80.758 +        OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
  80.759 +        OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
  80.760 +        OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
  80.761 +        OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
  80.762 +        OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
  80.763 +        OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
  80.764 +        OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
  80.765 +        dst++;\
  80.766 +        tmp++;\
  80.767 +    }\
  80.768 +}\
  80.769 +\
  80.770 +static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.771 +    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
  80.772 +    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
  80.773 +    src += 8*srcStride;\
  80.774 +    dst += 8*dstStride;\
  80.775 +    OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
  80.776 +    OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
  80.777 +}\
  80.778 +\
  80.779 +static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  80.780 +    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
  80.781 +    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
  80.782 +    src += 8*srcStride;\
  80.783 +    dst += 8*dstStride;\
  80.784 +    OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
  80.785 +    OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
  80.786 +}\
  80.787 +\
  80.788 +static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
  80.789 +    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
  80.790 +    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
  80.791 +    src += 8*srcStride;\
  80.792 +    dst += 8*dstStride;\
  80.793 +    OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
  80.794 +    OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
  80.795 +}\
  80.796 +
  80.797 +#define H264_MC(OPNAME, SIZE) \
  80.798 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
  80.799 +    OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
  80.800 +}\
  80.801 +\
  80.802 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
  80.803 +    uint8_t half[SIZE*SIZE];\
  80.804 +    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
  80.805 +    OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
  80.806 +}\
  80.807 +\
  80.808 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
  80.809 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
  80.810 +}\
  80.811 +\
  80.812 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
  80.813 +    uint8_t half[SIZE*SIZE];\
  80.814 +    put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
  80.815 +    OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
  80.816 +}\
  80.817 +\
  80.818 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
  80.819 +    uint8_t full[SIZE*(SIZE+5)];\
  80.820 +    uint8_t * const full_mid= full + SIZE*2;\
  80.821 +    uint8_t half[SIZE*SIZE];\
  80.822 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
  80.823 +    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
  80.824 +    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
  80.825 +}\
  80.826 +\
  80.827 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
  80.828 +    uint8_t full[SIZE*(SIZE+5)];\
  80.829 +    uint8_t * const full_mid= full + SIZE*2;\
  80.830 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
  80.831 +    OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
  80.832 +}\
  80.833 +\
  80.834 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
  80.835 +    uint8_t full[SIZE*(SIZE+5)];\
  80.836 +    uint8_t * const full_mid= full + SIZE*2;\
  80.837 +    uint8_t half[SIZE*SIZE];\
  80.838 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
  80.839 +    put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
  80.840 +    OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
  80.841 +}\
  80.842 +\
  80.843 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
  80.844 +    uint8_t full[SIZE*(SIZE+5)];\
  80.845 +    uint8_t * const full_mid= full + SIZE*2;\
  80.846 +    uint8_t halfH[SIZE*SIZE];\
  80.847 +    uint8_t halfV[SIZE*SIZE];\
  80.848 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
  80.849 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
  80.850 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  80.851 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  80.852 +}\
  80.853 +\
  80.854 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
  80.855 +    uint8_t full[SIZE*(SIZE+5)];\
  80.856 +    uint8_t * const full_mid= full + SIZE*2;\
  80.857 +    uint8_t halfH[SIZE*SIZE];\
  80.858 +    uint8_t halfV[SIZE*SIZE];\
  80.859 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
  80.860 +    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
  80.861 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  80.862 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  80.863 +}\
  80.864 +\
  80.865 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
  80.866 +    uint8_t full[SIZE*(SIZE+5)];\
  80.867 +    uint8_t * const full_mid= full + SIZE*2;\
  80.868 +    uint8_t halfH[SIZE*SIZE];\
  80.869 +    uint8_t halfV[SIZE*SIZE];\
  80.870 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
  80.871 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
  80.872 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  80.873 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  80.874 +}\
  80.875 +\
  80.876 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
  80.877 +    uint8_t full[SIZE*(SIZE+5)];\
  80.878 +    uint8_t * const full_mid= full + SIZE*2;\
  80.879 +    uint8_t halfH[SIZE*SIZE];\
  80.880 +    uint8_t halfV[SIZE*SIZE];\
  80.881 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
  80.882 +    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
  80.883 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  80.884 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
  80.885 +}\
  80.886 +\
  80.887 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
  80.888 +    int16_t tmp[SIZE*(SIZE+5)];\
  80.889 +    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
  80.890 +}\
  80.891 +\
  80.892 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
  80.893 +    int16_t tmp[SIZE*(SIZE+5)];\
  80.894 +    uint8_t halfH[SIZE*SIZE];\
  80.895 +    uint8_t halfHV[SIZE*SIZE];\
  80.896 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
  80.897 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  80.898 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  80.899 +}\
  80.900 +\
  80.901 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
  80.902 +    int16_t tmp[SIZE*(SIZE+5)];\
  80.903 +    uint8_t halfH[SIZE*SIZE];\
  80.904 +    uint8_t halfHV[SIZE*SIZE];\
  80.905 +    put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
  80.906 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  80.907 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
  80.908 +}\
  80.909 +\
  80.910 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
  80.911 +    uint8_t full[SIZE*(SIZE+5)];\
  80.912 +    uint8_t * const full_mid= full + SIZE*2;\
  80.913 +    int16_t tmp[SIZE*(SIZE+5)];\
  80.914 +    uint8_t halfV[SIZE*SIZE];\
  80.915 +    uint8_t halfHV[SIZE*SIZE];\
  80.916 +    copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
  80.917 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  80.918 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  80.919 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  80.920 +}\
  80.921 +\
  80.922 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
  80.923 +    uint8_t full[SIZE*(SIZE+5)];\
  80.924 +    uint8_t * const full_mid= full + SIZE*2;\
  80.925 +    int16_t tmp[SIZE*(SIZE+5)];\
  80.926 +    uint8_t halfV[SIZE*SIZE];\
  80.927 +    uint8_t halfHV[SIZE*SIZE];\
  80.928 +    copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
  80.929 +    put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
  80.930 +    put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
  80.931 +    OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
  80.932 +}\
  80.933 +
  80.934 +#define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
  80.935 +#define op_put(a, b)  a = cm[((b) + 16)>>5]
  80.936 +#define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
  80.937 +#define op2_put(a, b)  a = cm[((b) + 512)>>10]
  80.938 +
  80.939 +H264_LOWPASS(put_       , op_put, op2_put)
  80.940 +H264_LOWPASS(avg_       , op_avg, op2_avg)
  80.941 +H264_MC(put_, 2)
  80.942 +H264_MC(put_, 4)
  80.943 +H264_MC(put_, 8)
  80.944 +H264_MC(put_, 16)
  80.945 +H264_MC(avg_, 4)
  80.946 +H264_MC(avg_, 8)
  80.947 +H264_MC(avg_, 16)
  80.948 +
  80.949 +#undef op_avg
  80.950 +#undef op_put
  80.951 +#undef op2_avg
  80.952 +#undef op2_put
  80.953 +
  80.954 +static void clear_block_c(DCTELEM *block)
  80.955 +{
  80.956 +    memset(block, 0, sizeof(DCTELEM)*64);
  80.957 +}
  80.958 +
  80.959 +/**
  80.960 + * memset(blocks, 0, sizeof(DCTELEM)*6*64)
  80.961 + */
  80.962 +static void clear_blocks_c(DCTELEM *blocks)
  80.963 +{
  80.964 +    memset(blocks, 0, sizeof(DCTELEM)*6*64);
  80.965 +}
  80.966 +
  80.967 +static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
  80.968 +
  80.969 +/* init static data */
  80.970 +av_cold void dsputil_static_init(void)
  80.971 +{
  80.972 +    int i;
  80.973 +
  80.974 +    for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
  80.975 +    for(i=0;i<MAX_NEG_CROP;i++) {
  80.976 +        ff_cropTbl[i] = 0;
  80.977 +        ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
  80.978 +    }
  80.979 +
  80.980 +    for(i=0;i<512;i++) {
  80.981 +        ff_squareTbl[i] = (i - 256) * (i - 256);
  80.982 +    }
  80.983 +}
  80.984 +
  80.985 +int ff_check_alignment(void){
  80.986 +    static int did_fail=0;
  80.987 +    DECLARE_ALIGNED(16, int, aligned);
  80.988 +
  80.989 +    if((intptr_t)&aligned & 15){
  80.990 +        if(!did_fail){
  80.991 +#if HAVE_MMX || HAVE_ALTIVEC
  80.992 +            av_log(AV_LOG_ERROR,
  80.993 +                "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
  80.994 +                "and may be very slow or crash. This is not a bug in libavcodec,\n"
  80.995 +                "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
  80.996 +                "Do not report crashes to FFmpeg developers.\n");
  80.997 +#endif
  80.998 +            did_fail=1;
  80.999 +        }
 80.1000 +        return -1;
 80.1001 +    }
 80.1002 +    return 0;
 80.1003 +}
 80.1004 +
 80.1005 +av_cold void dsputil_init(DSPContext* c)
 80.1006 +{
 80.1007 +    (void) avg_pixels2_c; // kill a warning, avg_pixels2_c is a macro created function.
 80.1008 +    ff_check_alignment();
 80.1009 +    dsputil_static_init();
 80.1010 + 
 80.1011 +    c->idct_put= ff_simple_idct_put;
 80.1012 +    c->idct_add= ff_simple_idct_add;
 80.1013 +    c->idct    = ff_simple_idct;
 80.1014 +
 80.1015 +    c->clear_block = clear_block_c;
 80.1016 +    c->clear_blocks = clear_blocks_c;
 80.1017 +
 80.1018 +#define dspfunc(PFX, IDX, NUM) \
 80.1019 +    c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
 80.1020 +    c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
 80.1021 +    c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
 80.1022 +    c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
 80.1023 +    c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
 80.1024 +    c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
 80.1025 +    c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
 80.1026 +    c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
 80.1027 +    c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
 80.1028 +    c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
 80.1029 +    c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
 80.1030 +    c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
 80.1031 +    c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
 80.1032 +    c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
 80.1033 +    c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
 80.1034 +    c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
 80.1035 +
 80.1036 +
 80.1037 +    dspfunc(put_h264_qpel, 0, 16);
 80.1038 +    dspfunc(put_h264_qpel, 1, 8);
 80.1039 +    dspfunc(put_h264_qpel, 2, 4);
 80.1040 +    dspfunc(put_h264_qpel, 3, 2);
 80.1041 +    dspfunc(avg_h264_qpel, 0, 16);
 80.1042 +    dspfunc(avg_h264_qpel, 1, 8);
 80.1043 +    dspfunc(avg_h264_qpel, 2, 4);
 80.1044 +
 80.1045 +#undef dspfunc
 80.1046 +    c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
 80.1047 +    c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
 80.1048 +    c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
 80.1049 +    c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
 80.1050 +    c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
 80.1051 +    c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
 80.1052 +
 80.1053 +
 80.1054 +    c->prefetch= just_return;
 80.1055 +
 80.1056 +    if (HAVE_MMX)        dsputil_init_mmx   (c);
 80.1057 +    if (ARCH_ARM)        dsputil_init_arm   (c);
 80.1058 +    if (HAVE_ALTIVEC)    dsputil_init_ppc   (c); //fixme PPC prefetch
 80.1059 +}
 80.1060 +

    81.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    81.2 +++ b/ffmpeg_smp/h264dec/libavcodec/dsputil.h	Mon Aug 27 12:09:56 2012 +0200
    81.3 @@ -0,0 +1,465 @@
    81.4 +/*
    81.5 + * DSP utils
    81.6 + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
    81.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
    81.8 + *
    81.9 + * This file is part of FFmpeg.
   81.10 + *
   81.11 + * FFmpeg is free software; you can redistribute it and/or
   81.12 + * modify it under the terms of the GNU Lesser General Public
   81.13 + * License as published by the Free Software Foundation; either
   81.14 + * version 2.1 of the License, or (at your option) any later version.
   81.15 + *
   81.16 + * FFmpeg is distributed in the hope that it will be useful,
   81.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   81.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   81.19 + * Lesser General Public License for more details.
   81.20 + *
   81.21 + * You should have received a copy of the GNU Lesser General Public
   81.22 + * License along with FFmpeg; if not, write to the Free Software
   81.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   81.24 + */
   81.25 +
   81.26 +/**
   81.27 + * @file
   81.28 + * DSP utils.
   81.29 + * note, many functions in here may use MMX which trashes the FPU state, it is
   81.30 + * absolutely necessary to call emms_c() between dsp & float/double code
   81.31 + */
   81.32 +
   81.33 +#ifndef AVCODEC_DSPUTIL_H
   81.34 +#define AVCODEC_DSPUTIL_H
   81.35 +
   81.36 +#include "libavutil/intreadwrite.h"
   81.37 +#include "avcodec.h"
   81.38 +#include "h264_idct.h"
   81.39 +// 
   81.40 +void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
   81.41 +                             const float *win, float add_bias, int len);
   81.42 +void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
   81.43 +void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
   81.44 +
   81.45 +/* encoding scans */
   81.46 +extern const uint8_t ff_alternate_horizontal_scan[64];
   81.47 +extern const uint8_t ff_alternate_vertical_scan[64];
   81.48 +extern const uint8_t ff_zigzag_direct[64];
   81.49 +extern const uint8_t ff_zigzag248_direct[64];
   81.50 +
   81.51 +/* pixel operations */
   81.52 +#define MAX_NEG_CROP 1024
   81.53 +
   81.54 +/* temporary */
   81.55 +extern uint32_t ff_squareTbl[512];
   81.56 +extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
   81.57 +
   81.58 +/* VP3 DSP functions */
   81.59 +void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
   81.60 +void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
   81.61 +void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
   81.62 +void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/);
   81.63 +
   81.64 +void ff_vp3_v_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
   81.65 +void ff_vp3_h_loop_filter_c(uint8_t *src, int stride, int *bounding_values);
   81.66 +
   81.67 +/* VP6 DSP functions */
   81.68 +void ff_vp6_filter_diag4_c(uint8_t *dst, uint8_t *src, int stride,
   81.69 +                           const int16_t *h_weights, const int16_t *v_weights);
   81.70 +
   81.71 +/* Bink functions */
   81.72 +void ff_bink_idct_c    (DCTELEM *block);
   81.73 +void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
   81.74 +void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
   81.75 +
   81.76 +/* CAVS functions */
   81.77 +void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
   81.78 +void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride);
   81.79 +void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
   81.80 +void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride);
   81.81 +
   81.82 +/* VC1 functions */
   81.83 +void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
   81.84 +void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd);
   81.85 +
   81.86 +/* EA functions */
   81.87 +void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
   81.88 +
   81.89 +/* 1/2^n downscaling functions from imgconvert.c */
   81.90 +void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
   81.91 +void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
   81.92 +void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
   81.93 +void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
   81.94 +
   81.95 +void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
   81.96 +              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
   81.97 +
   81.98 +/* minimum alignment rules ;)
   81.99 +If you notice errors in the align stuff, need more alignment for some ASM code
  81.100 +for some CPU or need to use a function with less aligned data then send a mail
  81.101 +to the ffmpeg-devel mailing list, ...
  81.102 +
  81.103 +!warning These alignments might not match reality, (missing attribute((align))
  81.104 +stuff somewhere possible).
  81.105 +I (Michael) did not check them, these are just the alignments which I think
  81.106 +could be reached easily ...
  81.107 +
  81.108 +!future video codecs might need functions with less strict alignment
  81.109 +*/
  81.110 +
  81.111 +/*
  81.112 +void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
  81.113 +void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
  81.114 +void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
  81.115 +void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
  81.116 +void clear_blocks_c(DCTELEM *blocks);
  81.117 +*/
  81.118 +
  81.119 +/* add and put pixel (decoding) */
  81.120 +// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
  81.121 +//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
  81.122 +typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
  81.123 +typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
  81.124 +typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
  81.125 +typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
  81.126 +
  81.127 +typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
  81.128 +
  81.129 +#define DEF_OLD_QPEL(name)\
  81.130 +void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
  81.131 +void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
  81.132 +void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
  81.133 +
  81.134 +DEF_OLD_QPEL(qpel16_mc11_old_c)
  81.135 +DEF_OLD_QPEL(qpel16_mc31_old_c)
  81.136 +DEF_OLD_QPEL(qpel16_mc12_old_c)
  81.137 +DEF_OLD_QPEL(qpel16_mc32_old_c)
  81.138 +DEF_OLD_QPEL(qpel16_mc13_old_c)
  81.139 +DEF_OLD_QPEL(qpel16_mc33_old_c)
  81.140 +DEF_OLD_QPEL(qpel8_mc11_old_c)
  81.141 +DEF_OLD_QPEL(qpel8_mc31_old_c)
  81.142 +DEF_OLD_QPEL(qpel8_mc12_old_c)
  81.143 +DEF_OLD_QPEL(qpel8_mc32_old_c)
  81.144 +DEF_OLD_QPEL(qpel8_mc13_old_c)
  81.145 +DEF_OLD_QPEL(qpel8_mc33_old_c)
  81.146 +
  81.147 +#define CALL_2X_PIXELS(a, b, n)\
  81.148 +static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
  81.149 +    b(block  , pixels  , line_size, h);\
  81.150 +    b(block+n, pixels+n, line_size, h);\
  81.151 +}
  81.152 +
  81.153 +/* motion estimation */
  81.154 +// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
  81.155 +// although currently h<4 is not used as functions with width <8 are neither used nor implemented
  81.156 +typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
  81.157 +
  81.158 +/**
  81.159 + * Scantable.
  81.160 + */
  81.161 +typedef struct ScanTable{
  81.162 +    const uint8_t *scantable;
  81.163 +    uint8_t permutated[64];
  81.164 +    uint8_t raster_end[64];
  81.165 +#if ARCH_PPC
  81.166 +                /** Used by dct_quantize_altivec to find last-non-zero */
  81.167 +    DECLARE_ALIGNED(16, uint8_t, inverse)[64];
  81.168 +#endif
  81.169 +} ScanTable;
  81.170 +
  81.171 +void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
  81.172 +
  81.173 +void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize,
  81.174 +                         int block_w, int block_h,
  81.175 +                         int src_x, int src_y, int w, int h);
  81.176 +
  81.177 +
  81.178 +/**
  81.179 + * DSPContext.
  81.180 + */
  81.181 +typedef struct DSPContext {
  81.182 +    /* pixel ops : interface with DCT */
  81.183 +    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
  81.184 +    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
  81.185 +    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
  81.186 +    void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
  81.187 +    void (*put_pixels_nonclamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
  81.188 +    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
  81.189 +    void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
  81.190 +    void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
  81.191 +    
  81.192 +    void (*clear_block)(DCTELEM *block/*align 16*/);
  81.193 +    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
  81.194 +
  81.195 +
  81.196 +    /**
  81.197 +     * Halfpel motion compensation with rounding (a+b+1)>>1.
  81.198 +     * this is an array[4][4] of motion compensation functions for 4
  81.199 +     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
  81.200 +     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
  81.201 +     * @param block destination where the result is stored
  81.202 +     * @param pixels source
  81.203 +     * @param line_size number of bytes in a horizontal line of block
  81.204 +     * @param h height
  81.205 +     */
  81.206 +    op_pixels_func put_pixels_tab[4][4];
  81.207 +
  81.208 +    /**
  81.209 +     * Halfpel motion compensation with rounding (a+b+1)>>1.
  81.210 +     * This is an array[4][4] of motion compensation functions for 4
  81.211 +     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
  81.212 +     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
  81.213 +     * @param block destination into which the result is averaged (a+b+1)>>1
  81.214 +     * @param pixels source
  81.215 +     * @param line_size number of bytes in a horizontal line of block
  81.216 +     * @param h height
  81.217 +     */
  81.218 +    op_pixels_func avg_pixels_tab[4][4];
  81.219 +
  81.220 +    /**
  81.221 +     * Halfpel motion compensation with no rounding (a+b)>>1.
  81.222 +     * this is an array[2][4] of motion compensation functions for 2
  81.223 +     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
  81.224 +     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
  81.225 +     * @param block destination where the result is stored
  81.226 +     * @param pixels source
  81.227 +     * @param line_size number of bytes in a horizontal line of block
  81.228 +     * @param h height
  81.229 +     */
  81.230 +    op_pixels_func put_no_rnd_pixels_tab[4][4];
  81.231 +
  81.232 +    /**
  81.233 +     * Halfpel motion compensation with no rounding (a+b)>>1.
  81.234 +     * this is an array[2][4] of motion compensation functions for 2
  81.235 +     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
  81.236 +     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
  81.237 +     * @param block destination into which the result is averaged (a+b)>>1
  81.238 +     * @param pixels source
  81.239 +     * @param line_size number of bytes in a horizontal line of block
  81.240 +     * @param h height
  81.241 +     */
  81.242 +    op_pixels_func avg_no_rnd_pixels_tab[4][4];
  81.243 +
  81.244 +    void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
  81.245 +
  81.246 +
  81.247 +    qpel_mc_func put_qpel_pixels_tab[2][16];
  81.248 +    qpel_mc_func avg_qpel_pixels_tab[2][16];
  81.249 +    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
  81.250 +    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
  81.251 +    qpel_mc_func put_mspel_pixels_tab[8];
  81.252 +
  81.253 +    /**
  81.254 +     * h264 Chroma MC
  81.255 +     */
  81.256 +    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
  81.257 +    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
  81.258 +    /* This is really one func used in VC-1 decoding */
  81.259 +    h264_chroma_mc_func put_no_rnd_vc1_chroma_pixels_tab[3];
  81.260 +    h264_chroma_mc_func avg_no_rnd_vc1_chroma_pixels_tab[3];
  81.261 +
  81.262 +    qpel_mc_func put_h264_qpel_pixels_tab[4][16];
  81.263 +    qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
  81.264 +
  81.265 +    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
  81.266 +    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];
  81.267 +
  81.268 +   
  81.269 +    /* (I)DCT */
  81.270 +    void (*fdct)(DCTELEM *block/* align 16*/);
  81.271 +    void (*fdct248)(DCTELEM *block/* align 16*/);
  81.272 +
  81.273 +    /* IDCT really*/
  81.274 +    void (*idct)(DCTELEM *block/* align 16*/);
  81.275 +
  81.276 +    /**
  81.277 +     * block -> idct -> clip to unsigned 8 bit -> dest.
  81.278 +     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
  81.279 +     * @param line_size size in bytes of a horizontal line of dest
  81.280 +     */
  81.281 +    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
  81.282 +
  81.283 +    /**
  81.284 +     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
  81.285 +     * @param line_size size in bytes of a horizontal line of dest
  81.286 +     */
  81.287 +    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
  81.288 +
  81.289 +    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
  81.290 +#define EDGE_WIDTH 32
  81.291 +
  81.292 +    void (*prefetch)(void *mem, int stride, int h);
  81.293 +
  81.294 +} DSPContext;
  81.295 +
  81.296 +void dsputil_static_init(void);
  81.297 +void dsputil_init(DSPContext* p);
  81.298 +
  81.299 +int ff_check_alignment(void);
  81.300 +
  81.301 +/**
  81.302 + * permute block according to permuatation.
  81.303 + * @param last last non zero element in scantable order
  81.304 + */
  81.305 +void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
  81.306 +
  81.307 +void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
  81.308 +
  81.309 +#define         BYTE_VEC32(c)   ((c)*0x01010101UL)
  81.310 +
  81.311 +static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
  81.312 +{
  81.313 +    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
  81.314 +}
  81.315 +
  81.316 +static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
  81.317 +{
  81.318 +    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
  81.319 +}
  81.320 +
  81.321 +
  81.322 +/**
  81.323 + * Empty mmx state.
  81.324 + * this must be called between any dsp function and float/double code.
  81.325 + * for example sin(); dsp->idct_put(); emms_c(); cos()
  81.326 + */
  81.327 +#define emms_c()
  81.328 +
  81.329 +/* should be defined by architectures supporting
  81.330 +   one or more MultiMedia extension */
  81.331 +int mm_support(void);
  81.332 +extern int mm_flags;
  81.333 +
  81.334 +void dsputil_init_arm(DSPContext* c);
  81.335 +void dsputil_init_mmx(DSPContext* c);
  81.336 +void dsputil_init_ppc(DSPContext* c);
  81.337 +
  81.338 +void ff_dsputil_init_dwt(DSPContext *c);
  81.339 +
  81.340 +#if HAVE_MMX
  81.341 +
  81.342 +#undef emms_c
  81.343 +
  81.344 +static inline void emms(void)
  81.345 +{
  81.346 +    __asm__ volatile ("emms;":::"memory");
  81.347 +}
  81.348 +
  81.349 +
  81.350 +#define emms_c() \
  81.351 +{\
  81.352 +    if (mm_flags & FF_MM_MMX)\
  81.353 +        emms();\
  81.354 +}
  81.355 +
  81.356 +#elif ARCH_ARM
  81.357 +
  81.358 +#if HAVE_NEON
  81.359 +#   define STRIDE_ALIGN 16
  81.360 +#endif
  81.361 +
  81.362 +#elif ARCH_PPC || ARCH_PPC64 || ARCH_CELL
  81.363 +
  81.364 +#define STRIDE_ALIGN 16
  81.365 +
  81.366 +#endif
  81.367 +
  81.368 +#ifndef STRIDE_ALIGN
  81.369 +#   define STRIDE_ALIGN 8
  81.370 +#endif
  81.371 +
  81.372 +#define WRAPPER8_16(name8, name16)\
  81.373 +static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
  81.374 +    return name8(s, dst           , src           , stride, h)\
  81.375 +          +name8(s, dst+8         , src+8         , stride, h);\
  81.376 +}
  81.377 +
  81.378 +#define WRAPPER8_16_SQ(name8, name16)\
  81.379 +static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
  81.380 +    int score=0;\
  81.381 +    score +=name8(s, dst           , src           , stride, 8);\
  81.382 +    score +=name8(s, dst+8         , src+8         , stride, 8);\
  81.383 +    if(h==16){\
  81.384 +        dst += 8*stride;\
  81.385 +        src += 8*stride;\
  81.386 +        score +=name8(s, dst           , src           , stride, 8);\
  81.387 +        score +=name8(s, dst+8         , src+8         , stride, 8);\
  81.388 +    }\
  81.389 +    return score;\
  81.390 +}
  81.391 +
  81.392 +static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
  81.393 +{
  81.394 +    int i;
  81.395 +    for(i=0; i<h; i++)
  81.396 +    {
  81.397 +        AV_WN16(dst   , AV_RN16(src   ));
  81.398 +        dst+=dstStride;
  81.399 +        src+=srcStride;
  81.400 +    }
  81.401 +}
  81.402 +
  81.403 +static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
  81.404 +{
  81.405 +    int i;
  81.406 +    for(i=0; i<h; i++)
  81.407 +    {
  81.408 +        AV_WN32(dst   , AV_RN32(src   ));
  81.409 +        dst+=dstStride;
  81.410 +        src+=srcStride;
  81.411 +    }
  81.412 +}
  81.413 +
  81.414 +static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
  81.415 +{
  81.416 +    int i;
  81.417 +    for(i=0; i<h; i++)
  81.418 +    {
  81.419 +        AV_WN32(dst   , AV_RN32(src   ));
  81.420 +        AV_WN32(dst+4 , AV_RN32(src+4 ));
  81.421 +        dst+=dstStride;
  81.422 +        src+=srcStride;
  81.423 +    }
  81.424 +}
  81.425 +
  81.426 +static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
  81.427 +{
  81.428 +    int i;
  81.429 +    for(i=0; i<h; i++)
  81.430 +    {
  81.431 +        AV_WN32(dst   , AV_RN32(src   ));
  81.432 +        AV_WN32(dst+4 , AV_RN32(src+4 ));
  81.433 +        dst[8]= src[8];
  81.434 +        dst+=dstStride;
  81.435 +        src+=srcStride;
  81.436 +    }
  81.437 +}
  81.438 +
  81.439 +static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
  81.440 +{
  81.441 +    int i;
  81.442 +    for(i=0; i<h; i++)
  81.443 +    {
  81.444 +        AV_WN32(dst   , AV_RN32(src   ));
  81.445 +        AV_WN32(dst+4 , AV_RN32(src+4 ));
  81.446 +        AV_WN32(dst+8 , AV_RN32(src+8 ));
  81.447 +        AV_WN32(dst+12, AV_RN32(src+12));
  81.448 +        dst+=dstStride;
  81.449 +        src+=srcStride;
  81.450 +    }
  81.451 +}
  81.452 +
  81.453 +static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
  81.454 +{
  81.455 +    int i;
  81.456 +    for(i=0; i<h; i++)
  81.457 +    {
  81.458 +        AV_WN32(dst   , AV_RN32(src   ));
  81.459 +        AV_WN32(dst+4 , AV_RN32(src+4 ));
  81.460 +        AV_WN32(dst+8 , AV_RN32(src+8 ));
  81.461 +        AV_WN32(dst+12, AV_RN32(src+12));
  81.462 +        dst[16]= src[16];
  81.463 +        dst+=dstStride;
  81.464 +        src+=srcStride;
  81.465 +    }
  81.466 +}
  81.467 +
  81.468 +#endif /* AVCODEC_DSPUTIL_H */

    82.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    82.2 +++ b/ffmpeg_smp/h264dec/libavcodec/get_bits.h	Mon Aug 27 12:09:56 2012 +0200
    82.3 @@ -0,0 +1,325 @@
    82.4 +/*
    82.5 + * copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
    82.6 + *
    82.7 + * This file is part of FFmpeg.
    82.8 + *
    82.9 + * FFmpeg is free software; you can redistribute it and/or
   82.10 + * modify it under the terms of the GNU Lesser General Public
   82.11 + * License as published by the Free Software Foundation; either
   82.12 + * version 2.1 of the License, or (at your option) any later version.
   82.13 + *
   82.14 + * FFmpeg is distributed in the hope that it will be useful,
   82.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   82.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   82.17 + * Lesser General Public License for more details.
   82.18 + *
   82.19 + * You should have received a copy of the GNU Lesser General Public
   82.20 + * License along with FFmpeg; if not, write to the Free Software
   82.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   82.22 + */
   82.23 +
   82.24 +/**
   82.25 + * @file
   82.26 + * bitstream reader API header.
   82.27 + */
   82.28 +
   82.29 +#ifndef AVCODEC_GET_BITS_H
   82.30 +#define AVCODEC_GET_BITS_H
   82.31 +
   82.32 +#include <stdint.h>
   82.33 +#include <stdlib.h>
   82.34 +#include <assert.h>
   82.35 +#include "libavutil/bswap.h"
   82.36 +#include "libavutil/common.h"
   82.37 +#include "libavutil/intreadwrite.h"
   82.38 +#include "libavutil/log.h"
   82.39 +#include "mathops.h"
   82.40 +
   82.41 +
   82.42 +typedef struct GetBitContext {
   82.43 +    uint8_t *rbsp;
   82.44 +    unsigned int rbsp_size;
   82.45 +    uint8_t *raw;
   82.46 +    const uint8_t *buffer, *buffer_end;
   82.47 +    unsigned int alloc_size;
   82.48 +    unsigned int buf_size;
   82.49 +    uint32_t *buffer_ptr;
   82.50 +    uint32_t cache0;
   82.51 +    uint32_t cache1;
   82.52 +    int bit_count;
   82.53 +    int size_in_bits;
   82.54 +} GetBitContext;
   82.55 +
   82.56 +/* Bitstream reader API docs:
   82.57 +name
   82.58 +    arbitrary name which is used as prefix for the internal variables
   82.59 +
   82.60 +gb
   82.61 +    getbitcontext
   82.62 +
   82.63 +OPEN_READER(name, gb)
   82.64 +    loads gb into local variables
   82.65 +
   82.66 +CLOSE_READER(name, gb)
   82.67 +    stores local vars in gb
   82.68 +
   82.69 +UPDATE_CACHE(name, gb)
   82.70 +    refills the internal cache from the bitstream
   82.71 +    after this call at least MIN_CACHE_BITS will be available,
   82.72 +
   82.73 +GET_CACHE(name, gb)
   82.74 +    will output the contents of the internal cache, next bit is MSB of 32 or 64 bit (FIXME 64bit)
   82.75 +
   82.76 +SHOW_UBITS(name, gb, num)
   82.77 +    will return the next num bits
   82.78 +
   82.79 +SHOW_SBITS(name, gb, num)
   82.80 +    will return the next num bits and do sign extension
   82.81 +
   82.82 +SKIP_BITS(name, gb, num)
   82.83 +    will skip over the next num bits
   82.84 +    note, this is equivalent to SKIP_CACHE; SKIP_COUNTER
   82.85 +
   82.86 +SKIP_CACHE(name, gb, num)
   82.87 +    will remove the next num bits from the cache (note SKIP_COUNTER MUST be called before UPDATE_CACHE / CLOSE_READER)
   82.88 +
   82.89 +SKIP_COUNTER(name, gb, num)
   82.90 +    will increment the internal bit counter (see SKIP_CACHE & SKIP_BITS)
   82.91 +
   82.92 +LAST_SKIP_CACHE(name, gb, num)
   82.93 +    will remove the next num bits from the cache if it is needed for UPDATE_CACHE otherwise it will do nothing
   82.94 +
   82.95 +LAST_SKIP_BITS(name, gb, num)
   82.96 +    is equivalent to LAST_SKIP_CACHE; SKIP_COUNTER
   82.97 +
   82.98 +for examples see get_bits, show_bits, skip_bits, get_vlc
   82.99 +*/
  82.100 +
  82.101 +#define MIN_CACHE_BITS 32
  82.102 +
  82.103 +#define OPEN_READER(name, gb)\
  82.104 +	int name##_bit_count=(gb)->bit_count;\
  82.105 +	uint32_t name##_cache0= (gb)->cache0;\
  82.106 +	uint32_t name##_cache1= (gb)->cache1;\
  82.107 +	uint32_t * name##_buffer_ptr=(gb)->buffer_ptr;\
  82.108 +
  82.109 +#define CLOSE_READER(name, gb)\
  82.110 +	(gb)->bit_count= name##_bit_count;\
  82.111 +	(gb)->cache0= name##_cache0;\
  82.112 +	(gb)->cache1= name##_cache1;\
  82.113 +	(gb)->buffer_ptr= name##_buffer_ptr;\
  82.114 +
  82.115 +#define UPDATE_CACHE(name, gb)\
  82.116 +	if(name##_bit_count > 0){\
  82.117 +		const uint32_t next= be2me_32( *name##_buffer_ptr );\
  82.118 +		name##_cache0 |= NEG_USR32(next,name##_bit_count);\
  82.119 +		name##_cache1 |= next<<name##_bit_count;\
  82.120 +		name##_buffer_ptr++;\
  82.121 +		name##_bit_count-= 32;\
  82.122 +	}\
  82.123 +
  82.124 +#if ARCH_X86
  82.125 +#   define SKIP_CACHE(name, gb, num)\
  82.126 +        __asm__(\
  82.127 +            "shldl %2, %1, %0          \n\t"\
  82.128 +            "shll %2, %1               \n\t"\
  82.129 +            : "+r" (name##_cache0), "+r" (name##_cache1)\
  82.130 +            : "Ic" ((uint8_t)(num))\
  82.131 +           );
  82.132 +#else
  82.133 +#   define SKIP_CACHE(name, gb, num)\
  82.134 +        name##_cache0 <<= (num);\
  82.135 +        name##_cache0 |= NEG_USR32(name##_cache1,num);\
  82.136 +        name##_cache1 <<= (num);
  82.137 +#endif
  82.138 +
  82.139 +#define SKIP_COUNTER(name, gb, num)\
  82.140 +	name##_bit_count += (num);\
  82.141 +
  82.142 +#define SKIP_BITS(name, gb, num)\
  82.143 +	{\
  82.144 +		SKIP_CACHE(name, gb, num)\
  82.145 +		SKIP_COUNTER(name, gb, num)\
  82.146 +	}\
  82.147 +
  82.148 +#define LAST_SKIP_BITS(name, gb, num) SKIP_BITS(name, gb, num)
  82.149 +#define LAST_SKIP_CACHE(name, gb, num) SKIP_CACHE(name, gb, num)
  82.150 +
  82.151 +#define SHOW_UBITS(name, gb, num)\
  82.152 +	NEG_USR32(name##_cache0, num)
  82.153 +
  82.154 +#define SHOW_SBITS(name, gb, num)\
  82.155 +        NEG_SSR32(name##_cache0, num)
  82.156 +
  82.157 +#define GET_CACHE(name, gb)\
  82.158 +	(name##_cache0)
  82.159 +
  82.160 +static inline int get_bits_count(const GetBitContext *s){
  82.161 +    return ((uint8_t*)s->buffer_ptr - s->buffer)*8 - 32 + s->bit_count;
  82.162 +}
  82.163 +
  82.164 +static inline void skip_bits_long(GetBitContext *s, int n){
  82.165 +    OPEN_READER(re, s)
  82.166 +    re_bit_count += n;
  82.167 +    re_buffer_ptr += re_bit_count>>5;
  82.168 +    re_bit_count &= 31;
  82.169 +    re_cache0 = be2me_32( re_buffer_ptr[-1] ) << re_bit_count;
  82.170 +    re_cache1 = 0;
  82.171 +    UPDATE_CACHE(re, s)
  82.172 +    CLOSE_READER(re, s)
  82.173 +}
  82.174 +
  82.175 +/**
  82.176 + * read mpeg1 dc style vlc (sign bit + mantisse with no MSB).
  82.177 + * if MSB not set it is negative
  82.178 + * @param n length in bits
  82.179 + * @author BERO
  82.180 + */
  82.181 +static inline int get_xbits(GetBitContext *s, int n){
  82.182 +    register int sign;
  82.183 +    register int32_t cache;
  82.184 +    OPEN_READER(re, s)
  82.185 +    UPDATE_CACHE(re, s)
  82.186 +    cache = GET_CACHE(re,s);
  82.187 +    sign=(~cache)>>31;
  82.188 +    LAST_SKIP_BITS(re, s, n)
  82.189 +    CLOSE_READER(re, s)
  82.190 +    return (NEG_USR32(sign ^ cache, n) ^ sign) - sign;
  82.191 +}
  82.192 +
  82.193 +static inline int get_sbits(GetBitContext *s, int n){
  82.194 +    register int tmp;
  82.195 +    OPEN_READER(re, s)
  82.196 +    UPDATE_CACHE(re, s)
  82.197 +    tmp= SHOW_SBITS(re, s, n);
  82.198 +    LAST_SKIP_BITS(re, s, n)
  82.199 +    CLOSE_READER(re, s)
  82.200 +    return tmp;
  82.201 +}
  82.202 +
  82.203 +/**
  82.204 + * reads 1-17 bits.
  82.205 + * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't
  82.206 + */
  82.207 +static inline unsigned int get_bits(GetBitContext *s, int n){
  82.208 +    register int tmp;
  82.209 +    OPEN_READER(re, s)
  82.210 +    UPDATE_CACHE(re, s)
  82.211 +    tmp= SHOW_UBITS(re, s, n);
  82.212 +    LAST_SKIP_BITS(re, s, n)
  82.213 +    CLOSE_READER(re, s)
  82.214 +    return tmp;
  82.215 +}
  82.216 +
  82.217 +/**
  82.218 + * shows 1-17 bits.
  82.219 + * Note, the alt bitstream reader can read up to 25 bits, but the libmpeg2 reader can't
  82.220 + */
  82.221 +static inline unsigned int show_bits(GetBitContext *s, int n){
  82.222 +    register int tmp;
  82.223 +    OPEN_READER(re, s)
  82.224 +    UPDATE_CACHE(re, s)
  82.225 +    tmp= SHOW_UBITS(re, s, n);
  82.226 +//    CLOSE_READER(re, s)
  82.227 +    return tmp;
  82.228 +}
  82.229 +
  82.230 +static inline void skip_bits(GetBitContext *s, int n){
  82.231 + //Note gcc seems to optimize this to s->index+=n for the ALT_READER :))
  82.232 +    OPEN_READER(re, s)
  82.233 +    UPDATE_CACHE(re, s)
  82.234 +    LAST_SKIP_BITS(re, s, n)
  82.235 +    CLOSE_READER(re, s)
  82.236 +}
  82.237 +
  82.238 +static inline unsigned int get_bits1(GetBitContext *s){
  82.239 +    return get_bits(s, 1);
  82.240 +}
  82.241 +
  82.242 +static inline unsigned int show_bits1(GetBitContext *s){
  82.243 +    return show_bits(s, 1);
  82.244 +}
  82.245 +
  82.246 +static inline void skip_bits1(GetBitContext *s){
  82.247 +    skip_bits(s, 1);
  82.248 +}
  82.249 +
  82.250 +/**
  82.251 + * reads 0-32 bits.
  82.252 + */
  82.253 +static inline unsigned int get_bits_long(GetBitContext *s, int n){
  82.254 +    if(n<=MIN_CACHE_BITS) return get_bits(s, n);
  82.255 +    else{
  82.256 +        int ret= get_bits(s, 16) << (n-16);
  82.257 +        return ret | get_bits(s, n-16);
  82.258 +    }
  82.259 +}
  82.260 +
  82.261 +/**
  82.262 + * reads 0-32 bits as a signed integer.
  82.263 + */
  82.264 +static inline int get_sbits_long(GetBitContext *s, int n) {
  82.265 +    return sign_extend(get_bits_long(s, n), n);
  82.266 +}
  82.267 +
  82.268 +/**
  82.269 + * shows 0-32 bits.
  82.270 + */
  82.271 +static inline unsigned int show_bits_long(GetBitContext *s, int n){
  82.272 +    if(n<=MIN_CACHE_BITS) return show_bits(s, n);
  82.273 +    else{
  82.274 +        GetBitContext gb= *s;
  82.275 +        return get_bits_long(&gb, n);
  82.276 +    }
  82.277 +}
  82.278 +
  82.279 +static inline int check_marker(GetBitContext *s, const char *msg)
  82.280 +{
  82.281 +    int bit= get_bits1(s);
  82.282 +    if(!bit)
  82.283 +        av_log(AV_LOG_INFO, "Marker bit missing %s\n", msg);
  82.284 +
  82.285 +    return bit;
  82.286 +}
  82.287 +
  82.288 +/**
  82.289 + * init GetBitContext.
  82.290 + * @param buffer bitstream buffer, must be FF_INPUT_BUFFER_PADDING_SIZE bytes larger then the actual read bits
  82.291 + * because some optimized bitstream readers read 32 or 64 bit at once and could read over the end
  82.292 + * @param bit_size the size of the buffer in bits
  82.293 + *
  82.294 + * While GetBitContext stores the buffer size, for performance reasons you are
  82.295 + * responsible for checking for the buffer end yourself (take advantage of the padding)!
  82.296 + */
  82.297 +static inline void init_get_bits(GetBitContext *s,
  82.298 +                   const uint8_t *buffer, int bit_size)
  82.299 +{
  82.300 +    int buffer_size= (bit_size+7)>>3;
  82.301 +    if(buffer_size < 0 || bit_size < 0) {
  82.302 +        buffer_size = bit_size = 0;
  82.303 +        buffer = NULL;
  82.304 +    }
  82.305 +
  82.306 +    s->buffer= buffer;
  82.307 +    s->size_in_bits= bit_size;
  82.308 +    s->buffer_end= buffer + buffer_size;
  82.309 +
  82.310 +    s->buffer_ptr = (uint32_t*)((intptr_t)buffer&(~3));
  82.311 +    s->bit_count = 32 + 8*((intptr_t)buffer&3);
  82.312 +    skip_bits_long(s, 0);
  82.313 +}
  82.314 +
  82.315 +static inline void align_get_bits(GetBitContext *s)
  82.316 +{
  82.317 +    int n= (-get_bits_count(s)) & 7;
  82.318 +    if(n) skip_bits(s, n);
  82.319 +}
  82.320 +
  82.321 +#define tprintf(p, ...) {}
  82.322 +
  82.323 +static inline int get_bits_left(GetBitContext *gb)
  82.324 +{
  82.325 +    return gb->size_in_bits - get_bits_count(gb);
  82.326 +}
  82.327 +
  82.328 +#endif /* AVCODEC_GET_BITS_H */

    83.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    83.2 +++ b/ffmpeg_smp/h264dec/libavcodec/golomb.c	Mon Aug 27 12:09:56 2012 +0200
    83.3 @@ -0,0 +1,184 @@
    83.4 +/*
    83.5 + * exp golomb vlc stuff
    83.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    83.7 + *
    83.8 + * This file is part of FFmpeg.
    83.9 + *
   83.10 + * FFmpeg is free software; you can redistribute it and/or
   83.11 + * modify it under the terms of the GNU Lesser General Public
   83.12 + * License as published by the Free Software Foundation; either
   83.13 + * version 2.1 of the License, or (at your option) any later version.
   83.14 + *
   83.15 + * FFmpeg is distributed in the hope that it will be useful,
   83.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   83.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   83.18 + * Lesser General Public License for more details.
   83.19 + *
   83.20 + * You should have received a copy of the GNU Lesser General Public
   83.21 + * License along with FFmpeg; if not, write to the Free Software
   83.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   83.23 + */
   83.24 +
   83.25 +/**
   83.26 + * @file
   83.27 + * @brief
   83.28 + *     exp golomb vlc stuff
   83.29 + * @author Michael Niedermayer <michaelni@gmx.at>
   83.30 + */
   83.31 +
   83.32 +#include "libavutil/common.h"
   83.33 +
   83.34 +const uint8_t ff_log2_tab[256]={
   83.35 +    0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
   83.36 +    5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
   83.37 +    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
   83.38 +    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
   83.39 +    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   83.40 +    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   83.41 +    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   83.42 +	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
   83.43 +};
   83.44 +
   83.45 +const uint8_t ff_golomb_vlc_len[512]={
   83.46 +14,13,12,12,11,11,11,11,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   83.47 +7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   83.48 +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
   83.49 +5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
   83.50 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   83.51 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   83.52 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   83.53 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
   83.54 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   83.55 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   83.56 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   83.57 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   83.58 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   83.59 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   83.60 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   83.61 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
   83.62 +};
   83.63 +
   83.64 +const uint8_t ff_ue_golomb_vlc_code[512]={
   83.65 +31,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,
   83.66 + 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,
   83.67 + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
   83.68 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
   83.69 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   83.70 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   83.71 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   83.72 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   83.73 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   83.74 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   83.75 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   83.76 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   83.77 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   83.78 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   83.79 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   83.80 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
   83.81 +};
   83.82 +
   83.83 +const int8_t ff_se_golomb_vlc_code[512]={
   83.84 + 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,  8, -8,  9, -9, 10,-10, 11,-11, 12,-12, 13,-13, 14,-14, 15,-15,
   83.85 +  4,  4,  4,  4, -4, -4, -4, -4,  5,  5,  5,  5, -5, -5, -5, -5,  6,  6,  6,  6, -6, -6, -6, -6,  7,  7,  7,  7, -7, -7, -7, -7,
   83.86 +  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
   83.87 +  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
   83.88 +  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
   83.89 +  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
   83.90 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   83.91 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
   83.92 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   83.93 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   83.94 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   83.95 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   83.96 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   83.97 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   83.98 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   83.99 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.100 +};
  83.101 +
  83.102 +
  83.103 +const uint8_t ff_ue_golomb_len[256]={
  83.104 + 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,11,
  83.105 +11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,13,
  83.106 +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
  83.107 +13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,15,
  83.108 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  83.109 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  83.110 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
  83.111 +15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,17,
  83.112 +};
  83.113 +
  83.114 +const uint8_t ff_interleaved_golomb_vlc_len[256]={
  83.115 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
  83.116 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
  83.117 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  83.118 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  83.119 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
  83.120 +9,9,7,7,9,9,7,7,5,5,5,5,5,5,5,5,
  83.121 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  83.122 +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
  83.123 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.124 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.125 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.126 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.127 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.128 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.129 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.130 +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  83.131 +};
  83.132 +
  83.133 +const uint8_t ff_interleaved_ue_golomb_vlc_code[256]={
  83.134 + 15,16,7, 7, 17,18,8, 8, 3, 3, 3, 3, 3, 3, 3, 3,
  83.135 + 19,20,9, 9, 21,22,10,10,4, 4, 4, 4, 4, 4, 4, 4,
  83.136 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  83.137 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  83.138 + 23,24,11,11,25,26,12,12,5, 5, 5, 5, 5, 5, 5, 5,
  83.139 + 27,28,13,13,29,30,14,14,6, 6, 6, 6, 6, 6, 6, 6,
  83.140 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  83.141 + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  83.142 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.143 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.144 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.145 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.146 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.147 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.148 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.149 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.150 +};
  83.151 +
  83.152 +const int8_t ff_interleaved_se_golomb_vlc_code[256]={
  83.153 +  8, -8,  4,  4,  9, -9, -4, -4,  2,  2,  2,  2,  2,  2,  2,  2,
  83.154 + 10,-10,  5,  5, 11,-11, -5, -5, -2, -2, -2, -2, -2, -2, -2, -2,
  83.155 +  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
  83.156 +  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
  83.157 + 12,-12,  6,  6, 13,-13, -6, -6,  3,  3,  3,  3,  3,  3,  3,  3,
  83.158 + 14,-14,  7,  7, 15,-15, -7, -7, -3, -3, -3, -3, -3, -3, -3, -3,
  83.159 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  83.160 + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
  83.161 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.162 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.163 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.164 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.165 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.166 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.167 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.168 +  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  83.169 +};
  83.170 +
  83.171 +const uint8_t ff_interleaved_dirac_golomb_vlc_code[256]={
  83.172 +0, 1, 0, 0, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
  83.173 +4, 5, 2, 2, 6, 7, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
  83.174 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.175 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.176 +8, 9, 4, 4, 10,11,5, 5, 2, 2, 2, 2, 2, 2, 2, 2,
  83.177 +12,13,6, 6, 14,15,7, 7, 3, 3, 3, 3, 3, 3, 3, 3,
  83.178 +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  83.179 +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  83.180 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.181 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.182 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.183 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.184 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.185 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.186 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  83.187 +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,};

    84.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    84.2 +++ b/ffmpeg_smp/h264dec/libavcodec/golomb.h	Mon Aug 27 12:09:56 2012 +0200
    84.3 @@ -0,0 +1,410 @@
    84.4 +/*
    84.5 + * exp golomb vlc stuff
    84.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    84.7 + * Copyright (c) 2004 Alex Beregszaszi
    84.8 + *
    84.9 + * This file is part of FFmpeg.
   84.10 + *
   84.11 + * FFmpeg is free software; you can redistribute it and/or
   84.12 + * modify it under the terms of the GNU Lesser General Public
   84.13 + * License as published by the Free Software Foundation; either
   84.14 + * version 2.1 of the License, or (at your option) any later version.
   84.15 + *
   84.16 + * FFmpeg is distributed in the hope that it will be useful,
   84.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   84.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   84.19 + * Lesser General Public License for more details.
   84.20 + *
   84.21 + * You should have received a copy of the GNU Lesser General Public
   84.22 + * License along with FFmpeg; if not, write to the Free Software
   84.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   84.24 + */
   84.25 +
   84.26 +/**
   84.27 + * @file
   84.28 + * @brief
   84.29 + *     exp golomb vlc stuff
   84.30 + * @author Michael Niedermayer <michaelni@gmx.at> and Alex Beregszaszi
   84.31 + */
   84.32 +
   84.33 +#ifndef AVCODEC_GOLOMB_H
   84.34 +#define AVCODEC_GOLOMB_H
   84.35 +
   84.36 +#include <stdint.h>
   84.37 +#include "get_bits.h"
   84.38 +
   84.39 +#define INVALID_VLC           0x80000000
   84.40 +
   84.41 +extern const uint8_t ff_golomb_vlc_len[512];
   84.42 +extern const uint8_t ff_ue_golomb_vlc_code[512];
   84.43 +extern const  int8_t ff_se_golomb_vlc_code[512];
   84.44 +extern const uint8_t ff_ue_golomb_len[256];
   84.45 +
   84.46 +extern const uint8_t ff_interleaved_golomb_vlc_len[256];
   84.47 +extern const uint8_t ff_interleaved_ue_golomb_vlc_code[256];
   84.48 +extern const  int8_t ff_interleaved_se_golomb_vlc_code[256];
   84.49 +extern const uint8_t ff_interleaved_dirac_golomb_vlc_code[256];
   84.50 +
   84.51 +
   84.52 + /**
   84.53 + * read unsigned exp golomb code.
   84.54 + */
   84.55 +static inline int get_ue_golomb(GetBitContext *gb){
   84.56 +    unsigned int buf;
   84.57 +    int log;
   84.58 +
   84.59 +    OPEN_READER(re, gb);
   84.60 +    UPDATE_CACHE(re, gb);
   84.61 +    buf=GET_CACHE(re, gb);
   84.62 +
   84.63 +    if(buf >= (1<<27)){
   84.64 +        buf >>= 32 - 9;
   84.65 +        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
   84.66 +        CLOSE_READER(re, gb);
   84.67 +
   84.68 +        return ff_ue_golomb_vlc_code[buf];
   84.69 +    }else{
   84.70 +        log= 2*av_log2_c(buf) - 31;
   84.71 +        buf>>= log;
   84.72 +        buf--;
   84.73 +        LAST_SKIP_BITS(re, gb, 32 - log);
   84.74 +        CLOSE_READER(re, gb);
   84.75 +
   84.76 +        return buf;
   84.77 +    }
   84.78 +}
   84.79 +
   84.80 + /**
   84.81 + * read unsigned exp golomb code, constraint to a max of 31.
   84.82 + * the return value is undefined if the stored value exceeds 31.
   84.83 + */
   84.84 +static inline int get_ue_golomb_31(GetBitContext *gb){
   84.85 +    unsigned int buf;
   84.86 +
   84.87 +    OPEN_READER(re, gb);
   84.88 +    UPDATE_CACHE(re, gb);
   84.89 +    buf=GET_CACHE(re, gb);
   84.90 +
   84.91 +    buf >>= 32 - 9;
   84.92 +    LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
   84.93 +    CLOSE_READER(re, gb);
   84.94 +
   84.95 +    return ff_ue_golomb_vlc_code[buf];
   84.96 +}
   84.97 +
   84.98 +static inline int svq3_get_ue_golomb(GetBitContext *gb){
   84.99 +    uint32_t buf;
  84.100 +
  84.101 +    OPEN_READER(re, gb);
  84.102 +    UPDATE_CACHE(re, gb);
  84.103 +    buf=GET_CACHE(re, gb);
  84.104 +
  84.105 +    if(buf&0xAA800000){
  84.106 +        buf >>= 32 - 8;
  84.107 +        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
  84.108 +        CLOSE_READER(re, gb);
  84.109 +
  84.110 +        return ff_interleaved_ue_golomb_vlc_code[buf];
  84.111 +    }else{
  84.112 +        int ret = 1;
  84.113 +
  84.114 +        while (1) {
  84.115 +            buf >>= 32 - 8;
  84.116 +            LAST_SKIP_BITS(re, gb, FFMIN(ff_interleaved_golomb_vlc_len[buf], 8));
  84.117 +
  84.118 +            if (ff_interleaved_golomb_vlc_len[buf] != 9){
  84.119 +                ret <<= (ff_interleaved_golomb_vlc_len[buf] - 1) >> 1;
  84.120 +                ret |= ff_interleaved_dirac_golomb_vlc_code[buf];
  84.121 +                break;
  84.122 +            }
  84.123 +            ret = (ret << 4) | ff_interleaved_dirac_golomb_vlc_code[buf];
  84.124 +            UPDATE_CACHE(re, gb);
  84.125 +            buf = GET_CACHE(re, gb);
  84.126 +        }
  84.127 +
  84.128 +        CLOSE_READER(re, gb);
  84.129 +        return ret - 1;
  84.130 +    }
  84.131 +}
  84.132 +
  84.133 +/**
  84.134 + * read unsigned truncated exp golomb code.
  84.135 + */
  84.136 +static inline int get_te0_golomb(GetBitContext *gb, int range){
  84.137 +    assert(range >= 1);
  84.138 +
  84.139 +    if(range==1)      return 0;
  84.140 +    else if(range==2) return get_bits1(gb)^1;
  84.141 +    else              return get_ue_golomb(gb);
  84.142 +}
  84.143 +
  84.144 +/**
  84.145 + * read unsigned truncated exp golomb code.
  84.146 + */
  84.147 +static inline int get_te_golomb(GetBitContext *gb, int range){
  84.148 +    assert(range >= 1);
  84.149 +
  84.150 +    if(range==2) return get_bits1(gb)^1;
  84.151 +    else         return get_ue_golomb(gb);
  84.152 +}
  84.153 +
  84.154 +
  84.155 +/**
  84.156 + * read signed exp golomb code.
  84.157 + */
  84.158 +static inline int get_se_golomb(GetBitContext *gb){
  84.159 +    unsigned int buf;
  84.160 +    int log;
  84.161 +
  84.162 +    OPEN_READER(re, gb);
  84.163 +    UPDATE_CACHE(re, gb);
  84.164 +    buf=GET_CACHE(re, gb);
  84.165 +
  84.166 +    if(buf >= (1<<27)){
  84.167 +        buf >>= 32 - 9;
  84.168 +        LAST_SKIP_BITS(re, gb, ff_golomb_vlc_len[buf]);
  84.169 +        CLOSE_READER(re, gb);
  84.170 +
  84.171 +        return ff_se_golomb_vlc_code[buf];
  84.172 +    }else{
  84.173 +        log= 2*av_log2_c(buf) - 31;
  84.174 +        buf>>= log;
  84.175 +
  84.176 +        LAST_SKIP_BITS(re, gb, 32 - log);
  84.177 +        CLOSE_READER(re, gb);
  84.178 +
  84.179 +        if(buf&1) buf= -(buf>>1);
  84.180 +        else      buf=  (buf>>1);
  84.181 +
  84.182 +        return buf;
  84.183 +    }
  84.184 +}
  84.185 +
  84.186 +static inline int svq3_get_se_golomb(GetBitContext *gb){
  84.187 +    unsigned int buf;
  84.188 +    int log;
  84.189 +
  84.190 +    OPEN_READER(re, gb);
  84.191 +    UPDATE_CACHE(re, gb);
  84.192 +    buf=GET_CACHE(re, gb);
  84.193 +
  84.194 +    if(buf&0xAA800000){
  84.195 +        buf >>= 32 - 8;
  84.196 +        LAST_SKIP_BITS(re, gb, ff_interleaved_golomb_vlc_len[buf]);
  84.197 +        CLOSE_READER(re, gb);
  84.198 +
  84.199 +        return ff_interleaved_se_golomb_vlc_code[buf];
  84.200 +    }else{
  84.201 +        LAST_SKIP_BITS(re, gb, 8);
  84.202 +        UPDATE_CACHE(re, gb);
  84.203 +        buf |= 1 | (GET_CACHE(re, gb) >> 8);
  84.204 +
  84.205 +        if((buf & 0xAAAAAAAA) == 0)
  84.206 +            return INVALID_VLC;
  84.207 +
  84.208 +        for(log=31; (buf & 0x80000000) == 0; log--){
  84.209 +            buf = (buf << 2) - ((buf << log) >> (log - 1)) + (buf >> 30);
  84.210 +        }
  84.211 +
  84.212 +        LAST_SKIP_BITS(re, gb, 63 - 2*log - 8);
  84.213 +        CLOSE_READER(re, gb);
  84.214 +
  84.215 +        return (signed) (((((buf << log) >> log) - 1) ^ -(buf & 0x1)) + 1) >> 1;
  84.216 +    }
  84.217 +}
  84.218 +
  84.219 +static inline int dirac_get_se_golomb(GetBitContext *gb){
  84.220 +    uint32_t buf;
  84.221 +    uint32_t ret;
  84.222 +
  84.223 +    ret = svq3_get_ue_golomb(gb);
  84.224 +
  84.225 +    if (ret) {
  84.226 +        OPEN_READER(re, gb);
  84.227 +        UPDATE_CACHE(re, gb);
  84.228 +        buf = SHOW_SBITS(re, gb, 1);
  84.229 +        LAST_SKIP_BITS(re, gb, 1);
  84.230 +        ret = (ret ^ buf) - buf;
  84.231 +        CLOSE_READER(re, gb);
  84.232 +    }
  84.233 +
  84.234 +    return ret;
  84.235 +}
  84.236 +
  84.237 +/**
  84.238 + * read unsigned golomb rice code (ffv1).
  84.239 + */
  84.240 +static inline int get_ur_golomb(GetBitContext *gb, int k, int limit, int esc_len){
  84.241 +    unsigned int buf;
  84.242 +    int log;
  84.243 +
  84.244 +    OPEN_READER(re, gb);
  84.245 +    UPDATE_CACHE(re, gb);
  84.246 +    buf=GET_CACHE(re, gb);
  84.247 +
  84.248 +    log= av_log2_c(buf);
  84.249 +
  84.250 +    if(log > 31-limit){
  84.251 +        buf >>= log - k;
  84.252 +        buf += (30-log)<<k;
  84.253 +        LAST_SKIP_BITS(re, gb, 32 + k - log);
  84.254 +        CLOSE_READER(re, gb);
  84.255 +
  84.256 +        return buf;
  84.257 +    }else{
  84.258 +        LAST_SKIP_BITS(re, gb, limit);
  84.259 +        UPDATE_CACHE(re, gb);
  84.260 +
  84.261 +        buf = SHOW_UBITS(re, gb, esc_len);
  84.262 +
  84.263 +        LAST_SKIP_BITS(re, gb, esc_len);
  84.264 +        CLOSE_READER(re, gb);
  84.265 +
  84.266 +        return buf + limit - 1;
  84.267 +    }
  84.268 +}
  84.269 +
  84.270 +/**
  84.271 + * read unsigned golomb rice code (jpegls).
  84.272 + */
  84.273 +static inline int get_ur_golomb_jpegls(GetBitContext *gb, int k, int limit, int esc_len){
  84.274 +    unsigned int buf;
  84.275 +    int log;
  84.276 +
  84.277 +    OPEN_READER(re, gb);
  84.278 +    UPDATE_CACHE(re, gb);
  84.279 +    buf=GET_CACHE(re, gb);
  84.280 +
  84.281 +    log= av_log2_c(buf);
  84.282 +
  84.283 +    if(log - k >= 32-MIN_CACHE_BITS+(MIN_CACHE_BITS==32) && 32-log < limit){
  84.284 +        buf >>= log - k;
  84.285 +        buf += (30-log)<<k;
  84.286 +        LAST_SKIP_BITS(re, gb, 32 + k - log);
  84.287 +        CLOSE_READER(re, gb);
  84.288 +
  84.289 +        return buf;
  84.290 +    }else{
  84.291 +        int i;
  84.292 +        for(i=0; SHOW_UBITS(re, gb, 1) == 0; i++){
  84.293 +            LAST_SKIP_BITS(re, gb, 1);
  84.294 +            UPDATE_CACHE(re, gb);
  84.295 +        }
  84.296 +        SKIP_BITS(re, gb, 1);
  84.297 +
  84.298 +        if(i < limit - 1){
  84.299 +            if(k){
  84.300 +                buf = SHOW_UBITS(re, gb, k);
  84.301 +                LAST_SKIP_BITS(re, gb, k);
  84.302 +            }else{
  84.303 +                buf=0;
  84.304 +            }
  84.305 +
  84.306 +            CLOSE_READER(re, gb);
  84.307 +            return buf + (i<<k);
  84.308 +        }else if(i == limit - 1){
  84.309 +            buf = SHOW_UBITS(re, gb, esc_len);
  84.310 +            LAST_SKIP_BITS(re, gb, esc_len);
  84.311 +            CLOSE_READER(re, gb);
  84.312 +
  84.313 +            return buf + 1;
  84.314 +        }else
  84.315 +            return -1;
  84.316 +    }
  84.317 +}
  84.318 +
  84.319 +/**
  84.320 + * read signed golomb rice code (ffv1).
  84.321 + */
  84.322 +static inline int get_sr_golomb(GetBitContext *gb, int k, int limit, int esc_len){
  84.323 +    int v= get_ur_golomb(gb, k, limit, esc_len);
  84.324 +
  84.325 +    v++;
  84.326 +    if (v&1) return v>>1;
  84.327 +    else return -(v>>1);
  84.328 +
  84.329 +//    return (v>>1) ^ -(v&1);
  84.330 +}
  84.331 +
  84.332 +/**
  84.333 + * read signed golomb rice code (flac).
  84.334 + */
  84.335 +static inline int get_sr_golomb_flac(GetBitContext *gb, int k, int limit, int esc_len){
  84.336 +    int v= get_ur_golomb_jpegls(gb, k, limit, esc_len);
  84.337 +    return (v>>1) ^ -(v&1);
  84.338 +}
  84.339 +
  84.340 +/**
  84.341 + * read unsigned golomb rice code (shorten).
  84.342 + */
  84.343 +static inline unsigned int get_ur_golomb_shorten(GetBitContext *gb, int k){
  84.344 +        return get_ur_golomb_jpegls(gb, k, INT_MAX, 0);
  84.345 +}
  84.346 +
  84.347 +/**
  84.348 + * read signed golomb rice code (shorten).
  84.349 + */
  84.350 +static inline int get_sr_golomb_shorten(GetBitContext* gb, int k)
  84.351 +{
  84.352 +    int uvar = get_ur_golomb_jpegls(gb, k + 1, INT_MAX, 0);
  84.353 +    if (uvar & 1)
  84.354 +        return ~(uvar >> 1);
  84.355 +    else
  84.356 +        return uvar >> 1;
  84.357 +}
  84.358 +
  84.359 +
  84.360 +
  84.361 +#ifdef TRACE
  84.362 +
  84.363 +static inline int get_ue(GetBitContext *s, char *file, const char *func, int line){
  84.364 +    int show= show_bits(s, 24);
  84.365 +    int pos= get_bits_count(s);
  84.366 +    int i= get_ue_golomb(s);
  84.367 +    int len= get_bits_count(s) - pos;
  84.368 +    int bits= show>>(24-len);
  84.369 +
  84.370 +    print_bin(bits, len);
  84.371 +
  84.372 +    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d ue  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
  84.373 +
  84.374 +    return i;
  84.375 +}
  84.376 +
  84.377 +static inline int get_se(GetBitContext *s, char *file, const char *func, int line){
  84.378 +    int show= show_bits(s, 24);
  84.379 +    int pos= get_bits_count(s);
  84.380 +    int i= get_se_golomb(s);
  84.381 +    int len= get_bits_count(s) - pos;
  84.382 +    int bits= show>>(24-len);
  84.383 +
  84.384 +    print_bin(bits, len);
  84.385 +
  84.386 +    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d se  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
  84.387 +
  84.388 +    return i;
  84.389 +}
  84.390 +
  84.391 +static inline int get_te(GetBitContext *s, int r, char *file, const char *func, int line){
  84.392 +    int show= show_bits(s, 24);
  84.393 +    int pos= get_bits_count(s);
  84.394 +    int i= get_te0_golomb(s, r);
  84.395 +    int len= get_bits_count(s) - pos;
  84.396 +    int bits= show>>(24-len);
  84.397 +
  84.398 +    print_bin(bits, len);
  84.399 +
  84.400 +    av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d te  @%5d in %s %s:%d\n", bits, len, i, pos, file, func, line);
  84.401 +
  84.402 +    return i;
  84.403 +}
  84.404 +
  84.405 +#define get_ue_golomb(a) get_ue(a, __FILE__, __PRETTY_FUNCTION__, __LINE__)
  84.406 +#define get_se_golomb(a) get_se(a, __FILE__, __PRETTY_FUNCTION__, __LINE__)
  84.407 +#define get_te_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__)
  84.408 +#define get_te0_golomb(a, r) get_te(a, r, __FILE__, __PRETTY_FUNCTION__, __LINE__)
  84.409 +
  84.410 +#endif
  84.411 +
  84.412 +
  84.413 +#endif /* AVCODEC_GOLOMB_H */

    85.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    85.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264.c	Mon Aug 27 12:09:56 2012 +0200
    85.3 @@ -0,0 +1,215 @@
    85.4 +#include "config.h"
    85.5 +#include "h264.h"
    85.6 +#include "h264_misc.h"
    85.7 +#include <math.h>
    85.8 +
    85.9 +H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int width, int height, h264_options *opts){
   85.10 +    int i;
   85.11 +    const int mb_height = (height + 15) / 16;
   85.12 +    const int mb_width  = (width  + 15) / 16;
   85.13 +    const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16
   85.14 +
   85.15 +    ff_init_cabac_states();
   85.16 +
   85.17 +    H264Context *h= av_mallocz(sizeof(H264Context));
   85.18 +
   85.19 +    start_timer(h, TOTAL);
   85.20 +    h->file_name = file_name;
   85.21 +    h->profile = opts->profile;
   85.22 +    for (i=0; i<PROFILE_STAGES; i++)
   85.23 +        h->total_time[i]=0;
   85.24 +
   85.25 +    h->ifile=ifile;
   85.26 +    h->ofile =ofile;
   85.27 +
   85.28 +    h->verbose =opts->verbose;
   85.29 +    h->no_mbd =opts->no_mbd;
   85.30 +    h->static_3d =opts->static_3d;
   85.31 +    h->pipe_bufs = opts->pipe_bufs;
   85.32 +    h->slice_bufs = opts->slice_bufs;
   85.33 +
   85.34 +    h->ed_ppe_threads =0;
   85.35 +    if (opts->ppe_ed){
   85.36 +        h->ed_ppe_threads = (opts->threads >opts->ppe_ed)? opts->ppe_ed :opts->threads;
   85.37 +    }
   85.38 +
   85.39 +    h->threads = opts->threads - h->ed_ppe_threads;
   85.40 +    h->smt = opts->smt;
   85.41 +    if (h->smt){
   85.42 +        h->threads *= 2;
   85.43 +    }
   85.44 +
   85.45 +    h->num_frames = opts->numframes;
   85.46 +
   85.47 +    h->frame_width = width;
   85.48 +    h->frame_height = height;
   85.49 +
   85.50 +    while ((width/2) %STRIDE_ALIGN)
   85.51 +        width+=STRIDE_ALIGN;
   85.52 +    h->width = width;
   85.53 +    h->height = mb_height*16;
   85.54 +
   85.55 +    h->mb_height = mb_height;
   85.56 +    h->mb_width = mb_width;
   85.57 +    h->mb_stride = mb_stride;
   85.58 +    h->b4_stride = mb_width*4 + 1;
   85.59 +    h->b_stride = mb_width*4;
   85.60 +
   85.61 +    h->smb_width = opts->smb_size[0];
   85.62 +    h->smb_height = opts->smb_size[1] < h->smb_width ?  opts->smb_size[1]  : h->smb_width;
   85.63 +    h->smbc = getSuperMBContext(h, h->smb_width, h->smb_height);    
   85.64 +
   85.65 +    h->wave_order = opts->wave_order;
   85.66 +
   85.67 +    h->pipe_bufs = opts->pipe_bufs;
   85.68 +
   85.69 +    h->max_dpb_cnt = DPB_SIZE + opts->pipe_bufs;
   85.70 +    h->free_dpb_cnt = h->max_dpb_cnt;
   85.71 +    h->dpb = av_mallocz (h->max_dpb_cnt* sizeof (DecodedPicture));
   85.72 +    
   85.73 +
   85.74 +    h->free_sb_cnt = h->threads*opts->slice_bufs + (h->no_mbd != 0) ;  //one extra to overlap some latency of signaling/freeing slicebuffers in entropy only mode
   85.75 +    h->sb_size = h->free_sb_cnt;
   85.76 +    h->sb = av_mallocz(h->sb_size* sizeof(SliceBufferEntry));
   85.77 +
   85.78 +    h->rl_q.size = FFMAX(1, FFMIN( (h->height-3 - 512)/16, h->mb_width/2)) +1;
   85.79 +    h->rl_q.free = h->rl_q.size -1;
   85.80 +    h->rl_q.ready=0;
   85.81 +    h->rl_q.fi = h->rl_q.fo= 0;
   85.82 +    h->rl_q.queue = av_malloc(h->rl_q.size* sizeof(RingLineEntry*));
   85.83 +    for (i=0; i<h->rl_q.size; i++){
   85.84 +        if( posix_memalign((void**)&h->rl_q.queue[i],64,sizeof(RingLineEntry)))
   85.85 +            h->rl_q.queue[i]=NULL;
   85.86 +        h->rl_q.queue[i]->top = av_malloc(h->mb_width*sizeof(TopBorder));
   85.87 +    }
   85.88 +
   85.89 +    h->rl_q.queue[0]->prev_line = h->rl_q.queue[h->rl_q.size-1];
   85.90 +    for (i=1; i<h->rl_q.size; i++){
   85.91 +        h->rl_q.queue[i]->prev_line = h->rl_q.queue[i-1];
   85.92 +    }
   85.93 +
   85.94 +    if( HAVE_MMX | HAVE_ALTIVEC| HAVE_NEON ){
   85.95 +        for(i=0; i<16; i++){
   85.96 +            #define T(x) (x>>2) | ((x<<2) & 0xF)
   85.97 +            h->zigzag_scan[i] = T(zigzag_scan[i]);
   85.98 +            #undef T
   85.99 +        }
  85.100 +        for(i=0; i<64; i++){
  85.101 +            #define T(x) (x>>3) | ((x&7)<<3)
  85.102 +            h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
  85.103 +            #undef T
  85.104 +        }
  85.105 +    }else{
  85.106 +        memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
  85.107 +        memcpy(h->zigzag_scan8x8, ff_zigzag_direct, 64*sizeof(uint8_t));
  85.108 +    }
  85.109 +
  85.110 +    pthread_mutex_init(&h->smb_lock, NULL);
  85.111 +    pthread_mutex_init(&h->sdl_lock, NULL);
  85.112 +    pthread_cond_init(&h->sdl_cond, NULL);
  85.113 +
  85.114 +    ///pthread initialization
  85.115 +    pthread_mutex_init(&h->ilock, NULL);
  85.116 +    pthread_cond_init(&h->icond, NULL);
  85.117 +    pthread_mutex_init(&h->slock, NULL);
  85.118 +    pthread_cond_init(&h->scond, NULL);
  85.119 +    pthread_mutex_init(&h->tlock, NULL);
  85.120 +    pthread_cond_init(&h->tcond, NULL);
  85.121 +    pthread_mutex_init(&h->tdlock, NULL);
  85.122 +    pthread_cond_init(&h->tdcond, NULL);
  85.123 +    h->start =!opts->numamap; //default dont wait for start signal
  85.124 +    h->statmbd = opts->statmbd;
  85.125 +    h->rl_side_touch= opts->numamap;
  85.126 +    h->touch_start=0;
  85.127 +    h->setaff =opts->statsched;
  85.128 +    h->init_threads=0;
  85.129 +
  85.130 +    pthread_mutex_init(&h->task_lock, NULL);
  85.131 +    pthread_cond_init(&h->task_cond, NULL);
  85.132 +    for (i=0; i<STAGES; i++){
  85.133 +        pthread_mutex_init (&h->lock[i], NULL);
  85.134 +        pthread_cond_init (&h->cond[i], NULL);
  85.135 +
  85.136 +        pthread_mutex_init (&h->sb_q[i].lock, NULL);
  85.137 +        pthread_cond_init (&h->sb_q[i].cond, NULL);
  85.138 +        h->sb_q[i].size = h->free_sb_cnt; //change to num threads later
  85.139 +        h->sb_q[i].queue = av_malloc(h->free_sb_cnt* sizeof(SliceBufferEntry*));
  85.140 +        h->sb_q[i].cnt = h->sb_q[i].fi = h->sb_q[i].fo =0;
  85.141 +    }
  85.142 +
  85.143 +#if HAVE_LIBSDL2
  85.144 +    h->sdlq.size=2;
  85.145 +    h->sdlq.ready=2;
  85.146 +    h->sdlq.queue = av_malloc(2* sizeof(SDL_Texture*));
  85.147 +    pthread_mutex_init (&h->sdlq.sdl_lock, NULL);
  85.148 +    pthread_cond_init (&h->sdlq.sdl_cond, NULL);
  85.149 +#endif
  85.150 +
  85.151 +    h->display=opts->display;
  85.152 +    h->fullscreen=opts->fullscreen;
  85.153 +
  85.154 +    return h;
  85.155 +}
  85.156 +
  85.157 +
  85.158 +void free_h264dec_context(H264Context *h) {
  85.159 +    int i;
  85.160 +
  85.161 +    for(i=0; i<h->max_dpb_cnt; i++)
  85.162 +        free_dp(&h->dpb[i]);
  85.163 +    av_free (h->dpb);
  85.164 +
  85.165 +    for(i=0; i<h->sb_size; i++){
  85.166 +        if (h->sb[i].initialized){
  85.167 +            free_sb_entry(&h->sb[i]);
  85.168 +        }
  85.169 +    }
  85.170 +    av_freep(&h->sb);
  85.171 +
  85.172 +    for (i=0; i<h->rl_q.size; i++){
  85.173 +        av_freep(&h->rl_q.queue[i]->top);
  85.174 +        av_freep(&h->rl_q.queue[i]);
  85.175 +    }
  85.176 +    av_freep(&h->rl_q.queue);
  85.177 +
  85.178 +    ///pthread cleanup
  85.179 +    pthread_mutex_destroy (&h->task_lock);
  85.180 +    pthread_cond_destroy (&h->task_cond);
  85.181 +    for (i=0; i<STAGES; i++){
  85.182 +        pthread_mutex_destroy (&h->lock[i]);
  85.183 +        pthread_cond_destroy (&h->cond[i]);
  85.184 +
  85.185 +        pthread_mutex_destroy (&h->sb_q[i].lock);
  85.186 +        pthread_cond_destroy (&h->sb_q[i].cond);
  85.187 +        av_freep( &h->sb_q[i].queue);
  85.188 +    }
  85.189 +    pthread_mutex_destroy (&h->slock);
  85.190 +    pthread_cond_destroy (&h->scond);
  85.191 +    pthread_mutex_destroy (&h->ilock);
  85.192 +    pthread_cond_destroy (&h->icond);
  85.193 +
  85.194 +    pthread_mutex_destroy(&h->smb_lock);
  85.195 +    pthread_mutex_destroy (&h->sdl_lock);
  85.196 +    pthread_cond_destroy (&h->sdl_cond);
  85.197 +#if HAVE_LIBSDL2
  85.198 +    av_free(h->sdlq.queue);
  85.199 +    pthread_mutex_destroy (&h->sdlq.sdl_lock);
  85.200 +    pthread_cond_destroy (&h->sdlq.sdl_cond);
  85.201 +#endif
  85.202 +
  85.203 +    stop_timer(h, TOTAL);
  85.204 +    if (h->threads==0){
  85.205 +        for (i=0; i<PROFILE_STAGES; i++)
  85.206 +            h->total_time[i] /= h->num_frames;
  85.207 +        double others = h->total_time[TOTAL];
  85.208 +        for (i=1; i<PROFILE_STAGES; i++)
  85.209 +            others-=h->total_time[i];
  85.210 +        if (h->profile == 1){
  85.211 +            printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [MBREC %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED], h->total_time[REC], others);
  85.212 +        }else if (h->profile ==2){
  85.213 +            printf("\n[FRAME %.3fms] [FRONT %.3fms] [ENTROPY %.3fms] [PRED  %.3fms] [OTHERS %.3fms]\n", h->total_time[TOTAL], h->total_time[FRONT], h->total_time[ED],h->total_time[REC], others);
  85.214 +        }
  85.215 +    }
  85.216 +
  85.217 +    av_free(h);
  85.218 +}
  85.219 \ No newline at end of file

    86.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    86.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264.h	Mon Aug 27 12:09:56 2012 +0200
    86.3 @@ -0,0 +1,76 @@
    86.4 +/*
    86.5 +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
    86.6 +* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    86.7 +*
    86.8 +* This file is part of FFmpeg.
    86.9 +*
   86.10 +* FFmpeg is free software; you can redistribute it and/or
   86.11 +* modify it under the terms of the GNU Lesser General Public
   86.12 +* License as published by the Free Software Foundation; either
   86.13 +* version 2.1 of the License, or (at your option) any later version.
   86.14 +*
   86.15 +* FFmpeg is distributed in the hope that it will be useful,
   86.16 +* but WITHOUT ANY WARRANTY; without even the implied warranty of
   86.17 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   86.18 +* Lesser General Public License for more details.
   86.19 +*
   86.20 +* You should have received a copy of the GNU Lesser General Public
   86.21 +* License along with FFmpeg; if not, write to the Free Software
   86.22 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   86.23 +*/
   86.24 +
   86.25 +/**
   86.26 +* @file
   86.27 +* H.264 / AVC / MPEG4 part10 codec.
   86.28 +* @author Michael Niedermayer <michaelni@gmx.at>
   86.29 +*/
   86.30 +
   86.31 +#ifndef H264_H
   86.32 +#define H264_H
   86.33 +
   86.34 +#include "h264_entropy.h"
   86.35 +#include "h264_data.h"
   86.36 +#include "h264_mc.h"
   86.37 +#include "h264_misc.h"
   86.38 +#include "h264_dsp.h"
   86.39 +#include "h264_pred.h"
   86.40 +#include "h264_parser.h"
   86.41 +#include "h264_nal.h"
   86.42 +#include "h264_rec.h"
   86.43 +#include "h264_deblock.h"
   86.44 +#include "h264_types.h"
   86.45 +
   86.46 +typedef struct h264_options{
   86.47 +    int statsched;
   86.48 +    int statmbd;
   86.49 +    int numamap;
   86.50 +    int no_mbd;
   86.51 +    int numframes;
   86.52 +    int display;
   86.53 +    int fullscreen;
   86.54 +    int verbose;
   86.55 +    int ppe_ed;         // only useful for Cell
   86.56 +    int profile;
   86.57 +    int threads;
   86.58 +    int smb_size[2];    // only useful for OmpSs
   86.59 +    int wave_order;
   86.60 +    int static_3d;
   86.61 +    int pipe_bufs;
   86.62 +    int slice_bufs;
   86.63 +    int smt;
   86.64 +}h264_options;
   86.65 +
   86.66 +int h264_decode_cell(H264Context *h);
   86.67 +int h264_decode_cell_seq(H264Context *h);
   86.68 +
   86.69 +int h264_decode_ompss(H264Context *h);
   86.70 +
   86.71 +int h264_decode_pthread(H264Context *h);
   86.72 +int h264_decode_seq(H264Context *h);
   86.73 +
   86.74 +
   86.75 +H264Context *get_h264dec_context(const char *file_name, int ifile, int ofile, int frame_width, int frame_height, h264_options *opts);
   86.76 +void free_h264dec_context(H264Context *h);
   86.77 +
   86.78 +
   86.79 +#endif /* AVCODEC_H264_H */

    87.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    87.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_cell.c	Mon Aug 27 12:09:56 2012 +0200
    87.3 @@ -0,0 +1,1242 @@
    87.4 +
    87.5 +#include "h264_types.h"
    87.6 +#include "h264_parser.h"
    87.7 +#include "h264_nal.h"
    87.8 +#include "h264_entropy.h"
    87.9 +#include "h264_rec.h"
   87.10 +#include "h264_misc.h"
   87.11 +#include "cell/h264_types_spu.h"
   87.12 +#include "h264_pthread.h"
   87.13 +
   87.14 +#include <pthread.h>
   87.15 +#include <assert.h>
   87.16 +#include <unistd.h>
   87.17 +
   87.18 +#include <libspe2.h>
   87.19 +#include <ppu_intrinsics.h>
   87.20 +#include <cbe_mfc.h>
   87.21 +#include <libsync.h>
   87.22 +
   87.23 +// spe global variables
   87.24 +unsigned rl_cnt_var, rl_mutex_var, rl_cond_var;
   87.25 +atomic_ea_t rl_cnt;
   87.26 +cond_ea_t rl_cond;
   87.27 +mutex_ea_t rl_lock;
   87.28 +
   87.29 +H264spe * spe_params;
   87.30 +unsigned mutex_var[16];
   87.31 +unsigned cond_var[16];
   87.32 +unsigned atomic_var[16];
   87.33 +
   87.34 +pthread_t * spe_tid;
   87.35 +spe_context_ptr_t *spe_context;
   87.36 +void** spe_control_area;
   87.37 +void** spe_ls_area;
   87.38 +H264slice **spe_slice_buf;
   87.39 +
   87.40 +H264spe * spe_ed_params;
   87.41 +unsigned mutex_ed_var[16];
   87.42 +unsigned cond_ed_var[16];
   87.43 +unsigned atomic_ed_var[16];
   87.44 +
   87.45 +pthread_t * spe_ed_tid;
   87.46 +spe_context_ptr_t *spe_ed_context;
   87.47 +void** spe_ed_control_area;
   87.48 +void** spe_ed_ls_area;
   87.49 +EDSlice_spu **spe_ed_slice_buf;
   87.50 +
   87.51 +//structs to propagate stop signal
   87.52 +MBSlice last_slice;
   87.53 +EDSlice last_ed_slice;
   87.54 +DecodedPicture last_pic;
   87.55 +RawFrame last_frm;
   87.56 +
   87.57 +static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){
   87.58 +    int i;
   87.59 +    int cnt = *poc_cnt;
   87.60 +    for(i=0; i<cnt; i++){
   87.61 +        if (poc_list[i]==s->ref_list[1][0]->poc){
   87.62 +            *poc_cnt=i+1;
   87.63 +            while(++i<cnt)
   87.64 +                poc_list[i]=0;
   87.65 +            return 1;
   87.66 +        }
   87.67 +    }
   87.68 +    return 0;
   87.69 +}
   87.70 +
   87.71 +static void update_IP_poc_list(int *poc_list, int *poc_cnt, int poc) {
   87.72 +    int i=0;
   87.73 +    int cnt = *poc_cnt;
   87.74 +
   87.75 +    while (poc_list[i] > poc) { i++;}
   87.76 +    if ( i< cnt)
   87.77 +        memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int));
   87.78 +
   87.79 +    poc_list[i]=poc;
   87.80 +    (*poc_cnt)++;
   87.81 +}
   87.82 +
   87.83 +static void *spe_ed_thread(void *arg){
   87.84 +    H264spe *params = (H264spe *)arg;
   87.85 +    unsigned int idx = params->idx;
   87.86 +    unsigned int runflags = 0;
   87.87 +    unsigned int entry = SPE_DEFAULT_ENTRY;
   87.88 +    // run SPE context
   87.89 +    spe_context_run(spe_ed_context[idx],  &entry, runflags, (void*) params, NULL, NULL);
   87.90 +    // done - now exit thread
   87.91 +    pthread_exit(NULL);
   87.92 +}
   87.93 +
   87.94 +static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) {
   87.95 +    int i;
   87.96 +    int num_threads = ip_threads+b_threads;
   87.97 +    spe_program_handle_t * spe_program = spe_image_open("spe_ed");
   87.98 +    // reserve memory for spe thread id, context and argument addresses
   87.99 +    spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t));
  87.100 +    spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
  87.101 +    spe_ed_params = av_malloc(num_threads * sizeof (H264spe));
  87.102 +    spe_ed_control_area = av_malloc(num_threads * sizeof (void*));
  87.103 +    spe_ed_ls_area = av_malloc(num_threads * sizeof (void*));
  87.104 +    spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*));
  87.105 +
  87.106 +    if (spe_program == NULL)
  87.107 +        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
  87.108 +
  87.109 +    for (i = 0; i < num_threads; i++) {
  87.110 +        // create context for spe program
  87.111 +        spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL);
  87.112 +        if (spe_ed_context[i] == NULL)
  87.113 +            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
  87.114 +        // load SPE program into main memory
  87.115 +        if ((spe_program_load(spe_ed_context[i], spe_program)) == -1)
  87.116 +            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
  87.117 +        //get the control_area for fast mailboxing
  87.118 +        if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL)
  87.119 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
  87.120 +        //get ls area for inter spe communication
  87.121 +        if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL)
  87.122 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
  87.123 +    }
  87.124 +
  87.125 +    for (i = 0; i < ip_threads; i++) {
  87.126 +        spe_ed_params[i].mb_width = h->mb_width;
  87.127 +        spe_ed_params[i].mb_stride = h->mb_stride;
  87.128 +        spe_ed_params[i].mb_height = h->mb_height;
  87.129 +        spe_ed_params[i].type = EDIP;
  87.130 +        spe_ed_params[i].spe_id = i;
  87.131 +        spe_ed_params[i].idx = i;
  87.132 +        //spe_ed_params[i].spe_total = ip_threads; //not used
  87.133 +        //spe_params[i].slice_params= &slice_params;
  87.134 +        spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
  87.135 +        spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads];
  87.136 +
  87.137 +        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
  87.138 +        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
  87.139 +        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
  87.140 +
  87.141 +        mutex_init(spe_ed_params[i].lock);
  87.142 +        cond_init(spe_ed_params[i].cond);
  87.143 +        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
  87.144 +            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
  87.145 +
  87.146 +        //slicebufaddr
  87.147 +        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
  87.148 +        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
  87.149 +    }
  87.150 +    for (int j = 0; j < b_threads; j++) {
  87.151 +        i = j+ip_threads;
  87.152 +        spe_ed_params[i].mb_width = h->mb_width;
  87.153 +        spe_ed_params[i].mb_stride = h->mb_stride;
  87.154 +        spe_ed_params[i].mb_height = h->mb_height;
  87.155 +        spe_ed_params[i].type = EDB;
  87.156 +        spe_ed_params[i].idx = i;
  87.157 +        spe_ed_params[i].spe_id = j;
  87.158 +        spe_ed_params[i].spe_total = b_threads;
  87.159 +        //spe_params[i].slice_params= &slice_params;
  87.160 +        //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
  87.161 +        spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads];
  87.162 +
  87.163 +        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
  87.164 +        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
  87.165 +        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
  87.166 +
  87.167 +        mutex_init(spe_ed_params[i].lock);
  87.168 +        cond_init(spe_ed_params[i].cond);
  87.169 +        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
  87.170 +            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
  87.171 +
  87.172 +        //slicebufaddr
  87.173 +        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
  87.174 +        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
  87.175 +    }
  87.176 +    spe_image_close(spe_program);
  87.177 +
  87.178 +}
  87.179 +
  87.180 +static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){
  87.181 +    dst->pps 	= src->pps;
  87.182 +    dst->mbs 	= src->mbs;
  87.183 +    dst->state 	= src->state;
  87.184 +    dst->qp_thresh = src->qp_thresh;
  87.185 +    dst->pic	= *src->current_picture;
  87.186 +
  87.187 +    dst->ref_count[0] = src->ref_count[0];
  87.188 +    dst->ref_count[1] = src->ref_count[1];
  87.189 +    dst->slice_type	  = src->slice_type;
  87.190 +    dst->slice_type_nos = src->slice_type_nos;
  87.191 +    dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag;
  87.192 +    dst->list_count = src->list_count;
  87.193 +    dst->coded_pic_num = src->coded_pic_num;
  87.194 +
  87.195 +    GetBitContext *gb = &src->gb;
  87.196 +    align_get_bits( gb);
  87.197 +    dst->bytestream_start = gb->buffer + get_bits_count(gb)/8;
  87.198 +    dst->byte_bufsize = (get_bits_left(gb) + 7)/8;
  87.199 +
  87.200 +    dst->transform_bypass = src->transform_bypass;
  87.201 +    dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred;
  87.202 +    memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int));
  87.203 +    memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int));
  87.204 +    dst->cabac_init_idc = src->cabac_init_idc;
  87.205 +    memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int));
  87.206 +    dst->chroma_qp[0]= src->chroma_qp[0];
  87.207 +    dst->chroma_qp[1]= src->chroma_qp[1];
  87.208 +    dst->qscale = src->qscale;
  87.209 +    dst->last_qscale_diff = src->last_qscale_diff;
  87.210 +
  87.211 +    if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0];
  87.212 +}
  87.213 +
  87.214 +static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){
  87.215 +    unsigned status;
  87.216 +
  87.217 +    spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0);
  87.218 +    spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status);
  87.219 +
  87.220 +
  87.221 +    _spe_in_mbox_write(spe_ed_control_area[id], 0);
  87.222 +
  87.223 +    while (!spe_out_mbox_status(spe_ed_context[id])){
  87.224 +        //pthread_yield();
  87.225 +        usleep(1000);
  87.226 +    }
  87.227 +    _spe_out_mbox_read(spe_ed_control_area[id]);
  87.228 +}
  87.229 +
  87.230 +static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){
  87.231 +    int i,j;
  87.232 +
  87.233 +    if( !s->pps.cabac ){
  87.234 +        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
  87.235 +        return -1;
  87.236 +    }
  87.237 +    DECLARE_ALIGNED(16, EDSlice_spu, slice);
  87.238 +    fill_EDSlice_spu(&slice, s);
  87.239 +
  87.240 +    send_slice_to_spe_and_wait(&slice, id);
  87.241 +
  87.242 +    return 0;
  87.243 +}
  87.244 +
  87.245 +static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){
  87.246 +    int i,j;
  87.247 +
  87.248 +    if( !s->pps.cabac ){
  87.249 +        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
  87.250 +        return -1;
  87.251 +    }
  87.252 +    DECLARE_ALIGNED(16, EDSlice_spu, slice);
  87.253 +    fill_EDSlice_spu(&slice, s);
  87.254 +
  87.255 +    send_slice_to_spe_and_wait(&slice, 0);
  87.256 +    
  87.257 +    if (s->release_cnt>0) {
  87.258 +        for (int i=0; i<s->release_cnt; i++){
  87.259 +            release_pib_entry(h, s->release_ref[i], 2);
  87.260 +        }
  87.261 +        s->release_cnt=0;
  87.262 +    }
  87.263 +
  87.264 +    release_pib_entry(h, s->current_picture, 1);
  87.265 +    av_freep(&s->gb.raw);
  87.266 +    if (s->gb.rbsp)
  87.267 +        av_freep(&s->gb.rbsp);
  87.268 +
  87.269 +    return 0;
  87.270 +}
  87.271 +
  87.272 +static void *entr_IP_spe_thread(void *arg){
  87.273 +    EDThreadContext *eip = (EDThreadContext *) arg;
  87.274 +    H264Context *h = eip->h;
  87.275 +// 	printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid));
  87.276 +    for (int i=0; i<SLICE_BUFS; i++){
  87.277 +        eip->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
  87.278 +    }
  87.279 +
  87.280 +    EntropyContext *ec = get_entropy_context(h);
  87.281 +    EDSlice *s;
  87.282 +
  87.283 +    for(;;){
  87.284 +        {
  87.285 +            pthread_mutex_lock(&eip->ed_lock);
  87.286 +            while (eip->ed_cnt <= 0)
  87.287 +                pthread_cond_wait(&eip->ed_cond, &eip->ed_lock);
  87.288 +            s = &eip->ed_q[eip->ed_fo];
  87.289 +            eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT;
  87.290 +            pthread_mutex_unlock(&eip->ed_lock);
  87.291 +        }
  87.292 +
  87.293 +        if (s->state<0)
  87.294 +            break;
  87.295 +        {
  87.296 +            pthread_mutex_lock(&eip->mbs_lock);
  87.297 +            while (eip->mbs_cnt <= 0)
  87.298 +                pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock);
  87.299 +
  87.300 +            s->mbs = eip->mbs[eip->mbs_fo];
  87.301 +            s->ed = eip;
  87.302 +            eip->mbs_cnt--;
  87.303 +            eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS;
  87.304 +            pthread_mutex_unlock(&eip->mbs_lock);
  87.305 +        }
  87.306 +        if (eip->cell){
  87.307 +            decode_slice_entropy_cell(ec, s, eip->thread_num);
  87.308 +        }else{
  87.309 +            decode_slice_entropy(ec, s);
  87.310 +        }
  87.311 +
  87.312 +//         {
  87.313 +//             pthread_mutex_lock(&h->lock[ENTROPY2]);
  87.314 +//             h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc;
  87.315 +//             while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT)
  87.316 +//                 h->ed_poc_fo++;
  87.317 +//
  87.318 +//             pthread_cond_signal(&h->cond[ENTROPY2]);
  87.319 +//             pthread_mutex_unlock(&h->lock[ENTROPY2]);
  87.320 +//         }
  87.321 +
  87.322 +        {
  87.323 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
  87.324 +            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
  87.325 +                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
  87.326 +            h->ed_reorder_q[h->ed_reorder_fi] = *s;
  87.327 +            h->ed_reorder_cnt++;
  87.328 +            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
  87.329 +            pthread_cond_signal(&h->cond[ENTROPY4]);
  87.330 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
  87.331 +        }
  87.332 +
  87.333 +        {
  87.334 +            pthread_mutex_lock(&eip->ed_lock);
  87.335 +            eip->ed_cnt--;
  87.336 +            pthread_cond_signal(&eip->ed_cond);
  87.337 +            pthread_mutex_unlock(&eip->ed_lock);
  87.338 +        }
  87.339 +    }
  87.340 +
  87.341 +    free_entropy_context(ec);
  87.342 +
  87.343 +    pthread_exit(NULL);
  87.344 +    return NULL;
  87.345 +}
  87.346 +
  87.347 +static void *entr_B_spe_thread(void *arg){
  87.348 +    EDThreadContext *eb = (EDThreadContext *) arg;
  87.349 +    H264Context *h = eb->h;
  87.350 +// 	printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid));
  87.351 +    for (int i=0; i<SLICE_BUFS; i++){
  87.352 +        eb->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
  87.353 +    }
  87.354 +
  87.355 +    EntropyContext *ec = get_entropy_context(h);
  87.356 +    EDSlice *s;
  87.357 +
  87.358 +    for(;;){
  87.359 +        {
  87.360 +            pthread_mutex_lock(&eb->ed_lock);
  87.361 +            while (eb->ed_cnt <= 0)
  87.362 +                pthread_cond_wait(&eb->ed_cond, &eb->ed_lock);
  87.363 +            s = &eb->ed_q[eb->ed_fo];
  87.364 +            eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT;
  87.365 +            pthread_mutex_unlock(&eb->ed_lock);
  87.366 +        }
  87.367 +
  87.368 +        if (s->state<0)
  87.369 +            break;
  87.370 +        {
  87.371 +            pthread_mutex_lock(&eb->mbs_lock);
  87.372 +            while (eb->mbs_cnt <= 0)
  87.373 +                pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock);
  87.374 +            s->mbs = eb->mbs[eb->mbs_fo];
  87.375 +            s->ed = eb;
  87.376 +            eb->mbs_cnt--;
  87.377 +            eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS;
  87.378 +            pthread_mutex_unlock(&eb->mbs_lock);
  87.379 +        }
  87.380 +        //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed);
  87.381 +        decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads);
  87.382 +
  87.383 +        {
  87.384 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
  87.385 +            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
  87.386 +                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
  87.387 +            h->ed_reorder_q[h->ed_reorder_fi] = *s;
  87.388 +            h->ed_reorder_cnt++;
  87.389 +            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
  87.390 +            pthread_cond_signal(&h->cond[ENTROPY4]);
  87.391 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
  87.392 +
  87.393 +        }
  87.394 +
  87.395 +        {
  87.396 +            pthread_mutex_lock(&eb->ed_lock);
  87.397 +            eb->ed_cnt--;
  87.398 +            pthread_cond_signal(&eb->ed_cond);
  87.399 +            pthread_mutex_unlock(&eb->ed_lock);
  87.400 +        }
  87.401 +    }
  87.402 +    eb->lines_cnt++;
  87.403 +
  87.404 +    free_entropy_context(ec);
  87.405 +
  87.406 +    pthread_exit(NULL);
  87.407 +    return NULL;
  87.408 +}
  87.409 +
  87.410 +static void *entr_B_distribute(void *arg){
  87.411 +    H264Context *h = (H264Context *) arg;
  87.412 +    EDSlice *s;
  87.413 +
  87.414 +    int i, n=0, poc;
  87.415 +
  87.416 +// 	printf("eb dist, pid %d\n", syscall(SYS_gettid));
  87.417 +
  87.418 +    for(i=0; i<h->edb_threads; i++){
  87.419 +        h->b[i].h =h;
  87.420 +        h->b[i].thread_num =i;
  87.421 +        h->b[i].thread_total =h->edb_threads;
  87.422 +        pthread_mutex_init(&h->b[i].mbs_lock, NULL);
  87.423 +        pthread_cond_init(&h->b[i].mbs_cond, NULL);
  87.424 +        h->b[i].mbs_fo = 0;
  87.425 +        h->b[i].mbs_cnt = SLICE_BUFS;
  87.426 +        h->b[i].ed_fi =0;
  87.427 +        h->b[i].ed_fo =0;
  87.428 +        h->b[i].ed_cnt =0;
  87.429 +        h->b[i].lines_cnt =0;
  87.430 +        h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads];
  87.431 +        pthread_mutex_init(&h->b[i].ed_lock, NULL);
  87.432 +        pthread_cond_init(&h->b[i].ed_cond, NULL);
  87.433 +        pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]);
  87.434 +    }
  87.435 +
  87.436 +    for(;;){
  87.437 +        {
  87.438 +            pthread_mutex_lock(&h->lock[ENTROPY3B]);
  87.439 +            while (h->ed_B_cnt<=0)
  87.440 +                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
  87.441 +            s= &h->ed_B_q[h->ed_B_fo];
  87.442 +            h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT;
  87.443 +            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
  87.444 +
  87.445 +        }
  87.446 +        if (s->state<0)
  87.447 +            break;
  87.448 +
  87.449 +        if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){
  87.450 +            while (poc < s->ref_list[1][0]->poc){
  87.451 +                pthread_mutex_lock(&h->lock[ENTROPY2]);
  87.452 +                while (poc == h->ed_poc)
  87.453 +                    pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]);
  87.454 +                poc = h->ed_poc;
  87.455 +                pthread_mutex_unlock(&h->lock[ENTROPY2]);
  87.456 +            }
  87.457 +        }
  87.458 +        {
  87.459 +            pthread_mutex_lock(&h->b[n].ed_lock);
  87.460 +            while (h->b[n].ed_cnt >= MAX_SLICE_COUNT)
  87.461 +                pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock);
  87.462 +            h->b[n].ed_q[ h->b[n].ed_fi] = *s;
  87.463 +            h->b[n].ed_cnt++;
  87.464 +            h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT;
  87.465 +            pthread_cond_signal(&h->b[n].ed_cond);
  87.466 +            pthread_mutex_unlock(&h->b[n].ed_lock);
  87.467 +
  87.468 +            n++; n%=h->edb_threads;
  87.469 +        }
  87.470 +        {
  87.471 +            pthread_mutex_lock(&h->lock[ENTROPY3B]);
  87.472 +            h->ed_B_cnt--;
  87.473 +            pthread_cond_signal(&h->cond[ENTROPY3B]);
  87.474 +            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
  87.475 +
  87.476 +        }
  87.477 +
  87.478 +    }
  87.479 +
  87.480 +    for (i=0; i<h->edb_threads; i++){
  87.481 +        pthread_mutex_lock(&h->b[i].ed_lock);
  87.482 +        while (h->b[i].ed_cnt >= MAX_SLICE_COUNT)
  87.483 +            pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock);
  87.484 +        h->b[i].ed_q[ h->b[i].ed_fi] = *s;
  87.485 +        h->b[i].ed_cnt++;
  87.486 +        h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT;
  87.487 +        pthread_cond_signal(&h->b[i].ed_cond);
  87.488 +        pthread_mutex_unlock(&h->b[i].ed_lock);
  87.489 +
  87.490 +    }
  87.491 +    for(int i=0; i<h->edb_threads; i++){
  87.492 +        pthread_join(h->ed_B_thr[i], NULL);
  87.493 +    }
  87.494 +    pthread_exit(NULL);
  87.495 +    return NULL;
  87.496 +}
  87.497 +
  87.498 +
  87.499 +static void *entr_IPB_distribute(void *arg){
  87.500 +    H264Context *h = (H264Context *) arg;
  87.501 +    EDSlice *s;
  87.502 +    int i,n=0;
  87.503 +
  87.504 +    create_spe_ED_threads(h, h->edip_threads, h->edb_threads);
  87.505 +    pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h);
  87.506 +    for(i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
  87.507 +        h->ip[i].h =h;
  87.508 +        h->ip[i].cell = (i >= h->edip_ppe_threads);
  87.509 +        pthread_mutex_init(&h->ip[i].mbs_lock, NULL);
  87.510 +        pthread_cond_init(&h->ip[i].mbs_cond, NULL);
  87.511 +        h->ip[i].thread_num = i - h->edip_ppe_threads;
  87.512 +        h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads;
  87.513 +        h->ip[i].mbs_fo = 0;
  87.514 +        h->ip[i].mbs_cnt = SLICE_BUFS;
  87.515 +        h->ip[i].ed_fi =0;
  87.516 +        h->ip[i].ed_fo =0;
  87.517 +        pthread_mutex_init(&h->ip[i].ed_lock, NULL);
  87.518 +        pthread_cond_init(&h->ip[i].ed_cond, NULL);
  87.519 +        pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]);
  87.520 +    }
  87.521 +
  87.522 +    for(;;){
  87.523 +        {
  87.524 +            pthread_mutex_lock(&h->lock[ENTROPY]);
  87.525 +            while (h->ed_cnt<=0)
  87.526 +                pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]);
  87.527 +            s= &h->ed_q[h->ed_fo];
  87.528 +
  87.529 +            pthread_mutex_unlock(&h->lock[ENTROPY]);
  87.530 +            h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT;
  87.531 +        }
  87.532 +        if (s->state<0)
  87.533 +            break;
  87.534 +
  87.535 +        assert(s->current_picture);
  87.536 +        if (s->slice_type_nos == FF_B_TYPE )
  87.537 +        {
  87.538 +            pthread_mutex_lock(&h->lock[ENTROPY3B]);
  87.539 +            while (h->ed_B_cnt>=MAX_SLICE_COUNT)
  87.540 +                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
  87.541 +            h->ed_B_q[h->ed_B_fi] = *s;
  87.542 +            h->ed_B_cnt++;
  87.543 +            h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
  87.544 +            pthread_cond_signal(&h->cond[ENTROPY3B]);
  87.545 +            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
  87.546 +        }else
  87.547 +        {
  87.548 +            ///round robin now, change to based on rawframes size.
  87.549 +            pthread_mutex_lock(&h->ip[n].ed_lock);
  87.550 +            while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT)
  87.551 +                pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock);
  87.552 +            h->ip[n].ed_q[ h->ip[n].ed_fi] = *s;
  87.553 +            h->ip[n].ed_cnt++;
  87.554 +            h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT;
  87.555 +            pthread_cond_signal(&h->ip[n].ed_cond);
  87.556 +            pthread_mutex_unlock(&h->ip[n].ed_lock);
  87.557 +
  87.558 +            n++; n %=(h->edip_threads+h->edip_ppe_threads);
  87.559 +        }
  87.560 +        {
  87.561 +            pthread_mutex_lock(&h->lock[ENTROPY]);
  87.562 +            h->ed_cnt--;
  87.563 +            pthread_cond_signal(&h->cond[ENTROPY]);
  87.564 +            pthread_mutex_unlock(&h->lock[ENTROPY]);
  87.565 +
  87.566 +        }
  87.567 +    }
  87.568 +
  87.569 +    {
  87.570 +        pthread_mutex_lock(&h->lock[ENTROPY3B]);
  87.571 +        while (h->ed_B_cnt>=MAX_SLICE_COUNT)
  87.572 +            pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
  87.573 +        h->ed_B_q[h->ed_B_fi] = *s;
  87.574 +        h->ed_B_cnt++;
  87.575 +        h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
  87.576 +        pthread_cond_signal(&h->cond[ENTROPY3B]);
  87.577 +        pthread_mutex_unlock(&h->lock[ENTROPY3B]);
  87.578 +    }
  87.579 +    {
  87.580 +        for (i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
  87.581 +            pthread_mutex_lock(&h->ip[i].ed_lock);
  87.582 +            while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT)
  87.583 +                pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock);
  87.584 +            h->ip[i].ed_q[ h->ip[i].ed_fi] = *s;
  87.585 +            h->ip[i].ed_cnt++;
  87.586 +            h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT;
  87.587 +            pthread_cond_signal(&h->ip[i].ed_cond);
  87.588 +            pthread_mutex_unlock(&h->ip[i].ed_lock);
  87.589 +        }
  87.590 +    }
  87.591 +    {
  87.592 +        pthread_mutex_lock(&h->lock[ENTROPY4]);
  87.593 +        while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
  87.594 +            pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
  87.595 +        h->ed_reorder_q[h->ed_reorder_fi] = *s;
  87.596 +        h->ed_reorder_cnt++;
  87.597 +        h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
  87.598 +        pthread_cond_signal(&h->cond[ENTROPY4]);
  87.599 +        pthread_mutex_unlock(&h->lock[ENTROPY4]);
  87.600 +
  87.601 +    }
  87.602 +    pthread_join(h->ed_B_dist, NULL);
  87.603 +    for(i=0; i<h->edip_threads; i++){
  87.604 +        pthread_join(h->ed_IP_thr[i], NULL);
  87.605 +    }
  87.606 +    pthread_exit(NULL);
  87.607 +    return NULL;
  87.608 +}
  87.609 +
  87.610 +static pthread_t ed_IPB_dist;
  87.611 +static void *entropy_IPB_cell_thread(void *arg){
  87.612 +    H264Context *h = (H264Context *) arg;
  87.613 +    int i;
  87.614 +    EDSlice reorder[MAX_SLICE_COUNT];
  87.615 +    int ip_poc[MAX_SLICE_COUNT][2]={0,};
  87.616 +    int next_ip_id=0;
  87.617 +    int ip_poc_cnt=0;
  87.618 +    EDSlice *s;
  87.619 +    int reorder_cnt=0;
  87.620 +    unsigned next_pic_num=0;
  87.621 +
  87.622 +    pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h);
  87.623 +    int count =0;
  87.624 +    for(;;){
  87.625 +        //signals received from the entropy decoders
  87.626 +        {
  87.627 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
  87.628 +            while (h->ed_reorder_cnt<=0)
  87.629 +                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
  87.630 +            s= &h->ed_reorder_q[h->ed_reorder_fo];
  87.631 +            h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT;
  87.632 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
  87.633 +        }
  87.634 +
  87.635 +        if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){
  87.636 +            for (i=0; i<ip_poc_cnt; i++){
  87.637 +                if (s->ip_id < ip_poc[i][0]){
  87.638 +                    memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int));
  87.639 +                    break;
  87.640 +                }
  87.641 +            }
  87.642 +            ip_poc[i][0]= s->ip_id;
  87.643 +            ip_poc[i][1]= s->current_picture->poc;
  87.644 +            ip_poc_cnt++;
  87.645 +
  87.646 +            while (next_ip_id == ip_poc[0][0]){
  87.647 +                pthread_mutex_lock(&h->lock[ENTROPY2]);
  87.648 +                h->ed_poc = ip_poc[0][1];
  87.649 +
  87.650 +                pthread_cond_signal(&h->cond[ENTROPY2]);
  87.651 +                pthread_mutex_unlock(&h->lock[ENTROPY2]);
  87.652 +                memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int));
  87.653 +                ip_poc_cnt--;
  87.654 +                next_ip_id++;
  87.655 +            }
  87.656 +        }
  87.657 +
  87.658 +        for(i=reorder_cnt; i>0; i--){
  87.659 +            if (s->coded_pic_num < reorder[i-1].coded_pic_num)
  87.660 +                break;
  87.661 +            reorder[i]=reorder[i-1];
  87.662 +        }
  87.663 +        reorder[i]=*s;
  87.664 +
  87.665 +        while(reorder_cnt>=0){
  87.666 +            if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){
  87.667 +                break;
  87.668 +            }
  87.669 +            EDSlice *es = &reorder[reorder_cnt];
  87.670 +
  87.671 +            {
  87.672 +                pthread_mutex_lock(&h->lock[MBDEC]);
  87.673 +                while (h->mbdec_cnt >= MAX_SLICE_COUNT)
  87.674 +                    pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
  87.675 +                copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es);
  87.676 +
  87.677 +                h->mbdec_cnt++;
  87.678 +                h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
  87.679 +                pthread_cond_signal(&h->cond[MBDEC]);
  87.680 +                pthread_mutex_unlock(&h->lock[MBDEC]);
  87.681 +
  87.682 +            }
  87.683 +
  87.684 +            if (es->state<0)
  87.685 +                goto end;
  87.686 +
  87.687 +            assert(es->current_picture);
  87.688 +            for (int i=0; i<es->release_cnt; i++){
  87.689 +                release_pib_entry(h, es->release_ref[i], 2);
  87.690 +            }
  87.691 +            release_pib_entry(h, es->current_picture, 1);
  87.692 +            av_freep(&es->gb.raw);
  87.693 +            if (es->gb.rbsp)
  87.694 +                av_freep(&es->gb.rbsp);
  87.695 +
  87.696 +            next_pic_num++;
  87.697 +            reorder_cnt--;
  87.698 +        }
  87.699 +        reorder_cnt++;
  87.700 +
  87.701 +        {
  87.702 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
  87.703 +            h->ed_reorder_cnt--;
  87.704 +            pthread_cond_signal(&h->cond[ENTROPY4]);
  87.705 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
  87.706 +        }
  87.707 +    }
  87.708 +
  87.709 +end:
  87.710 +    pthread_join(ed_IPB_dist, NULL);
  87.711 +    pthread_exit(NULL);
  87.712 +    return NULL;
  87.713 +}
  87.714 +
  87.715 +
  87.716 +static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){
  87.717 +    dst->deblocking_filter =1;
  87.718 +    dst->linesize = src->current_picture->linesize[0];
  87.719 +    dst->uvlinesize = src->current_picture->linesize[1];
  87.720 +    dst->mb_width = h->mb_width;
  87.721 +    dst->mb_height = h->mb_height;
  87.722 +    dst->use_weight = src->use_weight;
  87.723 +    dst->use_weight_chroma = src->use_weight_chroma;
  87.724 +    dst->luma_log2_weight_denom = src->luma_log2_weight_denom;
  87.725 +    dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom;
  87.726 +
  87.727 +    //weights later
  87.728 +    memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t));
  87.729 +    memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t));
  87.730 +    memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t));
  87.731 +
  87.732 +    for(int list=0; list<2; list++){
  87.733 +        for (int i=0; i<src->ref_count[list]; i++){
  87.734 +            Picture_spu *p_dst = &dst->ref_list[list][i];
  87.735 +            DecodedPicture *p_src = src->ref_list[list][i];
  87.736 +            if (p_src){
  87.737 +                p_dst->data[0] = p_src->data[0];
  87.738 +                p_dst->data[1] = p_src->data[1];
  87.739 +                p_dst->data[2] = p_src->data[2];
  87.740 +            }
  87.741 +        }
  87.742 +    }
  87.743 +    dst->state = src->state;
  87.744 +
  87.745 +    dst->emu_edge_width  =32;
  87.746 +    dst->emu_edge_height =32;
  87.747 +    dst->slice_type = src->slice_type;
  87.748 +    dst->slice_type_nos = src->slice_type_nos;
  87.749 +    dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset;
  87.750 +    dst->slice_beta_offset = src->slice_beta_offset;
  87.751 +
  87.752 +    memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64);
  87.753 +
  87.754 +    dst->blocks = src->mbs;
  87.755 +    dst->dst_y = src->current_picture->data[0];
  87.756 +    dst->dst_cb = src->current_picture->data[1];
  87.757 +    dst->dst_cr = src->current_picture->data[2];
  87.758 +}
  87.759 +
  87.760 +static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){
  87.761 +    static int rl_fi=0;
  87.762 +
  87.763 +    DECLARE_ALIGNED(16, H264slice, spe_slice);
  87.764 +    H264spe *p=&spe_params[0];
  87.765 +    unsigned status;
  87.766 +    uint8_t *dst_y, *dst_cb, *dst_cr;
  87.767 +
  87.768 +    DecodedPicture *dp;
  87.769 +
  87.770 +    for (int i=0; i<2; i++){
  87.771 +        for(int j=0; j< s->ref_count[i]; j++){
  87.772 +            if (s->ref_list_cpn[i][j] ==-1)
  87.773 +                continue;
  87.774 +            int k;
  87.775 +            for (k=0; k<DPB_SIZE; k++){
  87.776 +                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
  87.777 +                    s->ref_list[i][j] = &h->dpb[k];
  87.778 +                    break;
  87.779 +                }
  87.780 +            }
  87.781 +        }
  87.782 +    }
  87.783 +
  87.784 +    dp = get_dpb_entry(h);
  87.785 +    init_dpb_entry(dp, s, d->width, d->height);
  87.786 +
  87.787 +    if (h->no_mbd)
  87.788 +        return;
  87.789 +
  87.790 +
  87.791 +    fill_spe_slice(&spe_slice, s, h);
  87.792 +    spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0);
  87.793 +    spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status);
  87.794 +    rl_fi++; rl_fi %= 2;
  87.795 +
  87.796 +    _spe_in_mbox_write(spe_control_area[0], 0);
  87.797 +    while (atomic_read(rl_cnt)<=0){
  87.798 +        //pthread_yield();
  87.799 +        usleep(1000);
  87.800 +    }
  87.801 +    atomic_dec(rl_cnt);
  87.802 +
  87.803 +
  87.804 +/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/
  87.805 +// 	memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16);
  87.806 +// 	memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8);
  87.807 +// 	memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8);
  87.808 +//
  87.809 +// 	memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16);
  87.810 +// 	memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8);
  87.811 +// 	memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8);
  87.812 +//
  87.813 +// 	decode_slice_mb_seq(d, s);
  87.814 +//
  87.815 +// 	for (int i=0; i<h->mb_height*16; i++){
  87.816 +// 		for (int j=0; j<h->width; j++){
  87.817 +// 			if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){
  87.818 +// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]);
  87.819 +// 				return;
  87.820 +// 			}
  87.821 +// 		}
  87.822 +// 	}
  87.823 +//
  87.824 +// 	for (int i=0; i<h->mb_height*8; i++){
  87.825 +// 		for (int j=0; j<h->width/2; j++){
  87.826 +// 			if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){
  87.827 +// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]);
  87.828 +// 				return;
  87.829 +// 			}
  87.830 +// 		}
  87.831 +// 	}
  87.832 +//
  87.833 +// 	for (int i=0; i<h->mb_height*8; i++){
  87.834 +// 		for (int j=0; j<h->width/2; j++){
  87.835 +// 			if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){
  87.836 +// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]);
  87.837 +// 				return;
  87.838 +// 			}
  87.839 +// 		}
  87.840 +// 	}
  87.841 +
  87.842 +
  87.843 +    //printf("dst_y %p\n", dst_y);
  87.844 +
  87.845 +
  87.846 +     for (int i=0; i<s->release_cnt; i++){
  87.847 +        for(int j=0; j<DPB_SIZE; j++){
  87.848 +            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
  87.849 +                release_dpb_entry(h, &h->dpb[j], 2);
  87.850 +                break;
  87.851 +            }
  87.852 +        }
  87.853 +    }
  87.854 +    s->release_cnt=0;
  87.855 +
  87.856 +}
  87.857 +
  87.858 +static void *h264_spe_thread(void * thread_args ) {
  87.859 +    H264spe *params = (H264spe *)thread_args;
  87.860 +    unsigned int spe_id = params->spe_id;
  87.861 +    unsigned int runflags = 0;
  87.862 +    unsigned int entry = SPE_DEFAULT_ENTRY;
  87.863 +    // run SPE context
  87.864 +    spe_context_run(spe_context[spe_id],  &entry, runflags, (void*) params, NULL, NULL);
  87.865 +    // done - now exit thread
  87.866 +    pthread_exit(NULL);
  87.867 +}
  87.868 +
  87.869 +static int create_spe_MBR_threads(H264Context *h, int num_threads) {
  87.870 +    int i;
  87.871 +
  87.872 +    // reserve memory for spe thread id, context and argument addresses
  87.873 +    spe_tid = av_malloc(num_threads * sizeof (pthread_t));
  87.874 +    spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
  87.875 +    spe_params = av_malloc(num_threads * sizeof (H264spe));
  87.876 +    spe_control_area = av_malloc(num_threads * sizeof (void*));
  87.877 +    spe_ls_area = av_malloc(num_threads * sizeof (void*));
  87.878 +    spe_slice_buf = av_malloc(num_threads * sizeof (void*));
  87.879 +
  87.880 +    spe_program_handle_t *spe_program = spe_image_open("spe_mbd");
  87.881 +
  87.882 +    if (spe_program == NULL)
  87.883 +        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
  87.884 +
  87.885 +    for (i = 0; i < num_threads; i++) {
  87.886 +        // create context for spe program
  87.887 +        spe_context[i] = spe_context_create(SPE_MAP_PS, NULL);
  87.888 +        if (spe_context[i] == NULL)
  87.889 +            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
  87.890 +        // load SPE program into main memory
  87.891 +        if ((spe_program_load(spe_context[i], spe_program)) == -1)
  87.892 +            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
  87.893 +        //get the control_area for fast mailboxing
  87.894 +        if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL)
  87.895 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
  87.896 +        //get ls area for inter spe communication
  87.897 +        if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL)
  87.898 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
  87.899 +    }
  87.900 +
  87.901 +    for (i = 0; i < num_threads; i++) {
  87.902 +        spe_params[i].mb_width = h->mb_width;
  87.903 +        spe_params[i].mb_height = h->mb_height;
  87.904 +        spe_params[i].mb_stride = h->mb_stride;
  87.905 +        spe_params[i].spe_id = i;
  87.906 +        spe_params[i].spe_total = num_threads;
  87.907 +        //spe_params[i].slice_params= &slice_params;
  87.908 +        spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads];
  87.909 +        spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads];
  87.910 +
  87.911 +        spe_params[i].rl_lock = rl_lock;
  87.912 +        spe_params[i].rl_cond = rl_cond;
  87.913 +        spe_params[i].rl_cnt = rl_cnt;
  87.914 +        spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i];
  87.915 +        spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i];
  87.916 +        spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0);
  87.917 +
  87.918 +        mutex_init(spe_params[i].lock);
  87.919 +        cond_init(spe_params[i].cond);
  87.920 +        if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i]))
  87.921 +            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
  87.922 +
  87.923 +        //slicebufaddr
  87.924 +        spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]);
  87.925 +
  87.926 +        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
  87.927 +    }
  87.928 +    spe_image_close(spe_program);
  87.929 +    return 0;
  87.930 +}
  87.931 +
  87.932 +//_spe_out_mbox_read(spe_control_area[i]);
  87.933 +/**
  87.934 +* joins all the spe worker threads.
  87.935 +*/
  87.936 +static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) {
  87.937 +    int i;
  87.938 +    ///just to keep coding consistency.
  87.939 +    {
  87.940 +        for (i=0; i<num_threads; i++){
  87.941 +            H264spe *p=&spe_params[i];
  87.942 +            unsigned status;
  87.943 +
  87.944 +            while (atomic_read(p->cnt)>=2) {//double buffered
  87.945 +                usleep(1000);//cond_wait(p->cond, p->lock);
  87.946 +            }
  87.947 +
  87.948 +            spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0);
  87.949 +            spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
  87.950 +            //mutex_unlock(p->lock);
  87.951 +            _spe_in_mbox_write(spe_control_area[i], 0);
  87.952 +        }
  87.953 +    }
  87.954 +
  87.955 +    for (i=0; i<num_threads; i++){
  87.956 +        pthread_join(spe_tid[i], NULL);
  87.957 +    }
  87.958 +
  87.959 +    for (i=0; i<num_threads; i++){
  87.960 +        spe_context_destroy(spe_context[i]);
  87.961 +    }
  87.962 +    atomic_inc(rl_cnt);
  87.963 +
  87.964 +    // destroy memory reserved for spe thread id, context and argument addresses
  87.965 +    av_freep(&spe_tid);
  87.966 +    av_freep(&spe_context);
  87.967 +    av_freep(&spe_params);
  87.968 +    av_freep(&spe_control_area);
  87.969 +    av_freep(&spe_slice_buf);
  87.970 +}
  87.971 +
  87.972 +
  87.973 +static void *rl_dist_thread(void *arg){
  87.974 +    int i;
  87.975 +    H264Context *h = (H264Context *) arg;
  87.976 +    MBSlice *s;
  87.977 +    DecodedPicture *dp;
  87.978 +    int rl_fi[16]={0,};
  87.979 +    DECLARE_ALIGNED(16, H264slice, spe_slice);
  87.980 +
  87.981 +    create_spe_MBR_threads(h, h->rl_threads);
  87.982 +    for(;;){
  87.983 +        {
  87.984 +            pthread_mutex_lock(&h->lock[MBDEC]);
  87.985 +            while (h->mbdec_cnt<=0)
  87.986 +                pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
  87.987 +            s= &h->mbdec_q[h->mbdec_fo];
  87.988 +            h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT;
  87.989 +            pthread_mutex_unlock(&h->lock[MBDEC]);
  87.990 +        }
  87.991 +
  87.992 +        if (s->state<0){
  87.993 +            break;
  87.994 +        }
  87.995 +        for (int i=0; i<2; i++){
  87.996 +            for(int j=0; j< s->ref_count[i]; j++){
  87.997 +                if (s->ref_list_cpn[i][j] ==-1)
  87.998 +                    continue;
  87.999 +                int k;
 87.1000 +                for (k=0; k<DPB_SIZE; k++){
 87.1001 +                    if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
 87.1002 +                        s->ref_list[i][j] = &h->dpb[k];
 87.1003 +                        break;
 87.1004 +                    }
 87.1005 +                }
 87.1006 +
 87.1007 +            }
 87.1008 +        }
 87.1009 +        dp = get_dpb_entry(h);
 87.1010 +        init_dpb_entry(dp, s, h->width, h->height);
 87.1011 +        assert(s->current_picture);
 87.1012 +        {
 87.1013 +            while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
 87.1014 +                usleep(1000);
 87.1015 +            }
 87.1016 +            h->mbrel_q[h->mbrel_fi] = *s;
 87.1017 +
 87.1018 +            h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
 87.1019 +        }
 87.1020 +        {
 87.1021 +            if(h->no_mbd){
 87.1022 +                atomic_inc(rl_cnt);
 87.1023 +            }else {
 87.1024 +                fill_spe_slice(&spe_slice, s, h);
 87.1025 +                for (i=0; i<h->rl_threads; i++){
 87.1026 +                    H264spe *p=&spe_params[i];
 87.1027 +                    unsigned status;
 87.1028 +                    while (atomic_read(p->cnt)>=2){ //double buffered
 87.1029 +                        usleep(1000);
 87.1030 +                        //cond_wait(p->cond, p->lock);
 87.1031 +                    }
 87.1032 +                    spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0);
 87.1033 +                    spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
 87.1034 +                    rl_fi[i]++; rl_fi[i] %= 2;
 87.1035 +                    atomic_inc(p->cnt);
 87.1036 +
 87.1037 +                    _spe_in_mbox_write(spe_control_area[i], 0);
 87.1038 +                }
 87.1039 +            }
 87.1040 +        }
 87.1041 +
 87.1042 +        {
 87.1043 +            pthread_mutex_lock(&h->lock[MBDEC]);
 87.1044 +            h->mbdec_cnt--;
 87.1045 +            pthread_cond_signal(&h->cond[MBDEC]);
 87.1046 +            pthread_mutex_unlock(&h->lock[MBDEC]);
 87.1047 +        }
 87.1048 +
 87.1049 +    }
 87.1050 +
 87.1051 +    {
 87.1052 +        while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
 87.1053 +            usleep(1000);
 87.1054 +        }
 87.1055 +        h->mbrel_q[h->mbrel_fi] = *s;
 87.1056 +
 87.1057 +        h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
 87.1058 +    }
 87.1059 +    spe_slice.state=-1;
 87.1060 +    join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi);
 87.1061 +    pthread_exit(NULL);
 87.1062 +    return NULL;
 87.1063 +}
 87.1064 +
 87.1065 +static void *mbdec_cell_thread(void *arg){
 87.1066 +    H264Context *h = (H264Context *) arg;
 87.1067 +
 87.1068 +    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
 87.1069 +    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
 87.1070 +    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
 87.1071 +    atomic_set(rl_cnt, 0);
 87.1072 +    mutex_init(rl_lock);
 87.1073 +    cond_init(rl_cond);
 87.1074 +// 	printf("mbdec, pid %d\n", syscall(SYS_gettid));
 87.1075 +    pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h);
 87.1076 +
 87.1077 +    for(;;){
 87.1078 +        MBSlice *s=NULL;
 87.1079 +        {
 87.1080 +            while (atomic_read(rl_cnt)<=0){
 87.1081 +                usleep(1000);
 87.1082 +            }
 87.1083 +            s= &h->mbrel_q[h->mbrel_fo];
 87.1084 +            h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT;
 87.1085 +        }
 87.1086 +
 87.1087 +        if (s->state<0)
 87.1088 +            break;
 87.1089 +
 87.1090 +        for (int i=0; i<s->release_cnt; i++){
 87.1091 +            for(int j=0; j<DPB_SIZE; j++){
 87.1092 +                if(h->dpb[j].cpn== s->release_ref_cpn[i]){
 87.1093 +                    release_dpb_entry(h, &h->dpb[j], 2);
 87.1094 +                    break;
 87.1095 +                }
 87.1096 +            }
 87.1097 +        }
 87.1098 +
 87.1099 +        {
 87.1100 +            EDThreadContext *ed = s->ed;
 87.1101 +            pthread_mutex_lock(&ed->mbs_lock);
 87.1102 +            ed->mbs_cnt++;
 87.1103 +            pthread_cond_signal(&ed->mbs_cond);
 87.1104 +            pthread_mutex_unlock(&ed->mbs_lock);
 87.1105 +        }
 87.1106 +
 87.1107 +        {
 87.1108 +            pthread_mutex_lock(&h->lock[WRITE]);
 87.1109 +            while (h->write_cnt>= DPB_SIZE)
 87.1110 +                pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
 87.1111 +            assert(s);
 87.1112 +            assert(s->current_picture);
 87.1113 +            h->write_q[h->write_fi]= s->current_picture;
 87.1114 +            h->write_cnt++;
 87.1115 +            h->write_fi++; h->write_fi %= DPB_SIZE;
 87.1116 +            pthread_cond_signal(&h->cond[WRITE]);
 87.1117 +            pthread_mutex_unlock(&h->lock[WRITE]);
 87.1118 +
 87.1119 +        }
 87.1120 +        {
 87.1121 +            atomic_dec(rl_cnt);
 87.1122 +        }
 87.1123 +
 87.1124 +    }
 87.1125 +
 87.1126 +    {//propagate exit
 87.1127 +        pthread_mutex_lock(&h->lock[WRITE]);
 87.1128 +        while (h->write_cnt>= DPB_SIZE)
 87.1129 +            pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
 87.1130 +        last_pic.reference = -1;
 87.1131 +        h->write_q[h->write_fi] = &last_pic;
 87.1132 +        h->write_cnt++;
 87.1133 +        h->write_fi++; h->write_fi %= DPB_SIZE;
 87.1134 +        pthread_cond_signal(&h->cond[WRITE]);
 87.1135 +        pthread_mutex_unlock(&h->lock[WRITE]);
 87.1136 +
 87.1137 +    }
 87.1138 +    pthread_join(h->rl_dist_thr, NULL);
 87.1139 +    pthread_exit(NULL);
 87.1140 +    return NULL;
 87.1141 +}
 87.1142 +
 87.1143 +/*
 87.1144 +* The following code is the main loop of the file converter
 87.1145 +*/
 87.1146 +int h264_decode_cell(H264Context *h) {
 87.1147 +
 87.1148 +    pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;   
 87.1149 +
 87.1150 +    start_timer();
 87.1151 +
 87.1152 +    pthread_create(&read_thr, NULL, read_thread, h);
 87.1153 +    pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
 87.1154 +    pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h);
 87.1155 +    pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h);
 87.1156 +    pthread_create(&write_thr, NULL, write_thread, h);
 87.1157 +
 87.1158 +    pthread_join(read_thr, NULL);
 87.1159 +    pthread_join(parsenal_thr, NULL);
 87.1160 +    pthread_join(entropy_thr, NULL);
 87.1161 +    pthread_join(mbdec_thr, NULL);
 87.1162 +    pthread_join(write_thr, NULL);
 87.1163 +
 87.1164 +    return 0;
 87.1165 +}
 87.1166 +
 87.1167 +/*
 87.1168 +* The following code is the main loop of the file converter
 87.1169 +*/
 87.1170 +int h264_decode_cell_seq(H264Context *h) {
 87.1171 +ParserContext *pc;
 87.1172 +    NalContext *nc;
 87.1173 +    EntropyContext *ec;
 87.1174 +    MBRecContext *rc;
 87.1175 +    OutputContext *oc;
 87.1176 +
 87.1177 +    RawFrame frm;
 87.1178 +    EDSlice slice, *s=&slice;
 87.1179 +    MBSlice mbslice, *s2=&mbslice;
 87.1180 +    PictureInfo *pic=NULL;
 87.1181 +    DecodedPicture *out;
 87.1182 +    int size;
 87.1183 +    int frames=0;
 87.1184 +    
 87.1185 +    pc = get_parse_context(h->ifile);
 87.1186 +    nc = get_nal_context(h->width, h->height);
 87.1187 +    ec = get_entropy_context( h );
 87.1188 +    rc = get_mbrec_context(h);
 87.1189 +    oc = get_output_context( h );
 87.1190 +
 87.1191 +    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
 87.1192 +    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
 87.1193 +    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
 87.1194 +    atomic_set(rl_cnt, 0);
 87.1195 +    mutex_init(rl_lock);
 87.1196 +    cond_init(rl_cond);
 87.1197 +
 87.1198 +    memset(s, 0, sizeof(EDSlice));
 87.1199 +    ff_init_slice(nc, s);
 87.1200 +    s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb));
 87.1201 +
 87.1202 +    DecodedPicture tmp;
 87.1203 +    tmp.base[0]=0;
 87.1204 +    ///fix this when want to debug the Cell errors
 87.1205 +    //init_dpb_entry(&tmp, h->width, h->height);
 87.1206 +
 87.1207 +    create_spe_ED_threads(h, 1, 0);
 87.1208 +    create_spe_MBR_threads(h, 1);
 87.1209 +    
 87.1210 +    start_timer();
 87.1211 +
 87.1212 +    while(!pc->final_frame && frames++ < h->num_frames){
 87.1213 +
 87.1214 +        av_read_frame_internal(pc, &frm);
 87.1215 +        
 87.1216 +        PictureInfo *pic=get_pib_entry(h);
 87.1217 +        ff_alloc_picture_info(nc, s, pic);
 87.1218 +        decode_nal_units(nc, s, &frm);
 87.1219 +
 87.1220 +        copyEDtoMBSlice(s2, s);
 87.1221 +        decode_slice_entropy_cell_seq(h, ec, s);
 87.1222 +        
 87.1223 +        decode_slice_mb_seq_cell(h, rc, s2, &tmp);
 87.1224 +
 87.1225 +        out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height);
 87.1226 +        
 87.1227 +        if (out){
 87.1228 +            release_dpb_entry(h, out, 1);
 87.1229 +        }
 87.1230 +        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
 87.1231 +    }
 87.1232 +    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
 87.1233 +
 87.1234 +    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
 87.1235 +
 87.1236 +    /* finished ! */
 87.1237 +    av_freep(&s->mbs);
 87.1238 +
 87.1239 +    free_parse_context(pc);
 87.1240 +    free_nal_context  (nc);
 87.1241 +    free_entropy_context(ec);
 87.1242 +    free_mbrec_context(rc);
 87.1243 +    free_output_context(oc);                
 87.1244 +    return 0;
 87.1245 +}

    88.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    88.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_data.h	Mon Aug 27 12:09:56 2012 +0200
    88.3 @@ -0,0 +1,243 @@
    88.4 +/*
    88.5 + * H26L/H264/AVC/JVT/14496-10/... encoder/decoder
    88.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    88.7 + *
    88.8 + * This file is part of FFmpeg.
    88.9 + *
   88.10 + * FFmpeg is free software; you can redistribute it and/or
   88.11 + * modify it under the terms of the GNU Lesser General Public
   88.12 + * License as published by the Free Software Foundation; either
   88.13 + * version 2.1 of the License, or (at your option) any later version.
   88.14 + *
   88.15 + * FFmpeg is distributed in the hope that it will be useful,
   88.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   88.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   88.18 + * Lesser General Public License for more details.
   88.19 + *
   88.20 + * You should have received a copy of the GNU Lesser General Public
   88.21 + * License along with FFmpeg; if not, write to the Free Software
   88.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   88.23 + */
   88.24 +
   88.25 +/**
   88.26 + * @file
   88.27 + * @brief
   88.28 + *     H264 / AVC / MPEG4 part10 codec data table
   88.29 + * @author Michael Niedermayer <michaelni@gmx.at>
   88.30 + */
   88.31 +
   88.32 +#ifndef AVCODEC_H264DATA_H
   88.33 +#define AVCODEC_H264DATA_H
   88.34 +
   88.35 +#include <stdint.h>
   88.36 +#include "avcodec.h"
   88.37 +//#include "h264.h"
   88.38 +
   88.39 +/*
   88.40 +o-o o-o
   88.41 + / / /
   88.42 +o-o o-o
   88.43 + ,---'
   88.44 +o-o o-o
   88.45 + / / /
   88.46 +o-o o-o
   88.47 +*/
   88.48 +//This table must be here because scan8[constant] must be known at compiletime
   88.49 +static const uint8_t scan8[16 + 2*4]={
   88.50 + 4+1*8, 5+1*8, 4+2*8, 5+2*8,
   88.51 + 6+1*8, 7+1*8, 6+2*8, 7+2*8,
   88.52 + 4+3*8, 5+3*8, 4+4*8, 5+4*8,
   88.53 + 6+3*8, 7+3*8, 6+4*8, 7+4*8,
   88.54 + 1+1*8, 2+1*8,
   88.55 + 1+2*8, 2+2*8,
   88.56 + 1+4*8, 2+4*8,
   88.57 + 1+5*8, 2+5*8,
   88.58 +};
   88.59 +
   88.60 +static const uint8_t golomb_to_pict_type[5]=
   88.61 +{FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
   88.62 +
   88.63 +static const uint8_t golomb_to_intra4x4_cbp[48]={
   88.64 + 47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46,
   88.65 + 16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4,
   88.66 +  8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41
   88.67 +};
   88.68 +
   88.69 +static const uint8_t golomb_to_inter_cbp[48]={
   88.70 +  0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13,
   88.71 + 14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46,
   88.72 + 17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41
   88.73 +};
   88.74 +
   88.75 +static const uint8_t zigzag_scan[16]={
   88.76 + 0+0*4, 1+0*4, 0+1*4, 0+2*4,
   88.77 + 1+1*4, 2+0*4, 3+0*4, 2+1*4,
   88.78 + 1+2*4, 0+3*4, 1+3*4, 2+2*4,
   88.79 + 3+1*4, 3+2*4, 2+3*4, 3+3*4,
   88.80 +};
   88.81 +
   88.82 +static const uint8_t field_scan[16]={
   88.83 + 0+0*4, 0+1*4, 1+0*4, 0+2*4,
   88.84 + 0+3*4, 1+1*4, 1+2*4, 1+3*4,
   88.85 + 2+0*4, 2+1*4, 2+2*4, 2+3*4,
   88.86 + 3+0*4, 3+1*4, 3+2*4, 3+3*4,
   88.87 +};
   88.88 +
   88.89 +static const uint8_t luma_dc_zigzag_scan[16]={
   88.90 + 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
   88.91 + 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
   88.92 + 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
   88.93 + 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
   88.94 +};
   88.95 +
   88.96 +static const uint8_t luma_dc_field_scan[16]={
   88.97 + 0*16 + 0*64, 2*16 + 0*64, 1*16 + 0*64, 0*16 + 2*64,
   88.98 + 2*16 + 2*64, 3*16 + 0*64, 1*16 + 2*64, 3*16 + 2*64,
   88.99 + 0*16 + 1*64, 2*16 + 1*64, 0*16 + 3*64, 2*16 + 3*64,
  88.100 + 1*16 + 1*64, 3*16 + 1*64, 1*16 + 3*64, 3*16 + 3*64,
  88.101 +};
  88.102 +
  88.103 +static const uint8_t chroma_dc_scan[4]={
  88.104 + (0+0*2)*16, (1+0*2)*16,
  88.105 + (0+1*2)*16, (1+1*2)*16,  //FIXME
  88.106 +};
  88.107 +
  88.108 +
  88.109 +static const uint8_t field_scan8x8[64]={
  88.110 + 0+0*8, 0+1*8, 0+2*8, 1+0*8,
  88.111 + 1+1*8, 0+3*8, 0+4*8, 1+2*8,
  88.112 + 2+0*8, 1+3*8, 0+5*8, 0+6*8,
  88.113 + 0+7*8, 1+4*8, 2+1*8, 3+0*8,
  88.114 + 2+2*8, 1+5*8, 1+6*8, 1+7*8,
  88.115 + 2+3*8, 3+1*8, 4+0*8, 3+2*8,
  88.116 + 2+4*8, 2+5*8, 2+6*8, 2+7*8,
  88.117 + 3+3*8, 4+1*8, 5+0*8, 4+2*8,
  88.118 + 3+4*8, 3+5*8, 3+6*8, 3+7*8,
  88.119 + 4+3*8, 5+1*8, 6+0*8, 5+2*8,
  88.120 + 4+4*8, 4+5*8, 4+6*8, 4+7*8,
  88.121 + 5+3*8, 6+1*8, 6+2*8, 5+4*8,
  88.122 + 5+5*8, 5+6*8, 5+7*8, 6+3*8,
  88.123 + 7+0*8, 7+1*8, 6+4*8, 6+5*8,
  88.124 + 6+6*8, 6+7*8, 7+2*8, 7+3*8,
  88.125 + 7+4*8, 7+5*8, 7+6*8, 7+7*8,
  88.126 +};
  88.127 +
  88.128 +typedef struct IMbInfo{
  88.129 +    uint16_t type;
  88.130 +    uint8_t pred_mode;
  88.131 +    uint8_t cbp;
  88.132 +} IMbInfo;
  88.133 +
  88.134 +static const IMbInfo i_mb_type_info[26]={
  88.135 +{MB_TYPE_INTRA4x4  , -1, -1},
  88.136 +{MB_TYPE_INTRA16x16,  2,  0},
  88.137 +{MB_TYPE_INTRA16x16,  1,  0},
  88.138 +{MB_TYPE_INTRA16x16,  0,  0},
  88.139 +{MB_TYPE_INTRA16x16,  3,  0},
  88.140 +{MB_TYPE_INTRA16x16,  2,  16},
  88.141 +{MB_TYPE_INTRA16x16,  1,  16},
  88.142 +{MB_TYPE_INTRA16x16,  0,  16},
  88.143 +{MB_TYPE_INTRA16x16,  3,  16},
  88.144 +{MB_TYPE_INTRA16x16,  2,  32},
  88.145 +{MB_TYPE_INTRA16x16,  1,  32},
  88.146 +{MB_TYPE_INTRA16x16,  0,  32},
  88.147 +{MB_TYPE_INTRA16x16,  3,  32},
  88.148 +{MB_TYPE_INTRA16x16,  2,  15+0},
  88.149 +{MB_TYPE_INTRA16x16,  1,  15+0},
  88.150 +{MB_TYPE_INTRA16x16,  0,  15+0},
  88.151 +{MB_TYPE_INTRA16x16,  3,  15+0},
  88.152 +{MB_TYPE_INTRA16x16,  2,  15+16},
  88.153 +{MB_TYPE_INTRA16x16,  1,  15+16},
  88.154 +{MB_TYPE_INTRA16x16,  0,  15+16},
  88.155 +{MB_TYPE_INTRA16x16,  3,  15+16},
  88.156 +{MB_TYPE_INTRA16x16,  2,  15+32},
  88.157 +{MB_TYPE_INTRA16x16,  1,  15+32},
  88.158 +{MB_TYPE_INTRA16x16,  0,  15+32},
  88.159 +{MB_TYPE_INTRA16x16,  3,  15+32},
  88.160 +{MB_TYPE_INTRA_PCM , -1, -1},
  88.161 +};
  88.162 +
  88.163 +typedef struct PMbInfo{
  88.164 +    uint16_t type;
  88.165 +    uint8_t partition_count;
  88.166 +} PMbInfo;
  88.167 +
  88.168 +static const PMbInfo p_mb_type_info[5]={
  88.169 +{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
  88.170 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
  88.171 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P1L0, 2},
  88.172 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0, 4},
  88.173 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_REF0, 4},
  88.174 +};
  88.175 +
  88.176 +static const PMbInfo p_sub_mb_type_info[4]={
  88.177 +{MB_TYPE_16x16|MB_TYPE_P0L0             , 1},
  88.178 +{MB_TYPE_16x8 |MB_TYPE_P0L0             , 2},
  88.179 +{MB_TYPE_8x16 |MB_TYPE_P0L0             , 2},
  88.180 +{MB_TYPE_8x8  |MB_TYPE_P0L0             , 4},
  88.181 +};
  88.182 +
  88.183 +static const PMbInfo b_mb_type_info[23]={
  88.184 +{MB_TYPE_DIRECT2|MB_TYPE_L0L1                                      , 1, },
  88.185 +{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
  88.186 +{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
  88.187 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
  88.188 +{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  88.189 +{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  88.190 +{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  88.191 +{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  88.192 +{MB_TYPE_16x8 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
  88.193 +{MB_TYPE_8x16 |MB_TYPE_P0L0                          |MB_TYPE_P1L1, 2, },
  88.194 +{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  88.195 +{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  88.196 +{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.197 +{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.198 +{MB_TYPE_16x8              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.199 +{MB_TYPE_8x16              |MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.200 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  88.201 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0             , 2, },
  88.202 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  88.203 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  88.204 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.205 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.206 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
  88.207 +};
  88.208 +
  88.209 +static const PMbInfo b_sub_mb_type_info[13]={
  88.210 +{MB_TYPE_DIRECT2                                                   , 1, },
  88.211 +{MB_TYPE_16x16|MB_TYPE_P0L0                                       , 1, },
  88.212 +{MB_TYPE_16x16             |MB_TYPE_P0L1                          , 1, },
  88.213 +{MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1                          , 1, },
  88.214 +{MB_TYPE_16x8 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  88.215 +{MB_TYPE_8x16 |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 2, },
  88.216 +{MB_TYPE_16x8              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  88.217 +{MB_TYPE_8x16              |MB_TYPE_P0L1             |MB_TYPE_P1L1, 2, },
  88.218 +{MB_TYPE_16x8 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.219 +{MB_TYPE_8x16 |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 2, },
  88.220 +{MB_TYPE_8x8  |MB_TYPE_P0L0             |MB_TYPE_P1L0             , 4, },
  88.221 +{MB_TYPE_8x8               |MB_TYPE_P0L1             |MB_TYPE_P1L1, 4, },
  88.222 +{MB_TYPE_8x8  |MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_P1L0|MB_TYPE_P1L1, 4, },
  88.223 +};
  88.224 +
  88.225 +static const uint8_t dequant4_coeff_init[6][3]={
  88.226 +  {10,13,16},
  88.227 +  {11,14,18},
  88.228 +  {13,16,20},
  88.229 +  {14,18,23},
  88.230 +  {16,20,25},
  88.231 +  {18,23,29},
  88.232 +};
  88.233 +
  88.234 +static const uint8_t dequant8_coeff_init_scan[16] = {
  88.235 +  0,3,4,3, 3,1,5,1, 4,5,2,5, 3,1,5,1
  88.236 +};
  88.237 +static const uint8_t dequant8_coeff_init[6][6]={
  88.238 +  {20,18,32,19,25,24},
  88.239 +  {22,19,35,21,28,26},
  88.240 +  {26,23,42,24,33,31},
  88.241 +  {28,25,45,26,35,33},
  88.242 +  {32,28,51,30,40,38},
  88.243 +  {36,32,58,34,46,43},
  88.244 +};
  88.245 +
  88.246 +#endif /* AVCODEC_H264DATA_H */

    89.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    89.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_deblock.c	Mon Aug 27 12:09:56 2012 +0200
    89.3 @@ -0,0 +1,507 @@
    89.4 +/*
    89.5 + * H.26L/H.264/AVC/JVT/14496-10/... loop filter
    89.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    89.7 + *
    89.8 + * This file is part of FFmpeg.
    89.9 + *
   89.10 + * FFmpeg is free software; you can redistribute it and/or
   89.11 + * modify it under the terms of the GNU Lesser General Public
   89.12 + * License as published by the Free Software Foundation; either
   89.13 + * version 2.1 of the License, or (at your option) any later version.
   89.14 + *
   89.15 + * FFmpeg is distributed in the hope that it will be useful,
   89.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   89.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   89.18 + * Lesser General Public License for more details.
   89.19 + *
   89.20 + * You should have received a copy of the GNU Lesser General Public
   89.21 + * License along with FFmpeg; if not, write to the Free Software
   89.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   89.23 + */
   89.24 +
   89.25 +/**
   89.26 + * @file
   89.27 + * H.264 / AVC / MPEG4 part10 loop filter.
   89.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   89.29 + */
   89.30 +
   89.31 +#include "dsputil.h"
   89.32 +#include "mathops.h"
   89.33 +#include "rectangle.h"
   89.34 +#include "h264_types.h"
   89.35 +#include "h264_misc.h"
   89.36 +#include "h264_data.h"
   89.37 +//#undef NDEBUG
   89.38 +#include <assert.h>
   89.39 +
   89.40 +/* Deblocking filter (p153) */
   89.41 +static const uint8_t alpha_table[52*3] = {
   89.42 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.43 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.44 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.45 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.46 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.47 +     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
   89.48 +     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
   89.49 +    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
   89.50 +    80, 90,101,113,127,144,162,182,203,226,
   89.51 +   255,255,
   89.52 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   89.53 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   89.54 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   89.55 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
   89.56 +};
   89.57 +static const uint8_t beta_table[52*3] = {
   89.58 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.59 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.60 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.61 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.62 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
   89.63 +     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
   89.64 +     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
   89.65 +     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
   89.66 +    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
   89.67 +    18, 18,
   89.68 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   89.69 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   89.70 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   89.71 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
   89.72 +};
   89.73 +static const uint8_t tc0_table[52*3][4] = {
   89.74 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.75 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.76 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.77 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.78 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.79 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.80 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.81 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.82 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.83 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.84 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
   89.85 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
   89.86 +    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
   89.87 +    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
   89.88 +    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
   89.89 +    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
   89.90 +    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
   89.91 +    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
   89.92 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   89.93 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   89.94 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   89.95 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   89.96 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   89.97 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   89.98 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   89.99 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
  89.100 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
  89.101 +};
  89.102 +
  89.103 +av_always_inline static void filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s) {
  89.104 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
  89.105 +    const int alpha = alpha_table[index_a];
  89.106 +    const int beta  = beta_table[qp + s->slice_beta_offset];
  89.107 +    if (alpha ==0 || beta == 0) return;
  89.108 +
  89.109 +    if( bS[0] < 4 ) {
  89.110 +        int8_t tc[4];
  89.111 +        tc[0] = tc0_table[index_a][bS[0]];
  89.112 +        tc[1] = tc0_table[index_a][bS[1]];
  89.113 +        tc[2] = tc0_table[index_a][bS[2]];
  89.114 +        tc[3] = tc0_table[index_a][bS[3]];
  89.115 +        mrc->hdsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
  89.116 +    } else {
  89.117 +        mrc->hdsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
  89.118 +    }
  89.119 +}
  89.120 +
  89.121 +av_always_inline static void filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
  89.122 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
  89.123 +    const int alpha = alpha_table[index_a];
  89.124 +    const int beta  = beta_table[qp + s->slice_beta_offset];
  89.125 +    if (alpha ==0 || beta == 0) return;
  89.126 +
  89.127 +    if( bS[0] < 4 ) {
  89.128 +        int8_t tc[4];
  89.129 +        tc[0] = tc0_table[index_a][bS[0]]+1;
  89.130 +        tc[1] = tc0_table[index_a][bS[1]]+1;
  89.131 +        tc[2] = tc0_table[index_a][bS[2]]+1;
  89.132 +        tc[3] = tc0_table[index_a][bS[3]]+1;
  89.133 +        mrc->hdsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
  89.134 +    } else {
  89.135 +        mrc->hdsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
  89.136 +    }
  89.137 +}
  89.138 +
  89.139 +
  89.140 +av_always_inline static void filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
  89.141 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
  89.142 +    const int alpha = alpha_table[index_a];
  89.143 +    const int beta  = beta_table[qp + s->slice_beta_offset];
  89.144 +    if (alpha ==0 || beta == 0) return;
  89.145 +
  89.146 +    if( bS[0] < 4 ) {
  89.147 +        int8_t tc[4];
  89.148 +        tc[0] = tc0_table[index_a][bS[0]];
  89.149 +        tc[1] = tc0_table[index_a][bS[1]];
  89.150 +        tc[2] = tc0_table[index_a][bS[2]];
  89.151 +        tc[3] = tc0_table[index_a][bS[3]];
  89.152 +        mrc->hdsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
  89.153 +    } else {
  89.154 +        mrc->hdsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
  89.155 +    }
  89.156 +}
  89.157 +
  89.158 +av_always_inline static void filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
  89.159 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
  89.160 +    const int alpha = alpha_table[index_a];
  89.161 +    const int beta  = beta_table[qp + s->slice_beta_offset];
  89.162 +    if (alpha ==0 || beta == 0) return;
  89.163 +
  89.164 +    if( bS[0] < 4 ) {
  89.165 +        int8_t tc[4];
  89.166 +        tc[0] = tc0_table[index_a][bS[0]]+1;
  89.167 +        tc[1] = tc0_table[index_a][bS[1]]+1;
  89.168 +        tc[2] = tc0_table[index_a][bS[2]]+1;
  89.169 +        tc[3] = tc0_table[index_a][bS[3]]+1;
  89.170 +        mrc->hdsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
  89.171 +    } else {
  89.172 +        mrc->hdsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
  89.173 +    }
  89.174 +}
  89.175 +
  89.176 +static av_always_inline void filter_mb_dir(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, int dir) {
  89.177 +    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
  89.178 +    const int qp_xy= m->qscale_mb_xy;
  89.179 +    const int qp_dir = dir == 0 ? m->qscale_left_mb_xy : m->qscale_top_mb_xy;
  89.180 +    const int linesize = mrc->linesize;
  89.181 +    const int uvlinesize = mrc->uvlinesize;
  89.182 +    const int mb_type = m->mb_type;
  89.183 +    int edge;
  89.184 +    const int edges = mrs->edges[dir];
  89.185 +
  89.186 +    if(mbm_type){
  89.187 +        int16_t* bS=mrs->bS[dir][0];
  89.188 +        /* Filter edge */
  89.189 +        // Do not use s->qscale as luma quantizer because it has not the same
  89.190 +        // value in IPCM macroblocks.
  89.191 +        if(bS[0]+bS[1]+bS[2]+bS[3]){
  89.192 +            int qp = ( qp_xy + qp_dir + 1 ) >> 1;
  89.193 +            if( dir == 0 ) {
  89.194 +                filter_mb_edgev( &img_y[0], linesize, bS, qp, mrc, s );
  89.195 +                {
  89.196 +                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
  89.197 +                    filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, mrc, s);
  89.198 +                    filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, mrc, s);
  89.199 +                }
  89.200 +            } else {
  89.201 +                filter_mb_edgeh( &img_y[0], linesize, bS, qp, mrc, s );
  89.202 +                {
  89.203 +                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
  89.204 +                    filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, mrc, s);
  89.205 +                    filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, mrc, s);
  89.206 +                }
  89.207 +            }
  89.208 +        }
  89.209 +    }
  89.210 +
  89.211 +    for( edge = 1; edge < edges; edge++ ) {
  89.212 +        int16_t* bS=mrs->bS[dir][edge];
  89.213 +        int qp = qp_xy;
  89.214 +
  89.215 +        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
  89.216 +            continue;
  89.217 +
  89.218 +        if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
  89.219 +            continue;
  89.220 +
  89.221 +        /* Filter edge */
  89.222 +        // Do not use s->qscale as luma quantizer because it has not the same
  89.223 +        // value in IPCM macroblocks.
  89.224 +
  89.225 +        if( dir == 0 ) {
  89.226 +            filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, mrc, s);
  89.227 +            if( (edge&1) == 0 ) {
  89.228 +                filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
  89.229 +                filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
  89.230 +            }
  89.231 +        } else {
  89.232 +            filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, mrc, s );
  89.233 +            if( (edge&1) == 0 ) {
  89.234 +                filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
  89.235 +                filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
  89.236 +            }
  89.237 +        }
  89.238 +    }
  89.239 +}
  89.240 +
  89.241 +static int check_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, long b_idx, long bn_idx, int mvy_limit){
  89.242 +    int v;
  89.243 +    v= mrs->ref_cache[0][b_idx] != mrs->ref_cache[0][bn_idx];
  89.244 +    if(!v && mrs->ref_cache[0][b_idx]!=-1)
  89.245 +        // absolute value >= 7 | ...
  89.246 +        v= ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
  89.247 +        ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
  89.248 +
  89.249 +    if(s->list_count==2){
  89.250 +        if(!v)
  89.251 +            v = (mrs->ref_cache[1][b_idx] != mrs->ref_cache[1][bn_idx]) |
  89.252 +            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
  89.253 +            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit);
  89.254 +
  89.255 +        if(v){
  89.256 +            if((mrs->ref_cache[0][b_idx] != mrs->ref_cache[1][bn_idx]) |
  89.257 +                (mrs->ref_cache[1][b_idx] != mrs->ref_cache[0][bn_idx]))
  89.258 +                return 1;
  89.259 +            return
  89.260 +            ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
  89.261 +            ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit) |
  89.262 +            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
  89.263 +            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
  89.264 +        }
  89.265 +    }
  89.266 +
  89.267 +    return v;
  89.268 +}
  89.269 +
  89.270 +static void calc_bS_values(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mvy_limit, int dir) {
  89.271 +    int mb_type = m->mb_type;
  89.272 +    int edge;
  89.273 +    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
  89.274 +
  89.275 +    // how often to recheck mv-based bS when iterating between edges
  89.276 +    static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1},
  89.277 +    {0,3,1,1,3,3,3,3}};
  89.278 +    const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7];
  89.279 +    const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4;
  89.280 +    // how often to recheck mv-based bS when iterating along each edge
  89.281 +    const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
  89.282 +
  89.283 +    mrs->edges[dir]= edges;
  89.284 +
  89.285 +    if(mbm_type){
  89.286 +        int16_t* bS=mrs->bS[dir][0];
  89.287 +        if( IS_INTRA(mb_type|mbm_type)) {
  89.288 +            AV_WN64A(bS, 0x0004000400040004ULL);
  89.289 +        } else {
  89.290 +            int i;
  89.291 +            int mv_done;
  89.292 +            if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
  89.293 +                int b_idx= 8 + 4;
  89.294 +                int bn_idx= b_idx - (dir ? 8:1);
  89.295 +
  89.296 +                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, 8 + 4, bn_idx, mvy_limit);
  89.297 +                mv_done = 1;
  89.298 +            }
  89.299 +            else
  89.300 +                mv_done = 0;
  89.301 +
  89.302 +            for( i = 0; i < 4; i++ ) {
  89.303 +                int x = dir == 0 ? 0 : i;
  89.304 +                int y = dir == 0 ? i    : 0;
  89.305 +                int b_idx= 8 + 4 + x + 8*y;
  89.306 +                int bn_idx= b_idx - (dir ? 8:1);
  89.307 +
  89.308 +                if( mrs->non_zero_count_cache[b_idx] |
  89.309 +                    mrs->non_zero_count_cache[bn_idx] ) {
  89.310 +                    bS[i] = 2;
  89.311 +                }
  89.312 +                else if(!mv_done)
  89.313 +                {
  89.314 +                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
  89.315 +                }
  89.316 +            }
  89.317 +        }
  89.318 +    }
  89.319 +
  89.320 +    /* Calculate bS */
  89.321 +    for( edge = 1; edge < edges; edge++ ) {
  89.322 +        int16_t* bS=mrs->bS[dir][edge];
  89.323 +
  89.324 +        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
  89.325 +            continue;
  89.326 +
  89.327 +        if( IS_INTRA(mb_type)) {
  89.328 +            AV_WN64A(bS, 0x0003000300030003ULL);
  89.329 +        } else {
  89.330 +            int i;
  89.331 +            int mv_done;
  89.332 +
  89.333 +            if( edge & mask_edge ) {
  89.334 +                AV_ZERO64(bS);
  89.335 +                mv_done = 1;
  89.336 +            }
  89.337 +            else if( mask_par0 ) {
  89.338 +                int b_idx= 8 + 4 + edge * (dir ? 8:1);
  89.339 +                int bn_idx= b_idx - (dir ? 8:1);
  89.340 +
  89.341 +                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
  89.342 +                mv_done = 1;
  89.343 +            }
  89.344 +            else
  89.345 +                mv_done = 0;
  89.346 +
  89.347 +            for( i = 0; i < 4; i++ ) {
  89.348 +                int x = dir == 0 ? edge : i;
  89.349 +                int y = dir == 0 ? i    : edge;
  89.350 +                int b_idx= 8 + 4 + x + 8*y;
  89.351 +                int bn_idx= b_idx - (dir ? 8:1);
  89.352 +
  89.353 +                if( mrs->non_zero_count_cache[b_idx] |
  89.354 +                    mrs->non_zero_count_cache[bn_idx] ) {
  89.355 +                    bS[i] = 2;
  89.356 +                }
  89.357 +                else if(!mv_done)
  89.358 +                {
  89.359 +                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
  89.360 +                }
  89.361 +            }
  89.362 +
  89.363 +            if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
  89.364 +                continue;
  89.365 +        }
  89.366 +
  89.367 +    }
  89.368 +}
  89.369 +
  89.370 +
  89.371 +/**
  89.372 +*
  89.373 +* @return zero if the loop filter can be skiped
  89.374 +*/
  89.375 +static int fill_filter_caches(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
  89.376 +    H264Mb *m_top = m - mrc->mb_width;
  89.377 +    H264Mb *m_left = m - 1;
  89.378 +    const int mb_x = m->mb_x;
  89.379 +    const int mb_y = m->mb_y;
  89.380 +    int top_type, left_type;
  89.381 +    int qp, top_qp, left_qp;
  89.382 +    int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
  89.383 +
  89.384 +    qp = m->qscale_mb_xy ;
  89.385 +    left_qp = m->qscale_left_mb_xy ;
  89.386 +    top_qp  = m->qscale_top_mb_xy ;
  89.387 +
  89.388 +    //for sufficiently low qp, filtering wouldn't do anything
  89.389 +    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
  89.390 +    if(qp <= qp_thresh
  89.391 +        && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh)
  89.392 +        && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){
  89.393 +        return 0;
  89.394 +    }
  89.395 +
  89.396 +    if(IS_INTRA(mb_type)){
  89.397 +        return 1;
  89.398 +    }
  89.399 +
  89.400 +    {
  89.401 +        int list;
  89.402 +        for(list=0; list<s->list_count; list++){
  89.403 +            int8_t *ref;
  89.404 +
  89.405 +            if(!USES_LIST(mb_type, list)){
  89.406 +                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
  89.407 +                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
  89.408 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
  89.409 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
  89.410 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
  89.411 +                AV_WN32A(&mrs->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
  89.412 +                continue;
  89.413 +            }
  89.414 +
  89.415 +            ref = &mrs->ref_index[list][4*mb_x];
  89.416 +            {
  89.417 +                int (*ref2frm)[64] =(void *) (s->ref2frm[0] +  2);
  89.418 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
  89.419 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
  89.420 +                ref += 2;
  89.421 +
  89.422 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
  89.423 +                AV_WN32A(&mrs->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
  89.424 +            }
  89.425 +        }
  89.426 +    }
  89.427 +
  89.428 +    /*
  89.429 +    0 . T T. T T T T
  89.430 +    1 L . .L . . . .
  89.431 +    2 L . .L . . . .
  89.432 +    3 . T TL . . . .
  89.433 +    4 L . .L . . . .
  89.434 +    5 L . .. . . . .
  89.435 +    */
  89.436 +
  89.437 +    if (IS_SKIP(mb_type)){
  89.438 +        memset(mrs->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
  89.439 +    }
  89.440 +
  89.441 +    //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
  89.442 +    top_type  = mrs->top_type;
  89.443 +    left_type = mrs->left_type;
  89.444 +    if(top_type){
  89.445 +        AV_COPY32(&mrs->non_zero_count_cache[4+8*0], &m_top->non_zero_count[3*4]);
  89.446 +    }
  89.447 +
  89.448 +    if(left_type){
  89.449 +        mrs->non_zero_count_cache[3+8*1]= m_left->non_zero_count[3+0*4];
  89.450 +        mrs->non_zero_count_cache[3+8*2]= m_left->non_zero_count[3+1*4];
  89.451 +        mrs->non_zero_count_cache[3+8*3]= m_left->non_zero_count[3+2*4];
  89.452 +        mrs->non_zero_count_cache[3+8*4]= m_left->non_zero_count[3+3*4];
  89.453 +    }
  89.454 +
  89.455 +    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
  89.456 +        int list;
  89.457 +        for(list=0; list<s->list_count; list++){
  89.458 +            if(USES_LIST(top_type, list)){
  89.459 +                const int b_xy= 4*mb_x + 3*mrc->b_stride;
  89.460 +                const int b8_x= 4*mb_x + 2;
  89.461 +                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
  89.462 +                AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]);
  89.463 +
  89.464 +                mrs->ref_cache[list][scan8[0] + 0 - 1*8]=
  89.465 +                mrs->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 0]];
  89.466 +                mrs->ref_cache[list][scan8[0] + 2 - 1*8]=
  89.467 +                mrs->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 1]];
  89.468 +            }else{
  89.469 +                AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]);
  89.470 +                AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
  89.471 +            }
  89.472 +
  89.473 +            if(USES_LIST(left_type, list)){
  89.474 +                const int b_x = 4*(mb_x-1) + 3;
  89.475 +                const int b8_x= 4*(mb_x-1) + 1;
  89.476 +                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
  89.477 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 0 ], mrs->motion_val[list][b_x + mrc->b_stride*0]);
  89.478 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 8 ], mrs->motion_val[list][b_x + mrc->b_stride*1]);
  89.479 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +16 ], mrs->motion_val[list][b_x + mrc->b_stride*2]);
  89.480 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +24 ], mrs->motion_val[list][b_x + mrc->b_stride*3]);
  89.481 +
  89.482 +                mrs->ref_cache[list][scan8[0] - 1 + 0 ]=
  89.483 +                mrs->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*0]];
  89.484 +                mrs->ref_cache[list][scan8[0] - 1 +16 ]=
  89.485 +                mrs->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*1]];
  89.486 +
  89.487 +            }else{
  89.488 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 0 ]);
  89.489 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 8 ]);
  89.490 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +16 ]);
  89.491 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +24 ]);
  89.492 +
  89.493 +                mrs->ref_cache[list][scan8[0] - 1 + 0  ]=
  89.494 +                mrs->ref_cache[list][scan8[0] - 1 + 8  ]=
  89.495 +                mrs->ref_cache[list][scan8[0] - 1 + 16 ]=
  89.496 +                mrs->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
  89.497 +            }
  89.498 +        }
  89.499 +    }
  89.500 +    return 1;
  89.501 +}
  89.502 +
  89.503 +void ff_h264_filter_mb(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) {
  89.504 +    if (fill_filter_caches(mrc, mrs, s, m, m->mb_type)){
  89.505 +        calc_bS_values(mrc, mrs, s, m, 4, 0);
  89.506 +        calc_bS_values(mrc, mrs, s, m, 4, 1);
  89.507 +        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 0);
  89.508 +        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 1);
  89.509 +    }
  89.510 +}

    90.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    90.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_deblock.h	Mon Aug 27 12:09:56 2012 +0200
    90.3 @@ -0,0 +1,8 @@
    90.4 +#ifndef H264_LOOPFILTER_H
    90.5 +#define H264_LOOPFILTER_H
    90.6 +
    90.7 +#include "h264_types.h"
    90.8 +
    90.9 +void ff_h264_filter_mb(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr);
   90.10 +
   90.11 +#endif

    91.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    91.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_dsp.c	Mon Aug 27 12:09:56 2012 +0200
    91.3 @@ -0,0 +1,320 @@
    91.4 +/*
    91.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
    91.6 + * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
    91.7 + *
    91.8 + * This file is part of FFmpeg.
    91.9 + *
   91.10 + * FFmpeg is free software; you can redistribute it and/or
   91.11 + * modify it under the terms of the GNU Lesser General Public
   91.12 + * License as published by the Free Software Foundation; either
   91.13 + * version 2.1 of the License, or (at your option) any later version.
   91.14 + *
   91.15 + * FFmpeg is distributed in the hope that it will be useful,
   91.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   91.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   91.18 + * Lesser General Public License for more details.
   91.19 + *
   91.20 + * You should have received a copy of the GNU Lesser General Public
   91.21 + * License along with FFmpeg; if not, write to the Free Software
   91.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   91.23 + */
   91.24 +
   91.25 +/**
   91.26 + * @file
   91.27 + * H.264 / AVC / MPEG4 part10 DSP functions.
   91.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   91.29 + */
   91.30 +
   91.31 +#include <stdint.h>
   91.32 +#include "avcodec.h"
   91.33 +#include "h264_dsp.h"
   91.34 +
   91.35 +#define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
   91.36 +#define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
   91.37 +#define H264_WEIGHT(W,H) \
   91.38 +static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
   91.39 +    int y; \
   91.40 +    offset <<= log2_denom; \
   91.41 +    if(log2_denom) offset += 1<<(log2_denom-1); \
   91.42 +    for(y=0; y<H; y++, block += stride){ \
   91.43 +        op_scale1(0); \
   91.44 +        op_scale1(1); \
   91.45 +        if(W==2) continue; \
   91.46 +        op_scale1(2); \
   91.47 +        op_scale1(3); \
   91.48 +        if(W==4) continue; \
   91.49 +        op_scale1(4); \
   91.50 +        op_scale1(5); \
   91.51 +        op_scale1(6); \
   91.52 +        op_scale1(7); \
   91.53 +        if(W==8) continue; \
   91.54 +        op_scale1(8); \
   91.55 +        op_scale1(9); \
   91.56 +        op_scale1(10); \
   91.57 +        op_scale1(11); \
   91.58 +        op_scale1(12); \
   91.59 +        op_scale1(13); \
   91.60 +        op_scale1(14); \
   91.61 +        op_scale1(15); \
   91.62 +    } \
   91.63 +} \
   91.64 +static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
   91.65 +    int y; \
   91.66 +    offset = ((offset + 1) | 1) << log2_denom; \
   91.67 +    for(y=0; y<H; y++, dst += stride, src += stride){ \
   91.68 +        op_scale2(0); \
   91.69 +        op_scale2(1); \
   91.70 +        if(W==2) continue; \
   91.71 +        op_scale2(2); \
   91.72 +        op_scale2(3); \
   91.73 +        if(W==4) continue; \
   91.74 +        op_scale2(4); \
   91.75 +        op_scale2(5); \
   91.76 +        op_scale2(6); \
   91.77 +        op_scale2(7); \
   91.78 +        if(W==8) continue; \
   91.79 +        op_scale2(8); \
   91.80 +        op_scale2(9); \
   91.81 +        op_scale2(10); \
   91.82 +        op_scale2(11); \
   91.83 +        op_scale2(12); \
   91.84 +        op_scale2(13); \
   91.85 +        op_scale2(14); \
   91.86 +        op_scale2(15); \
   91.87 +    } \
   91.88 +}
   91.89 +
   91.90 +H264_WEIGHT(16,16)
   91.91 +H264_WEIGHT(16,8)
   91.92 +H264_WEIGHT(8,16)
   91.93 +H264_WEIGHT(8,8)
   91.94 +H264_WEIGHT(8,4)
   91.95 +H264_WEIGHT(4,8)
   91.96 +H264_WEIGHT(4,4)
   91.97 +H264_WEIGHT(4,2)
   91.98 +H264_WEIGHT(2,4)
   91.99 +H264_WEIGHT(2,2)
  91.100 +
  91.101 +#undef op_scale1
  91.102 +#undef op_scale2
  91.103 +#undef H264_WEIGHT
  91.104 +
  91.105 +static av_always_inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
  91.106 +{
  91.107 +    int i, d;
  91.108 +    for( i = 0; i < 4; i++ ) {
  91.109 +        if( tc0[i] < 0 ) {
  91.110 +            pix += 4*ystride;
  91.111 +            continue;
  91.112 +        }
  91.113 +        for( d = 0; d < 4; d++ ) {
  91.114 +            const int p0 = pix[-1*xstride];
  91.115 +            const int p1 = pix[-2*xstride];
  91.116 +            const int p2 = pix[-3*xstride];
  91.117 +            const int q0 = pix[0];
  91.118 +            const int q1 = pix[1*xstride];
  91.119 +            const int q2 = pix[2*xstride];
  91.120 +
  91.121 +            if( FFABS( p0 - q0 ) < alpha &&
  91.122 +                FFABS( p1 - p0 ) < beta &&
  91.123 +                FFABS( q1 - q0 ) < beta ) {
  91.124 +
  91.125 +                int tc = tc0[i];
  91.126 +                int i_delta;
  91.127 +
  91.128 +                if( FFABS( p2 - p0 ) < beta ) {
  91.129 +                    if(tc0[i])
  91.130 +                    pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
  91.131 +                    tc++;
  91.132 +                }
  91.133 +                if( FFABS( q2 - q0 ) < beta ) {
  91.134 +                    if(tc0[i])
  91.135 +                    pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
  91.136 +                    tc++;
  91.137 +                }
  91.138 +
  91.139 +                i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
  91.140 +                pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
  91.141 +                pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
  91.142 +            }
  91.143 +            pix += ystride;
  91.144 +        }
  91.145 +    }
  91.146 +}
  91.147 +static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  91.148 +{
  91.149 +    h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
  91.150 +}
  91.151 +static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  91.152 +{
  91.153 +    h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
  91.154 +}
  91.155 +
  91.156 +static av_always_inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
  91.157 +{
  91.158 +    int d;
  91.159 +    for( d = 0; d < 16; d++ ) {
  91.160 +        const int p2 = pix[-3*xstride];
  91.161 +        const int p1 = pix[-2*xstride];
  91.162 +        const int p0 = pix[-1*xstride];
  91.163 +
  91.164 +        const int q0 = pix[ 0*xstride];
  91.165 +        const int q1 = pix[ 1*xstride];
  91.166 +        const int q2 = pix[ 2*xstride];
  91.167 +
  91.168 +        if( FFABS( p0 - q0 ) < alpha &&
  91.169 +            FFABS( p1 - p0 ) < beta &&
  91.170 +            FFABS( q1 - q0 ) < beta ) {
  91.171 +
  91.172 +            if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
  91.173 +                if( FFABS( p2 - p0 ) < beta)
  91.174 +                {
  91.175 +                    const int p3 = pix[-4*xstride];
  91.176 +                    /* p0', p1', p2' */
  91.177 +                    pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
  91.178 +                    pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
  91.179 +                    pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
  91.180 +                } else {
  91.181 +                    /* p0' */
  91.182 +                    pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
  91.183 +                }
  91.184 +                if( FFABS( q2 - q0 ) < beta)
  91.185 +                {
  91.186 +                    const int q3 = pix[3*xstride];
  91.187 +                    /* q0', q1', q2' */
  91.188 +                    pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
  91.189 +                    pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
  91.190 +                    pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
  91.191 +                } else {
  91.192 +                    /* q0' */
  91.193 +                    pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
  91.194 +                }
  91.195 +            }else{
  91.196 +                /* p0', q0' */
  91.197 +                pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
  91.198 +                pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
  91.199 +            }
  91.200 +        }
  91.201 +        pix += ystride;
  91.202 +    }
  91.203 +}
  91.204 +static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  91.205 +{
  91.206 +    h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
  91.207 +}
  91.208 +static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  91.209 +{
  91.210 +    h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
  91.211 +}
  91.212 +
  91.213 +static av_always_inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
  91.214 +{
  91.215 +    int i, d;
  91.216 +    for( i = 0; i < 4; i++ ) {
  91.217 +        const int tc = tc0[i];
  91.218 +        if( tc <= 0 ) {
  91.219 +            pix += 2*ystride;
  91.220 +            continue;
  91.221 +        }
  91.222 +        for( d = 0; d < 2; d++ ) {
  91.223 +            const int p0 = pix[-1*xstride];
  91.224 +            const int p1 = pix[-2*xstride];
  91.225 +            const int q0 = pix[0];
  91.226 +            const int q1 = pix[1*xstride];
  91.227 +
  91.228 +            if( FFABS( p0 - q0 ) < alpha &&
  91.229 +                FFABS( p1 - p0 ) < beta &&
  91.230 +                FFABS( q1 - q0 ) < beta ) {
  91.231 +
  91.232 +                int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
  91.233 +
  91.234 +                pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
  91.235 +                pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
  91.236 +            }
  91.237 +            pix += ystride;
  91.238 +        }
  91.239 +    }
  91.240 +}
  91.241 +static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  91.242 +{
  91.243 +    h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
  91.244 +}
  91.245 +static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  91.246 +{
  91.247 +    h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
  91.248 +}
  91.249 +
  91.250 +static av_always_inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
  91.251 +{
  91.252 +    int d;
  91.253 +    for( d = 0; d < 8; d++ ) {
  91.254 +        const int p0 = pix[-1*xstride];
  91.255 +        const int p1 = pix[-2*xstride];
  91.256 +        const int q0 = pix[0];
  91.257 +        const int q1 = pix[1*xstride];
  91.258 +
  91.259 +        if( FFABS( p0 - q0 ) < alpha &&
  91.260 +            FFABS( p1 - p0 ) < beta &&
  91.261 +            FFABS( q1 - q0 ) < beta ) {
  91.262 +
  91.263 +            pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
  91.264 +            pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
  91.265 +        }
  91.266 +        pix += ystride;
  91.267 +    }
  91.268 +}
  91.269 +static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  91.270 +{
  91.271 +    h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
  91.272 +}
  91.273 +static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
  91.274 +{
  91.275 +    h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
  91.276 +}
  91.277 +
  91.278 +void ff_h264dsp_init(H264DSPContext *c)
  91.279 +{
  91.280 +    c->h264_idct_add= ff_h264_idct_add_c;
  91.281 +    c->h264_idct8_add= ff_h264_idct8_add_c;
  91.282 +    c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
  91.283 +    c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
  91.284 +    c->h264_idct_add16     = ff_h264_idct_add16_c;
  91.285 +    c->h264_idct8_add4     = ff_h264_idct8_add4_c;
  91.286 +    c->h264_idct_add8      = ff_h264_idct_add8_c;
  91.287 +    c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
  91.288 +
  91.289 +    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
  91.290 +    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
  91.291 +    c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
  91.292 +    c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
  91.293 +    c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
  91.294 +    c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
  91.295 +    c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
  91.296 +    c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
  91.297 +    c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
  91.298 +    c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
  91.299 +    c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
  91.300 +    c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
  91.301 +    c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
  91.302 +    c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
  91.303 +    c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
  91.304 +    c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
  91.305 +    c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
  91.306 +    c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
  91.307 +    c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
  91.308 +    c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
  91.309 +
  91.310 +    c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
  91.311 +    c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
  91.312 +    c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
  91.313 +    c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
  91.314 +    c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
  91.315 +    c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
  91.316 +    c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
  91.317 +    c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
  91.318 +    c->h264_loop_filter_strength= NULL;
  91.319 +
  91.320 +    if (ARCH_ARM) ff_h264dsp_init_arm(c);
  91.321 +    if (HAVE_ALTIVEC) ff_h264dsp_init_ppc(c);
  91.322 +    if (HAVE_MMX) ff_h264dsp_init_x86(c);
  91.323 +}

    92.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    92.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_dsp.h	Mon Aug 27 12:09:56 2012 +0200
    92.3 @@ -0,0 +1,83 @@
    92.4 +/*
    92.5 + * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
    92.6 + *
    92.7 + * This file is part of FFmpeg.
    92.8 + *
    92.9 + * FFmpeg is free software; you can redistribute it and/or
   92.10 + * modify it under the terms of the GNU Lesser General Public
   92.11 + * License as published by the Free Software Foundation; either
   92.12 + * version 2.1 of the License, or (at your option) any later version.
   92.13 + *
   92.14 + * FFmpeg is distributed in the hope that it will be useful,
   92.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   92.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   92.17 + * Lesser General Public License for more details.
   92.18 + *
   92.19 + * You should have received a copy of the GNU Lesser General Public
   92.20 + * License along with FFmpeg; if not, write to the Free Software
   92.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   92.22 + */
   92.23 +
   92.24 +/**
   92.25 + * @file
   92.26 + * H.264 DSP functions.
   92.27 + * @author Michael Niedermayer <michaelni@gmx.at>
   92.28 + */
   92.29 +
   92.30 +#ifndef AVCODEC_H264DSP_H
   92.31 +#define AVCODEC_H264DSP_H
   92.32 +
   92.33 +#include <stdint.h>
   92.34 +#include "dsputil.h"
   92.35 +
   92.36 +//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
   92.37 +typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
   92.38 +typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
   92.39 +
   92.40 +/**
   92.41 + * Context for storing H.264 DSP functions
   92.42 + */
   92.43 +typedef struct H264DSPContext{
   92.44 +    /* weighted MC */
   92.45 +    h264_weight_func weight_h264_pixels_tab[10];
   92.46 +    h264_biweight_func biweight_h264_pixels_tab[10];
   92.47 +
   92.48 +    /* loop filter */
   92.49 +    void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
   92.50 +    void (*h264_h_loop_filter_luma)(uint8_t *pix/*align 4 */, int stride, int alpha, int beta, int8_t *tc0);
   92.51 +    /* v/h_loop_filter_luma_intra: align 16 */
   92.52 +    void (*h264_v_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
   92.53 +    void (*h264_h_loop_filter_luma_intra)(uint8_t *pix, int stride, int alpha, int beta);
   92.54 +    void (*h264_v_loop_filter_chroma)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta, int8_t *tc0);
   92.55 +    void (*h264_h_loop_filter_chroma)(uint8_t *pix/*align 4*/, int stride, int alpha, int beta, int8_t *tc0);
   92.56 +    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
   92.57 +    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix/*align 8*/, int stride, int alpha, int beta);
   92.58 +    // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
   92.59 +    void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
   92.60 +                                      int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
   92.61 +
   92.62 +    /* IDCT */
   92.63 +    /* NOTE!!! if you implement any of h264_idct8_add, h264_idct8_add4 then you must implement all of them
   92.64 +       NOTE!!! if you implement any of h264_idct_add, h264_idct_add16, h264_idct_add16intra, h264_idct_add8 then you must implement all of them
   92.65 +        The reason for above, is that no 2 out of one list may use a different permutation.
   92.66 +    */
   92.67 +    void (*h264_idct_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
   92.68 +    void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
   92.69 +    void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
   92.70 +    void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
   92.71 +    void (*h264_dct)(DCTELEM block[4][4]);
   92.72 +    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
   92.73 +    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
   92.74 +    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
   92.75 +    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
   92.76 +
   92.77 +    qpel_mc_func (*qpel_put)[16];
   92.78 +    qpel_mc_func (*qpel_avg)[16];
   92.79 +}H264DSPContext;
   92.80 +
   92.81 +void ff_h264dsp_init(H264DSPContext *c);
   92.82 +void ff_h264dsp_init_arm(H264DSPContext *c);
   92.83 +void ff_h264dsp_init_ppc(H264DSPContext *c);
   92.84 +void ff_h264dsp_init_x86(H264DSPContext *c);
   92.85 +
   92.86 +#endif /* AVCODEC_H264DSP_H */

    93.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    93.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_entropy.c	Mon Aug 27 12:09:56 2012 +0200
    93.3 @@ -0,0 +1,2065 @@
    93.4 +/*
    93.5 + * H.26L/H.264/AVC/JVT/14496-10/... cabac decoding
    93.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
    93.7 + *
    93.8 + * This file is part of FFmpeg.
    93.9 + *
   93.10 + * FFmpeg is free software; you can redistribute it and/or
   93.11 + * modify it under the terms of the GNU Lesser General Public
   93.12 + * License as published by the Free Software Foundation; either
   93.13 + * version 2.1 of the License, or (at your option) any later version.
   93.14 + *
   93.15 + * FFmpeg is distributed in the hope that it will be useful,
   93.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   93.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   93.18 + * Lesser General Public License for more details.
   93.19 + *
   93.20 + * You should have received a copy of the GNU Lesser General Public
   93.21 + * License along with FFmpeg; if not, write to the Free Software
   93.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   93.23 + */
   93.24 +
   93.25 +/**
   93.26 + * @file
   93.27 + * H.264 / AVC / MPEG4 part10 cabac decoding.
   93.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   93.29 + */
   93.30 +
   93.31 +#include "avcodec.h"
   93.32 +#include "h264_types.h"
   93.33 +#include "h264_data.h"
   93.34 +#include "cabac.h"
   93.35 +#include "rectangle.h"
   93.36 +#include "h264_misc.h"
   93.37 +
   93.38 +// #undef NDEBUG
   93.39 +#include <assert.h>
   93.40 +
   93.41 +/* Cabac pre state table */
   93.42 +
   93.43 +static const int8_t cabac_context_init_I[460][2] =
   93.44 +{
   93.45 +    /* 0 - 10 */
   93.46 +    { 20, -15 }, {  2, 54 },  {  3,  74 }, { 20, -15 },
   93.47 +    {  2,  54 }, {  3, 74 },  { -28,127 }, { -23, 104 },
   93.48 +    { -6,  53 }, { -1, 54 },  {  7,  51 },
   93.49 +
   93.50 +    /* 11 - 23 unsused for I */
   93.51 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.52 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.53 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.54 +    { 0, 0 },
   93.55 +
   93.56 +    /* 24- 39 */
   93.57 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.58 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.59 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.60 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.61 +
   93.62 +    /* 40 - 53 */
   93.63 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.64 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.65 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.66 +    { 0, 0 },    { 0, 0 },
   93.67 +
   93.68 +    /* 54 - 59 */
   93.69 +    { 0, 0 },    { 0, 0 },    { 0, 0 },      { 0, 0 },
   93.70 +    { 0, 0 },    { 0, 0 },
   93.71 +
   93.72 +    /* 60 - 69 */
   93.73 +    { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
   93.74 +    { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
   93.75 +    { 13, 41 },  { 3, 62 },
   93.76 +
   93.77 +    /* 70 -> 87 */
   93.78 +    { 0, 11 },   { 1, 55 },   { 0, 69 },     { -17, 127 },
   93.79 +    { -13, 102 },{ 0, 82 },   { -7, 74 },    { -21, 107 },
   93.80 +    { -27, 127 },{ -31, 127 },{ -24, 127 },  { -18, 95 },
   93.81 +    { -27, 127 },{ -21, 114 },{ -30, 127 },  { -17, 123 },
   93.82 +    { -12, 115 },{ -16, 122 },
   93.83 +
   93.84 +    /* 88 -> 104 */
   93.85 +    { -11, 115 },{ -12, 63 }, { -2, 68 },    { -15, 84 },
   93.86 +    { -13, 104 },{ -3, 70 },  { -8, 93 },    { -10, 90 },
   93.87 +    { -30, 127 },{ -1, 74 },  { -6, 97 },    { -7, 91 },
   93.88 +    { -20, 127 },{ -4, 56 },  { -5, 82 },    { -7, 76 },
   93.89 +    { -22, 125 },
   93.90 +
   93.91 +    /* 105 -> 135 */
   93.92 +    { -7, 93 },  { -11, 87 }, { -3, 77 },    { -5, 71 },
   93.93 +    { -4, 63 },  { -4, 68 },  { -12, 84 },   { -7, 62 },
   93.94 +    { -7, 65 },  { 8, 61 },   { 5, 56 },     { -2, 66 },
   93.95 +    { 1, 64 },   { 0, 61 },   { -2, 78 },    { 1, 50 },
   93.96 +    { 7, 52 },   { 10, 35 },  { 0, 44 },     { 11, 38 },
   93.97 +    { 1, 45 },   { 0, 46 },   { 5, 44 },     { 31, 17 },
   93.98 +    { 1, 51 },   { 7, 50 },   { 28, 19 },    { 16, 33 },
   93.99 +    { 14, 62 },  { -13, 108 },{ -15, 100 },
  93.100 +
  93.101 +    /* 136 -> 165 */
  93.102 +    { -13, 101 },{ -13, 91 }, { -12, 94 },   { -10, 88 },
  93.103 +    { -16, 84 }, { -10, 86 }, { -7, 83 },    { -13, 87 },
  93.104 +    { -19, 94 }, { 1, 70 },   { 0, 72 },     { -5, 74 },
  93.105 +    { 18, 59 },  { -8, 102 }, { -15, 100 },  { 0, 95 },
  93.106 +    { -4, 75 },  { 2, 72 },   { -11, 75 },   { -3, 71 },
  93.107 +    { 15, 46 },  { -13, 69 }, { 0, 62 },     { 0, 65 },
  93.108 +    { 21, 37 },  { -15, 72 }, { 9, 57 },     { 16, 54 },
  93.109 +    { 0, 62 },   { 12, 72 },
  93.110 +
  93.111 +    /* 166 -> 196 */
  93.112 +    { 24, 0 },   { 15, 9 },   { 8, 25 },     { 13, 18 },
  93.113 +    { 15, 9 },   { 13, 19 },  { 10, 37 },    { 12, 18 },
  93.114 +    { 6, 29 },   { 20, 33 },  { 15, 30 },    { 4, 45 },
  93.115 +    { 1, 58 },   { 0, 62 },   { 7, 61 },     { 12, 38 },
  93.116 +    { 11, 45 },  { 15, 39 },  { 11, 42 },    { 13, 44 },
  93.117 +    { 16, 45 },  { 12, 41 },  { 10, 49 },    { 30, 34 },
  93.118 +    { 18, 42 },  { 10, 55 },  { 17, 51 },    { 17, 46 },
  93.119 +    { 0, 89 },   { 26, -19 }, { 22, -17 },
  93.120 +
  93.121 +    /* 197 -> 226 */
  93.122 +    { 26, -17 }, { 30, -25 }, { 28, -20 },   { 33, -23 },
  93.123 +    { 37, -27 }, { 33, -23 }, { 40, -28 },   { 38, -17 },
  93.124 +    { 33, -11 }, { 40, -15 }, { 41, -6 },    { 38, 1 },
  93.125 +    { 41, 17 },  { 30, -6 },  { 27, 3 },     { 26, 22 },
  93.126 +    { 37, -16 }, { 35, -4 },  { 38, -8 },    { 38, -3 },
  93.127 +    { 37, 3 },   { 38, 5 },   { 42, 0 },     { 35, 16 },
  93.128 +    { 39, 22 },  { 14, 48 },  { 27, 37 },    { 21, 60 },
  93.129 +    { 12, 68 },  { 2, 97 },
  93.130 +
  93.131 +    /* 227 -> 251 */
  93.132 +    { -3, 71 },  { -6, 42 },  { -5, 50 },    { -3, 54 },
  93.133 +    { -2, 62 },  { 0, 58 },   { 1, 63 },     { -2, 72 },
  93.134 +    { -1, 74 },  { -9, 91 },  { -5, 67 },    { -5, 27 },
  93.135 +    { -3, 39 },  { -2, 44 },  { 0, 46 },     { -16, 64 },
  93.136 +    { -8, 68 },  { -10, 78 }, { -6, 77 },    { -10, 86 },
  93.137 +    { -12, 92 }, { -15, 55 }, { -10, 60 },   { -6, 62 },
  93.138 +    { -4, 65 },
  93.139 +
  93.140 +    /* 252 -> 275 */
  93.141 +    { -12, 73 }, { -8, 76 },  { -7, 80 },    { -9, 88 },
  93.142 +    { -17, 110 },{ -11, 97 }, { -20, 84 },   { -11, 79 },
  93.143 +    { -6, 73 },  { -4, 74 },  { -13, 86 },   { -13, 96 },
  93.144 +    { -11, 97 }, { -19, 117 },{ -8, 78 },    { -5, 33 },
  93.145 +    { -4, 48 },  { -2, 53 },  { -3, 62 },    { -13, 71 },
  93.146 +    { -10, 79 }, { -12, 86 }, { -13, 90 },   { -14, 97 },
  93.147 +
  93.148 +    /* 276 a bit special (not used, bypass is used instead) */
  93.149 +    { 0, 0 },
  93.150 +
  93.151 +    /* 277 -> 307 */
  93.152 +    { -6, 93 },  { -6, 84 },  { -8, 79 },    { 0, 66 },
  93.153 +    { -1, 71 },  { 0, 62 },   { -2, 60 },    { -2, 59 },
  93.154 +    { -5, 75 },  { -3, 62 },  { -4, 58 },    { -9, 66 },
  93.155 +    { -1, 79 },  { 0, 71 },   { 3, 68 },     { 10, 44 },
  93.156 +    { -7, 62 },  { 15, 36 },  { 14, 40 },    { 16, 27 },
  93.157 +    { 12, 29 },  { 1, 44 },   { 20, 36 },    { 18, 32 },
  93.158 +    { 5, 42 },   { 1, 48 },   { 10, 62 },    { 17, 46 },
  93.159 +    { 9, 64 },   { -12, 104 },{ -11, 97 },
  93.160 +
  93.161 +    /* 308 -> 337 */
  93.162 +    { -16, 96 }, { -7, 88 },  { -8, 85 },    { -7, 85 },
  93.163 +    { -9, 85 },  { -13, 88 }, { 4, 66 },     { -3, 77 },
  93.164 +    { -3, 76 },  { -6, 76 },  { 10, 58 },    { -1, 76 },
  93.165 +    { -1, 83 },  { -7, 99 },  { -14, 95 },   { 2, 95 },
  93.166 +    { 0, 76 },   { -5, 74 },  { 0, 70 },     { -11, 75 },
  93.167 +    { 1, 68 },   { 0, 65 },   { -14, 73 },   { 3, 62 },
  93.168 +    { 4, 62 },   { -1, 68 },  { -13, 75 },   { 11, 55 },
  93.169 +    { 5, 64 },   { 12, 70 },
  93.170 +
  93.171 +    /* 338 -> 368 */
  93.172 +    { 15, 6 },   { 6, 19 },   { 7, 16 },     { 12, 14 },
  93.173 +    { 18, 13 },  { 13, 11 },  { 13, 15 },    { 15, 16 },
  93.174 +    { 12, 23 },  { 13, 23 },  { 15, 20 },    { 14, 26 },
  93.175 +    { 14, 44 },  { 17, 40 },  { 17, 47 },    { 24, 17 },
  93.176 +    { 21, 21 },  { 25, 22 },  { 31, 27 },    { 22, 29 },
  93.177 +    { 19, 35 },  { 14, 50 },  { 10, 57 },    { 7, 63 },
  93.178 +    { -2, 77 },  { -4, 82 },  { -3, 94 },    { 9, 69 },
  93.179 +    { -12, 109 },{ 36, -35 }, { 36, -34 },
  93.180 +
  93.181 +    /* 369 -> 398 */
  93.182 +    { 32, -26 }, { 37, -30 }, { 44, -32 },   { 34, -18 },
  93.183 +    { 34, -15 }, { 40, -15 }, { 33, -7 },    { 35, -5 },
  93.184 +    { 33, 0 },   { 38, 2 },   { 33, 13 },    { 23, 35 },
  93.185 +    { 13, 58 },  { 29, -3 },  { 26, 0 },     { 22, 30 },
  93.186 +    { 31, -7 },  { 35, -15 }, { 34, -3 },    { 34, 3 },
  93.187 +    { 36, -1 },  { 34, 5 },   { 32, 11 },    { 35, 5 },
  93.188 +    { 34, 12 },  { 39, 11 },  { 30, 29 },    { 34, 26 },
  93.189 +    { 29, 39 },  { 19, 66 },
  93.190 +
  93.191 +    /* 399 -> 435 */
  93.192 +    {  31,  21 }, {  31,  31 }, {  25,  50 },
  93.193 +    { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11,  85 },
  93.194 +    { -15,  92 }, { -14,  89 }, { -26,  71 }, { -15,  81 },
  93.195 +    { -14,  80 }, {   0,  68 }, { -14,  70 }, { -24,  56 },
  93.196 +    { -23,  68 }, { -24,  50 }, { -11,  74 }, {  23, -13 },
  93.197 +    {  26, -13 }, {  40, -15 }, {  49, -14 }, {  44,   3 },
  93.198 +    {  45,   6 }, {  44,  34 }, {  33,  54 }, {  19,  82 },
  93.199 +    {  -3,  75 }, {  -1,  23 }, {   1,  34 }, {   1,  43 },
  93.200 +    {   0,  54 }, {  -2,  55 }, {   0,  61 }, {   1,  64 },
  93.201 +    {   0,  68 }, {  -9,  92 },
  93.202 +
  93.203 +    /* 436 -> 459 */
  93.204 +    { -14, 106 }, { -13,  97 }, { -15,  90 }, { -12,  90 },
  93.205 +    { -18,  88 }, { -10,  73 }, {  -9,  79 }, { -14,  86 },
  93.206 +    { -10,  73 }, { -10,  70 }, { -10,  69 }, {  -5,  66 },
  93.207 +    {  -9,  64 }, {  -5,  58 }, {   2,  59 }, {  21, -10 },
  93.208 +    {  24, -11 }, {  28,  -8 }, {  28,  -1 }, {  29,   3 },
  93.209 +    {  29,   9 }, {  35,  20 }, {  29,  36 }, {  14,  67 }
  93.210 +};
  93.211 +
  93.212 +static const int8_t cabac_context_init_PB[3][460][2] =
  93.213 +{
  93.214 +    /* i_cabac_init_idc == 0 */
  93.215 +    {
  93.216 +        /* 0 - 10 */
  93.217 +        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
  93.218 +        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
  93.219 +        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
  93.220 +
  93.221 +        /* 11 - 23 */
  93.222 +        {  23,  33 }, {  23,   2 }, {  21,   0 }, {   1,   9 },
  93.223 +        {   0,  49 }, { -37, 118 }, {   5,  57 }, { -13,  78 },
  93.224 +        { -11,  65 }, {   1,  62 }, {  12,  49 }, {  -4,  73 },
  93.225 +        {  17,  50 },
  93.226 +
  93.227 +        /* 24 - 39 */
  93.228 +        {  18,  64 }, {   9,  43 }, {  29,   0 }, {  26,  67 },
  93.229 +        {  16,  90 }, {   9, 104 }, { -46, 127 }, { -20, 104 },
  93.230 +        {   1,  67 }, { -13,  78 }, { -11,  65 }, {   1,  62 },
  93.231 +        {  -6,  86 }, { -17,  95 }, {  -6,  61 }, {   9,  45 },
  93.232 +
  93.233 +        /* 40 - 53 */
  93.234 +        {  -3,  69 }, {  -6,  81 }, { -11,  96 }, {   6,  55 },
  93.235 +        {   7,  67 }, {  -5,  86 }, {   2,  88 }, {   0,  58 },
  93.236 +        {  -3,  76 }, { -10,  94 }, {   5,  54 }, {   4,  69 },
  93.237 +        {  -3,  81 }, {   0,  88 },
  93.238 +
  93.239 +        /* 54 - 59 */
  93.240 +        {  -7,  67 }, {  -5,  74 }, {  -4,  74 }, {  -5,  80 },
  93.241 +        {  -7,  72 }, {   1,  58 },
  93.242 +
  93.243 +        /* 60 - 69 */
  93.244 +        {   0,  41 }, {   0,  63 }, {   0,  63 }, { 0, 63 },
  93.245 +        {  -9,  83 }, {   4,  86 }, {   0,  97 }, { -7, 72 },
  93.246 +        {  13,  41 }, {   3,  62 },
  93.247 +
  93.248 +        /* 70 - 87 */
  93.249 +        {   0,  45 }, {  -4,  78 }, {  -3,  96 }, { -27,  126 },
  93.250 +        { -28,  98 }, { -25, 101 }, { -23,  67 }, { -28,  82 },
  93.251 +        { -20,  94 }, { -16,  83 }, { -22, 110 }, { -21,  91 },
  93.252 +        { -18, 102 }, { -13,  93 }, { -29, 127 }, {  -7,  92 },
  93.253 +        {  -5,  89 }, {  -7,  96 }, { -13, 108 }, {  -3,  46 },
  93.254 +        {  -1,  65 }, {  -1,  57 }, {  -9,  93 }, {  -3,  74 },
  93.255 +        {  -9,  92 }, {  -8,  87 }, { -23, 126 }, {   5,  54 },
  93.256 +        {   6,  60 }, {   6,  59 }, {   6,  69 }, {  -1,  48 },
  93.257 +        {   0,  68 }, {  -4,  69 }, {  -8,  88 },
  93.258 +
  93.259 +        /* 105 -> 165 */
  93.260 +        {  -2,  85 }, {  -6,  78 }, {  -1,  75 }, {  -7,  77 },
  93.261 +        {   2,  54 }, {   5,  50 }, {  -3,  68 }, {   1,  50 },
  93.262 +        {   6,  42 }, {  -4,  81 }, {   1,  63 }, {  -4,  70 },
  93.263 +        {   0,  67 }, {   2,  57 }, {  -2,  76 }, {  11,  35 },
  93.264 +        {   4,  64 }, {   1,  61 }, {  11,  35 }, {  18,  25 },
  93.265 +        {  12,  24 }, {  13,  29 }, {  13,  36 }, { -10,  93 },
  93.266 +        {  -7,  73 }, {  -2,  73 }, {  13,  46 }, {   9,  49 },
  93.267 +        {  -7, 100 }, {   9,  53 }, {   2,  53 }, {   5,  53 },
  93.268 +        {  -2,  61 }, {   0,  56 }, {   0,  56 }, { -13,  63 },
  93.269 +        {  -5,  60 }, {  -1,  62 }, {   4,  57 }, {  -6,  69 },
  93.270 +        {   4,  57 }, {  14,  39 }, {   4,  51 }, {  13,  68 },
  93.271 +        {   3,  64 }, {   1,  61 }, {   9,  63 }, {   7,  50 },
  93.272 +        {  16,  39 }, {   5,  44 }, {   4,  52 }, {  11,  48 },
  93.273 +        {  -5,  60 }, {  -1,  59 }, {   0,  59 }, {  22,  33 },
  93.274 +        {   5,  44 }, {  14,  43 }, {  -1,  78 }, {   0,  60 },
  93.275 +        {   9,  69 },
  93.276 +
  93.277 +        /* 166 - 226 */
  93.278 +        {  11,  28 }, {   2,  40 }, {   3,  44 }, {   0,  49 },
  93.279 +        {   0,  46 }, {   2,  44 }, {   2,  51 }, {   0,  47 },
  93.280 +        {   4,  39 }, {   2,  62 }, {   6,  46 }, {   0,  54 },
  93.281 +        {   3,  54 }, {   2,  58 }, {   4,  63 }, {   6,  51 },
  93.282 +        {   6,  57 }, {   7,  53 }, {   6,  52 }, {   6,  55 },
  93.283 +        {  11,  45 }, {  14,  36 }, {   8,  53 }, {  -1,  82 },
  93.284 +        {   7,  55 }, {  -3,  78 }, {  15,  46 }, {  22,  31 },
  93.285 +        {  -1,  84 }, {  25,   7 }, {  30,  -7 }, {  28,   3 },
  93.286 +        {  28,   4 }, {  32,   0 }, {  34,  -1 }, {  30,   6 },
  93.287 +        {  30,   6 }, {  32,   9 }, {  31,  19 }, {  26,  27 },
  93.288 +        {  26,  30 }, {  37,  20 }, {  28,  34 }, {  17,  70 },
  93.289 +        {   1,  67 }, {   5,  59 }, {   9,  67 }, {  16,  30 },
  93.290 +        {  18,  32 }, {  18,  35 }, {  22,  29 }, {  24,  31 },
  93.291 +        {  23,  38 }, {  18,  43 }, {  20,  41 }, {  11,  63 },
  93.292 +        {   9,  59 }, {   9,  64 }, {  -1,  94 }, {  -2,  89 },
  93.293 +        {  -9, 108 },
  93.294 +
  93.295 +        /* 227 - 275 */
  93.296 +        {  -6,  76 }, {  -2,  44 }, {   0,  45 }, {   0,  52 },
  93.297 +        {  -3,  64 }, {  -2,  59 }, {  -4,  70 }, {  -4,  75 },
  93.298 +        {  -8,  82 }, { -17, 102 }, {  -9,  77 }, {   3,  24 },
  93.299 +        {   0,  42 }, {   0,  48 }, {   0,  55 }, {  -6,  59 },
  93.300 +        {  -7,  71 }, { -12,  83 }, { -11,  87 }, { -30, 119 },
  93.301 +        {   1,  58 }, {  -3,  29 }, {  -1,  36 }, {   1,  38 },
  93.302 +        {   2,  43 }, {  -6,  55 }, {   0,  58 }, {   0,  64 },
  93.303 +        {  -3,  74 }, { -10,  90 }, {   0,  70 }, {  -4,  29 },
  93.304 +        {   5,  31 }, {   7,  42 }, {   1,  59 }, {  -2,  58 },
  93.305 +        {  -3,  72 }, {  -3,  81 }, { -11,  97 }, {   0,  58 },
  93.306 +        {   8,   5 }, {  10,  14 }, {  14,  18 }, {  13,  27 },
  93.307 +        {   2,  40 }, {   0,  58 }, {  -3,  70 }, {  -6,  79 },
  93.308 +        {  -8,  85 },
  93.309 +
  93.310 +        /* 276 a bit special (not used, bypass is used instead) */
  93.311 +        { 0, 0 },
  93.312 +
  93.313 +        /* 277 - 337 */
  93.314 +        { -13, 106 }, { -16, 106 }, { -10,  87 }, { -21, 114 },
  93.315 +        { -18, 110 }, { -14,  98 }, { -22, 110 }, { -21, 106 },
  93.316 +        { -18, 103 }, { -21, 107 }, { -23, 108 }, { -26, 112 },
  93.317 +        { -10,  96 }, { -12,  95 }, {  -5,  91 }, {  -9,  93 },
  93.318 +        { -22,  94 }, {  -5,  86 }, {   9,  67 }, {  -4,  80 },
  93.319 +        { -10,  85 }, {  -1,  70 }, {   7,  60 }, {   9,  58 },
  93.320 +        {   5,  61 }, {  12,  50 }, {  15,  50 }, {  18,  49 },
  93.321 +        {  17,  54 }, {  10,  41 }, {   7,  46 }, {  -1,  51 },
  93.322 +        {   7,  49 }, {   8,  52 }, {   9,  41 }, {   6,  47 },
  93.323 +        {   2,  55 }, {  13,  41 }, {  10,  44 }, {   6,  50 },
  93.324 +        {   5,  53 }, {  13,  49 }, {   4,  63 }, {   6,  64 },
  93.325 +        {  -2,  69 }, {  -2,  59 }, {   6,  70 }, {  10,  44 },
  93.326 +        {   9,  31 }, {  12,  43 }, {   3,  53 }, {  14,  34 },
  93.327 +        {  10,  38 }, {  -3,  52 }, {  13,  40 }, {  17,  32 },
  93.328 +        {   7,  44 }, {   7,  38 }, {  13,  50 }, {  10,  57 },
  93.329 +        {  26,  43 },
  93.330 +
  93.331 +        /* 338 - 398 */
  93.332 +        {  14,  11 }, {  11,  14 }, {   9,  11 }, {  18,  11 },
  93.333 +        {  21,   9 }, {  23,  -2 }, {  32, -15 }, {  32, -15 },
  93.334 +        {  34, -21 }, {  39, -23 }, {  42, -33 }, {  41, -31 },
  93.335 +        {  46, -28 }, {  38, -12 }, {  21,  29 }, {  45, -24 },
  93.336 +        {  53, -45 }, {  48, -26 }, {  65, -43 }, {  43, -19 },
  93.337 +        {  39, -10 }, {  30,   9 }, {  18,  26 }, {  20,  27 },
  93.338 +        {   0,  57 }, { -14,  82 }, {  -5,  75 }, { -19,  97 },
  93.339 +        { -35, 125 }, {  27,   0 }, {  28,   0 }, {  31,  -4 },
  93.340 +        {  27,   6 }, {  34,   8 }, {  30,  10 }, {  24,  22 },
  93.341 +        {  33,  19 }, {  22,  32 }, {  26,  31 }, {  21,  41 },
  93.342 +        {  26,  44 }, {  23,  47 }, {  16,  65 }, {  14,  71 },
  93.343 +        {   8,  60 }, {   6,  63 }, {  17,  65 }, {  21,  24 },
  93.344 +        {  23,  20 }, {  26,  23 }, {  27,  32 }, {  28,  23 },
  93.345 +        {  28,  24 }, {  23,  40 }, {  24,  32 }, {  28,  29 },
  93.346 +        {  23,  42 }, {  19,  57 }, {  22,  53 }, {  22,  61 },
  93.347 +        {  11,  86 },
  93.348 +
  93.349 +        /* 399 - 435 */
  93.350 +        {  12,  40 }, {  11,  51 }, {  14,  59 },
  93.351 +        {  -4,  79 }, {  -7,  71 }, {  -5,  69 }, {  -9,  70 },
  93.352 +        {  -8,  66 }, { -10,  68 }, { -19,  73 }, { -12,  69 },
  93.353 +        { -16,  70 }, { -15,  67 }, { -20,  62 }, { -19,  70 },
  93.354 +        { -16,  66 }, { -22,  65 }, { -20,  63 }, {   9,  -2 },
  93.355 +        {  26,  -9 }, {  33,  -9 }, {  39,  -7 }, {  41,  -2 },
  93.356 +        {  45,   3 }, {  49,   9 }, {  45,  27 }, {  36,  59 },
  93.357 +        {  -6,  66 }, {  -7,  35 }, {  -7,  42 }, {  -8,  45 },
  93.358 +        {  -5,  48 }, { -12,  56 }, {  -6,  60 }, {  -5,  62 },
  93.359 +        {  -8,  66 }, {  -8,  76 },
  93.360 +
  93.361 +        /* 436 - 459 */
  93.362 +        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
  93.363 +        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
  93.364 +        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
  93.365 +        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  21, -13 },
  93.366 +        {  33, -14 }, {  39,  -7 }, {  46,  -2 }, {  51,   2 },
  93.367 +        {  60,   6 }, {  61,  17 }, {  55,  34 }, {  42,  62 },
  93.368 +    },
  93.369 +
  93.370 +    /* i_cabac_init_idc == 1 */
  93.371 +    {
  93.372 +        /* 0 - 10 */
  93.373 +        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
  93.374 +        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
  93.375 +        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
  93.376 +
  93.377 +        /* 11 - 23 */
  93.378 +        {  22,  25 }, {  34,   0 }, {  16,   0 }, {  -2,   9 },
  93.379 +        {   4,  41 }, { -29, 118 }, {   2,  65 }, {  -6,  71 },
  93.380 +        { -13,  79 }, {   5,  52 }, {   9,  50 }, {  -3,  70 },
  93.381 +        {  10,  54 },
  93.382 +
  93.383 +        /* 24 - 39 */
  93.384 +        {  26,  34 }, {  19,  22 }, {  40,   0 }, {  57,   2 },
  93.385 +        {  41,  36 }, {  26,  69 }, { -45, 127 }, { -15, 101 },
  93.386 +        {  -4,  76 }, {  -6,  71 }, { -13,  79 }, {   5,  52 },
  93.387 +        {   6,  69 }, { -13,  90 }, {   0,  52 }, {   8,  43 },
  93.388 +
  93.389 +        /* 40 - 53 */
  93.390 +        {  -2,  69 },{  -5,  82 },{ -10,  96 },{   2,  59 },
  93.391 +        {   2,  75 },{  -3,  87 },{  -3,  100 },{   1,  56 },
  93.392 +        {  -3,  74 },{  -6,  85 },{   0,  59 },{  -3,  81 },
  93.393 +        {  -7,  86 },{  -5,  95 },
  93.394 +
  93.395 +        /* 54 - 59 */
  93.396 +        {  -1,  66 },{  -1,  77 },{   1,  70 },{  -2,  86 },
  93.397 +        {  -5,  72 },{   0,  61 },
  93.398 +
  93.399 +        /* 60 - 69 */
  93.400 +        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
  93.401 +        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
  93.402 +        { 13, 41 },  { 3, 62 },
  93.403 +
  93.404 +        /* 70 - 104 */
  93.405 +        {  13,  15 }, {   7,  51 }, {   2,  80 }, { -39, 127 },
  93.406 +        { -18,  91 }, { -17,  96 }, { -26,  81 }, { -35,  98 },
  93.407 +        { -24, 102 }, { -23,  97 }, { -27, 119 }, { -24,  99 },
  93.408 +        { -21, 110 }, { -18, 102 }, { -36, 127 }, {   0,  80 },
  93.409 +        {  -5,  89 }, {  -7,  94 }, {  -4,  92 }, {   0,  39 },
  93.410 +        {   0,  65 }, { -15,  84 }, { -35, 127 }, {  -2,  73 },
  93.411 +        { -12, 104 }, {  -9,  91 }, { -31, 127 }, {   3,  55 },
  93.412 +        {   7,  56 }, {   7,  55 }, {   8,  61 }, {  -3,  53 },
  93.413 +        {   0,  68 }, {  -7,  74 }, {  -9,  88 },
  93.414 +
  93.415 +        /* 105 -> 165 */
  93.416 +        { -13, 103 }, { -13,  91 }, {  -9,  89 }, { -14,  92 },
  93.417 +        {  -8,  76 }, { -12,  87 }, { -23, 110 }, { -24, 105 },
  93.418 +        { -10,  78 }, { -20, 112 }, { -17,  99 }, { -78, 127 },
  93.419 +        { -70, 127 }, { -50, 127 }, { -46, 127 }, {  -4,  66 },
  93.420 +        {  -5,  78 }, {  -4,  71 }, {  -8,  72 }, {   2,  59 },
  93.421 +        {  -1,  55 }, {  -7,  70 }, {  -6,  75 }, {  -8,  89 },
  93.422 +        { -34, 119 }, {  -3,  75 }, {  32,  20 }, {  30,  22 },
  93.423 +        { -44, 127 }, {   0,  54 }, {  -5,  61 }, {   0,  58 },
  93.424 +        {  -1,  60 }, {  -3,  61 }, {  -8,  67 }, { -25,  84 },
  93.425 +        { -14,  74 }, {  -5,  65 }, {   5,  52 }, {   2,  57 },
  93.426 +        {   0,  61 }, {  -9,  69 }, { -11,  70 }, {  18,  55 },
  93.427 +        {  -4,  71 }, {   0,  58 }, {   7,  61 }, {   9,  41 },
  93.428 +        {  18,  25 }, {   9,  32 }, {   5,  43 }, {   9,  47 },
  93.429 +        {   0,  44 }, {   0,  51 }, {   2,  46 }, {  19,  38 },
  93.430 +        {  -4,  66 }, {  15,  38 }, {  12,  42 }, {   9,  34 },
  93.431 +        {   0,  89 },
  93.432 +
  93.433 +        /* 166 - 226 */
  93.434 +        {   4,  45 }, {  10,  28 }, {  10,  31 }, {  33, -11 },
  93.435 +        {  52, -43 }, {  18,  15 }, {  28,   0 }, {  35, -22 },
  93.436 +        {  38, -25 }, {  34,   0 }, {  39, -18 }, {  32, -12 },
  93.437 +        { 102, -94 }, {   0,   0 }, {  56, -15 }, {  33,  -4 },
  93.438 +        {  29,  10 }, {  37,  -5 }, {  51, -29 }, {  39,  -9 },
  93.439 +        {  52, -34 }, {  69, -58 }, {  67, -63 }, {  44,  -5 },
  93.440 +        {  32,   7 }, {  55, -29 }, {  32,   1 }, {   0,   0 },
  93.441 +        {  27,  36 }, {  33, -25 }, {  34, -30 }, {  36, -28 },
  93.442 +        {  38, -28 }, {  38, -27 }, {  34, -18 }, {  35, -16 },
  93.443 +        {  34, -14 }, {  32,  -8 }, {  37,  -6 }, {  35,   0 },
  93.444 +        {  30,  10 }, {  28,  18 }, {  26,  25 }, {  29,  41 },
  93.445 +        {   0,  75 }, {   2,  72 }, {   8,  77 }, {  14,  35 },
  93.446 +        {  18,  31 }, {  17,  35 }, {  21,  30 }, {  17,  45 },
  93.447 +        {  20,  42 }, {  18,  45 }, {  27,  26 }, {  16,  54 },
  93.448 +        {   7,  66 }, {  16,  56 }, {  11,  73 }, {  10,  67 },
  93.449 +        { -10, 116 },
  93.450 +
  93.451 +        /* 227 - 275 */
  93.452 +        { -23, 112 }, { -15,  71 }, {  -7,  61 }, {   0,  53 },
  93.453 +        {  -5,  66 }, { -11,  77 }, {  -9,  80 }, {  -9,  84 },
  93.454 +        { -10,  87 }, { -34, 127 }, { -21, 101 }, {  -3,  39 },
  93.455 +        {  -5,  53 }, {  -7,  61 }, { -11,  75 }, { -15,  77 },
  93.456 +        { -17,  91 }, { -25, 107 }, { -25, 111 }, { -28, 122 },
  93.457 +        { -11,  76 }, { -10,  44 }, { -10,  52 }, { -10,  57 },
  93.458 +        {  -9,  58 }, { -16,  72 }, {  -7,  69 }, {  -4,  69 },
  93.459 +        {  -5,  74 }, {  -9,  86 }, {   2,  66 }, {  -9,  34 },
  93.460 +        {   1,  32 }, {  11,  31 }, {   5,  52 }, {  -2,  55 },
  93.461 +        {  -2,  67 }, {   0,  73 }, {  -8,  89 }, {   3,  52 },
  93.462 +        {   7,   4 }, {  10,   8 }, {  17,   8 }, {  16,  19 },
  93.463 +        {   3,  37 }, {  -1,  61 }, {  -5,  73 }, {  -1,  70 },
  93.464 +        {  -4,  78 },
  93.465 +
  93.466 +        /* 276 a bit special (not used, bypass is used instead) */
  93.467 +        { 0, 0 },
  93.468 +
  93.469 +        /* 277 - 337 */
  93.470 +        { -21, 126 }, { -23, 124 }, { -20, 110 }, { -26, 126 },
  93.471 +        { -25, 124 }, { -17, 105 }, { -27, 121 }, { -27, 117 },
  93.472 +        { -17, 102 }, { -26, 117 }, { -27, 116 }, { -33, 122 },
  93.473 +        { -10,  95 }, { -14, 100 }, {  -8,  95 }, { -17, 111 },
  93.474 +        { -28, 114 }, {  -6,  89 }, {  -2,  80 }, {  -4,  82 },
  93.475 +        {  -9,  85 }, {  -8,  81 }, {  -1,  72 }, {   5,  64 },
  93.476 +        {   1,  67 }, {   9,  56 }, {   0,  69 }, {   1,  69 },
  93.477 +        {   7,  69 }, {  -7,  69 }, {  -6,  67 }, { -16,  77 },
  93.478 +        {  -2,  64 }, {   2,  61 }, {  -6,  67 }, {  -3,  64 },
  93.479 +        {   2,  57 }, {  -3,  65 }, {  -3,  66 }, {   0,  62 },
  93.480 +        {   9,  51 }, {  -1,  66 }, {  -2,  71 }, {  -2,  75 },
  93.481 +        {  -1,  70 }, {  -9,  72 }, {  14,  60 }, {  16,  37 },
  93.482 +        {   0,  47 }, {  18,  35 }, {  11,  37 }, {  12,  41 },
  93.483 +        {  10,  41 }, {   2,  48 }, {  12,  41 }, {  13,  41 },
  93.484 +        {   0,  59 }, {   3,  50 }, {  19,  40 }, {   3,  66 },
  93.485 +        {  18,  50 },
  93.486 +
  93.487 +        /* 338 - 398 */
  93.488 +        {  19,  -6 }, {  18,  -6 }, {  14,   0 }, {  26, -12 },
  93.489 +        {  31, -16 }, {  33, -25 }, {  33, -22 }, {  37, -28 },
  93.490 +        {  39, -30 }, {  42, -30 }, {  47, -42 }, {  45, -36 },
  93.491 +        {  49, -34 }, {  41, -17 }, {  32,   9 }, {  69, -71 },
  93.492 +        {  63, -63 }, {  66, -64 }, {  77, -74 }, {  54, -39 },
  93.493 +        {  52, -35 }, {  41, -10 }, {  36,   0 }, {  40,  -1 },
  93.494 +        {  30,  14 }, {  28,  26 }, {  23,  37 }, {  12,  55 },
  93.495 +        {  11,  65 }, {  37, -33 }, {  39, -36 }, {  40, -37 },
  93.496 +        {  38, -30 }, {  46, -33 }, {  42, -30 }, {  40, -24 },
  93.497 +        {  49, -29 }, {  38, -12 }, {  40, -10 }, {  38,  -3 },
  93.498 +        {  46,  -5 }, {  31,  20 }, {  29,  30 }, {  25,  44 },
  93.499 +        {  12,  48 }, {  11,  49 }, {  26,  45 }, {  22,  22 },
  93.500 +        {  23,  22 }, {  27,  21 }, {  33,  20 }, {  26,  28 },
  93.501 +        {  30,  24 }, {  27,  34 }, {  18,  42 }, {  25,  39 },
  93.502 +        {  18,  50 }, {  12,  70 }, {  21,  54 }, {  14,  71 },
  93.503 +        {  11,  83 },
  93.504 +
  93.505 +        /* 399 - 435 */
  93.506 +        {  25,  32 }, {  21,  49 }, {  21,  54 },
  93.507 +        {  -5,  85 }, {  -6,  81 }, { -10,  77 }, {  -7,  81 },
  93.508 +        { -17,  80 }, { -18,  73 }, {  -4,  74 }, { -10,  83 },
  93.509 +        {  -9,  71 }, {  -9,  67 }, {  -1,  61 }, {  -8,  66 },
  93.510 +        { -14,  66 }, {   0,  59 }, {   2,  59 }, {  17, -10 },
  93.511 +        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
  93.512 +        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
  93.513 +        {  -5,  71 }, {   0,  24 }, {  -1,  36 }, {  -2,  42 },
  93.514 +        {  -2,  52 }, {  -9,  57 }, {  -6,  63 }, {  -4,  65 },
  93.515 +        {  -4,  67 }, {  -7,  82 },
  93.516 +
  93.517 +        /* 436 - 459 */
  93.518 +        {  -3,  81 }, {  -3,  76 }, {  -7,  72 }, {  -6,  78 },
  93.519 +        { -12,  72 }, { -14,  68 }, {  -3,  70 }, {  -6,  76 },
  93.520 +        {  -5,  66 }, {  -5,  62 }, {   0,  57 }, {  -4,  61 },
  93.521 +        {  -9,  60 }, {   1,  54 }, {   2,  58 }, {  17, -10 },
  93.522 +        {  32, -13 }, {  42,  -9 }, {  49,  -5 }, {  53,   0 },
  93.523 +        {  64,   3 }, {  68,  10 }, {  66,  27 }, {  47,  57 },
  93.524 +    },
  93.525 +
  93.526 +    /* i_cabac_init_idc == 2 */
  93.527 +    {
  93.528 +        /* 0 - 10 */
  93.529 +        {  20, -15 }, {   2,  54 }, {   3,  74 }, {  20, -15 },
  93.530 +        {   2,  54 }, {   3,  74 }, { -28, 127 }, { -23, 104 },
  93.531 +        {  -6,  53 }, {  -1,  54 }, {   7,  51 },
  93.532 +
  93.533 +        /* 11 - 23 */
  93.534 +        {  29,  16 }, {  25,   0 }, {  14,   0 }, { -10,  51 },
  93.535 +        {  -3,  62 }, { -27,  99 }, {  26,  16 }, {  -4,  85 },
  93.536 +        { -24, 102 }, {   5,  57 }, {   6,  57 }, { -17,  73 },
  93.537 +        {  14,  57 },
  93.538 +
  93.539 +        /* 24 - 39 */
  93.540 +        {  20,  40 }, {  20,  10 }, {  29,   0 }, {  54,   0 },
  93.541 +        {  37,  42 }, {  12,  97 }, { -32, 127 }, { -22, 117 },
  93.542 +        {  -2,  74 }, {  -4,  85 }, { -24, 102 }, {   5,  57 },
  93.543 +        {  -6,  93 }, { -14,  88 }, {  -6,  44 }, {   4,  55 },
  93.544 +
  93.545 +        /* 40 - 53 */
  93.546 +        { -11,  89 },{ -15,  103 },{ -21,  116 },{  19,  57 },
  93.547 +        {  20,  58 },{   4,  84 },{   6,  96 },{   1,  63 },
  93.548 +        {  -5,  85 },{ -13,  106 },{   5,  63 },{   6,  75 },
  93.549 +        {  -3,  90 },{  -1,  101 },
  93.550 +
  93.551 +        /* 54 - 59 */
  93.552 +        {   3,  55 },{  -4,  79 },{  -2,  75 },{ -12,  97 },
  93.553 +        {  -7,  50 },{   1,  60 },
  93.554 +
  93.555 +        /* 60 - 69 */
  93.556 +        { 0, 41 },   { 0, 63 },   { 0, 63 },     { 0, 63 },
  93.557 +        { -9, 83 },  { 4, 86 },   { 0, 97 },     { -7, 72 },
  93.558 +        { 13, 41 },  { 3, 62 },
  93.559 +
  93.560 +        /* 70 - 104 */
  93.561 +        {   7,  34 }, {  -9,  88 }, { -20, 127 }, { -36, 127 },
  93.562 +        { -17,  91 }, { -14,  95 }, { -25,  84 }, { -25,  86 },
  93.563 +        { -12,  89 }, { -17,  91 }, { -31, 127 }, { -14,  76 },
  93.564 +        { -18, 103 }, { -13,  90 }, { -37, 127 }, {  11,  80 },
  93.565 +        {   5,  76 }, {   2,  84 }, {   5,  78 }, {  -6,  55 },
  93.566 +        {   4,  61 }, { -14,  83 }, { -37, 127 }, {  -5,  79 },
  93.567 +        { -11, 104 }, { -11,  91 }, { -30, 127 }, {   0,  65 },
  93.568 +        {  -2,  79 }, {   0,  72 }, {  -4,  92 }, {  -6,  56 },
  93.569 +        {   3,  68 }, {  -8,  71 }, { -13,  98 },
  93.570 +
  93.571 +        /* 105 -> 165 */
  93.572 +        {  -4,  86 }, { -12,  88 }, {  -5,  82 }, {  -3,  72 },
  93.573 +        {  -4,  67 }, {  -8,  72 }, { -16,  89 }, {  -9,  69 },
  93.574 +        {  -1,  59 }, {   5,  66 }, {   4,  57 }, {  -4,  71 },
  93.575 +        {  -2,  71 }, {   2,  58 }, {  -1,  74 }, {  -4,  44 },
  93.576 +        {  -1,  69 }, {   0,  62 }, {  -7,  51 }, {  -4,  47 },
  93.577 +        {  -6,  42 }, {  -3,  41 }, {  -6,  53 }, {   8,  76 },
  93.578 +        {  -9,  78 }, { -11,  83 }, {   9,  52 }, {   0,  67 },
  93.579 +        {  -5,  90 }, {   1,  67 }, { -15,  72 }, {  -5,  75 },
  93.580 +        {  -8,  80 }, { -21,  83 }, { -21,  64 }, { -13,  31 },
  93.581 +        { -25,  64 }, { -29,  94 }, {   9,  75 }, {  17,  63 },
  93.582 +        {  -8,  74 }, {  -5,  35 }, {  -2,  27 }, {  13,  91 },
  93.583 +        {   3,  65 }, {  -7,  69 }, {   8,  77 }, { -10,  66 },
  93.584 +        {   3,  62 }, {  -3,  68 }, { -20,  81 }, {   0,  30 },
  93.585 +        {   1,   7 }, {  -3,  23 }, { -21,  74 }, {  16,  66 },
  93.586 +        { -23, 124 }, {  17,  37 }, {  44, -18 }, {  50, -34 },
  93.587 +        { -22, 127 },
  93.588 +
  93.589 +        /* 166 - 226 */
  93.590 +        {   4,  39 }, {   0,  42 }, {   7,  34 }, {  11,  29 },
  93.591 +        {   8,  31 }, {   6,  37 }, {   7,  42 }, {   3,  40 },
  93.592 +        {   8,  33 }, {  13,  43 }, {  13,  36 }, {   4,  47 },
  93.593 +        {   3,  55 }, {   2,  58 }, {   6,  60 }, {   8,  44 },
  93.594 +        {  11,  44 }, {  14,  42 }, {   7,  48 }, {   4,  56 },
  93.595 +        {   4,  52 }, {  13,  37 }, {   9,  49 }, {  19,  58 },
  93.596 +        {  10,  48 }, {  12,  45 }, {   0,  69 }, {  20,  33 },
  93.597 +        {   8,  63 }, {  35, -18 }, {  33, -25 }, {  28,  -3 },
  93.598 +        {  24,  10 }, {  27,   0 }, {  34, -14 }, {  52, -44 },
  93.599 +        {  39, -24 }, {  19,  17 }, {  31,  25 }, {  36,  29 },
  93.600 +        {  24,  33 }, {  34,  15 }, {  30,  20 }, {  22,  73 },
  93.601 +        {  20,  34 }, {  19,  31 }, {  27,  44 }, {  19,  16 },
  93.602 +        {  15,  36 }, {  15,  36 }, {  21,  28 }, {  25,  21 },
  93.603 +        {  30,  20 }, {  31,  12 }, {  27,  16 }, {  24,  42 },
  93.604 +        {   0,  93 }, {  14,  56 }, {  15,  57 }, {  26,  38 },
  93.605 +        { -24, 127 },
  93.606 +
  93.607 +        /* 227 - 275 */
  93.608 +        { -24, 115 }, { -22,  82 }, {  -9,  62 }, {   0,  53 },
  93.609 +        {   0,  59 }, { -14,  85 }, { -13,  89 }, { -13,  94 },
  93.610 +        { -11,  92 }, { -29, 127 }, { -21, 100 }, { -14,  57 },
  93.611 +        { -12,  67 }, { -11,  71 }, { -10,  77 }, { -21,  85 },
  93.612 +        { -16,  88 }, { -23, 104 }, { -15,  98 }, { -37, 127 },
  93.613 +        { -10,  82 }, {  -8,  48 }, {  -8,  61 }, {  -8,  66 },
  93.614 +        {  -7,  70 }, { -14,  75 }, { -10,  79 }, {  -9,  83 },
  93.615 +        { -12,  92 }, { -18, 108 }, {  -4,  79 }, { -22,  69 },
  93.616 +        { -16,  75 }, {  -2,  58 }, {   1,  58 }, { -13,  78 },
  93.617 +        {  -9,  83 }, {  -4,  81 }, { -13,  99 }, { -13,  81 },
  93.618 +        {  -6,  38 }, { -13,  62 }, {  -6,  58 }, {  -2,  59 },
  93.619 +        { -16,  73 }, { -10,  76 }, { -13,  86 }, {  -9,  83 },
  93.620 +        { -10,  87 },
  93.621 +
  93.622 +        /* 276 a bit special (not used, bypass is used instead) */
  93.623 +        { 0, 0 },
  93.624 +
  93.625 +        /* 277 - 337 */
  93.626 +        { -22, 127 }, { -25, 127 }, { -25, 120 }, { -27, 127 },
  93.627 +        { -19, 114 }, { -23, 117 }, { -25, 118 }, { -26, 117 },
  93.628 +        { -24, 113 }, { -28, 118 }, { -31, 120 }, { -37, 124 },
  93.629 +        { -10,  94 }, { -15, 102 }, { -10,  99 }, { -13, 106 },
  93.630 +        { -50, 127 }, {  -5,  92 }, {  17,  57 }, {  -5,  86 },
  93.631 +        { -13,  94 }, { -12,  91 }, {  -2,  77 }, {   0,  71 },
  93.632 +        {  -1,  73 }, {   4,  64 }, {  -7,  81 }, {   5,  64 },
  93.633 +        {  15,  57 }, {   1,  67 }, {   0,  68 }, { -10,  67 },
  93.634 +        {   1,  68 }, {   0,  77 }, {   2,  64 }, {   0,  68 },
  93.635 +        {  -5,  78 }, {   7,  55 }, {   5,  59 }, {   2,  65 },
  93.636 +        {  14,  54 }, {  15,  44 }, {   5,  60 }, {   2,  70 },
  93.637 +        {  -2,  76 }, { -18,  86 }, {  12,  70 }, {   5,  64 },
  93.638 +        { -12,  70 }, {  11,  55 }, {   5,  56 }, {   0,  69 },
  93.639 +        {   2,  65 }, {  -6,  74 }, {   5,  54 }, {   7,  54 },
  93.640 +        {  -6,  76 }, { -11,  82 }, {  -2,  77 }, {  -2,  77 },
  93.641 +        {  25,  42 },
  93.642 +
  93.643 +        /* 338 - 398 */
  93.644 +        {  17, -13 }, {  16,  -9 }, {  17, -12 }, {  27, -21 },
  93.645 +        {  37, -30 }, {  41, -40 }, {  42, -41 }, {  48, -47 },
  93.646 +        {  39, -32 }, {  46, -40 }, {  52, -51 }, {  46, -41 },
  93.647 +        {  52, -39 }, {  43, -19 }, {  32,  11 }, {  61, -55 },
  93.648 +        {  56, -46 }, {  62, -50 }, {  81, -67 }, {  45, -20 },
  93.649 +        {  35,  -2 }, {  28,  15 }, {  34,   1 }, {  39,   1 },
  93.650 +        {  30,  17 }, {  20,  38 }, {  18,  45 }, {  15,  54 },
  93.651 +        {   0,  79 }, {  36, -16 }, {  37, -14 }, {  37, -17 },
  93.652 +        {  32,   1 }, {  34,  15 }, {  29,  15 }, {  24,  25 },
  93.653 +        {  34,  22 }, {  31,  16 }, {  35,  18 }, {  31,  28 },
  93.654 +        {  33,  41 }, {  36,  28 }, {  27,  47 }, {  21,  62 },
  93.655 +        {  18,  31 }, {  19,  26 }, {  36,  24 }, {  24,  23 },
  93.656 +        {  27,  16 }, {  24,  30 }, {  31,  29 }, {  22,  41 },
  93.657 +        {  22,  42 }, {  16,  60 }, {  15,  52 }, {  14,  60 },
  93.658 +        {   3,  78 }, { -16, 123 }, {  21,  53 }, {  22,  56 },
  93.659 +        {  25,  61 },
  93.660 +
  93.661 +        /* 399 - 435 */
  93.662 +        {  21,  33 }, {  19,  50 }, {  17,  61 },
  93.663 +        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
  93.664 +        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
  93.665 +        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
  93.666 +        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
  93.667 +        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
  93.668 +        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
  93.669 +        {  -9,  71 }, {  -7,  37 }, {  -8,  44 }, { -11,  49 },
  93.670 +        { -10,  56 }, { -12,  59 }, {  -8,  63 }, {  -9,  67 },
  93.671 +        {  -6,  68 }, { -10,  79 },
  93.672 +
  93.673 +        /* 436 - 459 */
  93.674 +        {  -3,  78 }, {  -8,  74 }, {  -9,  72 }, { -10,  72 },
  93.675 +        { -18,  75 }, { -12,  71 }, { -11,  63 }, {  -5,  70 },
  93.676 +        { -17,  75 }, { -14,  72 }, { -16,  67 }, {  -8,  53 },
  93.677 +        { -14,  59 }, {  -9,  52 }, { -11,  68 }, {   9,  -2 },
  93.678 +        {  30, -10 }, {  31,  -4 }, {  33,  -1 }, {  33,   7 },
  93.679 +        {  31,  12 }, {  37,  23 }, {  31,  38 }, {  20,  64 },
  93.680 +    }
  93.681 +};
  93.682 +
  93.683 +static const uint8_t left_block_options[4][16]={
  93.684 +    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
  93.685 +    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
  93.686 +    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
  93.687 +    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
  93.688 +};
  93.689 +
  93.690 +static const uint8_t rem6[52]={
  93.691 +0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  93.692 +};
  93.693 +
  93.694 +static const uint8_t div6[52]={
  93.695 +0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  93.696 +};
  93.697 +
  93.698 +static void init_dequant8_coeff_table(H264Slice *s, EntropyContext *ec){
  93.699 +    int i,q,x;
  93.700 +    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
  93.701 +    ec->dequant8_coeff[0] = ec->dequant8_buffer[0];
  93.702 +    ec->dequant8_coeff[1] = ec->dequant8_buffer[1];
  93.703 +
  93.704 +    for(i=0; i<2; i++){
  93.705 +        if(i && !memcmp(s->pps.scaling_matrix8[0], s->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
  93.706 +            ec->dequant8_coeff[1] = ec->dequant8_buffer[0];
  93.707 +            break;
  93.708 +        }
  93.709 +
  93.710 +        for(q=0; q<52; q++){
  93.711 +            int shift = div6[q];
  93.712 +            int idx = rem6[q];
  93.713 +            for(x=0; x<64; x++)
  93.714 +                ec->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
  93.715 +                    ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
  93.716 +                    s->pps.scaling_matrix8[i][x]) << shift;
  93.717 +        }
  93.718 +    }
  93.719 +}
  93.720 +
  93.721 +static void init_dequant4_coeff_table(H264Slice *s, EntropyContext *ec){
  93.722 +    int i,j,q,x;
  93.723 +    const int transpose = HAVE_MMX | HAVE_ALTIVEC | HAVE_NEON;
  93.724 +    for(i=0; i<6; i++ ){
  93.725 +        ec->dequant4_coeff[i] = ec->dequant4_buffer[i];
  93.726 +        for(j=0; j<i; j++){
  93.727 +            if(!memcmp(s->pps.scaling_matrix4[j], s->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
  93.728 +                ec->dequant4_coeff[i] = ec->dequant4_buffer[j];
  93.729 +                break;
  93.730 +            }
  93.731 +        }
  93.732 +        if(j<i)
  93.733 +            continue;
  93.734 +
  93.735 +        for(q=0; q<52; q++){
  93.736 +            int shift = div6[q] + 2;
  93.737 +            int idx = rem6[q];
  93.738 +            for(x=0; x<16; x++)
  93.739 +                ec->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
  93.740 +                    ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
  93.741 +                    s->pps.scaling_matrix4[i][x]) << shift;
  93.742 +        }
  93.743 +    }
  93.744 +}
  93.745 +
  93.746 +void init_dequant_tables(H264Slice *s, EntropyContext *ec){
  93.747 +    int i,x;
  93.748 +
  93.749 +    init_dequant4_coeff_table(s, ec);
  93.750 +    if(s->pps.transform_8x8_mode)
  93.751 +        init_dequant8_coeff_table(s, ec);
  93.752 +    if(s->transform_bypass){
  93.753 +        for(i=0; i<6; i++)
  93.754 +            for(x=0; x<16; x++)
  93.755 +                ec->dequant4_coeff[i][0][x] = 1<<6;
  93.756 +        if(s->pps.transform_8x8_mode)
  93.757 +            for(i=0; i<2; i++)
  93.758 +                for(x=0; x<64; x++)
  93.759 +                    ec->dequant8_coeff[i][0][x] = 1<<6;
  93.760 +    }
  93.761 +}
  93.762 +
  93.763 +void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c) {
  93.764 +    int i;
  93.765 +    const int8_t (*tab)[2];
  93.766 +
  93.767 +    if( s->slice_type_nos == FF_I_TYPE ) tab = cabac_context_init_I;
  93.768 +    else                                 tab = cabac_context_init_PB[s->cabac_init_idc];
  93.769 +
  93.770 +    /* calculate pre-state */
  93.771 +    for( i= 0; i < 460; i++ ) {
  93.772 +        int pre = 2*(((tab[i][0] * ec->curr_qscale) >>4 ) + tab[i][1]) - 127;
  93.773 +
  93.774 +        pre^= pre>>31;
  93.775 +        if(pre > 124)
  93.776 +            pre= 124 + (pre&1);
  93.777 +
  93.778 +        c->cabac_state[i] =  pre;
  93.779 +    }
  93.780 +}
  93.781 +
  93.782 +static void fill_decode_neighbors(EntropyContext *ec, H264Slice *s){
  93.783 +    H264Mb *m = ec->m;
  93.784 +	const int mb_x = m->mb_x;
  93.785 +
  93.786 +    if (m->mb_y){
  93.787 +        ec->top_type     = ec->mb_type_top[mb_x];
  93.788 +        ec->topright_type= ec->mb_type_top[mb_x+1];
  93.789 +        ec->topleft_type = ec->mb_type_top[mb_x-1];
  93.790 +        m->qscale_top_mb_xy = ec->qscale_top[mb_x];
  93.791 +    } else {
  93.792 +        ec->top_type     = 0;
  93.793 +        ec->topright_type= 0;
  93.794 +        ec->topleft_type = 0;
  93.795 +        m->qscale_top_mb_xy = 0;
  93.796 +    }
  93.797 +
  93.798 +    ec->left_type    = ec->mb_type[mb_x-1] ;
  93.799 +    m->qscale_left_mb_xy = ec->qscale[mb_x-1];
  93.800 +
  93.801 +}
  93.802 +
  93.803 +static void fill_decode_caches(EntropyContext *ec, H264Slice *s, int mb_type){
  93.804 +    H264Mb *m = ec->m;
  93.805 +    int topleft_type, top_type, topright_type, left_type;
  93.806 +    const uint8_t * left_block= left_block_options[0];
  93.807 +	const int mb_x = m->mb_x;
  93.808 +    int i;
  93.809 +
  93.810 +    topleft_type = ec->topleft_type;
  93.811 +	top_type     = ec->top_type;
  93.812 +    topright_type= ec->topright_type;
  93.813 +	left_type    = ec->left_type;
  93.814 +
  93.815 +    if(!IS_SKIP(mb_type)){
  93.816 +        if(top_type){
  93.817 +            AV_COPY32(&ec->non_zero_count_cache[4+8*0], &ec->non_zero_count_top[mb_x][0]);
  93.818 +            ec->non_zero_count_cache[1+8*0]= ec->non_zero_count_top[mb_x][4];
  93.819 +            ec->non_zero_count_cache[2+8*0]= ec->non_zero_count_top[mb_x][5];
  93.820 +            ec->non_zero_count_cache[1+8*3]= ec->non_zero_count_top[mb_x][6];
  93.821 +            ec->non_zero_count_cache[2+8*3]= ec->non_zero_count_top[mb_x][7];
  93.822 +
  93.823 +        }else {
  93.824 +            ec->non_zero_count_cache[1+8*0]=
  93.825 +            ec->non_zero_count_cache[2+8*0]=
  93.826 +            ec->non_zero_count_cache[1+8*3]=
  93.827 +            ec->non_zero_count_cache[2+8*3]=
  93.828 +            AV_WN32A(&ec->non_zero_count_cache[4+8*0], !IS_INTRA(mb_type) ? 0 : 0x40404040);
  93.829 +        }
  93.830 +
  93.831 +        if(left_type){
  93.832 +            for (i=0; i<2; i++) {
  93.833 +                ec->non_zero_count_cache[3+8*1 + 2*8*i]= ec->non_zero_count_left[i*2+0];
  93.834 +                ec->non_zero_count_cache[3+8*2 + 2*8*i]= ec->non_zero_count_left[i*2+1];
  93.835 +                ec->non_zero_count_cache[0+8*1 + 3*8*i]= ec->non_zero_count_left[4+i*2+0];
  93.836 +                ec->non_zero_count_cache[0+8*2 + 3*8*i]= ec->non_zero_count_left[4+i*2+1];
  93.837 +            }
  93.838 +        }
  93.839 +        else{
  93.840 +            for (i=0; i<2; i++) {
  93.841 +                ec->non_zero_count_cache[3+8*1 + 2*8*i]=
  93.842 +                ec->non_zero_count_cache[3+8*2 + 2*8*i]=
  93.843 +                ec->non_zero_count_cache[0+8*1 + 3*8*i]=
  93.844 +                ec->non_zero_count_cache[0+8*2 + 3*8*i]= !IS_INTRA(mb_type) ? 0 : 64;
  93.845 +            }
  93.846 +        }
  93.847 +
  93.848 +		// top_cbp
  93.849 +		if(top_type) {
  93.850 +			ec->top_cbp = ec->cbp_top[mb_x];
  93.851 +		} else {
  93.852 +			ec->top_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
  93.853 +		}
  93.854 +		// left_cbp
  93.855 +		if (left_type) {
  93.856 +			ec->left_cbp = (ec->cbp[mb_x-1] & 0x1f0)
  93.857 +			|  ((ec->cbp[mb_x-1]>>(left_block[0]&(~1)))&2)
  93.858 +			| (((ec->cbp[mb_x-1]>>(left_block[2]&(~1)))&2) << 2);
  93.859 +		} else {
  93.860 +			ec->left_cbp = IS_INTRA(mb_type) ? 0x1CF : 0x00F;
  93.861 +		}
  93.862 +    }
  93.863 +
  93.864 +    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
  93.865 +        int list;
  93.866 +
  93.867 +        ec->ref_cache[0][scan8[5 ]+1] = ec->ref_cache[0][scan8[7 ]+1] = ec->ref_cache[0][scan8[13]+1] =
  93.868 +        ec->ref_cache[1][scan8[5 ]+1] = ec->ref_cache[1][scan8[7 ]+1] = ec->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
  93.869 +
  93.870 +        for(list=0; list<s->list_count; list++){
  93.871 +            if(!USES_LIST(mb_type, list)){
  93.872 +                continue;
  93.873 +            }
  93.874 +            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
  93.875 +
  93.876 +            if(USES_LIST(top_type, list)){
  93.877 +                ec->ref_cache[list][scan8[0] + 0 - 1*8]=
  93.878 +                ec->ref_cache[list][scan8[0] + 1 - 1*8]= ec->ref_index_top[list][4*mb_x + 2];
  93.879 +                ec->ref_cache[list][scan8[0] + 2 - 1*8]=
  93.880 +                ec->ref_cache[list][scan8[0] + 3 - 1*8]= ec->ref_index_top[list][4*mb_x + 3];
  93.881 +            }else{
  93.882 +                AV_WN32A(&ec->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
  93.883 +            }
  93.884 +
  93.885 +            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
  93.886 +                for(i=0; i<2; i++){
  93.887 +                    int cache_idx = scan8[0] - 1 + i*2*8;
  93.888 +                    if(USES_LIST(left_type, list)){
  93.889 +                        const int b8_x= 4*(mb_x-1) + 1;
  93.890 +                        ec->ref_cache[list][cache_idx  ]= ec->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
  93.891 +                        ec->ref_cache[list][cache_idx+8]= ec->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
  93.892 +                    }else{
  93.893 +                        ec->ref_cache[list][cache_idx  ]=
  93.894 +                        ec->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
  93.895 +                    }
  93.896 +                }
  93.897 +            }else{
  93.898 +                if(USES_LIST(left_type, list)){
  93.899 +                    const int b8_x= 4*(mb_x-1) + 1;
  93.900 +                    ec->ref_cache[list][scan8[0] - 1]= ec->ref_index[list][b8_x + (left_block[0]&~1)];
  93.901 +                }else{
  93.902 +                    ec->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
  93.903 +                }
  93.904 +            }
  93.905 +
  93.906 +            if(USES_LIST(topright_type, list)){
  93.907 +                ec->ref_cache[list][scan8[0] + 4 - 1*8]= ec->ref_index_top[list][4*(mb_x+1) + 2];
  93.908 +            }else{
  93.909 +                ec->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
  93.910 +            }
  93.911 +            if(ec->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
  93.912 +                int topleft_partition= -1;
  93.913 +                if(USES_LIST(topleft_type, list)){
  93.914 +                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
  93.915 +                    ec->ref_cache[list][scan8[0] - 1 - 1*8]= ec->ref_index_top[list][b8_x];
  93.916 +                }else{
  93.917 +                    ec->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
  93.918 +                }
  93.919 +            }
  93.920 +
  93.921 +            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
  93.922 +                continue;
  93.923 +
  93.924 +            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
  93.925 +                ec->ref_cache[list][scan8[4 ]] =
  93.926 +                ec->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
  93.927 +
  93.928 +				/* XXX beurk, Load mvd */
  93.929 +				if(USES_LIST(top_type, list)){
  93.930 +					AV_COPY64(ec->mvd_cache[list][scan8[0] + 0 - 1*8], ec->mvd_top[list][8*mb_x + 0]);
  93.931 +				}else{
  93.932 +					AV_ZERO64(ec->mvd_cache[list][scan8[0] + 0 - 1*8]);
  93.933 +				}
  93.934 +				if(USES_LIST(left_type, list)){
  93.935 +					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 0*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[0]]);
  93.936 +					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 1*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[1]]);
  93.937 +				}else{
  93.938 +					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 0*8]);
  93.939 +					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 1*8]);
  93.940 +				}
  93.941 +				if(USES_LIST(left_type, list)){
  93.942 +					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 2*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[2]]);
  93.943 +					AV_COPY16(ec->mvd_cache[list][scan8[0] - 1 + 3*8], ec->mvd[list][8*(mb_x-1) + 6 - left_block[3]]);
  93.944 +				}else{
  93.945 +					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 2*8]);
  93.946 +					AV_ZERO16(ec->mvd_cache [list][scan8[0] - 1 + 3*8]);
  93.947 +				}
  93.948 +				AV_ZERO16(ec->mvd_cache [list][scan8[4 ]]);
  93.949 +				AV_ZERO16(ec->mvd_cache [list][scan8[12]]);
  93.950 +				if(s->slice_type_nos == FF_B_TYPE){
  93.951 +					fill_rectangle(&ec->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);
  93.952 +
  93.953 +					if(IS_DIRECT(top_type)){
  93.954 +						AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101u*(MB_TYPE_DIRECT2>>1));
  93.955 +					}else if(IS_8X8(top_type)){
  93.956 +						int b8_x = 4*mb_x;
  93.957 +						ec->direct_cache[scan8[0] + 0 - 1*8]= ec->direct_top[b8_x + 2];
  93.958 +						ec->direct_cache[scan8[0] + 2 - 1*8]= ec->direct_top[b8_x + 3];
  93.959 +					}else{
  93.960 +						AV_WN32A(&ec->direct_cache[scan8[0] - 1*8], 0x01010101*(MB_TYPE_16x16>>1));
  93.961 +					}
  93.962 +
  93.963 +					if(IS_DIRECT(left_type))
  93.964 +						ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_DIRECT2>>1;
  93.965 +					else if(IS_8X8(left_type))
  93.966 +						ec->direct_cache[scan8[0] - 1 + 0*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[0]&~1)];
  93.967 +					else
  93.968 +						ec->direct_cache[scan8[0] - 1 + 0*8]= MB_TYPE_16x16>>1;
  93.969 +
  93.970 +					if(IS_DIRECT(left_type))
  93.971 +						ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_DIRECT2>>1;
  93.972 +					else if(IS_8X8(left_type))
  93.973 +						ec->direct_cache[scan8[0] - 1 + 2*8]= ec->direct[4*(mb_x-1) + 1 + (left_block[2]&~1)];
  93.974 +					else
  93.975 +						ec->direct_cache[scan8[0] - 1 + 2*8]= MB_TYPE_16x16>>1;
  93.976 +				}
  93.977 +            }
  93.978 +        }
  93.979 +    }
  93.980 +    ec->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type);
  93.981 +}
  93.982 +
  93.983 +static inline void write_back_non_zero_count(EntropyContext *ec, H264Slice *s){
  93.984 +    H264Mb *m = ec->m;
  93.985 +    const int mb_x= m->mb_x;
  93.986 +
  93.987 +    //bottom nnz
  93.988 +    AV_COPY32(&ec->non_zero_count[mb_x][0], &ec->non_zero_count_cache[4+8*4] );
  93.989 +    ec->non_zero_count[mb_x][4] = ec->non_zero_count_cache[1+8*2];
  93.990 +    ec->non_zero_count[mb_x][5] = ec->non_zero_count_cache[2+8*2];
  93.991 +    ec->non_zero_count[mb_x][6] = ec->non_zero_count_cache[1+8*5];
  93.992 +    ec->non_zero_count[mb_x][7] = ec->non_zero_count_cache[2+8*5];
  93.993 +
  93.994 +    for (int i=0; i<2; i++) {
  93.995 +        ec->non_zero_count_left[i*2+0]   = ec->non_zero_count_cache[7+8*1 + 2*8*i];
  93.996 +        ec->non_zero_count_left[i*2+1]   = ec->non_zero_count_cache[7+8*2 + 2*8*i];
  93.997 +        ec->non_zero_count_left[4+i*2+0] = ec->non_zero_count_cache[2+8*1 + 3*8*i];
  93.998 +        ec->non_zero_count_left[4+i*2+1] = ec->non_zero_count_cache[2+8*2 + 3*8*i];
  93.999 +    }
 93.1000 +
 93.1001 +    AV_COPY32(&m->non_zero_count[ 0], &ec->non_zero_count_cache[4+8*1]);
 93.1002 +    AV_COPY32(&m->non_zero_count[ 4], &ec->non_zero_count_cache[4+8*2]);
 93.1003 +    AV_COPY32(&m->non_zero_count[ 8], &ec->non_zero_count_cache[4+8*3]);
 93.1004 +    AV_COPY32(&m->non_zero_count[12], &ec->non_zero_count_cache[4+8*4]);
 93.1005 +
 93.1006 +    for (int i=0; i<2; i++) {
 93.1007 +        m->non_zero_count[16 + i*2   ] = ec->non_zero_count_cache[8*1 + 8*i + 1];
 93.1008 +        m->non_zero_count[16 + i*2 +1] = ec->non_zero_count_cache[8*1 + 8*i + 2];
 93.1009 +        m->non_zero_count[20 + i*2   ] = ec->non_zero_count_cache[8*4 + 8*i + 1];
 93.1010 +        m->non_zero_count[20 + i*2 +1] = ec->non_zero_count_cache[8*4 + 8*i + 2];
 93.1011 +    }
 93.1012 +}
 93.1013 +
 93.1014 +static inline void write_back_motion(EntropyContext *ec, H264Slice *s, int mb_type){
 93.1015 +    H264Mb *m = ec->m;
 93.1016 +	const int mb_x = m->mb_x;
 93.1017 +    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
 93.1018 +    int list;
 93.1019 +
 93.1020 +    for(list=0; list<s->list_count; list++){
 93.1021 +        if(!USES_LIST(mb_type, list))
 93.1022 +            continue;
 93.1023 +
 93.1024 +        {
 93.1025 +            uint8_t (*mvd_dst)[2] = (void *) ec->mvd[list][8*mb_x];
 93.1026 +            uint8_t (*mvd_src)[2] = &ec->mvd_cache[list][scan8[0]];
 93.1027 +            if(IS_SKIP(mb_type))
 93.1028 +                AV_ZERO128(mvd_dst);
 93.1029 +            else{
 93.1030 +				AV_COPY64(mvd_dst, mvd_src + 8*3);
 93.1031 +                AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
 93.1032 +                AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
 93.1033 +                AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
 93.1034 +            }
 93.1035 +        }
 93.1036 +        int8_t *ref_index = &ec->ref_index[list][b_x];
 93.1037 +        {
 93.1038 +            ref_index[0+0*2]= ec->ref_cache[list][scan8[0]];
 93.1039 +            ref_index[1+0*2]= ec->ref_cache[list][scan8[4]];
 93.1040 +            ref_index[0+1*2]= ec->ref_cache[list][scan8[8]];
 93.1041 +            ref_index[1+1*2]= ec->ref_cache[list][scan8[12]];
 93.1042 +        }
 93.1043 +    }
 93.1044 +
 93.1045 +    if(s->slice_type_nos == FF_B_TYPE){
 93.1046 +        if(IS_8X8(mb_type)){
 93.1047 +            uint8_t *direct = &ec->direct[4*mb_x];
 93.1048 +            direct[1] = m->sub_mb_type[1]>>1;
 93.1049 +            direct[2] = m->sub_mb_type[2]>>1;
 93.1050 +            direct[3] = m->sub_mb_type[3]>>1;
 93.1051 +        }
 93.1052 +    }
 93.1053 +}
 93.1054 +
 93.1055 +static inline int get_dct8x8_allowed(EntropyContext *ec, H264Slice *s){
 93.1056 +    H264Mb *m = ec->m;
 93.1057 +    if(s->direct_8x8_inference_flag)
 93.1058 +        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
 93.1059 +    else
 93.1060 +        return !(AV_RN64A(m->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
 93.1061 +}
 93.1062 +
 93.1063 +/**
 93.1064 + * decodes a P_SKIP or B_SKIP macroblock
 93.1065 + */
 93.1066 +static void decode_mb_skip(EntropyContext *ec, H264Slice *s){
 93.1067 +    H264Mb *m = ec->m;
 93.1068 +	const int mb_x = m->mb_x;
 93.1069 +    int mb_type;
 93.1070 +
 93.1071 +    if( s->slice_type_nos == FF_B_TYPE )
 93.1072 +        mb_type= MB_TYPE_16x16|MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
 93.1073 +    else
 93.1074 +        mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
 93.1075 +
 93.1076 +    fill_rectangle(&ec->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
 93.1077 +    write_back_motion(ec, s, mb_type);
 93.1078 +    m->mb_type = ec->mb_type[mb_x] = mb_type;
 93.1079 +    m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale;
 93.1080 +
 93.1081 +    AV_ZERO64(ec->non_zero_count[mb_x]);
 93.1082 +    AV_ZERO64(ec->non_zero_count_left);
 93.1083 +    memset(m->non_zero_count, 0, 24);
 93.1084 +}
 93.1085 +
 93.1086 +static int decode_cabac_intra_mb_type(EntropyContext *ec, H264Slice *s, CABACContext *c, int ctx_base, int intra_slice) {
 93.1087 +    uint8_t *state= &c->cabac_state[ctx_base];
 93.1088 +    int mb_type;
 93.1089 +
 93.1090 +    if(intra_slice){
 93.1091 +        int ctx=0;
 93.1092 +        if( ec->left_type & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
 93.1093 +            ctx++;
 93.1094 +        if( ec->top_type     & (MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM))
 93.1095 +            ctx++;
 93.1096 +        if( get_cabac_noinline( c, &state[ctx] ) == 0 )
 93.1097 +            return 0;   /* I4x4 */
 93.1098 +        state += 2;
 93.1099 +    }else{
 93.1100 +        if( get_cabac_noinline( c, state ) == 0 )
 93.1101 +            return 0;   /* I4x4 */
 93.1102 +    }
 93.1103 +
 93.1104 +    if( get_cabac_terminate( c ) )
 93.1105 +        return 25;  /* PCM */
 93.1106 +
 93.1107 +    mb_type = 1; /* I16x16 */
 93.1108 +    mb_type += 12 * get_cabac_noinline( c, &state[1] ); /* cbp_luma != 0 */
 93.1109 +    if( get_cabac_noinline(c, &state[2] ) ) /* cbp_chroma */
 93.1110 +        mb_type += 4 + 4 * get_cabac_noinline(c, &state[2+intra_slice] );
 93.1111 +    mb_type += 2 * get_cabac_noinline(c, &state[3+intra_slice] );
 93.1112 +    mb_type += 1 * get_cabac_noinline(c, &state[3+2*intra_slice] );
 93.1113 +    return mb_type;
 93.1114 +}
 93.1115 +
 93.1116 +static int decode_cabac_mb_skip(EntropyContext *ec, H264Slice *s, H264Mb *m, CABACContext *c) {
 93.1117 +    int ctx = 0;
 93.1118 +
 93.1119 +	if( m->mb_x>0 && !IS_SKIP( ec->left_type ))
 93.1120 +        ctx++;
 93.1121 +	if( m->mb_y>0 && !IS_SKIP( ec->top_type ))
 93.1122 +        ctx++;
 93.1123 +
 93.1124 +    if( s->slice_type_nos == FF_B_TYPE )
 93.1125 +        ctx += 13;
 93.1126 +    return get_cabac_noinline(c, &c->cabac_state[11+ctx] );
 93.1127 +}
 93.1128 +
 93.1129 +static int decode_cabac_mb_intra4x4_pred_mode_delta( CABACContext *c) {
 93.1130 +    int mode = 0;
 93.1131 +
 93.1132 +    if( get_cabac(c, &c->cabac_state[68] ) )
 93.1133 +        return -1;
 93.1134 +
 93.1135 +    mode += 1 * get_cabac(c, &c->cabac_state[69] );
 93.1136 +    mode += 2 * get_cabac(c, &c->cabac_state[69] );
 93.1137 +    mode += 4 * get_cabac(c, &c->cabac_state[69] );
 93.1138 +
 93.1139 +    return mode;
 93.1140 +}
 93.1141 +
 93.1142 +static int decode_cabac_mb_chroma_pre_mode(EntropyContext *ec, H264Slice *s, CABACContext *c) {
 93.1143 +    H264Mb *m = ec->m;
 93.1144 +	const int mb_x = m->mb_x;
 93.1145 +
 93.1146 +    int ctx = 0;
 93.1147 +
 93.1148 +    /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode to 0 */
 93.1149 +    if( ec->left_type && ec->chroma_pred_mode[mb_x-1] != 0 )
 93.1150 +        ctx++;
 93.1151 +
 93.1152 +    if( ec->top_type     && ec->chroma_pred_mode_top[mb_x] != 0 )
 93.1153 +        ctx++;
 93.1154 +
 93.1155 +    if( get_cabac_noinline(c, &c->cabac_state[64+ctx] ) == 0 )
 93.1156 +        return 0;
 93.1157 +
 93.1158 +    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
 93.1159 +        return 1;
 93.1160 +    if( get_cabac_noinline(c, &c->cabac_state[64+3] ) == 0 )
 93.1161 +        return 2;
 93.1162 +    else
 93.1163 +        return 3;
 93.1164 +}
 93.1165 +
 93.1166 +static int decode_cabac_mb_cbp_luma(EntropyContext *ec, CABACContext *c) {
 93.1167 +    int cbp_b, cbp_a, ctx, cbp = 0;
 93.1168 +
 93.1169 +    cbp_a = ec->left_cbp;
 93.1170 +    cbp_b = ec->top_cbp;
 93.1171 +
 93.1172 +    ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
 93.1173 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]);
 93.1174 +    ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
 93.1175 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 1;
 93.1176 +    ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
 93.1177 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 2;
 93.1178 +    ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
 93.1179 +    cbp += get_cabac_noinline(c, &c->cabac_state[73 + ctx]) << 3;
 93.1180 +    return cbp;
 93.1181 +}
 93.1182 +static int decode_cabac_mb_cbp_chroma(EntropyContext *ec, CABACContext *c) {
 93.1183 +    int ctx;
 93.1184 +    int cbp_a, cbp_b;
 93.1185 +
 93.1186 +    cbp_a = (ec->left_cbp>>4)&0x03;
 93.1187 +    cbp_b = (ec-> top_cbp>>4)&0x03;
 93.1188 +
 93.1189 +    ctx = 0;
 93.1190 +    if( cbp_a > 0 ) ctx++;
 93.1191 +    if( cbp_b > 0 ) ctx += 2;
 93.1192 +    if( get_cabac_noinline(c, &c->cabac_state[77 + ctx] ) == 0 )
 93.1193 +        return 0;
 93.1194 +
 93.1195 +    ctx = 4;
 93.1196 +    if( cbp_a == 2 ) ctx++;
 93.1197 +    if( cbp_b == 2 ) ctx += 2;
 93.1198 +    return 1 + get_cabac_noinline(c, &c->cabac_state[77 + ctx] );
 93.1199 +}
 93.1200 +
 93.1201 +static int decode_cabac_p_mb_sub_type( CABACContext *c) {
 93.1202 +    if( get_cabac(c, &c->cabac_state[21] ) )
 93.1203 +        return 0;   /* 8x8 */
 93.1204 +    if( !get_cabac(c, &c->cabac_state[22] ) )
 93.1205 +        return 1;   /* 8x4 */
 93.1206 +    if( get_cabac(c, &c->cabac_state[23] ) )
 93.1207 +        return 2;   /* 4x8 */
 93.1208 +    return 3;       /* 4x4 */
 93.1209 +}
 93.1210 +static int decode_cabac_b_mb_sub_type(CABACContext *c) {
 93.1211 +    int type;
 93.1212 +    if( !get_cabac(c, &c->cabac_state[36] ) )
 93.1213 +        return 0;   /* B_Direct_8x8 */
 93.1214 +    if( !get_cabac(c, &c->cabac_state[37] ) )
 93.1215 +        return 1 + get_cabac(c, &c->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
 93.1216 +    type = 3;
 93.1217 +    if( get_cabac(c, &c->cabac_state[38] ) ) {
 93.1218 +        if( get_cabac(c, &c->cabac_state[39] ) )
 93.1219 +            return 11 + get_cabac(c, &c->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
 93.1220 +        type += 4;
 93.1221 +    }
 93.1222 +    type += 2*get_cabac(c, &c->cabac_state[39] );
 93.1223 +    type +=   get_cabac(c, &c->cabac_state[39] );
 93.1224 +    return type;
 93.1225 +}
 93.1226 +
 93.1227 +static int decode_cabac_mb_ref(EntropyContext *ec, H264Slice *s, CABACContext *c, int list, int n ) {
 93.1228 +    int refa = ec->ref_cache[list][scan8[n] - 1];
 93.1229 +    int refb = ec->ref_cache[list][scan8[n] - 8];
 93.1230 +    int ref  = 0;
 93.1231 +    int ctx  = 0;
 93.1232 +
 93.1233 +    if( s->slice_type_nos == FF_B_TYPE) {
 93.1234 +        if( refa > 0 && !(ec->direct_cache[scan8[n] - 1]&(MB_TYPE_DIRECT2>>1)) )
 93.1235 +            ctx++;
 93.1236 +        if( refb > 0 && !(ec->direct_cache[scan8[n] - 8]&(MB_TYPE_DIRECT2>>1)) )
 93.1237 +            ctx += 2;
 93.1238 +    } else {
 93.1239 +        if( refa > 0 )
 93.1240 +            ctx++;
 93.1241 +        if( refb > 0 )
 93.1242 +            ctx += 2;
 93.1243 +    }
 93.1244 +
 93.1245 +    while( get_cabac(c, &c->cabac_state[54+ctx] ) ) {
 93.1246 +        ref++;
 93.1247 +        ctx = (ctx>>2)+4;
 93.1248 +        if(ref >= 32 /*h->ref_list[list]*/){
 93.1249 +            return -1;
 93.1250 +        }
 93.1251 +    }
 93.1252 +    return ref;
 93.1253 +}
 93.1254 +
 93.1255 +static int decode_cabac_mb_mvd( CABACContext *c, int ctxbase, int amvd, int *mvda) {
 93.1256 +    int mvd;
 93.1257 +
 93.1258 +    if(!get_cabac(c, &c->cabac_state[ctxbase+((amvd-3)>>(INT_BIT-1))+((amvd-33)>>(INT_BIT-1))+2])){
 93.1259 +        *mvda= 0;
 93.1260 +        return 0;
 93.1261 +    }
 93.1262 +
 93.1263 +    mvd= 1;
 93.1264 +    ctxbase+= 3;
 93.1265 +    while( mvd < 9 && get_cabac(c, &c->cabac_state[ctxbase] ) ) {
 93.1266 +        if( mvd < 4 )
 93.1267 +            ctxbase++;
 93.1268 +        mvd++;
 93.1269 +    }
 93.1270 +
 93.1271 +    if( mvd >= 9 ) {
 93.1272 +        int k = 3;
 93.1273 +        while( get_cabac_bypass(c ) ) {
 93.1274 +            mvd += 1 << k;
 93.1275 +            k++;
 93.1276 +            if(k>24){
 93.1277 +                av_log(AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
 93.1278 +                return INT_MIN;
 93.1279 +            }
 93.1280 +        }
 93.1281 +        while( k-- ) {
 93.1282 +            mvd += get_cabac_bypass(c )<<k;
 93.1283 +        }
 93.1284 +        *mvda=mvd < 70 ? mvd : 70;
 93.1285 +    }else
 93.1286 +        *mvda=mvd;
 93.1287 +    return get_cabac_bypass_sign(c, -mvd );
 93.1288 +}
 93.1289 +
 93.1290 +#define DECODE_CABAC_MB_MVD( ec, c, list,  n )\
 93.1291 +{\
 93.1292 +    int amvd0 = ec->mvd_cache[list][scan8[n] - 1][0] +\
 93.1293 +                ec->mvd_cache[list][scan8[n] - 8][0];\
 93.1294 +    int amvd1 = ec->mvd_cache[list][scan8[n] - 1][1] +\
 93.1295 +                ec->mvd_cache[list][scan8[n] - 8][1];\
 93.1296 +\
 93.1297 +    m->mvd[list][mp][0] = decode_cabac_mb_mvd( c, 40, amvd0, &mpx ); \
 93.1298 +    m->mvd[list][mp][1] = decode_cabac_mb_mvd( c, 47, amvd1, &mpy ); \
 93.1299 +    mp++; \
 93.1300 +}
 93.1301 +
 93.1302 +static av_always_inline int get_cabac_cbf_ctx(EntropyContext *ec, H264Slice *s, int cat, int idx, int is_dc ) {
 93.1303 +    int nza, nzb;
 93.1304 +    int ctx = 0;
 93.1305 +
 93.1306 +    if( is_dc ) {
 93.1307 +        if( cat == 0 ) {
 93.1308 +            nza = ec->left_cbp&0x100;
 93.1309 +            nzb = ec-> top_cbp&0x100;
 93.1310 +        } else {
 93.1311 +            nza = (ec->left_cbp>>(6+idx))&0x01;
 93.1312 +            nzb = (ec-> top_cbp>>(6+idx))&0x01;
 93.1313 +        }
 93.1314 +    } else {
 93.1315 +        assert(cat == 1 || cat == 2 || cat == 4);
 93.1316 +        nza = ec->non_zero_count_cache[scan8[idx] - 1];
 93.1317 +        nzb = ec->non_zero_count_cache[scan8[idx] - 8];
 93.1318 +    }
 93.1319 +
 93.1320 +    if( nza > 0 )
 93.1321 +        ctx++;
 93.1322 +
 93.1323 +    if( nzb > 0 )
 93.1324 +        ctx += 2;
 93.1325 +
 93.1326 +    return ctx + 4 * cat;
 93.1327 +}
 93.1328 +
 93.1329 +DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8)[63] = {
 93.1330 +    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 93.1331 +    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 93.1332 +    3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 93.1333 +    5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
 93.1334 +};
 93.1335 +
 93.1336 +static const int significant_coeff_flag_offset[2][6] = {
 93.1337 +    { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
 93.1338 +    { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
 93.1339 +};
 93.1340 +static const int last_coeff_flag_offset[2][6] = {
 93.1341 +    { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
 93.1342 +    { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
 93.1343 +};
 93.1344 +static const int coeff_abs_level_m1_offset[6] = {
 93.1345 +    227+0, 227+10, 227+20, 227+30, 227+39, 426
 93.1346 +};
 93.1347 +static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
 93.1348 +    { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
 93.1349 +    4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
 93.1350 +    7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
 93.1351 +    12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
 93.1352 +    { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
 93.1353 +    6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
 93.1354 +    9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
 93.1355 +    9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
 93.1356 +};
 93.1357 +/* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
 93.1358 +* 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
 93.1359 +* map node ctx => cabac ctx for level=1 */
 93.1360 +static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
 93.1361 +/* map node ctx => cabac ctx for level>1 */
 93.1362 +static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
 93.1363 +static const uint8_t coeff_abs_level_transition[2][8] = {
 93.1364 +    /* update node ctx after decoding a level=1 */
 93.1365 +    { 1, 2, 3, 3, 4, 5, 6, 7 },
 93.1366 +    /* update node ctx after decoding a level>1 */
 93.1367 +    { 4, 4, 4, 4, 5, 6, 7, 7 }
 93.1368 +};
 93.1369 +
 93.1370 +static av_always_inline void decode_cabac_residual_internal(EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
 93.1371 +    H264Mb *m = ec->m;
 93.1372 +	const int mb_x = m->mb_x;
 93.1373 +    int index[64];
 93.1374 +
 93.1375 +    int av_unused last;
 93.1376 +    int coeff_count = 0;
 93.1377 +    int node_ctx = 0;
 93.1378 +
 93.1379 +    uint8_t *significant_coeff_ctx_base;
 93.1380 +    uint8_t *last_coeff_ctx_base;
 93.1381 +    uint8_t *abs_level_m1_ctx_base;
 93.1382 +
 93.1383 +    /* read coded block flag */
 93.1384 +    if( is_dc || cat != 5 ) {
 93.1385 +        if( get_cabac( c, &c->cabac_state[85 + get_cabac_cbf_ctx( ec, s, cat, n, is_dc ) ] ) == 0 ) {
 93.1386 +            if( !is_dc )
 93.1387 +                ec->non_zero_count_cache[scan8[n]] = 0;
 93.1388 +            return;
 93.1389 +        }
 93.1390 +    }
 93.1391 +
 93.1392 +    significant_coeff_ctx_base = c->cabac_state
 93.1393 +        + significant_coeff_flag_offset[0][cat];
 93.1394 +    last_coeff_ctx_base = c->cabac_state
 93.1395 +        + last_coeff_flag_offset[0][cat];
 93.1396 +    abs_level_m1_ctx_base = c->cabac_state
 93.1397 +        + coeff_abs_level_m1_offset[cat];
 93.1398 +
 93.1399 +    if( !is_dc && cat == 5 ) {
 93.1400 +#define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
 93.1401 +        for(last= 0; last < coefs; last++) { \
 93.1402 +            uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
 93.1403 +            if( get_cabac( c, sig_ctx )) { \
 93.1404 +                uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
 93.1405 +                index[coeff_count++] = last; \
 93.1406 +                if( get_cabac( c, last_ctx ) ) { \
 93.1407 +                    last= max_coeff; \
 93.1408 +                    break; \
 93.1409 +                } \
 93.1410 +            } \
 93.1411 +        }\
 93.1412 +        if( last == max_coeff -1 ) {\
 93.1413 +            index[coeff_count++] = last;\
 93.1414 +        }
 93.1415 +
 93.1416 +        const uint8_t *sig_off = significant_coeff_flag_offset_8x8[0];
 93.1417 +        DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
 93.1418 +    } else {
 93.1419 +        DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
 93.1420 +    }
 93.1421 +    assert(coeff_count > 0);
 93.1422 +
 93.1423 +    if( is_dc ) {
 93.1424 +        if( cat == 0 )
 93.1425 +            ec->cbp[mb_x] |= 0x100;
 93.1426 +        else
 93.1427 +            ec->cbp[mb_x] |= 0x40 << n;
 93.1428 +    } else {
 93.1429 +        if( cat == 5 )
 93.1430 +            fill_rectangle(&ec->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
 93.1431 +        else {
 93.1432 +            assert( cat == 1 || cat == 2 || cat == 4 );
 93.1433 +            ec->non_zero_count_cache[scan8[n]] = coeff_count;
 93.1434 +        }
 93.1435 +    }
 93.1436 +
 93.1437 +    do {
 93.1438 +        uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
 93.1439 +
 93.1440 +        int j= scantable[index[--coeff_count]];
 93.1441 +
 93.1442 +        if( get_cabac( c, ctx ) == 0 ) {
 93.1443 +            node_ctx = coeff_abs_level_transition[0][node_ctx];
 93.1444 +            if( is_dc ) {
 93.1445 +                block[j] = get_cabac_bypass_sign( c, -1);
 93.1446 +            }else{
 93.1447 +                block[j] = (get_cabac_bypass_sign( c, -qmul[j]) + 32) >> 6;
 93.1448 +            }
 93.1449 +        } else {
 93.1450 +            int coeff_abs = 2;
 93.1451 +            ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
 93.1452 +            node_ctx = coeff_abs_level_transition[1][node_ctx];
 93.1453 +
 93.1454 +            while( coeff_abs < 15 && get_cabac( c, ctx ) ) {
 93.1455 +                coeff_abs++;
 93.1456 +            }
 93.1457 +
 93.1458 +            if( coeff_abs >= 15 ) {
 93.1459 +                int j = 0;
 93.1460 +                while( get_cabac_bypass( c ) ) {
 93.1461 +                    j++;
 93.1462 +                }
 93.1463 +
 93.1464 +                coeff_abs=1;
 93.1465 +                while( j-- ) {
 93.1466 +                    coeff_abs += coeff_abs + get_cabac_bypass( c );
 93.1467 +                }
 93.1468 +                coeff_abs+= 14;
 93.1469 +            }
 93.1470 +
 93.1471 +            if( is_dc ) {
 93.1472 +                block[j] = get_cabac_bypass_sign( c, -coeff_abs );
 93.1473 +            }else{
 93.1474 +                block[j] = (get_cabac_bypass_sign( c, -coeff_abs ) * qmul[j] + 32) >> 6;
 93.1475 +            }
 93.1476 +        }
 93.1477 +    } while( coeff_count );
 93.1478 +
 93.1479 +}
 93.1480 +
 93.1481 +static void decode_cabac_residual_dc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, int max_coeff ) {
 93.1482 +    decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, NULL, max_coeff, 1);
 93.1483 +}
 93.1484 +
 93.1485 +static void decode_cabac_residual_nondc( EntropyContext *ec, H264Slice *s, CABACContext *c, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
 93.1486 +    decode_cabac_residual_internal( ec, s, c, block, cat, n, scantable, qmul, max_coeff, 0);
 93.1487 +}
 93.1488 +
 93.1489 +/**
 93.1490 + * decodes a macroblock
 93.1491 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
 93.1492 + */
 93.1493 +int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c) {
 93.1494 +    H264Mb *m = ec->m;
 93.1495 +	int mb_x = m->mb_x;
 93.1496 +    int mb_type, partition_count, cbp = 0;
 93.1497 +    int dct8x8_allowed= s->pps.transform_8x8_mode;
 93.1498 +
 93.1499 +    fill_decode_neighbors(ec, s);
 93.1500 +
 93.1501 +    if( s->slice_type_nos != FF_I_TYPE ) {
 93.1502 +        int skip;
 93.1503 +        /* a skipped mb needs the aff flag from the following mb */
 93.1504 +        skip = decode_cabac_mb_skip( ec, s, m, c);
 93.1505 +
 93.1506 +        /* read skip flags */
 93.1507 +        if( skip ) {
 93.1508 +            decode_mb_skip(ec, s);
 93.1509 +            m->cbp = ec->cbp[mb_x] = 0;
 93.1510 +            ec->chroma_pred_mode[mb_x] = 0;
 93.1511 +            ec->last_qscale_diff = 0;
 93.1512 +            return 0;
 93.1513 +        }
 93.1514 +    }
 93.1515 +
 93.1516 +    if( s->slice_type_nos == FF_B_TYPE ) {
 93.1517 +        int ctx = 0;
 93.1518 +
 93.1519 +        if( !IS_DIRECT( ec->left_type-1 ) )
 93.1520 +            ctx++;
 93.1521 +        if( !IS_DIRECT( ec->top_type-1 ) )
 93.1522 +            ctx++;
 93.1523 +
 93.1524 +        if( !get_cabac_noinline(c, &c->cabac_state[27+ctx] ) ){
 93.1525 +            mb_type= 0; /* B_Direct_16x16 */
 93.1526 +        }else if( !get_cabac_noinline(c, &c->cabac_state[27+3] ) ) {
 93.1527 +            mb_type= 1 + get_cabac_noinline(c, &c->cabac_state[27+5] ); /* B_L[01]_16x16 */
 93.1528 +        }else{
 93.1529 +            int bits;
 93.1530 +            bits = get_cabac_noinline(c, &c->cabac_state[27+4] ) << 3;
 93.1531 +            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 2;
 93.1532 +            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] ) << 1;
 93.1533 +            bits+= get_cabac_noinline(c, &c->cabac_state[27+5] );
 93.1534 +            if( bits < 8 ){
 93.1535 +                mb_type= bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
 93.1536 +            }else if( bits == 13 ){
 93.1537 +                mb_type= decode_cabac_intra_mb_type(ec, s, c, 32, 0);
 93.1538 +                goto decode_intra_mb;
 93.1539 +            }else if( bits == 14 ){
 93.1540 +                mb_type= 11; /* B_L1_L0_8x16 */
 93.1541 +            }else if( bits == 15 ){
 93.1542 +                mb_type= 22; /* B_8x8 */
 93.1543 +            }else{
 93.1544 +                bits= ( bits<<1 ) + get_cabac_noinline(c, &c->cabac_state[27+5] );
 93.1545 +                mb_type= bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
 93.1546 +            }
 93.1547 +        }
 93.1548 +        partition_count= b_mb_type_info[mb_type].partition_count;
 93.1549 +        mb_type=         b_mb_type_info[mb_type].type;
 93.1550 +    } else if( s->slice_type_nos == FF_P_TYPE ) {
 93.1551 +        if( get_cabac_noinline(c, &c->cabac_state[14] ) == 0 ) {
 93.1552 +            /* P-type */
 93.1553 +            if( get_cabac_noinline(c, &c->cabac_state[15] ) == 0 ) {
 93.1554 +                /* P_L0_D16x16, P_8x8 */
 93.1555 +                mb_type= 3 * get_cabac_noinline(c, &c->cabac_state[16] );
 93.1556 +            } else {
 93.1557 +                /* P_L0_D8x16, P_L0_D16x8 */
 93.1558 +                mb_type= 2 - get_cabac_noinline(c, &c->cabac_state[17] );
 93.1559 +            }
 93.1560 +            partition_count= p_mb_type_info[mb_type].partition_count;
 93.1561 +            mb_type=         p_mb_type_info[mb_type].type;
 93.1562 +        } else {
 93.1563 +            mb_type= decode_cabac_intra_mb_type(ec, s, c, 17, 0);
 93.1564 +            goto decode_intra_mb;
 93.1565 +        }
 93.1566 +    } else {
 93.1567 +        mb_type= decode_cabac_intra_mb_type(ec, s ,c, 3, 1);
 93.1568 +        if(s->slice_type == FF_SI_TYPE && mb_type)
 93.1569 +            mb_type--;
 93.1570 +        assert(s->slice_type_nos == FF_I_TYPE);
 93.1571 +decode_intra_mb:
 93.1572 +        partition_count = 0;
 93.1573 +        cbp= i_mb_type_info[mb_type].cbp;
 93.1574 +        m->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
 93.1575 +        mb_type= i_mb_type_info[mb_type].type;
 93.1576 +    }
 93.1577 +
 93.1578 +    if(IS_INTRA_PCM(mb_type)) {
 93.1579 +        const uint8_t *ptr;
 93.1580 +        // We assume these blocks are very rare so we do not optimize it.
 93.1581 +        // FIXME The two following lines get the bitstream position in the cabac
 93.1582 +        // decode, I think it should be done by a function in cabac.h (or cabac.c).
 93.1583 +        ptr=c->bytestream;
 93.1584 +        if(c->low&0x1) ptr--;
 93.1585 +        if(CABAC_BITS==16){
 93.1586 +            if(c->low&0x1FF) ptr--;
 93.1587 +        }
 93.1588 +		//printf("pcm\n");
 93.1589 +        // The pixels are stored in the same order as levels in h->mb array.
 93.1590 +        memcpy(m->mb, ptr, 256); ptr+=256;
 93.1591 +		memcpy(m->mb+128, ptr, 128); ptr+=128;
 93.1592 +
 93.1593 +        ff_init_cabac_decoder(c, ptr, c->bytestream_end - ptr);
 93.1594 +
 93.1595 +        // All blocks are present
 93.1596 +        m->cbp= ec->cbp[mb_x] = 0x1ef;
 93.1597 +        ec->chroma_pred_mode[mb_x] = 0;
 93.1598 +        // In deblocking, the quantizer is 0
 93.1599 +        m->qscale_mb_xy = ec->qscale[mb_x]= 0;
 93.1600 +        // All coeffs are present
 93.1601 +        memset(ec->non_zero_count[mb_x], 16, 8);
 93.1602 +		m->mb_type = ec->mb_type[mb_x]=  mb_type;
 93.1603 +        ec->last_qscale_diff = 0;
 93.1604 +
 93.1605 +        return 0;
 93.1606 +    }
 93.1607 +
 93.1608 +    fill_decode_caches(ec, s, mb_type);
 93.1609 +
 93.1610 +    int mp = 0;
 93.1611 +    if( IS_INTRA( mb_type ) ) {
 93.1612 +        int i, pred_mode;
 93.1613 +        if( IS_INTRA4x4( mb_type ) ) {
 93.1614 +            if( dct8x8_allowed && get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] ) ) {
 93.1615 +                mb_type |= MB_TYPE_8x8DCT;
 93.1616 +                for( i = 0; i < 16; i+=4 ) {
 93.1617 +                    m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c);
 93.1618 +                }
 93.1619 +            } else {
 93.1620 +                for( i = 0; i < 16; i++ ) {
 93.1621 +                    m->intra4x4_pred_mode[i] = decode_cabac_mb_intra4x4_pred_mode_delta(c);
 93.1622 +                }
 93.1623 +            }
 93.1624 +        }
 93.1625 +
 93.1626 +        m->chroma_pred_mode= ec->chroma_pred_mode[mb_x] =
 93.1627 +		pred_mode = decode_cabac_mb_chroma_pre_mode( ec, s, c );
 93.1628 +
 93.1629 +    } else if( partition_count == 4 ) {
 93.1630 +        int i, j, sub_partition_count[4], list;
 93.1631 +
 93.1632 +        if( s->slice_type_nos == FF_B_TYPE ) {
 93.1633 +            for( i = 0; i < 4; i++ ) {
 93.1634 +                m->sub_mb_type[i] = decode_cabac_b_mb_sub_type( c );
 93.1635 +                sub_partition_count[i]= b_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
 93.1636 +                m->sub_mb_type[i]=      b_sub_mb_type_info[ m->sub_mb_type[i] ].type;
 93.1637 +            }
 93.1638 +            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
 93.1639 +                          m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
 93.1640 +                ec->ref_cache[0][scan8[4]] =
 93.1641 +                ec->ref_cache[1][scan8[4]] =
 93.1642 +                ec->ref_cache[0][scan8[12]] =
 93.1643 +                ec->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
 93.1644 +
 93.1645 +                for( i = 0; i < 4; i++ )
 93.1646 +                    fill_rectangle( &ec->direct_cache[scan8[4*i]], 2, 2, 8, (m->sub_mb_type[i]>>1)&0xFF, 1 );
 93.1647 +            }
 93.1648 +        } else {
 93.1649 +            for( i = 0; i < 4; i++ ) {
 93.1650 +                m->sub_mb_type[i] = decode_cabac_p_mb_sub_type( c );
 93.1651 +                sub_partition_count[i]= p_sub_mb_type_info[ m->sub_mb_type[i] ].partition_count;
 93.1652 +                m->sub_mb_type[i]=      p_sub_mb_type_info[ m->sub_mb_type[i] ].type;
 93.1653 +            }
 93.1654 +        }
 93.1655 +
 93.1656 +        for( list = 0; list < s->list_count; list++ ) {
 93.1657 +            for( i = 0; i < 4; i++ ) {
 93.1658 +                if(IS_DIRECT(m->sub_mb_type[i])) continue;
 93.1659 +                if(IS_DIR(m->sub_mb_type[i], 0, list)){
 93.1660 +                    if( s->ref_count[list] > 1 ){
 93.1661 +                        m->ref_index[list][i] = decode_cabac_mb_ref(ec, s, c, list, 4*i );
 93.1662 +                        if(m->ref_index[list][i] >= s->ref_count[list]){
 93.1663 +                            av_log(AV_LOG_ERROR, "Reference %d >= %d\n", m->ref_index[list][i], s->ref_count[list]);
 93.1664 +                            return -1;
 93.1665 +                        }
 93.1666 +                    }else
 93.1667 +                        m->ref_index[list][i] = 0;
 93.1668 +                } else {
 93.1669 +                    m->ref_index[list][i] = -1;
 93.1670 +                }
 93.1671 +                ec->ref_cache[list][ scan8[4*i]   ]=ec->ref_cache[list][ scan8[4*i]+1 ]=
 93.1672 +                ec->ref_cache[list][ scan8[4*i]+8 ]=ec->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i];
 93.1673 +            }
 93.1674 +        }
 93.1675 +
 93.1676 +        if(dct8x8_allowed){
 93.1677 +//             assert(0);
 93.1678 +            dct8x8_allowed = get_dct8x8_allowed(ec, s);
 93.1679 +        }
 93.1680 +
 93.1681 +        for(list=0; list<s->list_count; list++){
 93.1682 +            for(i=0; i<4; i++){
 93.1683 +//                 ec->ref_cache[list][ scan8[4*i]   ]=ec->ref_cache[list][ scan8[4*i]+1 ];
 93.1684 +                if(IS_DIRECT(m->sub_mb_type[i])){
 93.1685 +                    fill_rectangle(ec->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
 93.1686 +                    continue;
 93.1687 +                }
 93.1688 +
 93.1689 +                if(IS_DIR(m->sub_mb_type[i], 0, list) && !IS_DIRECT(m->sub_mb_type[i])){
 93.1690 +                    const int sub_mb_type= m->sub_mb_type[i];
 93.1691 +                    const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
 93.1692 +                    for(j=0; j<sub_partition_count[i]; j++){
 93.1693 +                        int mpx, mpy;
 93.1694 +                        const int index= 4*i + block_width*j;
 93.1695 +                        uint8_t (* mvd_cache)[2]= &ec->mvd_cache[list][ scan8[index]];
 93.1696 +
 93.1697 +                        DECODE_CABAC_MB_MVD( ec, c, list, index)
 93.1698 +
 93.1699 +                        if(IS_SUB_8X8(sub_mb_type)){
 93.1700 +                            mvd_cache[ 1 ][0]=
 93.1701 +                            mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mpx;
 93.1702 +                            mvd_cache[ 1 ][1]=
 93.1703 +                            mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= mpy;
 93.1704 +                        }else if(IS_SUB_8X4(sub_mb_type)){
 93.1705 +                            mvd_cache[ 1 ][0]=  mpx;
 93.1706 +                            mvd_cache[ 1 ][1]= mpy;
 93.1707 +                        }else if(IS_SUB_4X8(sub_mb_type)){
 93.1708 +                            mvd_cache[ 8 ][0]= mpx;
 93.1709 +                            mvd_cache[ 8 ][1]= mpy;
 93.1710 +                        }
 93.1711 +                        mvd_cache[ 0 ][0]= mpx;
 93.1712 +                        mvd_cache[ 0 ][1]= mpy;
 93.1713 +                    }
 93.1714 +                }else{
 93.1715 +                    fill_rectangle(ec->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2);
 93.1716 +                }
 93.1717 +            }
 93.1718 +        }
 93.1719 +    } else if( IS_DIRECT(mb_type) ) {
 93.1720 +        mb_type |= MB_TYPE_16x16;
 93.1721 +        fill_rectangle(ec->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
 93.1722 +        fill_rectangle(ec->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
 93.1723 +        dct8x8_allowed &= s->direct_8x8_inference_flag;
 93.1724 +    } else {
 93.1725 +        int list, i;
 93.1726 +        if(IS_16X16(mb_type)){
 93.1727 +            for(list=0; list<s->list_count; list++){
 93.1728 +                if(IS_DIR(mb_type, 0, list)){
 93.1729 +                    int ref;
 93.1730 +                    if(s->ref_count[list] > 1){
 93.1731 +                        ref= decode_cabac_mb_ref(ec, s, c, list, 0);
 93.1732 +                        if(ref >= s->ref_count[list]){
 93.1733 +                            av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
 93.1734 +                            return -1;
 93.1735 +                        }
 93.1736 +                    }else
 93.1737 +                        ref=0;
 93.1738 +                    m->ref_index[list][0]= ref;
 93.1739 +                    fill_rectangle(&ec->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
 93.1740 +                }
 93.1741 +            }
 93.1742 +            for(list=0; list<s->list_count; list++){
 93.1743 +                if(IS_DIR(mb_type, 0, list)){
 93.1744 +                    int mpx,mpy;
 93.1745 +                    DECODE_CABAC_MB_MVD( ec, c, list, 0)
 93.1746 +
 93.1747 +                    fill_rectangle(ec->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
 93.1748 +                }
 93.1749 +
 93.1750 +            }
 93.1751 +        }
 93.1752 +        else if(IS_16X8(mb_type)){
 93.1753 +            for(list=0; list<s->list_count; list++){
 93.1754 +                for(i=0; i<2; i++){
 93.1755 +                    if(IS_DIR(mb_type, i, list)){
 93.1756 +                        int ref;
 93.1757 +                        if(s->ref_count[list] > 1){
 93.1758 +                            ref= decode_cabac_mb_ref(ec, s, c, list, 8*i );
 93.1759 +                            if(ref >= s->ref_count[list]){
 93.1760 +                                av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
 93.1761 +                                return -1;
 93.1762 +                            }
 93.1763 +                        }else
 93.1764 +                            ref=0;
 93.1765 +                        m->ref_index[list][i]= ref;
 93.1766 +                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
 93.1767 +                    }else{
 93.1768 +                        m->ref_index[list][i]= LIST_NOT_USED;
 93.1769 +                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
 93.1770 +                    }
 93.1771 +                }
 93.1772 +            }
 93.1773 +            for(list=0; list<s->list_count; list++){
 93.1774 +                for(i=0; i<2; i++){
 93.1775 +                    if(IS_DIR(mb_type, i, list)){
 93.1776 +                        int mpx,mpy;
 93.1777 +                        DECODE_CABAC_MB_MVD( ec, c, list, 8*i)
 93.1778 +
 93.1779 +                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
 93.1780 +                    }else{
 93.1781 +                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
 93.1782 +                    }
 93.1783 +                }
 93.1784 +            }
 93.1785 +        }else{
 93.1786 +            assert(IS_8X16(mb_type));
 93.1787 +            for(list=0; list<s->list_count; list++){
 93.1788 +                for(i=0; i<2; i++){
 93.1789 +                    if(IS_DIR(mb_type, i, list)){ //FIXME optimize
 93.1790 +                        int ref;
 93.1791 +                        if(s->ref_count[list] > 1){
 93.1792 +                            ref= decode_cabac_mb_ref(ec, s, c, list, 4*i );
 93.1793 +                            if(ref >= s->ref_count[list]){
 93.1794 +                                av_log(AV_LOG_ERROR, "Reference %d >= %d\n", ref, s->ref_count[list]);
 93.1795 +                                return -1;
 93.1796 +                            }
 93.1797 +                        }else
 93.1798 +                            ref=0;
 93.1799 +                        m->ref_index[list][i]= ref;
 93.1800 +                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
 93.1801 +                    }else{
 93.1802 +                        m->ref_index[list][i]= LIST_NOT_USED;
 93.1803 +                        fill_rectangle(&ec->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
 93.1804 +                    }
 93.1805 +                }
 93.1806 +            }
 93.1807 +            for(list=0; list<s->list_count; list++){
 93.1808 +                for(i=0; i<2; i++){
 93.1809 +                    if(IS_DIR(mb_type, i, list)){
 93.1810 +                        int mpx,mpy;
 93.1811 +                        DECODE_CABAC_MB_MVD( ec, c, list, 4*i)
 93.1812 +
 93.1813 +                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
 93.1814 +                    }else{
 93.1815 +                        fill_rectangle(ec->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
 93.1816 +                    }
 93.1817 +                }
 93.1818 +            }
 93.1819 +        }
 93.1820 +    }
 93.1821 +
 93.1822 +    if( IS_INTER( mb_type ) ||(IS_DIRECT(mb_type))) {
 93.1823 +        ec->chroma_pred_mode[mb_x] = 0;
 93.1824 +        write_back_motion( ec, s, mb_type );
 93.1825 +    }
 93.1826 +
 93.1827 +    if( !IS_INTRA16x16( mb_type ) ) {
 93.1828 +        cbp  = decode_cabac_mb_cbp_luma( ec, c);
 93.1829 +		cbp |= decode_cabac_mb_cbp_chroma( ec, c ) << 4;
 93.1830 +    }
 93.1831 +
 93.1832 +    ec->cbp[mb_x] = m->cbp = cbp;
 93.1833 +
 93.1834 +    if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
 93.1835 +        int t = get_cabac_noinline(c, &c->cabac_state[399 + ec->neighbor_transform_size] );
 93.1836 +        mb_type |= MB_TYPE_8x8DCT * t;
 93.1837 +    }
 93.1838 +    m->mb_type = ec->mb_type[mb_x] = mb_type;
 93.1839 +
 93.1840 +    if( cbp || IS_INTRA16x16( mb_type ) ) {
 93.1841 +        const uint8_t *scan, *scan8x8, *dc_scan;
 93.1842 +        const uint32_t *qmul;
 93.1843 +
 93.1844 +
 93.1845 +        if (s->transform_bypass && ec->curr_qscale){
 93.1846 +            scan8x8= ff_zigzag_direct;
 93.1847 +            scan= zigzag_scan;
 93.1848 +        }else{
 93.1849 +            scan8x8= ec->zigzag_scan8x8;
 93.1850 +            scan= ec->zigzag_scan;
 93.1851 +        }
 93.1852 +        dc_scan= luma_dc_zigzag_scan;
 93.1853 +
 93.1854 +        // decode_cabac_mb_dqp
 93.1855 +        if(get_cabac_noinline(c, &c->cabac_state[60 + (ec->last_qscale_diff != 0)])){
 93.1856 +            int val = 1;
 93.1857 +            int ctx= 2;
 93.1858 +
 93.1859 +            while( get_cabac_noinline(c, &c->cabac_state[60 + ctx] ) ) {
 93.1860 +                ctx= 3;
 93.1861 +                val++;
 93.1862 +                if(val > 102){ //prevent infinite loop
 93.1863 +                    av_log(AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", m->mb_x, m->mb_y);
 93.1864 +                    return -1;
 93.1865 +                }
 93.1866 +            }
 93.1867 +
 93.1868 +            if( val&0x01 )
 93.1869 +                val=   (val + 1)>>1 ;
 93.1870 +            else
 93.1871 +                val= -((val + 1)>>1);
 93.1872 +            ec->last_qscale_diff = val;
 93.1873 +            ec->curr_qscale += val;
 93.1874 +            if(((unsigned)ec->curr_qscale) > 51){
 93.1875 +                if(ec->curr_qscale<0) ec->curr_qscale+= 52;
 93.1876 +                else            ec->curr_qscale-= 52;
 93.1877 +            }
 93.1878 +            ec->chroma_qp[0] = get_chroma_qp( s, 0, ec->curr_qscale);
 93.1879 +            ec->chroma_qp[1] = get_chroma_qp( s, 1, ec->curr_qscale);
 93.1880 +        }else
 93.1881 +            ec->last_qscale_diff=0;
 93.1882 +
 93.1883 +        memset(m->mb, 0, 16*16 * sizeof(DCTELEM));
 93.1884 +        if( IS_INTRA16x16( mb_type ) ) {
 93.1885 +            int i;
 93.1886 +
 93.1887 +            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
 93.1888 +            decode_cabac_residual_dc( ec, s, c, m->mb, 0, 0, dc_scan, 16);
 93.1889 +            qmul = ec->dequant4_coeff[0][ec->curr_qscale];
 93.1890 +            if( cbp&15 ) {
 93.1891 +                for( i = 0; i < 16; i++ ) {
 93.1892 +                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
 93.1893 +                    decode_cabac_residual_nondc( ec, s, c, m->mb + 16*i, 1, i, scan + 1, qmul, 15);
 93.1894 +                }
 93.1895 +            } else {
 93.1896 +                fill_rectangle(&ec->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
 93.1897 +            }
 93.1898 +            h264_luma_dc_dequant_idct_c(m->mb, qmul[0]);
 93.1899 +        } else {
 93.1900 +
 93.1901 +            int i8x8, i4x4;
 93.1902 +            for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
 93.1903 +                if( cbp & (1<<i8x8) ) {
 93.1904 +                    if( IS_8x8DCT(mb_type) ) {
 93.1905 +                        decode_cabac_residual_nondc(ec, s, c, m->mb + 64*i8x8, 5, 4*i8x8,
 93.1906 +                            scan8x8, ec->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][ec->curr_qscale], 64);
 93.1907 +                    } else {
 93.1908 +                        qmul = ec->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][ec->curr_qscale];
 93.1909 +                        for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
 93.1910 +                            const int index = 4*i8x8 + i4x4;
 93.1911 +                            //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
 93.1912 +//START_TIMER
 93.1913 +                            decode_cabac_residual_nondc(ec, s, c, m->mb + 16*index, 2, index, scan, qmul, 16);
 93.1914 +//STOP_TIMER("decode_residual")
 93.1915 +                        }
 93.1916 +                    }
 93.1917 +                } else {
 93.1918 +                    uint8_t * const nnz= &ec->non_zero_count_cache[ scan8[4*i8x8] ];
 93.1919 +                    nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
 93.1920 +                }
 93.1921 +            }
 93.1922 +        }
 93.1923 +
 93.1924 +        if( cbp&0x30 ){
 93.1925 +            memset(m->mb + 256, 0, 2*64 * sizeof(DCTELEM));
 93.1926 +            for( int i = 0; i < 2; i++ ) {
 93.1927 +                const uint32_t dequant4_coeff = ec->dequant4_coeff[IS_INTRA(mb_type) ? 1+i:4+i][ec->chroma_qp[i]][0];
 93.1928 +
 93.1929 +                //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
 93.1930 +                decode_cabac_residual_dc(ec, s, c, m->mb + 256 + 16*4*i, 3, i, chroma_dc_scan, 4);
 93.1931 +                chroma_dc_dequant_idct_c(m->mb + 256 + 16*4*i, dequant4_coeff);
 93.1932 +            }
 93.1933 +        }
 93.1934 +
 93.1935 +        if( cbp&0x20 ) {
 93.1936 +            int i, j;
 93.1937 +            for( i = 0; i < 2; i++ ) {
 93.1938 +                qmul = ec->dequant4_coeff[i+1+(IS_INTRA( mb_type ) ? 0:3)][ec->chroma_qp[i]];
 93.1939 +                for( j = 0; j < 4; j++ ) {
 93.1940 +                    const int index = 16 + 4 * i + j;
 93.1941 +                    //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
 93.1942 +                    decode_cabac_residual_nondc( ec, s, c, m->mb + 16*index, 4, index, scan + 1, qmul, 15);
 93.1943 +                }
 93.1944 +            }
 93.1945 +        } else {
 93.1946 +            uint8_t * const nnz= &ec->non_zero_count_cache[0];
 93.1947 +            nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
 93.1948 +            nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
 93.1949 +        }
 93.1950 +
 93.1951 +    } else {
 93.1952 +        uint8_t * const nnz= &ec->non_zero_count_cache[0];
 93.1953 +        fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
 93.1954 +        nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
 93.1955 +        nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
 93.1956 +        ec->last_qscale_diff = 0;
 93.1957 +    }
 93.1958 +
 93.1959 +    m->qscale_mb_xy = ec->qscale[mb_x]= ec->curr_qscale;
 93.1960 +    write_back_non_zero_count(ec, s);
 93.1961 +
 93.1962 +
 93.1963 +    return 0;
 93.1964 +}
 93.1965 +
 93.1966 +void free_entropy_context(EntropyContext *ec){
 93.1967 +    av_freep(&ec->non_zero_count_row[0]);
 93.1968 +    av_freep(&ec->non_zero_count_row[1]);
 93.1969 +    av_freep(&ec->mvd_table[0][0]);
 93.1970 +    av_freep(&ec->mvd_table[0][1]);
 93.1971 +    av_freep(&ec->mvd_table[1][0]);
 93.1972 +    av_freep(&ec->mvd_table[1][1]);
 93.1973 +
 93.1974 +    av_freep(&ec->direct_table[0]);
 93.1975 +    av_freep(&ec->direct_table[1]);
 93.1976 +    av_freep(&ec->chroma_pred_mode_table[0]);
 93.1977 +    av_freep(&ec->chroma_pred_mode_table[1]);
 93.1978 +    av_freep(&ec->cbp_table[0]);
 93.1979 +    av_freep(&ec->cbp_table[1]);
 93.1980 +    av_freep(&ec->qscale_table[0]);
 93.1981 +    av_freep(&ec->qscale_table[1]);
 93.1982 +
 93.1983 +    av_freep(&ec->mb_type_table[0]);
 93.1984 +    av_freep(&ec->mb_type_table[1]);
 93.1985 +    av_freep(&ec->ref_index_table[0][0]);
 93.1986 +    av_freep(&ec->ref_index_table[0][1]);
 93.1987 +    av_freep(&ec->ref_index_table[1][0]);
 93.1988 +    av_freep(&ec->ref_index_table[1][1]);
 93.1989 +
 93.1990 +
 93.1991 +    av_free(ec);
 93.1992 +}
 93.1993 +
 93.1994 +EntropyContext *get_entropy_context(H264Context *h){
 93.1995 +    const int mb_height = h->mb_height;
 93.1996 +    const int mb_width  = h->mb_width;
 93.1997 +    const int mb_stride = h->mb_stride;
 93.1998 +
 93.1999 +    EntropyContext *ec = av_mallocz(sizeof(EntropyContext));
 93.2000 +
 93.2001 +    ec->mb_width = mb_width;
 93.2002 +    ec->mb_height = mb_height;
 93.2003 +    ec->b_stride  = mb_width*4;
 93.2004 +    ec->mb_stride = mb_stride;
 93.2005 +
 93.2006 +    FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[0], mb_stride * 8 * sizeof(uint8_t), fail)
 93.2007 +    FF_ALLOCZ_OR_GOTO(ec->non_zero_count_row[1], mb_stride * 8 * sizeof(uint8_t), fail)
 93.2008 +
 93.2009 +    FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][0], 16*mb_stride * sizeof(uint8_t), fail);
 93.2010 +    FF_ALLOCZ_OR_GOTO(ec->mvd_table[0][1], 16*mb_stride * sizeof(uint8_t), fail);
 93.2011 +    FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][0], 16*mb_stride * sizeof(uint8_t), fail);
 93.2012 +    FF_ALLOCZ_OR_GOTO(ec->mvd_table[1][1], 16*mb_stride * sizeof(uint8_t), fail);
 93.2013 +
 93.2014 +    FF_ALLOCZ_OR_GOTO(ec->direct_table[0], 4*mb_stride * sizeof(uint8_t) , fail);
 93.2015 +    FF_ALLOCZ_OR_GOTO(ec->direct_table[1], 4*mb_stride * sizeof(uint8_t) , fail);
 93.2016 +
 93.2017 +    FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[0], mb_stride * sizeof(uint8_t), fail)
 93.2018 +    FF_ALLOCZ_OR_GOTO(ec->chroma_pred_mode_table[1], mb_stride * sizeof(uint8_t), fail)
 93.2019 +
 93.2020 +    FF_ALLOCZ_OR_GOTO(ec->cbp_table[0], mb_stride * sizeof(uint16_t), fail)
 93.2021 +    FF_ALLOCZ_OR_GOTO(ec->cbp_table[1], mb_stride * sizeof(uint16_t), fail)
 93.2022 +
 93.2023 +    FF_ALLOCZ_OR_GOTO(ec->qscale_table[0], mb_stride * sizeof(uint8_t) , fail)
 93.2024 +    FF_ALLOCZ_OR_GOTO(ec->qscale_table[1], mb_stride * sizeof(uint8_t) , fail)
 93.2025 +
 93.2026 +    FF_ALLOCZ_OR_GOTO(ec->mb_type_table[0] , (mb_stride+1) * sizeof(uint32_t), fail)
 93.2027 +    FF_ALLOCZ_OR_GOTO(ec->mb_type_table[1] , (mb_stride+1) * sizeof(uint32_t), fail)
 93.2028 +
 93.2029 +    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][0], 4*mb_stride * sizeof(int8_t), fail)
 93.2030 +    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][0], 4*mb_stride * sizeof(int8_t), fail)
 93.2031 +    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[0][1], 4*mb_stride * sizeof(int8_t), fail)
 93.2032 +    FF_ALLOCZ_OR_GOTO(ec->ref_index_table[1][1], 4*mb_stride * sizeof(int8_t), fail)
 93.2033 +
 93.2034 +    ec->zigzag_scan = h->zigzag_scan;
 93.2035 +    ec->zigzag_scan8x8 = h->zigzag_scan8x8;
 93.2036 +
 93.2037 +    return ec;
 93.2038 +fail:
 93.2039 +    free_entropy_context(ec);
 93.2040 +    return NULL;
 93.2041 +}
 93.2042 +
 93.2043 +void init_entropy_buf(EntropyContext *ec, H264Slice *s, int line){
 93.2044 +    int top = (line+1)%2;
 93.2045 +    int cur = line%2;
 93.2046 +
 93.2047 +    ec->non_zero_count_top      = ec->non_zero_count_row[top];
 93.2048 +    ec->non_zero_count          = ec->non_zero_count_row[cur];
 93.2049 +    ec->mvd_top[0]              = ec->mvd_table[0][top];
 93.2050 +    ec->mvd[0]                  = ec->mvd_table[0][cur];
 93.2051 +    ec->mvd_top[1]              = ec->mvd_table[1][top];
 93.2052 +    ec->mvd[1]                  = ec->mvd_table[1][cur];
 93.2053 +    ec->direct_top              = ec->direct_table[top];
 93.2054 +    ec->direct                  = ec->direct_table[cur];
 93.2055 +    ec->chroma_pred_mode_top    = ec->chroma_pred_mode_table[top];
 93.2056 +    ec->chroma_pred_mode        = ec->chroma_pred_mode_table[cur];
 93.2057 +    ec->cbp_top                 = ec->cbp_table[top];
 93.2058 +    ec->cbp                     = ec->cbp_table[cur];
 93.2059 +    ec->qscale_top              = ec->qscale_table[top] +1;
 93.2060 +    ec->qscale                  = ec->qscale_table[cur] +1;
 93.2061 +    ec->mb_type_top             = ec->mb_type_table[top]+1;
 93.2062 +    ec->mb_type                 = ec->mb_type_table[cur]+1;
 93.2063 +    ec->ref_index_top[0]        = ec->ref_index_table[0][top];
 93.2064 +    ec->ref_index_top[1]        = ec->ref_index_table[1][top];
 93.2065 +    ec->ref_index[0]            = ec->ref_index_table[0][cur];
 93.2066 +    ec->ref_index[1]            = ec->ref_index_table[1][cur];
 93.2067 +
 93.2068 +}

    94.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    94.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_entropy.h	Mon Aug 27 12:09:56 2012 +0200
    94.3 @@ -0,0 +1,20 @@
    94.4 +#ifndef H264_CABAC_H
    94.5 +#define H264_CABAC_H
    94.6 +
    94.7 +#include "h264_types.h"
    94.8 +#include "cabac.h"
    94.9 +
   94.10 +/**
   94.11 + * decodes a CABAC coded macroblock
   94.12 + * @return 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
   94.13 + */
   94.14 +
   94.15 +int ff_h264_decode_mb_cabac(EntropyContext *ec, H264Slice *s, CABACContext *c);
   94.16 +void ff_h264_init_cabac_states(EntropyContext *ec, H264Slice *s, CABACContext *c);
   94.17 +
   94.18 +int init_entropy_buf(EntropyContext *ec, H264Slice *s, int line);
   94.19 +EntropyContext * get_entropy_context(H264Context *h);
   94.20 +void init_dequant_tables(H264Slice *s, EntropyContext *ec);
   94.21 +void free_entropy_context(EntropyContext *ec);
   94.22 +
   94.23 +#endif

    95.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    95.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_idct.c	Mon Aug 27 12:09:56 2012 +0200
    95.3 @@ -0,0 +1,270 @@
    95.4 +/*
    95.5 + * H.264 IDCT
    95.6 + * Copyright (c) 2004 Michael Niedermayer <michaelni@gmx.at>
    95.7 + *
    95.8 + * This file is part of FFmpeg.
    95.9 + *
   95.10 + * FFmpeg is free software; you can redistribute it and/or
   95.11 + * modify it under the terms of the GNU Lesser General Public
   95.12 + * License as published by the Free Software Foundation; either
   95.13 + * version 2.1 of the License, or (at your option) any later version.
   95.14 + *
   95.15 + * FFmpeg is distributed in the hope that it will be useful,
   95.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
   95.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   95.18 + * Lesser General Public License for more details.
   95.19 + *
   95.20 + * You should have received a copy of the GNU Lesser General Public
   95.21 + * License along with FFmpeg; if not, write to the Free Software
   95.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   95.23 + */
   95.24 +
   95.25 +/**
   95.26 + * @file
   95.27 + * H.264 IDCT.
   95.28 + * @author Michael Niedermayer <michaelni@gmx.at>
   95.29 + */
   95.30 +
   95.31 +#include "dsputil.h"
   95.32 +#include "h264_data.h"
   95.33 +
   95.34 +static av_always_inline void idct_internal(uint8_t *dst, DCTELEM *block, int stride, int block_stride, int shift, int add){
   95.35 +    int i;
   95.36 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
   95.37 +
   95.38 +    block[0] += 1<<(shift-1);
   95.39 +
   95.40 +    for(i=0; i<4; i++){
   95.41 +        const int z0=  block[0 + block_stride*i]     +  block[2 + block_stride*i];
   95.42 +        const int z1=  block[0 + block_stride*i]     -  block[2 + block_stride*i];
   95.43 +        const int z2= (block[1 + block_stride*i]>>1) -  block[3 + block_stride*i];
   95.44 +        const int z3=  block[1 + block_stride*i]     + (block[3 + block_stride*i]>>1);
   95.45 +
   95.46 +        block[0 + block_stride*i]= z0 + z3;
   95.47 +        block[1 + block_stride*i]= z1 + z2;
   95.48 +        block[2 + block_stride*i]= z1 - z2;
   95.49 +        block[3 + block_stride*i]= z0 - z3;
   95.50 +    }
   95.51 +
   95.52 +    for(i=0; i<4; i++){
   95.53 +        const int z0=  block[i + block_stride*0]     +  block[i + block_stride*2];
   95.54 +        const int z1=  block[i + block_stride*0]     -  block[i + block_stride*2];
   95.55 +        const int z2= (block[i + block_stride*1]>>1) -  block[i + block_stride*3];
   95.56 +        const int z3=  block[i + block_stride*1]     + (block[i + block_stride*3]>>1);
   95.57 +
   95.58 +        dst[i + 0*stride]= cm[ add*dst[i + 0*stride] + ((z0 + z3) >> shift) ];
   95.59 +        dst[i + 1*stride]= cm[ add*dst[i + 1*stride] + ((z1 + z2) >> shift) ];
   95.60 +        dst[i + 2*stride]= cm[ add*dst[i + 2*stride] + ((z1 - z2) >> shift) ];
   95.61 +        dst[i + 3*stride]= cm[ add*dst[i + 3*stride] + ((z0 - z3) >> shift) ];
   95.62 +    }
   95.63 +}
   95.64 +
   95.65 +void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride){
   95.66 +    idct_internal(dst, block, stride, 4, 6, 1);
   95.67 +}
   95.68 +
   95.69 +void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block){
   95.70 +    idct_internal(dst, block, stride, 8, 3, 1);
   95.71 +}
   95.72 +
   95.73 +void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block){
   95.74 +    idct_internal(dst, block, stride, 8, 3, 0);
   95.75 +}
   95.76 +
   95.77 +void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
   95.78 +    int i;
   95.79 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
   95.80 +
   95.81 +    block[0] += 32;
   95.82 +
   95.83 +    for( i = 0; i < 8; i++ )
   95.84 +    {
   95.85 +        const int a0 =  block[0+i*8] + block[4+i*8];
   95.86 +        const int a2 =  block[0+i*8] - block[4+i*8];
   95.87 +        const int a4 = (block[2+i*8]>>1) - block[6+i*8];
   95.88 +        const int a6 = (block[6+i*8]>>1) + block[2+i*8];
   95.89 +
   95.90 +        const int b0 = a0 + a6;
   95.91 +        const int b2 = a2 + a4;
   95.92 +        const int b4 = a2 - a4;
   95.93 +        const int b6 = a0 - a6;
   95.94 +
   95.95 +        const int a1 = -block[3+i*8] + block[5+i*8] - block[7+i*8] - (block[7+i*8]>>1);
   95.96 +        const int a3 =  block[1+i*8] + block[7+i*8] - block[3+i*8] - (block[3+i*8]>>1);
   95.97 +        const int a5 = -block[1+i*8] + block[7+i*8] + block[5+i*8] + (block[5+i*8]>>1);
   95.98 +        const int a7 =  block[3+i*8] + block[5+i*8] + block[1+i*8] + (block[1+i*8]>>1);
   95.99 +
  95.100 +        const int b1 = (a7>>2) + a1;
  95.101 +        const int b3 =  a3 + (a5>>2);
  95.102 +        const int b5 = (a3>>2) - a5;
  95.103 +        const int b7 =  a7 - (a1>>2);
  95.104 +
  95.105 +        block[0+i*8] = b0 + b7;
  95.106 +        block[7+i*8] = b0 - b7;
  95.107 +        block[1+i*8] = b2 + b5;
  95.108 +        block[6+i*8] = b2 - b5;
  95.109 +        block[2+i*8] = b4 + b3;
  95.110 +        block[5+i*8] = b4 - b3;
  95.111 +        block[3+i*8] = b6 + b1;
  95.112 +        block[4+i*8] = b6 - b1;
  95.113 +    }
  95.114 +    for( i = 0; i < 8; i++ )
  95.115 +    {
  95.116 +        const int a0 =  block[i+0*8] + block[i+4*8];
  95.117 +        const int a2 =  block[i+0*8] - block[i+4*8];
  95.118 +        const int a4 = (block[i+2*8]>>1) - block[i+6*8];
  95.119 +        const int a6 = (block[i+6*8]>>1) + block[i+2*8];
  95.120 +
  95.121 +        const int b0 = a0 + a6;
  95.122 +        const int b2 = a2 + a4;
  95.123 +        const int b4 = a2 - a4;
  95.124 +        const int b6 = a0 - a6;
  95.125 +
  95.126 +        const int a1 = -block[i+3*8] + block[i+5*8] - block[i+7*8] - (block[i+7*8]>>1);
  95.127 +        const int a3 =  block[i+1*8] + block[i+7*8] - block[i+3*8] - (block[i+3*8]>>1);
  95.128 +        const int a5 = -block[i+1*8] + block[i+7*8] + block[i+5*8] + (block[i+5*8]>>1);
  95.129 +        const int a7 =  block[i+3*8] + block[i+5*8] + block[i+1*8] + (block[i+1*8]>>1);
  95.130 +
  95.131 +        const int b1 = (a7>>2) + a1;
  95.132 +        const int b3 =  a3 + (a5>>2);
  95.133 +        const int b5 = (a3>>2) - a5;
  95.134 +        const int b7 =  a7 - (a1>>2);
  95.135 +
  95.136 +        dst[i + 0*stride] = cm[ dst[i + 0*stride] + ((b0 + b7) >> 6) ];
  95.137 +        dst[i + 1*stride] = cm[ dst[i + 1*stride] + ((b2 + b5) >> 6) ];
  95.138 +        dst[i + 2*stride] = cm[ dst[i + 2*stride] + ((b4 + b3) >> 6) ];
  95.139 +        dst[i + 3*stride] = cm[ dst[i + 3*stride] + ((b6 + b1) >> 6) ];
  95.140 +        dst[i + 4*stride] = cm[ dst[i + 4*stride] + ((b6 - b1) >> 6) ];
  95.141 +        dst[i + 5*stride] = cm[ dst[i + 5*stride] + ((b4 - b3) >> 6) ];
  95.142 +        dst[i + 6*stride] = cm[ dst[i + 6*stride] + ((b2 - b5) >> 6) ];
  95.143 +        dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
  95.144 +    }
  95.145 +}
  95.146 +
  95.147 +// assumes all AC coefs are 0
  95.148 +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
  95.149 +    int i, j;
  95.150 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  95.151 +    int dc = (block[0] + 32) >> 6;
  95.152 +    for( j = 0; j < 4; j++ )
  95.153 +    {
  95.154 +        for( i = 0; i < 4; i++ )
  95.155 +            dst[i] = cm[ dst[i] + dc ];
  95.156 +        dst += stride;
  95.157 +    }
  95.158 +}
  95.159 +
  95.160 +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
  95.161 +    int i, j;
  95.162 +    uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
  95.163 +    int dc = (block[0] + 32) >> 6;
  95.164 +    for( j = 0; j < 8; j++ )
  95.165 +    {
  95.166 +        for( i = 0; i < 8; i++ )
  95.167 +            dst[i] = cm[ dst[i] + dc ];
  95.168 +        dst += stride;
  95.169 +    }
  95.170 +}
  95.171 +
  95.172 +void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  95.173 +    int i;
  95.174 +    for(i=0; i<16; i++){
  95.175 +        int nnz = nnzc[ scan8[i] ];
  95.176 +        if(nnz){
  95.177 +            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
  95.178 +            else                      idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
  95.179 +        }
  95.180 +    }
  95.181 +}
  95.182 +
  95.183 +void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  95.184 +    int i;
  95.185 +    for(i=0; i<16; i++){
  95.186 +        if(nnzc[ scan8[i] ]) idct_internal        (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
  95.187 +        else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
  95.188 +    }
  95.189 +}
  95.190 +
  95.191 +void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  95.192 +    int i;
  95.193 +    for(i=0; i<16; i+=4){
  95.194 +        int nnz = nnzc[ scan8[i] ];
  95.195 +        if(nnz){
  95.196 +            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride);
  95.197 +            else                      ff_h264_idct8_add_c   (dst + block_offset[i], block + i*16, stride);
  95.198 +        }
  95.199 +    }
  95.200 +}
  95.201 +
  95.202 +void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
  95.203 +    int i;
  95.204 +    for(i=16; i<16+8; i++){
  95.205 +        if(nnzc[ scan8[i] ])
  95.206 +            ff_h264_idct_add_c   (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
  95.207 +        else if(block[i*16])
  95.208 +            ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
  95.209 +    }
  95.210 +}
  95.211 +
  95.212 +/**
  95.213 +* IDCT transforms the 16 dc values and dequantizes them.
  95.214 +* @param qp quantization parameter
  95.215 +*/
  95.216 +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul){
  95.217 +	#define stride 16
  95.218 +	int i;
  95.219 +	int temp[16]; //FIXME check if this is a good idea
  95.220 +	static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
  95.221 +	static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
  95.222 +
  95.223 +	//return;
  95.224 +	for(i=0; i<4; i++){
  95.225 +		const int offset= y_offset[i];
  95.226 +		const int z0= block[offset+stride*0] + block[offset+stride*4];
  95.227 +		const int z1= block[offset+stride*0] - block[offset+stride*4];
  95.228 +		const int z2= block[offset+stride*1] - block[offset+stride*5];
  95.229 +		const int z3= block[offset+stride*1] + block[offset+stride*5];
  95.230 +
  95.231 +		temp[4*i+0]= z0+z3;
  95.232 +		temp[4*i+1]= z1+z2;
  95.233 +		temp[4*i+2]= z1-z2;
  95.234 +		temp[4*i+3]= z0-z3;
  95.235 +	}
  95.236 +
  95.237 +	for(i=0; i<4; i++){
  95.238 +		const int offset= x_offset[i];
  95.239 +		const int z0= temp[4*0+i] + temp[4*2+i];
  95.240 +		const int z1= temp[4*0+i] - temp[4*2+i];
  95.241 +		const int z2= temp[4*1+i] - temp[4*3+i];
  95.242 +		const int z3= temp[4*1+i] + temp[4*3+i];
  95.243 +
  95.244 +		block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
  95.245 +		block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
  95.246 +		block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
  95.247 +		block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
  95.248 +	}
  95.249 +}
  95.250 +
  95.251 +#undef xStride
  95.252 +#undef stride
  95.253 +
  95.254 +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){
  95.255 +	const int stride= 16*2;
  95.256 +	const int xStride= 16;
  95.257 +	int a,b,c,d,e;
  95.258 +
  95.259 +	a= block[stride*0 + xStride*0];
  95.260 +	b= block[stride*0 + xStride*1];
  95.261 +	c= block[stride*1 + xStride*0];
  95.262 +	d= block[stride*1 + xStride*1];
  95.263 +
  95.264 +	e= a-b;
  95.265 +	a= a+b;
  95.266 +	b= c-d;
  95.267 +	c= c+d;
  95.268 +
  95.269 +	block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
  95.270 +	block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
  95.271 +	block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
  95.272 +	block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
  95.273 +}

    96.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    96.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_idct.h	Mon Aug 27 12:09:56 2012 +0200
    96.3 @@ -0,0 +1,19 @@
    96.4 +#ifndef H264_IDCT_H
    96.5 +#define H264_IDCT_H
    96.6 +
    96.7 +#include "avcodec.h"
    96.8 +
    96.9 +void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
   96.10 +void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
   96.11 +void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
   96.12 +void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
   96.13 +void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
   96.14 +void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
   96.15 +void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
   96.16 +void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
   96.17 +void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
   96.18 +void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
   96.19 +void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qmul);
   96.20 +void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul);
   96.21 +
   96.22 +#endif

    97.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    97.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_mc.c	Mon Aug 27 12:09:56 2012 +0200
    97.3 @@ -0,0 +1,272 @@
    97.4 +#include "h264_types.h"
    97.5 +#include "h264_data.h"
    97.6 +
    97.7 +static inline void mc_dir_part(MBRecContext *d, MBRecState *mrs, H264Mb *m, DecodedPicture *pic, int n, int square,
    97.8 +							   int chroma_height, int delta, int list,uint8_t *dest_y,
    97.9 +							   uint8_t *dest_cb, uint8_t *dest_cr, int src_x_offset, int src_y_offset,
   97.10 +							   qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
   97.11 +	const int mx= mrs->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
   97.12 +	const int my= mrs->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
   97.13 +	const int luma_xy= (mx&3) + ((my&3)<<2);
   97.14 +	const int pic_width  = 16*d->mb_width;
   97.15 +	const int pic_height = 16*d->mb_height;
   97.16 +
   97.17 +	uint8_t *src_y, *src_cb, *src_cr;
   97.18 +	int ymx= mx>>2;
   97.19 +	int ymy= my>>2;
   97.20 +	int cmy= my>>3;
   97.21 +	int cmx= mx>>3;
   97.22 +
   97.23 +	//truncate the motion vectors references
   97.24 +	if(ymy>= pic_height+2){
   97.25 +		ymy=pic_height+1;
   97.26 +	}else if(ymy <=-19){
   97.27 +		ymy=-18;
   97.28 +	}
   97.29 +	if(ymx>= pic_width+2){
   97.30 +		ymx= pic_width+1;
   97.31 +	}else if(ymx<=-19){
   97.32 +		ymx=-19;
   97.33 +	}
   97.34 +
   97.35 +	src_y = pic->data[0] + ymx + ymy*d->linesize;
   97.36 +	qpix_op[luma_xy](dest_y, src_y, d->linesize); //FIXME try variable height perhaps?
   97.37 +	if(!square){
   97.38 +		qpix_op[luma_xy](dest_y + delta, src_y + delta, d->linesize);
   97.39 +	}
   97.40 +
   97.41 +	if(cmy >= pic_height>>1){
   97.42 +		cmy = (pic_height>>1) -1;
   97.43 +	}else if(cmy<=-9){
   97.44 +		cmy=-8;
   97.45 +	}
   97.46 +	if(cmx >= pic_width>>1){
   97.47 +		cmx = (pic_width>>1) -1;
   97.48 +	}else if(cmx<=-9){
   97.49 +		cmx=-8;
   97.50 +	}
   97.51 +
   97.52 +	src_cb= pic->data[1] + cmx + cmy*d->uvlinesize;
   97.53 +	src_cr= pic->data[2] + cmx + cmy*d->uvlinesize;
   97.54 +
   97.55 +	chroma_op(dest_cb, src_cb, d->uvlinesize, chroma_height, mx&7, my&7);
   97.56 +	chroma_op(dest_cr, src_cr, d->uvlinesize, chroma_height, mx&7, my&7);
   97.57 +}
   97.58 +
   97.59 +static inline void mc_part_std(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
   97.60 +								uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
   97.61 +								int x_offset, int y_offset,
   97.62 +								qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
   97.63 +								qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
   97.64 +								int list0, int list1){
   97.65 +	qpel_mc_func *qpix_op=  qpix_put;
   97.66 +	h264_chroma_mc_func chroma_op= chroma_put;
   97.67 +
   97.68 +	dest_y  += 2*x_offset + 2*y_offset*d->  linesize;
   97.69 +	dest_cb +=   x_offset +   y_offset*d->uvlinesize;
   97.70 +	dest_cr +=   x_offset +   y_offset*d->uvlinesize;
   97.71 +	x_offset += 8*m->mb_x;
   97.72 +	y_offset += 8*m->mb_y;
   97.73 +
   97.74 +	if(list0){
   97.75 +		DecodedPicture *ref= s->dp_ref_list[0][ mrs->ref_cache[0][ scan8[n] ] ];
   97.76 +		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 0,
   97.77 +					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op);
   97.78 +
   97.79 +		qpix_op=  qpix_avg;
   97.80 +		chroma_op= chroma_avg;
   97.81 +	}
   97.82 +
   97.83 +	if(list1){
   97.84 +		DecodedPicture *ref= s->dp_ref_list[1][ mrs->ref_cache[1][ scan8[n] ] ];
   97.85 +		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, 1,
   97.86 +					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_op, chroma_op);
   97.87 +	}
   97.88 +}
   97.89 +
   97.90 +static inline void mc_part_weighted(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
   97.91 +									uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
   97.92 +									int x_offset, int y_offset,
   97.93 +									qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
   97.94 +									h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
   97.95 +									h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
   97.96 +									int list0, int list1){
   97.97 +	dest_y  += 2*x_offset + 2*y_offset*d->  linesize;
   97.98 +	dest_cb +=   x_offset +   y_offset*d->uvlinesize;
   97.99 +	dest_cr +=   x_offset +   y_offset*d->uvlinesize;
  97.100 +	x_offset += 8*m->mb_x;
  97.101 +	y_offset += 8*m->mb_y;
  97.102 +
  97.103 +	if(list0 && list1){
  97.104 +		/* don't optimize for luma-only case, since B-frames usually
  97.105 +		* use implicit weights => chroma too. */
  97.106 +		uint8_t *tmp_y  = d->scratchpad_y  + 2*x_offset +16 ;
  97.107 +		uint8_t *tmp_cb = d->scratchpad_cb + x_offset + 8;
  97.108 +		uint8_t *tmp_cr = d->scratchpad_cr + x_offset + 8;
  97.109 +
  97.110 +/*
  97.111 +		uint8_t *tmp_cb = d->scratchpad;
  97.112 +		uint8_t *tmp_cr = d->scratchpad + 8;
  97.113 +		uint8_t *tmp_y  = d->scratchpad + 8*d->uvlinesize;*/
  97.114 +		int refn0 = mrs->ref_cache[0][ scan8[n] ];
  97.115 +		int refn1 = mrs->ref_cache[1][ scan8[n] ];
  97.116 +
  97.117 +		mc_dir_part(d, mrs, m, s->dp_ref_list[0][refn0], n, square, chroma_height, delta, 0,
  97.118 +					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put);
  97.119 +		mc_dir_part(d, mrs, m, s->dp_ref_list[1][refn1], n, square, chroma_height, delta, 1,
  97.120 +					tmp_y, tmp_cb, tmp_cr, x_offset, y_offset, qpix_put, chroma_put);
  97.121 +
  97.122 +		if(s->use_weight == 2){
  97.123 +			int weight0 = s->implicit_weight[refn0][refn1][m->mb_y&1];
  97.124 +			int weight1 = 64 - weight0;
  97.125 +			luma_weight_avg(  dest_y,  tmp_y,  d->  linesize, 5, weight0, weight1, 0);
  97.126 +			chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, 5, weight0, weight1, 0);
  97.127 +			chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, 5, weight0, weight1, 0);
  97.128 +		}else{
  97.129 +			luma_weight_avg(dest_y, tmp_y, d->linesize, s->luma_log2_weight_denom,
  97.130 +							s->luma_weight[refn0][0][0] , s->luma_weight[refn1][1][0],
  97.131 +							s->luma_weight[refn0][0][1] + s->luma_weight[refn1][1][1]);
  97.132 +			chroma_weight_avg(dest_cb, tmp_cb, d->uvlinesize, s->chroma_log2_weight_denom,
  97.133 +							s->chroma_weight[refn0][0][0][0] , s->chroma_weight[refn1][1][0][0],
  97.134 +							s->chroma_weight[refn0][0][0][1] + s->chroma_weight[refn1][1][0][1]);
  97.135 +			chroma_weight_avg(dest_cr, tmp_cr, d->uvlinesize, s->chroma_log2_weight_denom,
  97.136 +							s->chroma_weight[refn0][0][1][0] , s->chroma_weight[refn1][1][1][0],
  97.137 +							s->chroma_weight[refn0][0][1][1] + s->chroma_weight[refn1][1][1][1]);
  97.138 +		}
  97.139 +	}else{
  97.140 +		int list = list1 ? 1 : 0;
  97.141 +		int refn = mrs->ref_cache[list][ scan8[n] ];
  97.142 +		DecodedPicture *ref= s->dp_ref_list[list][refn];
  97.143 +		mc_dir_part(d, mrs, m, ref, n, square, chroma_height, delta, list,
  97.144 +					dest_y, dest_cb, dest_cr, x_offset, y_offset, qpix_put, chroma_put);
  97.145 +
  97.146 +		luma_weight_op(dest_y, d->linesize, s->luma_log2_weight_denom,
  97.147 +						s->luma_weight[refn][list][0], s->luma_weight[refn][list][1]);
  97.148 +		if(s->use_weight_chroma){
  97.149 +			chroma_weight_op(dest_cb, d->uvlinesize, s->chroma_log2_weight_denom,
  97.150 +							s->chroma_weight[refn][list][0][0], s->chroma_weight[refn][list][0][1]);
  97.151 +			chroma_weight_op(dest_cr, d->uvlinesize, s->chroma_log2_weight_denom,
  97.152 +							s->chroma_weight[refn][list][1][0], s->chroma_weight[refn][list][1][1]);
  97.153 +		}
  97.154 +	}
  97.155 +}
  97.156 +
  97.157 +static inline void mc_part(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int n, int square, int chroma_height, int delta,
  97.158 +							uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
  97.159 +							int x_offset, int y_offset,
  97.160 +							qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
  97.161 +							qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
  97.162 +							h264_weight_func *weight_op, h264_biweight_func *weight_avg,
  97.163 +							int list0, int list1){
  97.164 +	if((s->use_weight==2 && list0 && list1
  97.165 +		&& (s->implicit_weight[ mrs->ref_cache[0][scan8[n]] ][ mrs->ref_cache[1][scan8[n]] ][m->mb_y&1] != 32))
  97.166 +		|| s->use_weight==1)
  97.167 +		mc_part_weighted(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
  97.168 +						x_offset, y_offset, qpix_put, chroma_put,
  97.169 +						weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
  97.170 +	else
  97.171 +		mc_part_std(d, mrs, s, m, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
  97.172 +					x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
  97.173 +}
  97.174 +
  97.175 +static inline void prefetch_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, int list){
  97.176 +	/* fetch pixels for estimated mv 4 macroblocks ahead
  97.177 +	* optimized for 64byte cache lines */
  97.178 +	const int refn = mrs->ref_cache[list][scan8[0]];
  97.179 +
  97.180 +	if(refn >= 0){
  97.181 +		const int mx= (mrs->mv_cache[list][scan8[0]][0]>>2) + 16*m->mb_x + 8;
  97.182 +		const int my= (mrs->mv_cache[list][scan8[0]][1]>>2) + 16*m->mb_y;
  97.183 +		uint8_t **src= s->dp_ref_list[list][refn]->data;
  97.184 +		int off= mx + (my + (m->mb_x&3)*4)*d->linesize + 64;
  97.185 +
  97.186 +		d->dsp.prefetch(src[0]+off, d->linesize, 4);
  97.187 +		off= (mx>>1) + ((my>>1) + (m->mb_x&7))*d->uvlinesize + 64;
  97.188 +		d->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
  97.189 +	}
  97.190 +}
  97.191 +
  97.192 +void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
  97.193 +					qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
  97.194 +					qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
  97.195 +					h264_weight_func *weight_op, h264_biweight_func *weight_avg){
  97.196 +	const int mb_type= m->mb_type;
  97.197 +	assert(IS_INTER(mb_type));
  97.198 +
  97.199 +	if (mb_type & MB_TYPE_L0)
  97.200 +		prefetch_motion(d, mrs, s, m, 0);
  97.201 +	if (mb_type & MB_TYPE_L1)
  97.202 +		prefetch_motion(d, mrs, s, m, 1);
  97.203 +
  97.204 +	if(IS_16X16(mb_type)){
  97.205 +		mc_part(d, mrs, s, m, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
  97.206 +				qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
  97.207 +				weight_op, weight_avg,
  97.208 +				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
  97.209 +	}else if(IS_16X8(mb_type)){
  97.210 +		mc_part(d, mrs, s, m, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
  97.211 +				qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
  97.212 +				&weight_op[1], &weight_avg[1],
  97.213 +				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
  97.214 +		mc_part(d, mrs, s, m, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
  97.215 +				qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
  97.216 +				&weight_op[1], &weight_avg[1],
  97.217 +				IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
  97.218 +	}else if(IS_8X16(mb_type)){
  97.219 +		mc_part(d, mrs, s, m, 0, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 0, 0,
  97.220 +				qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
  97.221 +				&weight_op[2], &weight_avg[2],
  97.222 +				IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
  97.223 +		mc_part(d, mrs, s, m, 4, 0, 8, 8*d->linesize, dest_y, dest_cb, dest_cr, 4, 0,
  97.224 +				qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
  97.225 +				&weight_op[2], &weight_avg[2],
  97.226 +				IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
  97.227 +	}else{
  97.228 +		int i;
  97.229 +
  97.230 +		assert(IS_8X8(mb_type));
  97.231 +
  97.232 +		for(i=0; i<4; i++){
  97.233 +			const int sub_mb_type= m->sub_mb_type[i];
  97.234 +			const int n= 4*i;
  97.235 +			int x_offset= (i&1)<<2;
  97.236 +			int y_offset= (i&2)<<1;
  97.237 +
  97.238 +			if(IS_SUB_8X8(sub_mb_type)){
  97.239 +				mc_part(d, mrs, s, m, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
  97.240 +						qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
  97.241 +						&weight_op[3], &weight_avg[3],
  97.242 +						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
  97.243 +			}else if(IS_SUB_8X4(sub_mb_type)){
  97.244 +				mc_part(d, mrs, s, m, n, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
  97.245 +						qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
  97.246 +						&weight_op[4], &weight_avg[4],
  97.247 +						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
  97.248 +				mc_part(d, mrs, s, m, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
  97.249 +						qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
  97.250 +						&weight_op[4], &weight_avg[4],
  97.251 +						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
  97.252 +			}else if(IS_SUB_4X8(sub_mb_type)){
  97.253 +				mc_part(d, mrs, s, m, n, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
  97.254 +						qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
  97.255 +						&weight_op[5], &weight_avg[5],
  97.256 +						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
  97.257 +				mc_part(d, mrs, s, m, n+1, 0, 4, 4*d->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
  97.258 +						qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
  97.259 +						&weight_op[5], &weight_avg[5],
  97.260 +						IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
  97.261 +			}else{
  97.262 +				int j;
  97.263 +				assert(IS_SUB_4X4(sub_mb_type));
  97.264 +				for(j=0; j<4; j++){
  97.265 +					int sub_x_offset= x_offset + 2*(j&1);
  97.266 +					int sub_y_offset= y_offset +   (j&2);
  97.267 +					mc_part(d, mrs, s, m, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
  97.268 +							qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
  97.269 +							&weight_op[6], &weight_avg[6],
  97.270 +							IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
  97.271 +				}
  97.272 +			}
  97.273 +		}
  97.274 +	}
  97.275 +}

    98.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    98.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_mc.h	Mon Aug 27 12:09:56 2012 +0200
    98.3 @@ -0,0 +1,12 @@
    98.4 +#ifndef H264_MC_H
    98.5 +#define H264_MC_H
    98.6 +
    98.7 +#include "dsputil.h"
    98.8 +#include "h264_types.h"
    98.9 +
   98.10 +void hl_motion(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
   98.11 +					qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
   98.12 +					qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
   98.13 +					h264_weight_func *weight_op, h264_biweight_func *weight_avg);
   98.14 +
   98.15 +#endif

    99.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
    99.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_misc.c	Mon Aug 27 12:09:56 2012 +0200
    99.3 @@ -0,0 +1,944 @@
    99.4 +#include "config.h"
    99.5 +
    99.6 +#include "h264_types.h"
    99.7 +
    99.8 +#include <unistd.h>
    99.9 +#include <sys/resource.h>
   99.10 +#include <sys/time.h>
   99.11 +#include <time.h>
   99.12 +#include <pthread.h>
   99.13 +#undef NDEBUG
   99.14 +#include <assert.h>
   99.15 +
   99.16 +#if HAVE_LIBSDL2
   99.17 +#include <SDL2/SDL.h>
   99.18 +#if HAVE_LIBSDL_TTF
   99.19 +#include <SDL/SDL_ttf.h>
   99.20 +#endif
   99.21 +#endif
   99.22 +
   99.23 +void start_timer(H264Context *h, int stage){
   99.24 +    clock_gettime(CLOCK_REALTIME, &h->start_time[stage]);
   99.25 +}
   99.26 +
   99.27 +void stop_timer(H264Context *h, int stage){
   99.28 +    clock_gettime(CLOCK_REALTIME, &h->end_time[stage]);
   99.29 +    double time = (double) 1.e3*(h->end_time[stage].tv_sec - h->start_time[stage].tv_sec) + 1.e-6*(h->end_time[stage].tv_nsec - h->start_time[stage].tv_nsec);
   99.30 +    h->last_time [stage]  = time;
   99.31 +    h->total_time[stage] += time;
   99.32 +}
   99.33 +
   99.34 +void init_sb_entry(H264Context *h, SliceBufferEntry *sbe){
   99.35 +    sbe->mbs = av_malloc(h->mb_width*h->mb_height* sizeof(H264Mb));
   99.36 +    sbe->initialized = 1;
   99.37 +}
   99.38 +
   99.39 +void free_sb_entry(SliceBufferEntry *sbe){
   99.40 +    av_free(sbe->mbs);
   99.41 +    av_freep(&sbe->gb.raw);
   99.42 +    if (sbe->gb.rbsp)
   99.43 +        av_freep(&sbe->gb.rbsp);
   99.44 +    sbe->initialized = 0;
   99.45 +}
   99.46 +
   99.47 +SliceBufferEntry *get_sb_entry(H264Context *h){
   99.48 +    SliceBufferEntry *sb = NULL;
   99.49 +
   99.50 +    pthread_mutex_lock(&h->lock[PARSE]);
   99.51 +    while (h->free_sb_cnt<=0)
   99.52 +        pthread_cond_wait(&h->cond[PARSE], &h->lock[PARSE]);
   99.53 +    /* use first free picture */
   99.54 +    for(int i=0; i<h->sb_size; i++){
   99.55 +        if(h->sb[i].state==0){
   99.56 +            sb= &h->sb[i];
   99.57 +            sb->state=1;
   99.58 +            sb->lines_taken=0;
   99.59 +            sb->lines_total=h->mb_height;
   99.60 +            break;
   99.61 +        }
   99.62 +    }
   99.63 +    h->free_sb_cnt--;
   99.64 +
   99.65 +    pthread_mutex_unlock(&h->lock[PARSE]);
   99.66 +
   99.67 +    memset (&sb->slice, 0, sizeof(H264Slice));
   99.68 +
   99.69 +    return sb;
   99.70 +}
   99.71 +
   99.72 +void release_sb_entry(H264Context *h, SliceBufferEntry *sb){
   99.73 +    pthread_mutex_lock(&h->lock[PARSE]);
   99.74 +
   99.75 +    sb->state = 0;
   99.76 +    h->free_sb_cnt++;
   99.77 +    pthread_cond_signal(&h->cond[PARSE]);
   99.78 +
   99.79 +    pthread_mutex_unlock(&h->lock[PARSE]);
   99.80 +}
   99.81 +
   99.82 +int init_dpb_entry(H264Context *h, DecodedPicture *pic, H264Slice *s, int width, int height){
   99.83 +    int i;
   99.84 +
   99.85 +    s->curr_pic=pic;
   99.86 +    pic->poc = s->poc;
   99.87 +    pic->key_frame = s->key_frame;
   99.88 +    pic->mmco_reset = s->mmco_reset;
   99.89 +    pic->reference = s->nal_ref_idc? 3:1;
   99.90 +    pic->cpn = s->coded_pic_num;
   99.91 +
   99.92 +    if(pic->data[0]==NULL) {
   99.93 +        int size[3] = {0};
   99.94 +
   99.95 +        width+= EDGE_WIDTH*2;
   99.96 +        height+= EDGE_WIDTH*2;
   99.97 +
   99.98 +        pic->linesize[0]= width;
   99.99 +        pic->linesize[1]=  pic->linesize[2] = width>>1;
  99.100 +
  99.101 +        size[0] = width*height;
  99.102 +        size[1] = size[2] = width*height>>2;
  99.103 +
  99.104 +        for(i=0; i<3; i++){
  99.105 +            pic->base[i]= av_malloc(size[i]);
  99.106 +        }
  99.107 +
  99.108 +        pic->data[0] = pic->base[0] + (pic->linesize[0]*EDGE_WIDTH) + EDGE_WIDTH;
  99.109 +        pic->data[1] = pic->base[1] + (pic->linesize[1]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1);
  99.110 +        pic->data[2] = pic->base[2] + (pic->linesize[2]*EDGE_WIDTH>>1) + (EDGE_WIDTH>>1);
  99.111 +    }
  99.112 +
  99.113 +    const int big_mb_num= h->mb_stride*(h->mb_height+1) + 1; //the +1 is needed so memset(,,stride*height) does not sig11
  99.114 +    const int mb_array_size= h->mb_stride*h->mb_height;
  99.115 +    const int b4_array_size= h->b4_stride*h->mb_height*4;
  99.116 +
  99.117 +    if(pic->mb_type_base==NULL){
  99.118 +        FF_ALLOCZ_OR_GOTO(pic->mb_type_base , big_mb_num * sizeof(uint32_t), fail)
  99.119 +        pic->mb_type= pic->mb_type_base + h->mb_stride+1;
  99.120 +
  99.121 +        for(int i=0; i<2; i++){
  99.122 +            FF_ALLOCZ_OR_GOTO(pic->motion_val_base[i], 2 * (b4_array_size+4)  * sizeof(int16_t), fail)
  99.123 +            pic->motion_val[i]= pic->motion_val_base[i]+4;
  99.124 +            FF_ALLOCZ_OR_GOTO(pic->ref_index[i], 4*mb_array_size * sizeof(uint8_t), fail)
  99.125 +        }
  99.126 +        FF_ALLOCZ_OR_GOTO(pic->intra4x4_pred_mode, h->mb_width*h->mb_height * 4* sizeof(int8_t), fail)
  99.127 +    }
  99.128 +
  99.129 +    return 0;
  99.130 +    fail:
  99.131 +    return -1;
  99.132 +}
  99.133 +
  99.134 +void free_dp(DecodedPicture *pic){
  99.135 +    if(pic->base[0]){
  99.136 +        for (int i=0; i<3; i++){
  99.137 +            av_free(pic->base[i]);
  99.138 +            pic->data[i]= NULL;
  99.139 +        }
  99.140 +    }
  99.141 +    if (pic->mb_type_base){
  99.142 +        av_free(pic->mb_type_base);
  99.143 +        pic->mb_type= NULL;
  99.144 +        for(int i=0; i<2; i++){
  99.145 +            av_free(pic->motion_val_base[i]);
  99.146 +            av_free(pic->ref_index[i]);
  99.147 +        }
  99.148 +        av_free(pic->intra4x4_pred_mode);
  99.149 +    }
  99.150 +}
  99.151 +
  99.152 +DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s){
  99.153 +    DecodedPicture *dp = NULL;
  99.154 +
  99.155 +    pthread_mutex_lock(&h->lock[REORDER2]);
  99.156 +    while (h->free_dpb_cnt<=0){
  99.157 +    #if OMPSS
  99.158 +        assert(0);
  99.159 +    #endif
  99.160 +        pthread_cond_wait(&h->cond[REORDER2], &h->lock[REORDER2]);
  99.161 +    }
  99.162 +    /* use first free picture */
  99.163 +    for(int i=0; i<h->max_dpb_cnt; i++){
  99.164 +        if(h->dpb[i].reference==0){
  99.165 +            dp= &h->dpb[i];
  99.166 +            break;
  99.167 +        }
  99.168 +    }
  99.169 +    assert(dp);
  99.170 +    init_dpb_entry(h, dp, s, h->width, h->height);
  99.171 +    h->free_dpb_cnt--;
  99.172 +    h->acdpb_cnt++; //debug
  99.173 +    pthread_mutex_unlock(&h->lock[REORDER2]);
  99.174 +
  99.175 +    return dp;
  99.176 +}
  99.177 +
  99.178 +void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode){
  99.179 +    pthread_mutex_lock(&h->lock[REORDER2]);
  99.180 +    pic->reference &= ~mode;
  99.181 +    if (pic->reference == 0){
  99.182 +        h->free_dpb_cnt++;
  99.183 +        h->reldpb_cnt++; //debug
  99.184 +        pthread_cond_signal(&h->cond[REORDER2]);
  99.185 +    }
  99.186 +    pthread_mutex_unlock(&h->lock[REORDER2]);
  99.187 +}
  99.188 +
  99.189 +
  99.190 +/**
  99.191 +*   Extends the edges of a macroblock line.
  99.192 +*/
  99.193 +void draw_edges(MBRecContext *d, H264Slice *s, int line){
  99.194 +    int i;
  99.195 +    int mb_width=d->mb_width;
  99.196 +    int mb_height=d->mb_height;
  99.197 +    int last = (line+1 == mb_height);
  99.198 +    int lines = last?16:12;
  99.199 +    int linesize = d->linesize;
  99.200 +    int uvlinesize = d->uvlinesize;
  99.201 +    uint8_t *y = s->curr_pic->data[0] + 16*line*linesize;
  99.202 +    uint8_t *cb = s->curr_pic->data[1] + 8*line*uvlinesize;
  99.203 +    uint8_t *cr = s->curr_pic->data[2] + 8*line*uvlinesize;
  99.204 +
  99.205 +    for (i=-4; i<lines; i++){
  99.206 +        memset(y + i*linesize - EDGE_WIDTH, y[i*linesize], EDGE_WIDTH);
  99.207 +        memset(y + i*linesize + mb_width*16, y[i*linesize +mb_width*16 -1], EDGE_WIDTH);
  99.208 +    }
  99.209 +    for (i=-2; i<lines/2; i++){
  99.210 +        memset(cb + i*uvlinesize - EDGE_WIDTH/2, cb[i*uvlinesize], EDGE_WIDTH/2);
  99.211 +        memset(cb + i*uvlinesize + mb_width*8, cb[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2);
  99.212 +        memset(cr + i*uvlinesize - EDGE_WIDTH/2, cr[i*uvlinesize], EDGE_WIDTH/2);
  99.213 +        memset(cr + i*uvlinesize + mb_width*8, cr[i*uvlinesize +mb_width*8 -1], EDGE_WIDTH/2);
  99.214 +    }
  99.215 +
  99.216 +    if (line==0){
  99.217 +        y -= EDGE_WIDTH;
  99.218 +        cb -= EDGE_WIDTH/2;
  99.219 +        cr -= EDGE_WIDTH/2;
  99.220 +        for (i=1; i<=21; i++){
  99.221 +            memcpy(y -i*linesize, y, linesize);
  99.222 +        }
  99.223 +        for (i=1; i<=9; i++){
  99.224 +            memcpy(cb -i*uvlinesize, cb, uvlinesize);
  99.225 +            memcpy(cr -i*uvlinesize, cr, uvlinesize);
  99.226 +        }
  99.227 +    }else if (last){
  99.228 +        y += -EDGE_WIDTH + 15*linesize;
  99.229 +        cb += -EDGE_WIDTH/2 + 7*uvlinesize;
  99.230 +        cr += -EDGE_WIDTH/2 + 7*uvlinesize;
  99.231 +        for (i=1; i<=21; i++){
  99.232 +            memcpy(y +i*linesize, y, linesize);
  99.233 +        }
  99.234 +        for (i=1; i<=9; i++){
  99.235 +            memcpy(cb +i*uvlinesize, cb, uvlinesize);
  99.236 +            memcpy(cr +i*uvlinesize, cr, uvlinesize);
  99.237 +        }
  99.238 +    }
  99.239 +}
  99.240 +
  99.241 +static int64_t timer_start;
  99.242 +int64_t av_gettime(void) {
  99.243 +    struct timeval tv;
  99.244 +    gettimeofday(&tv,NULL);
  99.245 +    return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
  99.246 +}
  99.247 +
  99.248 +void av_start_timer(){
  99.249 +    timer_start = av_gettime();
  99.250 +}
  99.251 +
  99.252 +void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose) {
  99.253 +    static int64_t last_time = -1;
  99.254 +    static int64_t last_frame_number = 0;
  99.255 +    float t=0, t2=0;
  99.256 +    int64_t cur_time=0;
  99.257 +    
  99.258 +    if (!is_last_report) {    
  99.259 +        /* display the report every 0.5 seconds */
  99.260 +        cur_time = av_gettime();
  99.261 +        if (last_time == -1) {
  99.262 +            last_time = cur_time;
  99.263 +            return;
  99.264 +        }
  99.265 +        if ((cur_time - last_time) < 500000)
  99.266 +            return;
  99.267 +        t = (cur_time-timer_start) / 1000000.0;
  99.268 +        t2 = (cur_time-last_time) / 1000000.0;        
  99.269 +    }
  99.270 +
  99.271 +    if (verbose){
  99.272 +        fprintf(stderr, "frame=%5d avgfps=%3d curfps=%3d\r", frame_number, (int)(frame_number/t+0.5), (int)((frame_number - last_frame_number)/t2+0.5) );
  99.273 +        fflush(stderr);
  99.274 +    }
  99.275 +    last_frame_number = frame_number;
  99.276 +    last_time = cur_time;
  99.277 +
  99.278 +    if (is_last_report){
  99.279 +        t = (av_gettime()-timer_start) / 1000000.0;
  99.280 +        fprintf(stderr, "%c[2Kframe=%5d avgfps=%3d\r", 27, frame_number, (int)(frame_number/t+0.5));
  99.281 +        fprintf(stderr, "\n");
  99.282 +        fprintf(stderr, "video:%1.0fkB\n", video_size/1024.0);
  99.283 +        fflush(stderr);
  99.284 +    }
  99.285 +}
  99.286 +
  99.287 +/* Sort B-frames into display order */
  99.288 +static DecodedPicture *get_reordered_picture(OutputContext *w, int flush){
  99.289 +    int i;
  99.290 +    int out_idx = 0;
  99.291 +    DecodedPicture *out = w->delayed_pic[0];
  99.292 +
  99.293 +    if (!out)
  99.294 +        return NULL;
  99.295 +
  99.296 +    for(i=1; w->delayed_pic[i] && !w->delayed_pic[i]->key_frame && !w->delayed_pic[i]->mmco_reset; i++){
  99.297 +        if(w->delayed_pic[i]->poc < out->poc){
  99.298 +            out = w->delayed_pic[i];
  99.299 +            out_idx = i;
  99.300 +        }
  99.301 +    }
  99.302 +
  99.303 +    if(w->dp_cnt > MAX_DELAYED_PIC_COUNT || flush) {
  99.304 +        for(i=out_idx; w->delayed_pic[i]; i++)
  99.305 +            w->delayed_pic[i] = w->delayed_pic[i+1];
  99.306 +        w->dp_cnt--;
  99.307 +        return out;
  99.308 +    }
  99.309 +    return NULL;
  99.310 +}
  99.311 +
  99.312 +/**
  99.313 +*  Remove the extra borders, and places the three parts of the image after each other.
  99.314 +*/
  99.315 +static int raw_encode(const DecodedPicture* src, int width, int height, unsigned char *dest) {
  99.316 +    int i, j;
  99.317 +/** To write entire image including extra borders*/
  99.318 +//  int w = src->linesize[0];
  99.319 +//  int h = height+64;
  99.320 +//  int w2 = w>>1;
  99.321 +//  int h2 = h>>1;
  99.322 +//     int data_planes=3;
  99.323 +//     int size = w * h + 2 *w2*h2;
  99.324 +//     const unsigned char* s;
  99.325 +//     for (i=0; i<data_planes; i++) {
  99.326 +//         if (i == 1) {
  99.327 +//             w = w2;
  99.328 +//             h = h2;
  99.329 +//         }
  99.330 +//         s = src->base[i];
  99.331 +//         for(j=0; j<h; j++) {
  99.332 +//             memcpy(dest, s, src->linesize[i]);
  99.333 +//             dest += w;
  99.334 +//             s += src->linesize[i];
  99.335 +//         }
  99.336 +//     }
  99.337 +
  99.338 +    int w = (width*8 + 7)/8;
  99.339 +    int h = height;
  99.340 +    int w2 =((width >>1) * 8 + 7) / 8;
  99.341 +    int h2 = ((height+1) >>1); //not sure about +1
  99.342 +    int data_planes=3;
  99.343 +    int size = w * h + 2 *w2*h2;
  99.344 +    const unsigned char* s;
  99.345 +
  99.346 +
  99.347 +    for (i=0; i<data_planes; i++) {
  99.348 +        if (i == 1) {
  99.349 +            w = w2;
  99.350 +            h = h2;
  99.351 +        }
  99.352 +        s = src->data[i];
  99.353 +        for(j=0; j<h; j++) {
  99.354 +            memcpy(dest, s, w);
  99.355 +            dest += w;
  99.356 +            s += src->linesize[i];
  99.357 +        }
  99.358 +    }
  99.359 +    return size;
  99.360 +}
  99.361 +
  99.362 +#ifdef HAVE_LIBSDL2
  99.363 +static SDL_Texture *get_next_texture(H264Context *h, int side){
  99.364 +    SDLTextureQueue *sdlq = &h->sdlq;
  99.365 +    SDL_Texture *texture;
  99.366 +    pthread_mutex_lock (&sdlq->sdl_lock);
  99.367 +    if (side ){ //send
  99.368 +        while (sdlq->ready >= sdlq->size)
  99.369 +            pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock);
  99.370 +        texture = sdlq->queue[sdlq->fi];
  99.371 +        sdlq->fi++; sdlq->fi %= sdlq->size;
  99.372 +    } else { //recv
  99.373 +        while (sdlq->ready <= 0 && !sdlq->exit)
  99.374 +            pthread_cond_wait(&sdlq->sdl_cond, &sdlq->sdl_lock);
  99.375 +
  99.376 +        if (sdlq->ready == 0 && sdlq->exit){
  99.377 +            texture = NULL;
  99.378 +        }else{
  99.379 +            texture = sdlq->queue[sdlq->fo];
  99.380 +            sdlq->fo++; sdlq->fo %= sdlq->size;
  99.381 +        }
  99.382 +    }
  99.383 +    pthread_mutex_unlock(&sdlq->sdl_lock);
  99.384 +
  99.385 +    return texture;
  99.386 +}
  99.387 +
  99.388 +static void signal_texture(H264Context *h, int side){
  99.389 +    SDLTextureQueue *sdlq = &h->sdlq;
  99.390 +    pthread_mutex_lock (&sdlq->sdl_lock);
  99.391 +    if (side)
  99.392 +        sdlq->ready++;
  99.393 +    else
  99.394 +        sdlq->ready--;
  99.395 +    pthread_cond_signal(&sdlq->sdl_cond);
  99.396 +    pthread_mutex_unlock(&sdlq->sdl_lock);
  99.397 +}
  99.398 +
  99.399 +void signal_sdl_exit(H264Context *h){
  99.400 +    SDLTextureQueue *sdlq = &h->sdlq;
  99.401 +    pthread_mutex_lock (&sdlq->sdl_lock);
  99.402 +    sdlq->exit=1;
  99.403 +    pthread_cond_signal(&sdlq->sdl_cond);
  99.404 +    pthread_mutex_unlock(&sdlq->sdl_lock);
  99.405 +}
  99.406 +
  99.407 +static void display_frame(H264Context *h, OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height, int dropable){
  99.408 +    static int64_t last_time = -1;
  99.409 +    int64_t cur_time;
  99.410 +//     SDLContext *sdlc = h->sdlc;
  99.411 +    uint8_t *iyuv_pixels;
  99.412 +    int pitch;
  99.413 +
  99.414 +
  99.415 +    if (last_time == -1){
  99.416 +        last_time = av_gettime();
  99.417 +    }
  99.418 +
  99.419 +    
  99.420 +    /* do not display frames that are less than 8.125 ms apart (120fps)*/
  99.421 +    if (dropable){
  99.422 +        cur_time = av_gettime();
  99.423 +
  99.424 +        if ((cur_time - last_time) < 8125)
  99.425 +            return;
  99.426 +
  99.427 +        last_time =cur_time;
  99.428 +    }
  99.429 +
  99.430 +    if(in_picture){
  99.431 +        
  99.432 +        SDL_Texture *texture= get_next_texture(h, 1);
  99.433 +
  99.434 +        SDL_LockTexture( texture, NULL, (void **)&iyuv_pixels, &pitch );
  99.435 +
  99.436 +        raw_encode(in_picture, frame_width, frame_height, iyuv_pixels);
  99.437 +
  99.438 +        signal_texture(h, 1);
  99.439 +    }
  99.440 +}
  99.441 +#endif
  99.442 +
  99.443 +// TODO: Parallelize the raw_encode (either split frame or over frames)
  99.444 +static void do_video_out(OutputContext *w, int fd, DecodedPicture *in_picture, int frame_width, int frame_height) {
  99.445 +    int size=0;
  99.446 +    //remove extra borders
  99.447 +
  99.448 +    if(in_picture)
  99.449 +        size= raw_encode(in_picture, frame_width, frame_height, w->bit_buffer);
  99.450 +
  99.451 +    if (size < 0) {
  99.452 +        fprintf(stderr, "Video encoding failed\n");
  99.453 +    }else {
  99.454 +        if (write(fd, w->bit_buffer, size)<0)
  99.455 +            fprintf(stderr, "Write frame failed\n");
  99.456 +    }
  99.457 +
  99.458 +    w->video_size += size;
  99.459 +}
  99.460 +
  99.461 +DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height) {
  99.462 +    DecodedPicture *out;
  99.463 +
  99.464 +    if (pic){
  99.465 +        oc->delayed_pic[oc->dp_cnt++]=pic;
  99.466 +        out = get_reordered_picture(oc, 0);
  99.467 +    }else{
  99.468 +        out = get_reordered_picture(oc, 1);
  99.469 +    }
  99.470 +
  99.471 +    if (out){
  99.472 +        if (fd){
  99.473 +            do_video_out(oc, fd, out, frame_width, frame_height);
  99.474 +        }else{
  99.475 +#ifdef HAVE_LIBSDL2
  99.476 +            if (h->display){
  99.477 +                display_frame(h, oc, fd, out, frame_width, frame_height, !(pic==NULL));
  99.478 +            }
  99.479 +#endif
  99.480 +        }
  99.481 +        oc->frame_number++;
  99.482 +    }
  99.483 +
  99.484 +    return out;
  99.485 +}
  99.486 +
  99.487 +OutputContext *get_output_context(H264Context *h){
  99.488 +    const int frame_width=h->frame_width;
  99.489 +    const int frame_height=h->frame_height;
  99.490 +    const int frame_size = frame_width*frame_height;
  99.491 +
  99.492 +    OutputContext *oc = av_mallocz(sizeof(OutputContext));
  99.493 +    oc->bit_buffer_size= FFMAX(1024*256, frame_size*2); // oversize a little bit to allow extra border write
  99.494 +    oc->bit_buffer=  av_mallocz(oc->bit_buffer_size);
  99.495 +
  99.496 +    return oc;
  99.497 +}
  99.498 +
  99.499 +void free_output_context(OutputContext *oc){
  99.500 +
  99.501 +    av_free(oc->bit_buffer);
  99.502 +    av_free(oc);
  99.503 +}
  99.504 +
  99.505 +SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height){
  99.506 +    SuperMBContext *smbc = av_mallocz(sizeof(SuperMBContext));
  99.507 +
  99.508 +    smbc->smb_width = smb_width;
  99.509 +    smbc->smb_height = smb_height;
  99.510 +
  99.511 +    smbc->nsmb_height = h->mb_height / smbc->smb_height +  (h->mb_height%smbc->smb_height ? 1:0);    //only need one extra if mb_height was not dividable
  99.512 +    smbc->nsmb_width  = h->mb_width / smbc->smb_width;
  99.513 +    while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width )
  99.514 +        smbc->nsmb_width++;
  99.515 +
  99.516 +    smbc->nsmb_3dheight= smbc->nsmb_height - ((h->mb_height/2)/smbc->smb_height +1); //assuming max motion vector of half the height
  99.517 +
  99.518 +    smbc->smbs[0] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask));
  99.519 +    smbc->smbs[1] = av_malloc (smbc->nsmb_width * smbc->nsmb_height * sizeof(SuperMBTask));
  99.520 +    for (int y=0, i=0; i<smbc->nsmb_height; i++, y+=smbc->smb_height){
  99.521 +        for (int x=0, j=0; j<smbc->nsmb_width; j++, x+=smbc->smb_width){
  99.522 +            smbc->smbs[0][i*smbc->nsmb_width +j].smb_y = y;
  99.523 +            smbc->smbs[0][i*smbc->nsmb_width +j].smb_x = x;
  99.524 +            smbc->smbs[1][i*smbc->nsmb_width +j].smb_y = y;
  99.525 +            smbc->smbs[1][i*smbc->nsmb_width +j].smb_x = x;
  99.526 +        }
  99.527 +    }
  99.528 +
  99.529 +    smbc->refcount = 1;
  99.530 +
  99.531 +    return smbc;
  99.532 +}
  99.533 +
  99.534 +void freeSuperMBContext(SuperMBContext *smbc){
  99.535 +    av_free(smbc->smbs[0]);
  99.536 +    av_free(smbc->smbs[1]);
  99.537 +    av_free(smbc);
  99.538 +}
  99.539 +
  99.540 +SuperMBContext * acquire_smbc(H264Context *h ){
  99.541 +    SuperMBContext *smbc;
  99.542 +
  99.543 +    pthread_mutex_lock (&h->smb_lock);
  99.544 +    smbc = h->smbc;
  99.545 +    smbc->refcount++;
  99.546 +    pthread_mutex_unlock(&h->smb_lock);
  99.547 +    return smbc;
  99.548 +}
  99.549 +
  99.550 +void release_smbc(H264Context *h, SuperMBContext *smbc){
  99.551 +    pthread_mutex_lock (&h->smb_lock);
  99.552 +    smbc->refcount--;
  99.553 +    if (smbc->refcount==0){
  99.554 +        freeSuperMBContext(smbc);
  99.555 +    }
  99.556 +    pthread_mutex_unlock(&h->smb_lock);
  99.557 +
  99.558 +}
  99.559 +
  99.560 +
  99.561 +#ifdef HAVE_LIBSDL2
  99.562 +
  99.563 +// #if OMPSS
  99.564 +static void draw_sb_border(H264Context *h, uint32_t *rgba_pixels, int smb_x, int smb_y){
  99.565 +    int mb_width = h->mb_width;
  99.566 +    int mb_height = h->mb_height;
  99.567 +    int width = h->frame_width;
  99.568 +    int height = h->frame_height;
  99.569 +
  99.570 +    int mb_x = smb_x * h->smb_width;
  99.571 +    int mb_y = smb_y * h->smb_height;
  99.572 +
  99.573 +    uint32_t pix= 0x0000FFC0;
  99.574 +
  99.575 +    for (int k=0, i=mb_y; i< mb_y + h->smb_height; i++, k++){
  99.576 +        for (int l=0, j=mb_x -k ; j< mb_x - k + h->smb_width; j++, l++){
  99.577 +            //outside frame
  99.578 +            if (i<0 || i>=mb_height || j<0 || j>=mb_width) {
  99.579 +                continue;
  99.580 +            }
  99.581 +
  99.582 +            //draw top
  99.583 +            if (i==0 || k==0 || l==0){
  99.584 +                int mx = j*16;
  99.585 +                int my = i*16;
  99.586 +                uint32_t *top = rgba_pixels + my*width + mx;
  99.587 +                int endx = mx+16 < width? 16: width-mx;
  99.588 +
  99.589 +                for (int x = 0; x<endx; x++){
  99.590 +                    top[x] = pix;
  99.591 +                }
  99.592 +            }
  99.593 +
  99.594 +            //draw bottom
  99.595 +            if (i==mb_height-1 || k==h->smb_height-1 || l==h->smb_width-1){
  99.596 +                int mx = j*16;
  99.597 +                int my = i*16 + 15; my = my < height ? my: height-1;
  99.598 +                uint32_t *bottom = rgba_pixels + my*width + mx;
  99.599 +                int endx = mx+16 < width? 16: width-mx;
  99.600 +
  99.601 +                for (int x = 0; x<endx; x++){
  99.602 +                    bottom[x] = pix;
  99.603 +                }
  99.604 +            }
  99.605 +
  99.606 +            //draw left
  99.607 +            if (j==0 || l==0 ){
  99.608 +                int mx = j*16;
  99.609 +                int my = i*16;
  99.610 +                uint32_t *left = rgba_pixels + my*width + mx;
  99.611 +                int endy = my +16 < height ? 16: height - my;
  99.612 +
  99.613 +                for (int y = 0; y<endy; y++){
  99.614 +                    left[y*width] = pix;
  99.615 +                }
  99.616 +            }
  99.617 +
  99.618 +            //draw right
  99.619 +            if (j==mb_width -1 || l==h->smb_width-1 ){
  99.620 +                int mx = j*16 + 15; mx = mx < width ? mx: width-1;
  99.621 +                int my = i*16;
  99.622 +                uint32_t *right = rgba_pixels + my*width + mx;
  99.623 +                int endy = my +16 < height ? 16: height - my;
  99.624 +
  99.625 +                for (int y = 0; y<endy; y++){
  99.626 +                    right[y*width] = pix;
  99.627 +                }
  99.628 +            }
  99.629 +        }
  99.630 +    }
  99.631 +}
  99.632 +
  99.633 +static void draw_sbmap (H264Context *h, SuperMBContext *smbc, SDLContext *sdlc){
  99.634 +    int pitch;
  99.635 +    uint32_t *rgba_pixels;
  99.636 +    SDL_Texture *sbmap= sdlc->sbmap_texture;
  99.637 +
  99.638 +    SDL_LockTexture( sbmap, NULL, (void **)&rgba_pixels, &pitch );
  99.639 +
  99.640 +    memset (rgba_pixels, 0, pitch * h->height);
  99.641 +    for (int i=0; i< smbc->nsmb_height; i++){
  99.642 +        for (int j=0; j< smbc->nsmb_width; j++){
  99.643 +            draw_sb_border(h, rgba_pixels, j, i);
  99.644 +        }
  99.645 +    }
  99.646 +
  99.647 +    SDL_UnlockTexture( sbmap );
  99.648 +}
  99.649 +// #endif
  99.650 +
  99.651 +// static void calc_sb_sizes (H264Context *h, SuperMBContext *smbc){
  99.652 +//     smbc->smb_height = h->smb_height;
  99.653 +//     smbc->smb_width = h->smb_width;
  99.654 +//
  99.655 +//     smbc->nsmb_height = h->mb_height / smbc->smb_height +  (h->mb_height%smbc->smb_height ? 1:0);    //only need one extra if mb_height was not dividable
  99.656 +//     smbc->nsmb_width  = h->mb_width / smbc->smb_width;
  99.657 +//     while ( (smbc->nsmb_width * smbc->smb_width)-(smbc->smb_height-1) < h->mb_width )
  99.658 +//         smbc->nsmb_width++;
  99.659 +// }
  99.660 +
  99.661 +
  99.662 +static void handle_key_event(H264Context *h, SDLContext *sdlc, SDL_Keysym keysym){
  99.663 +    int arrow=0;
  99.664 +
  99.665 +    switch (keysym.sym){
  99.666 +        case SDLK_ESCAPE:
  99.667 +            if (sdlc->fullscreen){
  99.668 +                SDL_SetWindowFullscreen(sdlc->window, SDL_FALSE);
  99.669 +                sdlc->fullscreen = 0;
  99.670 +            }
  99.671 +            break;
  99.672 +        case SDLK_SPACE:
  99.673 +            pthread_mutex_lock(&h->sdl_lock);
  99.674 +            sdlc->pause = !sdlc->pause;
  99.675 +            pthread_cond_signal(&h->sdl_cond);
  99.676 +            pthread_mutex_unlock(&h->sdl_lock);
  99.677 +            break;
  99.678 +        case SDLK_f:
  99.679 +            if (!sdlc->fullscreen){
  99.680 +                if (keysym.mod == KMOD_LCTRL){
  99.681 +//                     SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full);
  99.682 +                    SDL_SetWindowFullscreen(sdlc->window, SDL_TRUE);
  99.683 +
  99.684 +                    sdlc->fullscreen = 1;
  99.685 +                }
  99.686 +            }
  99.687 +            break;
  99.688 +        case SDLK_m:
  99.689 +            sdlc->showmap = !sdlc->showmap;
  99.690 +            break;
  99.691 +        case SDLK_UP:
  99.692 +            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height < h->mb_height && h->smb_height < h->smb_width){
  99.693 +                h->smb_height++;
  99.694 +                arrow =1;
  99.695 +            }
  99.696 +            break;
  99.697 +        case SDLK_DOWN:
  99.698 +            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_height > 1 ){
  99.699 +                h->smb_height--;
  99.700 +                arrow =1;
  99.701 +            }
  99.702 +            break;
  99.703 +        case SDLK_LEFT:
  99.704 +            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width > 1 && h->smb_width > h->smb_height){
  99.705 +                h->smb_width--;
  99.706 +                arrow =1;
  99.707 +            }
  99.708 +            break;
  99.709 +        case SDLK_RIGHT:
  99.710 +            if (keysym.mod == KMOD_NONE && sdlc->showmap && h->smb_width < h->mb_width){
  99.711 +                h->smb_width++;
  99.712 +                arrow =1;
  99.713 +            }
  99.714 +            break;
  99.715 +    }
  99.716 +
  99.717 +    if (arrow){
  99.718 +        SuperMBContext *smbc = getSuperMBContext(h, h->smb_width, h->smb_height);
  99.719 +        pthread_mutex_lock(&h->smb_lock);
  99.720 +        h->smbc->refcount--;
  99.721 +        if (h->smbc->refcount == 0)
  99.722 +            freeSuperMBContext(h->smbc);
  99.723 +        h->smbc = smbc;
  99.724 +        sdlc->updatemap =1;
  99.725 +        pthread_mutex_unlock(&h->smb_lock);
  99.726 +    }
  99.727 +}
  99.728 +
  99.729 +void handle_window_event(H264Context *h, SDLContext *sdlc, SDL_WindowEvent winevent){
  99.730 +    SDL_Rect nrect;
  99.731 +    switch (winevent.event){
  99.732 +        case SDL_WINDOWEVENT_RESIZED:
  99.733 +
  99.734 +            sdlc->win_w =  winevent.data1;
  99.735 +            sdlc->win_h =  winevent.data2;
  99.736 +
  99.737 +            double aspect = (double) sdlc->win_w/ sdlc->win_h;
  99.738 +            if ( aspect < sdlc->aspect){
  99.739 +                double r = (double) sdlc->win_w / sdlc->rect.w;
  99.740 +                double h = (double) sdlc->rect.h * r;
  99.741 +
  99.742 +                nrect.y = lrint(( (double) sdlc->win_h - h)/2);
  99.743 +                nrect.h = lrint(h);
  99.744 +
  99.745 +                nrect.x=0;
  99.746 +                nrect.w= sdlc->win_w;
  99.747 +
  99.748 +            }else {
  99.749 +                double r = (double) sdlc->win_h / sdlc->rect.h;
  99.750 +                double w = (double) sdlc->rect.w * r;
  99.751 +
  99.752 +                nrect.x = lrint(( (double) sdlc->win_w - w)/2);
  99.753 +                nrect.w = lrint(w);
  99.754 +
  99.755 +                nrect.y=0;
  99.756 +                nrect.h= sdlc->win_h;
  99.757 +            }
  99.758 +            //prob better to lock
  99.759 +            sdlc->win_rect = nrect;
  99.760 +            sdlc->resized=1;
  99.761 +            break;
  99.762 +    }
  99.763 +}
  99.764 +
  99.765 +void *sdl_event_listen_thread(void *arg){
  99.766 +    H264Context *h = (H264Context *) arg;
  99.767 +    SDLContext *sdlc = h->sdlc;
  99.768 +    SDL_Event event;
  99.769 +
  99.770 +    while ( SDL_WaitEvent(&event) ) {
  99.771 +        switch (event.type) {
  99.772 +            case SDL_KEYDOWN:
  99.773 +                handle_key_event(h, sdlc, event.key.keysym);
  99.774 +                break;
  99.775 +            case SDL_WINDOWEVENT:
  99.776 +                handle_window_event(h, sdlc, event.window);
  99.777 +                break;
  99.778 +            case SDL_QUIT:
  99.779 +                h->quit=1;
  99.780 +                goto finish;
  99.781 +        }
  99.782 +    }
  99.783 +finish:
  99.784 +    pthread_exit(NULL);
  99.785 +    return NULL;
  99.786 +}
  99.787 +
  99.788 +//XInitThreads not called in SDL2 library, causes crash
  99.789 +//remove in future when fixed ...
  99.790 +#include <X11/Xlib.h>
  99.791 +
  99.792 +SDLContext *get_SDL_context(H264Context *h){
  99.793 +    const int frame_width=h->frame_width;
  99.794 +    const int frame_height=h->frame_height;
  99.795 +
  99.796 +    SDLContext *sdlc = av_mallocz(sizeof(SDLContext));
  99.797 +    sdlc->display = h->display;
  99.798 +    sdlc->fullscreen = h->fullscreen;
  99.799 +
  99.800 +    sdlc->aspect = (double) frame_width / (double) frame_height;
  99.801 +    sdlc->rect.x =0;
  99.802 +    sdlc->rect.y =0;
  99.803 +    sdlc->rect.w =frame_width;
  99.804 +    sdlc->rect.h =frame_height;
  99.805 +
  99.806 +    XInitThreads(); //workaround
  99.807 +
  99.808 +    // Initializes the video subsystem
  99.809 +    if (SDL_Init(SDL_INIT_VIDEO) < 0) {
  99.810 +        fprintf(stderr, "Unable to init SDL: %s\n", SDL_GetError());
  99.811 +        #undef exit
  99.812 +        exit(-1);
  99.813 +    }
  99.814 +    SDL_SetHint("SDL_HINT_RENDER_SCALE_QUALITY", "best");
  99.815 +    SDL_SetHint("SDL_HINT_RENDER_OPENGL_SHADERS", "1");
  99.816 +
  99.817 +    SDL_GetDesktopDisplayMode(0, &sdlc->full);
  99.818 +    sdlc->full.format = SDL_PIXELFORMAT_IYUV;
  99.819 +
  99.820 +    sdlc->wind = sdlc->full;
  99.821 +    if (sdlc->wind.w > frame_width) sdlc->wind.w = frame_width;
  99.822 +    if (sdlc->wind.h > frame_height) sdlc->wind.h = frame_height;
  99.823 +
  99.824 +    sdlc->win_rect.x =0;
  99.825 +    sdlc->win_rect.y =0;
  99.826 +    sdlc->win_rect.w =sdlc->wind.w;
  99.827 +    sdlc->win_rect.h =sdlc->wind.h;
  99.828 +
  99.829 +    if (sdlc->fullscreen){
  99.830 +        sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED,  SDL_WINDOWPOS_UNDEFINED, sdlc->full.w, sdlc->full.h, SDL_WINDOW_FULLSCREEN|SDL_WINDOW_SHOWN|SDL_WINDOW_RESIZABLE);
  99.831 +        SDL_SetWindowDisplayMode (sdlc->window, &sdlc->full);
  99.832 +    } else {
  99.833 +        sdlc->window = SDL_CreateWindow( h->file_name, SDL_WINDOWPOS_UNDEFINED,  SDL_WINDOWPOS_UNDEFINED, sdlc->wind.w, sdlc->wind.h, SDL_WINDOW_RESIZABLE|SDL_WINDOW_SHOWN);
  99.834 +        SDL_SetWindowDisplayMode (sdlc->window, &sdlc->wind);
  99.835 +    }
  99.836 +
  99.837 +    sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_ACCELERATED);
  99.838 +//     sdlc->renderer = SDL_CreateRenderer(sdlc->window, -1, SDL_RENDERER_SOFTWARE);
  99.839 +
  99.840 +    h->sdlq.queue[0] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
  99.841 +    h->sdlq.queue[1] = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_IYUV, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
  99.842 +
  99.843 +    sdlc->sbmap_texture = SDL_CreateTexture (sdlc->renderer, SDL_PIXELFORMAT_RGBA8888, SDL_TEXTUREACCESS_STREAMING, frame_width, frame_height);
  99.844 +    SDL_SetTextureBlendMode(sdlc->sbmap_texture, SDL_BLENDMODE_BLEND);
  99.845 +    sdlc->updatemap = 1;
  99.846 +
  99.847 +#if HAVE_LIBSDL_TTF
  99.848 +    //not working with SDL 2.0, try again in future when supported
  99.849 +    if(TTF_Init()==-1) {
  99.850 +        printf("TTF_Init: %s\n", TTF_GetError());
  99.851 +        exit(2);
  99.852 +    }
  99.853 +
  99.854 +    // Load a font
  99.855 +    TTF_Font *font;
  99.856 +    font = TTF_OpenFont("/usr/share/fonts/truetype/freefont/FreeSans.ttf", 24);
  99.857 +    if (font == NULL)
  99.858 +    {
  99.859 +        printf("TTF_OpenFont() Failed: %s\n", TTF_GetError());
  99.860 +        TTF_Quit();
  99.861 +        exit(1);
  99.862 +    }
  99.863 +#endif
  99.864 +    
  99.865 +    pthread_create(&sdlc->listen_thread, NULL, sdl_event_listen_thread, h);
  99.866 +
  99.867 +    return sdlc;
  99.868 +
  99.869 +}
  99.870 +
  99.871 +void free_SDL_context(H264Context *h){
  99.872 +    SDLContext *sdlc = h->sdlc;
  99.873 +    pthread_join(sdlc->listen_thread, NULL);
  99.874 +
  99.875 +#if HAVE_LIBSDL_TTF
  99.876 +    TTF_Quit();
  99.877 +#endif
  99.878 +    SDL_DestroyTexture(h->sdlq.queue[0]);
  99.879 +    SDL_DestroyTexture(h->sdlq.queue[1]);
  99.880 +    SDL_DestroyTexture(sdlc->sbmap_texture);
  99.881 +    SDL_DestroyRenderer(sdlc->renderer);
  99.882 +    SDL_DestroyWindow(sdlc->window);
  99.883 +    SDL_Quit();
  99.884 +
  99.885 +}
  99.886 +
  99.887 +void *sdl_thread(void *arg){
  99.888 +    H264Context *h = (H264Context *) arg;
  99.889 +
  99.890 +    SDLContext *sdlc = get_SDL_context(h);
  99.891 +    h->sdlc = sdlc;
  99.892 +
  99.893 +    signal_texture(h, 0);
  99.894 +    signal_texture(h, 0);
  99.895 +
  99.896 +    SDL_Texture *texture;
  99.897 +    for (;;){
  99.898 +        pthread_mutex_lock(&h->sdl_lock);
  99.899 +        while (sdlc->pause){
  99.900 +            pthread_cond_wait(&h->sdl_cond, &h->sdl_lock);
  99.901 +        }
  99.902 +        pthread_mutex_unlock(&h->sdl_lock);
  99.903 +
  99.904 +        texture = get_next_texture(h, 0);
  99.905 +        if (texture == NULL)
  99.906 +            break;
  99.907 +        
  99.908 +        SDL_UnlockTexture(texture);
  99.909 +
  99.910 +        //clear if resized
  99.911 +        if (sdlc->resized){
  99.912 +            // KDE bug prob, reset viewport change after resize from max
  99.913 +            SDL_RenderSetViewport(sdlc->renderer, NULL);
  99.914 +            SDL_SetRenderDrawColor(sdlc->renderer, 0, 0, 0, 255);
  99.915 +            SDL_RenderClear(sdlc->renderer);
  99.916 +            sdlc->resized = 0;
  99.917 +        }
  99.918 +
  99.919 +        SDL_RenderCopy(sdlc->renderer, texture, &sdlc->rect, &sdlc->win_rect);
  99.920 +
  99.921 +        if (sdlc->showmap){
  99.922 +            if (sdlc->updatemap){
  99.923 +                SuperMBContext *smbc;
  99.924 +                pthread_mutex_lock (&h->smb_lock);
  99.925 +                smbc = h->smbc;
  99.926 +                smbc->refcount++;
  99.927 +                sdlc->updatemap=0;
  99.928 +                pthread_mutex_unlock(&h->smb_lock);
  99.929 +
  99.930 +                draw_sbmap(h, smbc, sdlc);
  99.931 +
  99.932 +                release_smbc(h, smbc);
  99.933 +            }
  99.934 +            SDL_RenderCopy(sdlc->renderer, sdlc->sbmap_texture, &sdlc->rect, &sdlc->win_rect);
  99.935 +        }
  99.936 +
  99.937 +        SDL_RenderPresent(sdlc->renderer);
  99.938 +        signal_texture(h, 0);
  99.939 +    }
  99.940 +
  99.941 +    free_SDL_context(h);
  99.942 +
  99.943 +    pthread_exit(NULL);
  99.944 +    return NULL;
  99.945 +}
  99.946 +#endif
  99.947 +

   100.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   100.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_misc.h	Mon Aug 27 12:09:56 2012 +0200
   100.3 @@ -0,0 +1,52 @@
   100.4 +#ifndef H264_MISC_H
   100.5 +#define H264_MISC_H
   100.6 +
   100.7 +#include "avcodec.h"
   100.8 +#include "h264_types.h"
   100.9 +
  100.10 +void start_timer(H264Context *h, int stage);
  100.11 +void stop_timer(H264Context *h, int stage);
  100.12 +
  100.13 +void init_sb_entry(H264Context *h, SliceBufferEntry *sbe);
  100.14 +void free_sb_entry(SliceBufferEntry *sb);
  100.15 +SliceBufferEntry *get_sb_entry(H264Context *h);
  100.16 +void release_sb_entry(H264Context *h, SliceBufferEntry *sb);
  100.17 +
  100.18 +DecodedPicture *get_dpb_entry(H264Context *h, H264Slice *s);
  100.19 +void release_dpb_entry(H264Context *h, DecodedPicture *pic, int mode);
  100.20 +
  100.21 +void draw_edges(MBRecContext *d, H264Slice *s, int line);
  100.22 +
  100.23 +int ff_init_slice(NalContext *n, H264Slice *s);
  100.24 +void free_picture(PictureInfo *pic);
  100.25 +void free_dp(DecodedPicture *pic);
  100.26 +
  100.27 +void av_start_timer();
  100.28 +int copyEDtoH264Slice(H264Slice *ms, H264Slice *es);
  100.29 +void print_report(int frame_number, uint64_t video_size, int is_last_report, int verbose);
  100.30 +
  100.31 +int ff_alloc_picture_info(NalContext *n, H264Slice *s, PictureInfo *pic);
  100.32 +DecodedPicture *output_frame(H264Context *h, OutputContext *oc, DecodedPicture *pic, int fd, int frame_width, int frame_height);
  100.33 +OutputContext *get_output_context(H264Context *h);
  100.34 +void free_output_context(OutputContext *oc);
  100.35 +
  100.36 +void freeSuperMBContext(SuperMBContext *smbc);
  100.37 +SuperMBContext *getSuperMBContext(H264Context *h, int smb_width, int smb_height);
  100.38 +void release_smbc(H264Context *h, SuperMBContext *smbc);
  100.39 +SuperMBContext * acquire_smbc(H264Context *h );
  100.40 +
  100.41 +#if HAVE_LIBSDL2
  100.42 +void signal_sdl_exit(H264Context *h);
  100.43 +void *sdl_thread(void *arg);
  100.44 +SDLContext *get_SDL_context(H264Context *h);
  100.45 +void free_SDL_context(SDLContext *sdlc);
  100.46 +#endif
  100.47 +
  100.48 +/**
  100.49 +* gets the chroma qp.
  100.50 +*/
  100.51 +static inline int get_chroma_qp(H264Slice *s, int t, int qscale){
  100.52 +    return s->pps.chroma_qp_table[t][qscale];
  100.53 +}
  100.54 +
  100.55 +#endif

   101.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   101.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_nal.c	Mon Aug 27 12:09:56 2012 +0200
   101.3 @@ -0,0 +1,628 @@
   101.4 +#include "h264_types.h"
   101.5 +#include "h264_data.h"
   101.6 +
   101.7 +#include "golomb.h"
   101.8 +#include "h264_sei.h"
   101.9 +#include "h264_refs.h"
  101.10 +#include "h264_ps.h"
  101.11 +#include "h264_pred_mode.h"
  101.12 +#include "h264_misc.h"
  101.13 +
  101.14 +static int ff_h264_decode_rbsp_trailing(const uint8_t *src){
  101.15 +    int v= *src;
  101.16 +    int r;
  101.17 +
  101.18 +    for(r=1; r<9; r++){
  101.19 +        if(v&1) return r;
  101.20 +        v>>=1;
  101.21 +    }
  101.22 +    return 0;
  101.23 +}
  101.24 +
  101.25 +static int pred_weight_table(H264Slice *s, GetBitContext *gb){
  101.26 +    int luma_def, chroma_def;
  101.27 +
  101.28 +    s->use_weight= 0;
  101.29 +    s->use_weight_chroma= 0;
  101.30 +    s->luma_log2_weight_denom= get_ue_golomb(gb);
  101.31 +    s->chroma_log2_weight_denom= get_ue_golomb(gb);
  101.32 +    luma_def = 1<<s->luma_log2_weight_denom;
  101.33 +    chroma_def = 1<<s->chroma_log2_weight_denom;
  101.34 +
  101.35 +    for(int list=0; list<2; list++){
  101.36 +        for(int i=0; i<s->ref_count[list]; i++){
  101.37 +            int luma_weight_flag, chroma_weight_flag;
  101.38 +
  101.39 +            luma_weight_flag= get_bits1(gb);
  101.40 +            if(luma_weight_flag){
  101.41 +                s->luma_weight[i][list][0]= get_se_golomb(gb);
  101.42 +                s->luma_weight[i][list][1]= get_se_golomb(gb);
  101.43 +                if(   s->luma_weight[i][list][0] != luma_def
  101.44 +                    || s->luma_weight[i][list][1] != 0) {
  101.45 +                    s->use_weight= 1;
  101.46 +                }
  101.47 +            }else{
  101.48 +                s->luma_weight[i][list][0]= luma_def;
  101.49 +                s->luma_weight[i][list][1]= 0;
  101.50 +            }
  101.51 +
  101.52 +            chroma_weight_flag= get_bits1(gb);
  101.53 +            if(chroma_weight_flag){
  101.54 +                int j;
  101.55 +                for(j=0; j<2; j++){
  101.56 +                    s->chroma_weight[i][list][j][0]= get_se_golomb(gb);
  101.57 +                    s->chroma_weight[i][list][j][1]= get_se_golomb(gb);
  101.58 +                    if(   s->chroma_weight[i][list][j][0] != chroma_def
  101.59 +                    || s->chroma_weight[i][list][j][1] != 0) {
  101.60 +                        s->use_weight_chroma= 1;
  101.61 +                    }
  101.62 +                }
  101.63 +            }else{
  101.64 +                int j;
  101.65 +                for(j=0; j<2; j++){
  101.66 +                    s->chroma_weight[i][list][j][0]= chroma_def;
  101.67 +                    s->chroma_weight[i][list][j][1]= 0;
  101.68 +                }
  101.69 +            }
  101.70 +        }
  101.71 +        if(s->slice_type_nos != FF_B_TYPE) break;
  101.72 +    }
  101.73 +    s->use_weight= s->use_weight || s->use_weight_chroma;
  101.74 +    return 0;
  101.75 +}
  101.76 +
  101.77 +/**
  101.78 +* Initialize implicit_weight table.
  101.79 +*/
  101.80 +static void implicit_weight_table(H264Slice *s){
  101.81 +    int ref0, ref1, cur_poc, ref_start, ref_count0, ref_count1;
  101.82 +
  101.83 +    cur_poc = s->poc;
  101.84 +    if(   s->ref_count[0] == 1 && s->ref_count[1] == 1  && s->ref_list[0][0]->poc + s->ref_list[1][0]->poc == 2*cur_poc){
  101.85 +        s->use_weight= 0;
  101.86 +        s->use_weight_chroma= 0;
  101.87 +        return;
  101.88 +    }
  101.89 +    ref_start= 0;
  101.90 +    ref_count0= s->ref_count[0];
  101.91 +    ref_count1= s->ref_count[1];
  101.92 +
  101.93 +    s->use_weight= 2;
  101.94 +    s->use_weight_chroma= 2;
  101.95 +    s->luma_log2_weight_denom= 5;
  101.96 +    s->chroma_log2_weight_denom= 5;
  101.97 +
  101.98 +    for(ref0=ref_start; ref0 < ref_count0; ref0++){
  101.99 +        int poc0 = s->ref_list[0][ref0]->poc;
 101.100 +        for(ref1=ref_start; ref1 < ref_count1; ref1++){
 101.101 +            int poc1 = s->ref_list[1][ref1]->poc;
 101.102 +            int td = av_clip(poc1 - poc0, -128, 127);
 101.103 +            int w= 32;
 101.104 +            if(td){
 101.105 +                int tb = av_clip(cur_poc - poc0, -128, 127);
 101.106 +                int tx = (16384 + (FFABS(td) >> 1)) / td;
 101.107 +                int dist_scale_factor = (tb*tx + 32) >> 8;
 101.108 +                if(dist_scale_factor >= -64 && dist_scale_factor <= 128)
 101.109 +                    w = 64 - dist_scale_factor;
 101.110 +            }
 101.111 +            s->implicit_weight[ref0][ref1][0]=
 101.112 +            s->implicit_weight[ref0][ref1][1]= w;
 101.113 +        }
 101.114 +    }
 101.115 +}
 101.116 +
 101.117 +/**
 101.118 +* instantaneous decoder refresh.
 101.119 +*/
 101.120 +static void idr(NalContext *n, H264Slice *s){
 101.121 +    ff_h264_remove_all_refs(n, s);
 101.122 +    n->prev_frame_num= 0;
 101.123 +    n->prev_frame_num_offset= 0;
 101.124 +    n->poc_offset +=  (n->prev_poc_msb<<16) + n->prev_poc_lsb;
 101.125 +    n->prev_poc_msb=
 101.126 +    n->prev_poc_lsb= 0;
 101.127 +}
 101.128 +
 101.129 +static int init_poc(NalContext *n, H264Slice *s, GetBitContext *gb){
 101.130 +    const int max_frame_num= 1<<n->sps.log2_max_frame_num;
 101.131 +    int frame_poc;
 101.132 +
 101.133 +    if(n->sps.poc_type==0){
 101.134 +        n->poc_lsb= get_bits(gb, n->sps.log2_max_poc_lsb);
 101.135 +    }
 101.136 +
 101.137 +    if(n->sps.poc_type==1 && !n->sps.delta_pic_order_always_zero_flag){
 101.138 +        n->delta_poc= get_se_golomb(gb);
 101.139 +    }
 101.140 +
 101.141 +    n->frame_num_offset= n->prev_frame_num_offset;
 101.142 +    if(n->frame_num < n->prev_frame_num)
 101.143 +        n->frame_num_offset += max_frame_num;
 101.144 +
 101.145 +    if(n->sps.poc_type==0){
 101.146 +        const int max_poc_lsb= 1<<n->sps.log2_max_poc_lsb;
 101.147 +
 101.148 +        if(n->poc_lsb < n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb >= max_poc_lsb/2)
 101.149 +            n->poc_msb = n->prev_poc_msb + max_poc_lsb;
 101.150 +        else if(n->poc_lsb > n->prev_poc_lsb && n->prev_poc_lsb - n->poc_lsb < -max_poc_lsb/2)
 101.151 +            n->poc_msb = n->prev_poc_msb - max_poc_lsb;
 101.152 +        else
 101.153 +            n->poc_msb = n->prev_poc_msb;
 101.154 +
 101.155 +        frame_poc = n->poc_msb + n->poc_lsb;
 101.156 +    }else if(n->sps.poc_type==1){
 101.157 +        int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
 101.158 +        int i;
 101.159 +
 101.160 +        if(n->sps.poc_cycle_length != 0)
 101.161 +            abs_frame_num = n->frame_num_offset + n->frame_num;
 101.162 +        else
 101.163 +            abs_frame_num = 0;
 101.164 +
 101.165 +        if(s->nal_ref_idc==0 && abs_frame_num > 0)
 101.166 +            abs_frame_num--;
 101.167 +
 101.168 +        expected_delta_per_poc_cycle = 0;
 101.169 +        for(i=0; i < n->sps.poc_cycle_length; i++)
 101.170 +            expected_delta_per_poc_cycle += n->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
 101.171 +
 101.172 +        if(abs_frame_num > 0){
 101.173 +            int poc_cycle_cnt          = (abs_frame_num - 1) / n->sps.poc_cycle_length;
 101.174 +            int frame_num_in_poc_cycle = (abs_frame_num - 1) % n->sps.poc_cycle_length;
 101.175 +
 101.176 +            expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
 101.177 +            for(i = 0; i <= frame_num_in_poc_cycle; i++)
 101.178 +                expectedpoc = expectedpoc + n->sps.offset_for_ref_frame[ i ];
 101.179 +        } else
 101.180 +            expectedpoc = 0;
 101.181 +        if(s->nal_ref_idc == 0)
 101.182 +            expectedpoc = expectedpoc + n->sps.offset_for_non_ref_pic;
 101.183 +        frame_poc = expectedpoc + n->delta_poc;
 101.184 +    }else{
 101.185 +        int poc= 2*(n->frame_num_offset + n->frame_num);
 101.186 +        if(!s->nal_ref_idc)
 101.187 +            poc--;
 101.188 +        frame_poc= poc;
 101.189 +    }
 101.190 +    s->current_picture_info->poc= s->poc = frame_poc + n->poc_offset;
 101.191 +    s->coded_pic_num = n->coded_pic_num++;
 101.192 +
 101.193 +    return 0;
 101.194 +}
 101.195 +
 101.196 +static void ref2frame(NalContext *n, H264Slice *s){
 101.197 +    for(int j=0; j<s->list_count; j++){
 101.198 +        int *ref2frm= s->ref2frm[j];
 101.199 +
 101.200 +        ref2frm[0]=
 101.201 +        ref2frm[1]= -1;
 101.202 +
 101.203 +        for(int i=0; i<s->ref_count[j]; i++){
 101.204 +            ref2frm[i+2]= 15;
 101.205 +            if(s->ref_list[j][i]->cpn >=0){
 101.206 +                int k;
 101.207 +                for(k=0; k<n->short_ref_count; k++){
 101.208 +                    if(n->short_ref[k]->cpn == s->ref_list[j][i]->cpn){
 101.209 +                        ref2frm[i+2]= k;
 101.210 +                        break;
 101.211 +                    }
 101.212 +                }
 101.213 +            }
 101.214 +        }
 101.215 +    }
 101.216 +}
 101.217 +
 101.218 +/**
 101.219 +* decodes a slice header.
 101.220 +* This will also call MPV_common_init() and frame_start() as needed.
 101.221 +*
 101.222 +* @param h h264context
 101.223 +* @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
 101.224 +*
 101.225 +* @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
 101.226 +*/
 101.227 +static int decode_slice_header(NalContext *n, H264Slice *s, GetBitContext *gb){
 101.228 +    unsigned int first_mb_in_slice;
 101.229 +    unsigned int pps_id;
 101.230 +    int num_ref_idx_active_override_flag;
 101.231 +    unsigned int slice_type, tmp;
 101.232 +
 101.233 +    first_mb_in_slice= get_ue_golomb(gb);
 101.234 +    (void) first_mb_in_slice;
 101.235 +
 101.236 +    slice_type= get_ue_golomb_31(gb);
 101.237 +    if(slice_type > 9){
 101.238 +        av_log(AV_LOG_ERROR, "slice type too large (%d)\n", s->slice_type);
 101.239 +        return -1;
 101.240 +    }
 101.241 +    if(slice_type > 4)
 101.242 +        slice_type -= 5;
 101.243 +
 101.244 +    slice_type= golomb_to_pict_type[ slice_type ];
 101.245 +
 101.246 +    s->slice_type= slice_type;
 101.247 +    s->slice_type_nos= slice_type & 3;
 101.248 +    s->current_picture_info->slice_type_nos = s->slice_type_nos;
 101.249 +    s->current_picture_info->reference= s->nal_ref_idc? 2:0;
 101.250 +    s->key_frame = s->slice_type == FF_I_TYPE;
 101.251 +
 101.252 +    pps_id= get_ue_golomb(gb);
 101.253 +
 101.254 +    if(pps_id>=MAX_PPS_COUNT){
 101.255 +        av_log(AV_LOG_ERROR, "pps_id out of range\n");
 101.256 +        return -1;
 101.257 +    }
 101.258 +    if(!n->pps_buffers[pps_id]) {
 101.259 +        av_log(AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
 101.260 +        return -1;
 101.261 +    }
 101.262 +    s->pps= *n->pps_buffers[pps_id];
 101.263 +
 101.264 +    if(!n->sps_buffers[s->pps.sps_id]) {
 101.265 +        av_log(AV_LOG_ERROR, "non-existing SPS %u referenced\n", s->pps.sps_id);
 101.266 +        return -1;
 101.267 +    }
 101.268 +    n->sps = *n->sps_buffers[s->pps.sps_id];
 101.269 +
 101.270 +    n->mb_width= n->sps.mb_width;
 101.271 +    n->mb_height= n->sps.mb_height;
 101.272 +
 101.273 +    int chroma444 = (n->sps.chroma_format_idc == 3);
 101.274 +    n->width = 16*n->mb_width - (2>>chroma444)*FFMIN(n->sps.crop_right, (8<<chroma444)-1);
 101.275 +    if(n->sps.frame_mbs_only_flag)
 101.276 +        n->height= 16*n->mb_height - (2>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1);
 101.277 +    else
 101.278 +        n->height= 16*n->mb_height - (4>>chroma444)*FFMIN(n->sps.crop_bottom, (8<<chroma444)-1);
 101.279 +
 101.280 +    s->direct_8x8_inference_flag = n->sps.direct_8x8_inference_flag;
 101.281 +    s->transform_bypass = n->sps.transform_bypass;
 101.282 +
 101.283 +    n->frame_num= get_bits(gb, n->sps.log2_max_frame_num);
 101.284 +    if(n->frame_num !=  n->prev_frame_num && n->frame_num != (n->prev_frame_num+1)%(1<<n->sps.log2_max_frame_num)){
 101.285 +        av_log(AV_LOG_ERROR, "unexpected frame_num \n");
 101.286 +    }
 101.287 +
 101.288 +    s->current_picture_info->frame_num= n->frame_num; //FIXME frame_num cleanup
 101.289 +    n->max_pic_num= 1<< n->sps.log2_max_frame_num;
 101.290 +
 101.291 +    if(s->nal_unit_type == NAL_IDR_SLICE){
 101.292 +        get_ue_golomb(gb); /* idr_pic_id */
 101.293 +    }
 101.294 +
 101.295 +    init_poc(n, s, gb);
 101.296 +
 101.297 +    if(s->pps.redundant_pic_cnt_present){
 101.298 +        n->redundant_pic_count= get_ue_golomb(gb);
 101.299 +    }
 101.300 +
 101.301 +    //set defaults, might be overridden a few lines later
 101.302 +    s->ref_count[0]= s->pps.ref_count[0];
 101.303 +    s->ref_count[1]= s->pps.ref_count[1];
 101.304 +
 101.305 +    if(s->slice_type_nos != FF_I_TYPE){
 101.306 +        if(s->slice_type_nos == FF_B_TYPE){
 101.307 +            s->direct_spatial_mv_pred= get_bits1(gb);
 101.308 +        }
 101.309 +        num_ref_idx_active_override_flag= get_bits1(gb);
 101.310 +
 101.311 +        if(num_ref_idx_active_override_flag){
 101.312 +            s->ref_count[0]= get_ue_golomb(gb) + 1;
 101.313 +            if(s->slice_type_nos==FF_B_TYPE)
 101.314 +                s->ref_count[1]= get_ue_golomb(gb) + 1;
 101.315 +
 101.316 +            if(s->ref_count[0]-1 > 32-1 || s->ref_count[1]-1 > 32-1){
 101.317 +                av_log(AV_LOG_ERROR, "reference overflow\n");
 101.318 +                s->ref_count[0]= s->ref_count[1]= 1;
 101.319 +                return -1;
 101.320 +            }
 101.321 +        }
 101.322 +        if(s->slice_type_nos == FF_B_TYPE)
 101.323 +            s->list_count= 2;
 101.324 +        else
 101.325 +            s->list_count= 1;
 101.326 +    }else
 101.327 +        s->list_count= 0;
 101.328 +
 101.329 +
 101.330 +    if(s->slice_type_nos!=FF_I_TYPE){
 101.331 +        ff_h264_fill_default_ref_list(n, s);
 101.332 +        ff_h264_decode_ref_pic_list_reordering(n, s, gb);
 101.333 +        ref2frame(n, s);
 101.334 +
 101.335 +        for(int i=0; i<2; i++){
 101.336 +            for(int j=0; j<s->ref_count[i]; j++){
 101.337 +                if (s->ref_list[i][j]==NULL || s->ref_list[i][j]->reference < 2) // Don't know why sometimes the ref_count=1 while there are no references
 101.338 +                    s->ref_list_cpn[i][j] = -1;
 101.339 +                else
 101.340 +                    s->ref_list_cpn[i][j] = s->ref_list[i][j]->cpn;
 101.341 +            }
 101.342 +        }
 101.343 +    }
 101.344 +
 101.345 +    if(   (s->pps.weighted_pred          && s->slice_type_nos == FF_P_TYPE )
 101.346 +    ||  (s->pps.weighted_bipred_idc==1 && s->slice_type_nos== FF_B_TYPE ) ){
 101.347 +        pred_weight_table(s, gb);
 101.348 +    }
 101.349 +    else if(s->pps.weighted_bipred_idc==2 && s->slice_type_nos== FF_B_TYPE){
 101.350 +        implicit_weight_table( s);
 101.351 +    }else {
 101.352 +        s->use_weight = 0;
 101.353 +    }
 101.354 +
 101.355 +    if(s->nal_ref_idc){
 101.356 +        ff_h264_ref_pic_marking(n, s, gb);
 101.357 +        n->prev_poc_msb= n->poc_msb;
 101.358 +        n->prev_poc_lsb= n->poc_lsb;
 101.359 +    }
 101.360 +
 101.361 +    n->prev_frame_num_offset= n->frame_num_offset;
 101.362 +    n->prev_frame_num= n->frame_num;
 101.363 +
 101.364 +    if(s->slice_type_nos != FF_B_TYPE){
 101.365 +        s->ip_id= n->ip_id++;
 101.366 +    }
 101.367 +
 101.368 +    if(s->slice_type_nos==FF_B_TYPE && !s->direct_spatial_mv_pred){
 101.369 +        ff_h264_direct_dist_scale_factor(s);
 101.370 +    }
 101.371 +    ff_h264_direct_ref_list_init(s);
 101.372 +
 101.373 +
 101.374 +    if( s->slice_type_nos != FF_I_TYPE && s->pps.cabac ){
 101.375 +        tmp = get_ue_golomb_31(gb);
 101.376 +        if(tmp > 2){
 101.377 +            av_log(AV_LOG_ERROR, "cabac_init_idc overflow\n");
 101.378 +            return -1;
 101.379 +        }
 101.380 +        s->cabac_init_idc= tmp;
 101.381 +    }
 101.382 +
 101.383 +    tmp = s->pps.init_qp + get_se_golomb(gb);
 101.384 +    if(tmp>51){
 101.385 +        av_log(AV_LOG_ERROR, "QP %u out of range\n", tmp);
 101.386 +        return -1;
 101.387 +    }
 101.388 +    s->qscale= tmp;
 101.389 +
 101.390 +    //FIXME qscale / qp ... stuff
 101.391 +    if(s->slice_type == FF_SP_TYPE){
 101.392 +        get_bits1(gb); /* sp_for_switch_flag */
 101.393 +    }
 101.394 +    if(s->slice_type==FF_SP_TYPE || s->slice_type == FF_SI_TYPE){
 101.395 +        get_se_golomb(gb); /* slice_qs_delta */
 101.396 +    }
 101.397 +
 101.398 +    s->slice_alpha_c0_offset = 52;
 101.399 +    s->slice_beta_offset = 52;
 101.400 +    if( s->pps.deblocking_filter_parameters_present ) {
 101.401 +        tmp= get_ue_golomb_31(gb);
 101.402 +        if(tmp > 1){
 101.403 +            av_log(AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
 101.404 +            return -1;
 101.405 +        }
 101.406 +
 101.407 +        if(tmp < 2)
 101.408 +            tmp^= 1; // 1<->0
 101.409 +
 101.410 +        if( tmp ) {
 101.411 +            s->slice_alpha_c0_offset += get_se_golomb(gb) << 1;
 101.412 +            s->slice_beta_offset     += get_se_golomb(gb) << 1;
 101.413 +            if( (unsigned) s->slice_alpha_c0_offset > 104U
 101.414 +            ||(unsigned) s->slice_beta_offset    > 104U){
 101.415 +                av_log(AV_LOG_ERROR, "deblocking filter parameters %d %d out of range\n", s->slice_alpha_c0_offset, s->slice_beta_offset);
 101.416 +                return -1;
 101.417 +            }
 101.418 +        }
 101.419 +    }
 101.420 +
 101.421 +    s->qp_thresh= 15 + 52 - FFMIN(s->slice_alpha_c0_offset, s->slice_beta_offset) - FFMAX3(0, s->pps.chroma_qp_index_offset[0], s->pps.chroma_qp_index_offset[1]);
 101.422 +
 101.423 +    return 0;
 101.424 +}
 101.425 +
 101.426 +PictureInfo *get_pib_entry(NalContext *nc, int coded_pic_num){
 101.427 +    PictureInfo *pic = NULL;
 101.428 +
 101.429 +    for(int i=0; i<MAX_REF_PIC_COUNT+1; i++){
 101.430 +        if(nc->picture[i].reference==0){
 101.431 +            pic= &nc->picture[i];
 101.432 +            break;
 101.433 +        }
 101.434 +    }
 101.435 +    pic->cpn = coded_pic_num;
 101.436 +
 101.437 +    return pic;
 101.438 +}
 101.439 +
 101.440 +int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb1){
 101.441 +    GetBitContext *gb = gb1;
 101.442 +    uint8_t *buf = gb1->raw;
 101.443 +    int buf_size = gb1->buf_size;
 101.444 +    int next_avc = buf_size;
 101.445 +    int buf_index=0;
 101.446 +    uint8_t *dst=NULL;
 101.447 +//     gb->raw = gb1->raw;
 101.448 +//     gb->rbsp = NULL;
 101.449 +    s->release_cnt=0;
 101.450 +    ff_h264_reset_sei(n);
 101.451 +
 101.452 +    s->current_picture_info = get_pib_entry(n, n->coded_pic_num);
 101.453 +
 101.454 +    for(;;){
 101.455 +        int consumed;
 101.456 +        int dst_length;
 101.457 +        int bit_length;
 101.458 +        const uint8_t *ptr;
 101.459 +        int err;
 101.460 +
 101.461 +        if (buf_index >= buf_size){
 101.462 +            break;
 101.463 +        } else {
 101.464 +            // start code prefix search
 101.465 +            for(; buf_index + 3 < buf_size; buf_index++){
 101.466 +                // This should always succeed in the first iteration.
 101.467 +                if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
 101.468 +                    break;
 101.469 +            }
 101.470 +            if(buf_index+3 >= buf_size) break;
 101.471 +            buf_index+=3;
 101.472 +        }
 101.473 +
 101.474 +        {
 101.475 +            int length = next_avc - buf_index;
 101.476 +            int i, si, di;
 101.477 +            uint8_t *src= buf+buf_index;
 101.478 +            //    src[0]&0x80;                //forbidden bit
 101.479 +            s->nal_ref_idc= src[0]>>5;
 101.480 +            s->nal_unit_type= src[0]&0x1F;
 101.481 +
 101.482 +            src++; length--;
 101.483 +
 101.484 +            for(i=0; i+1<length; i+=2){
 101.485 +                if(src[i]) continue;
 101.486 +                if(i>0 && src[i-1]==0) i--;
 101.487 +                if(i+2<length && src[i+1]==0 && src[i+2]<=3){
 101.488 +                    if(src[i+2]!=3){
 101.489 +                        /* startcode, so we must be past the end */
 101.490 +                        length=i;
 101.491 +                    }
 101.492 +                    break;
 101.493 +                }
 101.494 +            }
 101.495 +
 101.496 +            if(i>=length-1){ //no escaped 0
 101.497 +                dst_length= length;
 101.498 +                consumed= length+1; //+1 for the header
 101.499 +                ptr=src;
 101.500 +            }else{
 101.501 +                av_fast_malloc(&gb->rbsp, &gb->rbsp_size, length+FF_INPUT_BUFFER_PADDING_SIZE);
 101.502 +                dst = gb->rbsp;
 101.503 +//                 if (dst){
 101.504 +//                     av_free(dst);
 101.505 +//                 }
 101.506 +//                 dst = av_malloc(length+FF_INPUT_BUFFER_PADDING_SIZE);
 101.507 +
 101.508 +                if (dst == NULL){
 101.509 +                    return -1;
 101.510 +                }
 101.511 +
 101.512 +                //printf("decoding esc\n");
 101.513 +                memcpy(dst, src, i);
 101.514 +                si=di=i;
 101.515 +                while(si+2<length){
 101.516 +                    //remove escapes (very rare 1:2^22)
 101.517 +                    if(src[si+2]>3){
 101.518 +                        dst[di++]= src[si++];
 101.519 +                        dst[di++]= src[si++];
 101.520 +                    }else if(src[si]==0 && src[si+1]==0){
 101.521 +                        if(src[si+2]==3){ //escape
 101.522 +                            dst[di++]= 0;
 101.523 +                            dst[di++]= 0;
 101.524 +                            si+=3;
 101.525 +                            continue;
 101.526 +                        }else //next start code
 101.527 +                            goto nsc;
 101.528 +                    }
 101.529 +
 101.530 +                    dst[di++]= src[si++];
 101.531 +                }
 101.532 +                while(si<length)
 101.533 +                    dst[di++]= src[si++];
 101.534 +                nsc:
 101.535 +
 101.536 +                memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
 101.537 +
 101.538 +                dst_length= di;
 101.539 +                consumed= si + 1;//+1 for the header
 101.540 +                //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
 101.541 +                ptr=dst;
 101.542 +//                 gb->rbsp=ptr;
 101.543 +            }
 101.544 +        }
 101.545 +        if (ptr==NULL || dst_length < 0){
 101.546 +            return -1;
 101.547 +        }
 101.548 +
 101.549 +        //error prevention, should not touch dst_length
 101.550 +        while(ptr[dst_length - 1] == 0 && dst_length > 0)
 101.551 +            dst_length--;
 101.552 +
 101.553 +        bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(ptr + dst_length - 1));
 101.554 +        buf_index += consumed;
 101.555 +
 101.556 +        err = 0;
 101.557 +        init_get_bits(gb, ptr, bit_length);
 101.558 +        switch(s->nal_unit_type){
 101.559 +            case NAL_IDR_SLICE:
 101.560 +                idr(n, s); //FIXME ensure we don't loose some frames if there is reordering
 101.561 +            case NAL_SLICE:
 101.562 +                if((err = decode_slice_header(n, s, gb)))
 101.563 +                    break;
 101.564 +                s->key_frame |= (s->nal_unit_type == NAL_IDR_SLICE) || (n->sei_recovery_frame_cnt >= 0);
 101.565 +                break;
 101.566 +            case NAL_DPA:
 101.567 +            case NAL_DPB:
 101.568 +            case NAL_DPC:
 101.569 +                av_log(AV_LOG_ERROR,"no slices/data partitioning support\n");
 101.570 +                break;
 101.571 +            case NAL_SEI:
 101.572 +                ff_h264_decode_sei(n, gb);
 101.573 +                break;
 101.574 +            case NAL_SPS:
 101.575 +                ff_h264_decode_seq_parameter_set(n, gb);
 101.576 +                break;
 101.577 +            case NAL_PPS:
 101.578 +                ff_h264_decode_picture_parameter_set(n, gb, bit_length);
 101.579 +                break;
 101.580 +            case NAL_AUD:
 101.581 +            case NAL_END_SEQUENCE:
 101.582 +            case NAL_END_STREAM:
 101.583 +            case NAL_FILLER_DATA:
 101.584 +            case NAL_SPS_EXT:
 101.585 +            case NAL_AUXILIARY_SLICE:
 101.586 +                break;
 101.587 +            default:
 101.588 +                av_log(AV_LOG_ERROR, "Unknown NAL code: %d (%d bits)\n", s->nal_unit_type, bit_length);
 101.589 +        }
 101.590 +        if (err < 0)
 101.591 +            av_log(AV_LOG_ERROR, "decode_slice_header error\n");
 101.592 +
 101.593 +    }
 101.594 +
 101.595 +    return buf_index;
 101.596 +}
 101.597 +
 101.598 +NalContext *get_nal_context(int width, int height){
 101.599 +    const int mb_height = (height + 15) / 16;
 101.600 +    const int mb_width  = (width  + 15) / 16;
 101.601 +    const int mb_stride = ((mb_width+1)/16 + 1) *16; //align mb_stride to 16
 101.602 +
 101.603 +    NalContext *nc = av_mallocz(sizeof(NalContext));
 101.604 +    nc->width = width;
 101.605 +    nc->height = height;
 101.606 +    nc->mb_height = mb_height;
 101.607 +    nc->mb_width  = mb_width;
 101.608 +    nc->b4_stride = mb_width*4 + 1;
 101.609 +    nc->mb_stride = mb_stride;
 101.610 +    nc->outputed_poc = INT_MIN;
 101.611 +
 101.612 +    for(int i=0; i<16; i++){
 101.613 +        nc->picture[i].cpn =-1;
 101.614 +    }
 101.615 +
 101.616 +    return nc;
 101.617 +}
 101.618 +
 101.619 +void free_nal_context(NalContext *nc){
 101.620 +    for(int i = 0; i < MAX_SPS_COUNT; i++){
 101.621 +        if (nc->sps_buffers[i]){
 101.622 +            av_free( nc->sps_buffers[i]);
 101.623 +        }
 101.624 +    }
 101.625 +    for(int i = 0; i < MAX_PPS_COUNT; i++){
 101.626 +        if (nc->pps_buffers[i]){
 101.627 +            av_free( nc->pps_buffers[i]);
 101.628 +        }
 101.629 +    }
 101.630 +    av_free(nc);
 101.631 +}

   102.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   102.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_nal.h	Mon Aug 27 12:09:56 2012 +0200
   102.3 @@ -0,0 +1,11 @@
   102.4 +#ifndef H264_NAL_H
   102.5 +#define H264_NAL_H
   102.6 +
   102.7 +#include "avcodec.h"
   102.8 +#include "h264_types.h"
   102.9 +
  102.10 +int decode_nal_units(NalContext *n, H264Slice *s, GetBitContext *gb);
  102.11 +NalContext *get_nal_context(int width, int height);
  102.12 +void free_nal_context(NalContext *nc);
  102.13 +
  102.14 +#endif

   103.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   103.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_numa.c	Mon Aug 27 12:09:56 2012 +0200
   103.3 @@ -0,0 +1,33 @@
   103.4 +
   103.5 +#include <pthread.h>
   103.6 +#include "h264.h"
   103.7 +#include "malloc.h"
   103.8 +
   103.9 +/*
  103.10 +* Pthread version with affinity lock for ED and MBD threads. Deprecated
  103.11 +*/
  103.12 +int av_transcode_pthread_affinity(int ifile, int ofile, int frame_width, int frame_height, h264_options *opts) {
  103.13 +	H264Context *h;
  103.14 +	pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;
  103.15 +
  103.16 +	h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height, opts);	
  103.17 +	timer_start = av_gettime();
  103.18 +
  103.19 +	pthread_create(&read_thr, NULL, read_thread, h);
  103.20 +	pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
  103.21 +	pthread_create(&entropy_thr, NULL, entropy_IPB_thread, h);
  103.22 +	pthread_create(&mbdec_thr, NULL, mbdec_thread, h);
  103.23 +	pthread_create(&write_thr, NULL, write_thread, h);
  103.24 +
  103.25 +
  103.26 +	pthread_join(read_thr, NULL);
  103.27 +	pthread_join(parsenal_thr, NULL);
  103.28 +	pthread_join(entropy_thr, NULL);
  103.29 +	pthread_join(mbdec_thr, NULL);
  103.30 +	pthread_join(write_thr, NULL);
  103.31 +
  103.32 +	/* finished ! */
  103.33 +	ff_h264_decode_end(h);
  103.34 +
  103.35 +	return 0;
  103.36 +}

   104.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   104.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_ompss.c	Mon Aug 27 12:09:56 2012 +0200
   104.3 @@ -0,0 +1,400 @@
   104.4 +/*
   104.5 +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   104.6 +* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   104.7 +*
   104.8 +* This file is part of FFmpeg.
   104.9 +*
  104.10 +* FFmpeg is free software; you can redistribute it and/or
  104.11 +* modify it under the terms of the GNU Lesser General Public
  104.12 +* License as published by the Free Software Foundation; either
  104.13 +* version 2.1 of the License, or (at your option) any later version.
  104.14 +*
  104.15 +* FFmpeg is distributed in the hope that it will be useful,
  104.16 +* but WITHOUT ANY WARRANTY; without even the implied warranty of
  104.17 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  104.18 +* Lesser General Public License for more details.
  104.19 +*
  104.20 +* You should have received a copy of the GNU Lesser General Public
  104.21 +* License along with FFmpeg; if not, write to the Free Software
  104.22 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  104.23 +*/
  104.24 +#include "h264_types.h"
  104.25 +#include "h264_parser.h"
  104.26 +#include "h264_nal.h"
  104.27 +#include "h264_entropy.h"
  104.28 +#include "h264_rec.h"
  104.29 +#include "h264_pred_mode.h"
  104.30 +#include "h264_misc.h"
  104.31 +// #undef NDEBUG
  104.32 +#include <assert.h>
  104.33 +
  104.34 +#pragma omp task inout(*pc, *nc) output(*sbe)
  104.35 +static void parse_task(H264Context *h, ParserContext *pc, NalContext *nc, SliceBufferEntry *sbe){
  104.36 +    H264Slice *s;
  104.37 +
  104.38 +    if (!sbe->initialized){
  104.39 +        init_sb_entry(h, sbe);
  104.40 +        sbe->lines_total=h->mb_height;
  104.41 +    }
  104.42 +
  104.43 +    av_read_frame_internal(pc, &sbe->gb);
  104.44 +    s = &sbe->slice;
  104.45 +
  104.46 +    decode_nal_units(nc, s, &sbe->gb);
  104.47 +}
  104.48 +
  104.49 +#pragma omp task inout(*ec) inout(*sbe)
  104.50 +static void decode_slice_entropy_task(H264Context *h, EntropyContext *ec, SliceBufferEntry *sbe){
  104.51 +    int i,j;
  104.52 +    H264Slice *s = &sbe->slice;
  104.53 +    GetBitContext *gb = &sbe->gb;
  104.54 +    H264Mb *mbs = sbe->mbs;
  104.55 +//     GetBitContext *gb = s->gb;
  104.56 +    CABACContext *c = &ec->c;
  104.57 +
  104.58 +    if( !s->pps.cabac ){
  104.59 +        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
  104.60 +        return ;
  104.61 +    }
  104.62 +
  104.63 +    init_dequant_tables(s, ec);
  104.64 +    ec->curr_qscale = s->qscale;
  104.65 +    ec->last_qscale_diff = 0;
  104.66 +    ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale);
  104.67 +    ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale);
  104.68 +
  104.69 +    /* realign */
  104.70 +    align_get_bits( gb );
  104.71 +    /* init cabac */
  104.72 +    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
  104.73 +
  104.74 +    ff_h264_init_cabac_states(ec, s, c);
  104.75 +
  104.76 +    for(j=0; j<ec->mb_height; j++){
  104.77 +        init_entropy_buf(ec, s, j);
  104.78 +        for(i=0; i<ec->mb_width; i++){
  104.79 +            int eos,ret;
  104.80 +            H264Mb *m = &mbs[i + j*ec->mb_width];
  104.81 +            m->mb_x=i;
  104.82 +            m->mb_y=j;
  104.83 +            ec->m = m;
  104.84 +
  104.85 +            ret = ff_h264_decode_mb_cabac(ec, s, c);
  104.86 +            eos = get_cabac_terminate( c);
  104.87 +            (void) eos;
  104.88 +            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
  104.89 +                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
  104.90 +                return ;
  104.91 +            }
  104.92 +        }
  104.93 +    }
  104.94 +}
  104.95 +
  104.96 +static void decode_super_mb_block(MBRecContext *d, H264Slice *s, SuperMBContext *smbc, H264Mb *mbs, int smb_x, int smb_y){
  104.97 +    MBRecState mrs;
  104.98 +//     memset(&mrs, 0, sizeof(MBRecState));
  104.99 +
 104.100 +    for (int k=0, i= smb_y; i< smb_y + smbc->smb_height; i++, k++){
 104.101 +        init_mbrec_context(d, &mrs, s, i);
 104.102 +        for (int j= smb_x -k ; j< smb_x - k + smbc->smb_width; j++){
 104.103 +            if (i< d->mb_height && j >= 0 && j < d->mb_width){
 104.104 +                h264_decode_mb_internal (d, &mrs, s, &mbs[i*d->mb_width+j]);
 104.105 +            }
 104.106 +        }
 104.107 +    }
 104.108 +}
 104.109 +
 104.110 +#pragma omp task input(*d, *sbe, *ml, *mur) inout(*m)
 104.111 +static void decode_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml,
 104.112 +SuperMBTask *mur, SuperMBTask *m){
 104.113 +    H264Slice *s = &sbe->slice;
 104.114 +    H264Mb *mbs = sbe->mbs;
 104.115 +    decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y);
 104.116 +}
 104.117 +
 104.118 +#pragma omp task input(*d, *sbe) inout(*sm)
 104.119 +static void draw_edges_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *sm, int line){
 104.120 +    H264Slice *s = &sbe->slice;
 104.121 +    for (int i=line*smbc->smb_height; i< (line+1)*smbc->smb_height && i< d->mb_height; i++)
 104.122 +        draw_edges(d, s, i);
 104.123 +}
 104.124 +
 104.125 +static void decode_mb_in_slice(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){
 104.126 +    int i,j;
 104.127 +
 104.128 +    SuperMBContext *smbc = acquire_smbc(h);
 104.129 +    int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width;
 104.130 +    SuperMBTask *smbs = smbc->smbs[0];
 104.131 +
 104.132 +    SuperMBTask *sm=NULL, *sml, *smur;
 104.133 +    for(j=0; j< smb_height; j++){
 104.134 +        for(i=0; i< smb_width; i++){
 104.135 +            sm = smbs + j*smb_width + i;
 104.136 +            sml  = sm - ((i > 0) ? 1: 0);
 104.137 +            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
 104.138 +            decode_super_mb_task(d, sbe, smbc, sml, smur, sm);
 104.139 +        }
 104.140 +        draw_edges_task(d, sbe, smbc, sm, j);
 104.141 +    }
 104.142 +    #pragma omp taskwait on(*sm)
 104.143 +
 104.144 +    release_smbc(h, smbc);
 104.145 +}
 104.146 +
 104.147 +#pragma omp task inout(*d) inout(*sbe)
 104.148 +static void decode_slice_mb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe){
 104.149 +    H264Slice *s = &sbe->slice;
 104.150 +
 104.151 +    for (int i=0; i<2; i++){
 104.152 +        for(int j=0; j< s->ref_count[i]; j++){
 104.153 +            if (s->ref_list_cpn[i][j] ==-1)
 104.154 +                continue;
 104.155 +            int k;
 104.156 +            for (k=0; k< h->max_dpb_cnt; k++){
 104.157 +                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
 104.158 +                    s->dp_ref_list[i][j] = &h->dpb[k];
 104.159 +                    break;
 104.160 +                }
 104.161 +            }
 104.162 +        }
 104.163 +    }
 104.164 +
 104.165 +    #pragma omp critical (dpb)
 104.166 +    get_dpb_entry(h, s);
 104.167 +
 104.168 +    if (!h->no_mbd){
 104.169 +        decode_mb_in_slice (h, d, sbe);
 104.170 +    }
 104.171 +
 104.172 +    for (int i=0; i<s->release_cnt; i++){
 104.173 +        for(int j=0; j<h->max_dpb_cnt; j++){
 104.174 +            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
 104.175 +                #pragma omp critical (dpb)
 104.176 +                release_dpb_entry(h, &h->dpb[j], 2);
 104.177 +                break;
 104.178 +            }
 104.179 +        }
 104.180 +    }
 104.181 +    s->release_cnt=0;
 104.182 +}
 104.183 +
 104.184 +// for static 3d wave
 104.185 +/*-------------------------------------------------------------------------------*/
 104.186 +#pragma omp task input(*d, *sbe, *ml, *mur, *mprev) inout(*m)
 104.187 +static void decode_3dwave_super_mb_task(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc, SuperMBTask *ml,
 104.188 +SuperMBTask *mur, SuperMBTask *mprev, SuperMBTask *m){
 104.189 +    H264Slice *s = &sbe->slice;
 104.190 +    H264Mb *mbs = sbe->mbs;
 104.191 +
 104.192 +    decode_super_mb_block(d, s, smbc, mbs, m->smb_x, m->smb_y);
 104.193 +}
 104.194 +
 104.195 +// int init_ref_count=0;
 104.196 +#pragma omp task inout(*d, *sbe, *init)
 104.197 +static void init_ref_list_and_get_dpb_task(H264Context *h, MBRecContext *d, SliceBufferEntry *sbe, int *init){
 104.198 +    H264Slice *s = &sbe->slice;
 104.199 +    for (int i=0; i<2; i++){
 104.200 +        for(int j=0; j< s->ref_count[i]; j++){
 104.201 +            if (s->ref_list_cpn[i][j] ==-1)
 104.202 +                continue;
 104.203 +            int k;
 104.204 +            for (k=0; k<h->max_dpb_cnt; k++){
 104.205 +                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
 104.206 +                    s->dp_ref_list[i][j] = &h->dpb[k];
 104.207 +                    break;
 104.208 +                }
 104.209 +            }
 104.210 +        }
 104.211 +    }
 104.212 +
 104.213 +    #pragma omp critical (dpb)
 104.214 +    get_dpb_entry(h, s);
 104.215 +
 104.216 +}
 104.217 +
 104.218 +static SuperMBTask* add_decode_slice_3dwave_tasks(MBRecContext *d, SliceBufferEntry *sbe, SuperMBContext *smbc){
 104.219 +    int i,j;
 104.220 +    
 104.221 +    int smb_3d_height =smbc->nsmb_3dheight;
 104.222 +    int smb_height =smbc->nsmb_height, smb_width= smbc->nsmb_width;
 104.223 +    int smb_diff_prev = smb_height - smb_3d_height;
 104.224 +    SuperMBTask *sm=NULL, *sml, *smur, *smprev;
 104.225 +
 104.226 +    SuperMBTask *smbs = smbc->smbs[smbc->index++]; smbc->index%=2; 
 104.227 +    SuperMBTask *smbs_prev = smbc->smbs[smbc->index]; // index rotates -> next == prev
 104.228 +    
 104.229 +    for(j=0; j<smb_3d_height ; j++){
 104.230 +        for(i=0; i< smb_width; i++){
 104.231 +            sm = smbs + j*smb_width + i;
 104.232 +            sml  = sm - ((i > 0) ? 1: 0);
 104.233 +            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
 104.234 +            smprev = smbs_prev + (j + smb_diff_prev+1)*smb_width -1;
 104.235 +            decode_3dwave_super_mb_task(d, sbe, smbc, sml, smur, smprev, sm);
 104.236 +        }
 104.237 +        draw_edges_task(d, sbe, smbc, sm, j);
 104.238 +    }
 104.239 +
 104.240 +    for(; j< smb_height; j++){
 104.241 +        for(i=0; i< smb_width; i++){
 104.242 +            sm = smbs + j*smb_width + i;
 104.243 +            sml  = sm - ((i > 0) ? 1: 0);
 104.244 +            smur = sm + (((i < smb_width-1) && (j >0))  ? -smb_width+1: 0);
 104.245 +            decode_super_mb_task(d, sbe, smbc, sml, smur, sm);
 104.246 +        }
 104.247 +        draw_edges_task(d, sbe, smbc, sm, j);
 104.248 +    }
 104.249 +    return sm;
 104.250 +}
 104.251 +
 104.252 +#pragma omp task inout(*d, *sbe, *release) input (*lastsmb)
 104.253 +static void release_ref_list_task(H264Context *h, SuperMBContext *smbc, MBRecContext *d, SliceBufferEntry *sbe, SuperMBTask *lastsmb, int *release){
 104.254 +    H264Slice *s = &sbe->slice;
 104.255 +    for (int i=0; i<s->release_cnt; i++){
 104.256 +        for(int j=0; j<h->max_dpb_cnt; j++){
 104.257 +            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
 104.258 +                #pragma omp critical (dpb)
 104.259 +                release_dpb_entry(h, &h->dpb[j], 2);
 104.260 +                break;
 104.261 +            }
 104.262 +        }
 104.263 +    }
 104.264 +    s->release_cnt=0;
 104.265 +
 104.266 +    release_smbc(h, smbc);
 104.267 +    
 104.268 +}
 104.269 +
 104.270 +// static void decode_mb_static_3dwave(H264Context *h, int mb_height, int mb_width, MBRecContext *d, H264Slice *s, H264Mb *mbs, SuperMBTask *smbs, SuperMBTask *smbs_prev){
 104.271 +//
 104.272 +// }
 104.273 +/*-------------------------------------------------------------------------------*/
 104.274 +//end for static 3d wave
 104.275 +
 104.276 +#pragma omp task inout (*oc) input(*sbe)
 104.277 +static void output_task(H264Context *h, OutputContext *oc, SliceBufferEntry *sbe){
 104.278 +    DecodedPicture* out =output_frame(h, oc, sbe->slice.curr_pic, h->ofile, h->frame_width, h->frame_height);
 104.279 +    if (out){
 104.280 +        #pragma omp critical (dpb)
 104.281 +        release_dpb_entry(h, out, 1);
 104.282 +    }
 104.283 +    print_report(oc->frame_number, oc->video_size, 0, h->verbose);
 104.284 +}
 104.285 +
 104.286 +/*
 104.287 +* The following code is the main loop of the file converter
 104.288 +*/
 104.289 +int h264_decode_ompss( H264Context *h) {
 104.290 +    const int bufs = h->pipe_bufs;
 104.291 +
 104.292 +    ParserContext *pc;
 104.293 +    NalContext *nc;
 104.294 +    EntropyContext *ec[bufs];
 104.295 +    MBRecContext *rc[2];
 104.296 +    OutputContext *oc;
 104.297 +    SliceBufferEntry *sbe;
 104.298 +    SuperMBContext *smbc;
 104.299 +
 104.300 +    DecodedPicture *out;
 104.301 +    int frames=0;
 104.302 +
 104.303 +#if HAVE_LIBSDL2
 104.304 +    pthread_t sdl_thr;
 104.305 +    if (h->display){
 104.306 +        pthread_create(&sdl_thr, NULL, sdl_thread, h);
 104.307 +    }
 104.308 +#endif
 104.309 +    sbe= av_mallocz(sizeof(SliceBufferEntry) * bufs);
 104.310 +
 104.311 +
 104.312 +    pc = get_parse_context(h->ifile);
 104.313 +    nc = get_nal_context(h->width, h->height);
 104.314 +
 104.315 +    for(int i=0; i<bufs; i++){
 104.316 +        ec[i] = get_entropy_context( h );
 104.317 +    }
 104.318 +
 104.319 +    for(int i=0; i<2; i++){
 104.320 +        rc[i] = get_mbrec_context(h);
 104.321 +    }
 104.322 +
 104.323 +    oc = get_output_context( h );
 104.324 +
 104.325 +    av_start_timer();
 104.326 +    int k=0; int init, release;
 104.327 +    if (h->static_3d && bufs < h->num_frames ){
 104.328 +        int num_pre_ed =0;
 104.329 +        for (num_pre_ed=0; num_pre_ed< bufs -1 && !pc->final_frame; num_pre_ed++){
 104.330 +            parse_task( h, pc, nc, &sbe[k%bufs] );
 104.331 +            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
 104.332 +            #pragma omp taskwait on(*pc)
 104.333 +            k++;
 104.334 +        }
 104.335 +
 104.336 +        while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
 104.337 +            parse_task( h, pc, nc, &sbe[k%bufs] );
 104.338 +            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
 104.339 +
 104.340 +            k++;
 104.341 +
 104.342 +            init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init);
 104.343 +            smbc = acquire_smbc(h);
 104.344 +            SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc);
 104.345 +            release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release);
 104.346 +
 104.347 +            output_task (h, oc, &sbe[k%bufs]);
 104.348 +            #pragma omp taskwait on(*pc)
 104.349 +        }
 104.350 +
 104.351 +        for (int i=0; i< num_pre_ed; i++){
 104.352 +            k++;
 104.353 +            init_ref_list_and_get_dpb_task(h, rc[k%2], &sbe[k%bufs], &init);
 104.354 +            smbc = acquire_smbc(h);
 104.355 +            SuperMBTask *lastsmb= add_decode_slice_3dwave_tasks(rc[k%2], &sbe[k%bufs], smbc);
 104.356 +            release_ref_list_task(h, smbc, rc[k%2], &sbe[k%bufs], lastsmb, &release);
 104.357 +
 104.358 +            output_task (h, oc, &sbe[k%bufs]);
 104.359 +        }
 104.360 +
 104.361 +    } else {
 104.362 +        while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
 104.363 +            parse_task( h, pc, nc, &sbe[k%bufs] );
 104.364 +
 104.365 +            decode_slice_entropy_task(h, ec[k%bufs], &sbe[k%bufs]);
 104.366 +
 104.367 +            decode_slice_mb_task(h, rc[0], &sbe[k%bufs]);
 104.368 +
 104.369 +            output_task (h, oc, &sbe[k%bufs]);
 104.370 +            #pragma omp taskwait on(*pc)
 104.371 +            k++;
 104.372 +        }
 104.373 +    }
 104.374 +    #pragma omp taskwait
 104.375 +
 104.376 +    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
 104.377 +
 104.378 +    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
 104.379 +    h->num_frames = oc->frame_number;
 104.380 +    /* finished ! */
 104.381 +
 104.382 +    free_parse_context(pc);
 104.383 +    free_nal_context  (nc);
 104.384 +    free_output_context(oc);
 104.385 +    for (int i=0; i<bufs; i++){
 104.386 +        free_sb_entry(&sbe[i]);
 104.387 +        free_entropy_context(ec[i]);
 104.388 +    }
 104.389 +    av_free(sbe);
 104.390 +
 104.391 +    for (int i=0; i<2; i++){
 104.392 +        free_mbrec_context(rc[i]);
 104.393 +    }
 104.394 +
 104.395 +#if HAVE_LIBSDL2
 104.396 +    if (h->display){
 104.397 +        signal_sdl_exit(h);
 104.398 +        pthread_join(sdl_thr, NULL);
 104.399 +    }
 104.400 +#endif
 104.401 +
 104.402 +    return 0;
 104.403 +}

   105.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   105.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_parser.c	Mon Aug 27 12:09:56 2012 +0200
   105.3 @@ -0,0 +1,224 @@
   105.4 +/*
   105.5 + * H.26L/H.264/AVC/JVT/14496-10/... parser
   105.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   105.7 + *
   105.8 + * This file is part of FFmpeg.
   105.9 + *
  105.10 + * FFmpeg is free software; you can redistribute it and/or
  105.11 + * modify it under the terms of the GNU Lesser General Public
  105.12 + * License as published by the Free Software Foundation; either
  105.13 + * version 2.1 of the License, or (at your option) any later version.
  105.14 + *
  105.15 + * FFmpeg is distributed in the hope that it will be useful,
  105.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  105.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  105.18 + * Lesser General Public License for more details.
  105.19 + *
  105.20 + * You should have received a copy of the GNU Lesser General Public
  105.21 + * License along with FFmpeg; if not, write to the Free Software
  105.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  105.23 + */
  105.24 +
  105.25 +/**
  105.26 + * @file
  105.27 + * H.264 / AVC / MPEG4 part10 parser.
  105.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  105.29 + */
  105.30 +
  105.31 +#include <unistd.h>
  105.32 +
  105.33 +#include "golomb.h"
  105.34 +#include "libavutil/error.h"
  105.35 +#include "h264_types.h"
  105.36 +
  105.37 +#undef NDEBUG
  105.38 +#include <assert.h>
  105.39 +
  105.40 +#define END_NOT_FOUND (-100)
  105.41 +
  105.42 +static int ff_h264_find_frame_end(ParserContext *s, const uint8_t *buf, int buf_size)
  105.43 +{
  105.44 +    int i;
  105.45 +    uint32_t state;
  105.46 +
  105.47 +    state= s->state;
  105.48 +    if(state>13)
  105.49 +        state= 7;
  105.50 +
  105.51 +    for(i=0; i<buf_size; i++){
  105.52 +        if(state==7){
  105.53 +        /* we check i<buf_size instead of i+3/7 because its simpler
  105.54 +         * and there should be FF_INPUT_BUFFER_PADDING_SIZE bytes at the end
  105.55 +         */
  105.56 +            while(i<buf_size && !((~*(const uint64_t*)(buf+i) & (*(const uint64_t*)(buf+i) - 0x0101010101010101ULL)) & 0x8080808080808080ULL))
  105.57 +                i+=8;
  105.58 +
  105.59 +            for(; i<buf_size; i++){
  105.60 +                if(!buf[i]){
  105.61 +                    state=2;
  105.62 +                    break;
  105.63 +                }
  105.64 +            }
  105.65 +        }else if(state<=2){
  105.66 +            if(buf[i]==1)   state^= 5; //2->7, 1->4, 0->5
  105.67 +            else if(buf[i]) state = 7;
  105.68 +            else            state>>=1; //2->1, 1->0, 0->0
  105.69 +        }else if(state<=5){
  105.70 +            int v= buf[i] & 0x1F;
  105.71 +            if(v==6 || v==7 || v==8 || v==9){
  105.72 +                if(s->frame_start_found){
  105.73 +                    i++;
  105.74 +                    goto found;
  105.75 +                }
  105.76 +            }else if(v==1 || v==2 || v==5){
  105.77 +                if(s->frame_start_found){
  105.78 +                    state+=8;
  105.79 +                    continue;
  105.80 +                }else
  105.81 +                    s->frame_start_found = 1;
  105.82 +            }
  105.83 +            state= 7;
  105.84 +        }else{
  105.85 +            if(buf[i] & 0x80)
  105.86 +                goto found;
  105.87 +            state= 7;
  105.88 +        }
  105.89 +    }
  105.90 +    s->state= state;
  105.91 +    return END_NOT_FOUND;
  105.92 +
  105.93 +found:
  105.94 +    s->state=7;
  105.95 +    s->frame_start_found= 0;
  105.96 +    return i-(state&5);
  105.97 +}
  105.98 +
  105.99 +static int ff_combine_frame(ParserContext *s, GetBitContext *gb, int next, uint8_t **buf, int *buf_size)
 105.100 +{
 105.101 +    int i;
 105.102 +    /* Copy overread bytes from last frame into buffer. */
 105.103 +    for(i =0; s->overread_cnt>0; s->overread_cnt--, i++){
 105.104 +        gb->raw[s->index++]= s->overread[i];
 105.105 +    }
 105.106 +
 105.107 +    /* EOF - END_NOT_FOUND means no next frame start is found in current partial read. If buf_size of the partial read is 0 we are at EOF */
 105.108 +    if(!*buf_size && next == END_NOT_FOUND){
 105.109 +        next= 0;
 105.110 +    }
 105.111 +    s->last_index= s->index;
 105.112 +
 105.113 +    /* copy into buffer end return */
 105.114 +    if(next == END_NOT_FOUND){
 105.115 +        gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, (*buf_size) + s->index + FF_INPUT_BUFFER_PADDING_SIZE);
 105.116 +        memcpy(&gb->raw[s->index], *buf, *buf_size);
 105.117 +        s->index += *buf_size;
 105.118 +        return -1;
 105.119 +    }
 105.120 +
 105.121 +    ///end found
 105.122 +    *buf_size=  s->index + next;
 105.123 +    /* append to buffer */
 105.124 +
 105.125 +    gb->raw = av_fast_realloc(gb->raw, &gb->alloc_size, next + s->index + FF_INPUT_BUFFER_PADDING_SIZE);
 105.126 +    memcpy(&gb->raw[s->index], *buf, next + FF_INPUT_BUFFER_PADDING_SIZE );
 105.127 +    s->index = 0;
 105.128 +
 105.129 +    /* store overread bytes */
 105.130 +    for(i=0; next < 0; next++, i++){
 105.131 +        s->state = (s->state<<8) | gb->raw[s->last_index + next];
 105.132 +        s->overread[i] = gb->raw[s->last_index + next];
 105.133 +        s->overread_cnt++;
 105.134 +    }
 105.135 +
 105.136 +    return 0;
 105.137 +}
 105.138 +
 105.139 +static int h264_parse(ParserContext *s, GetBitContext *gb,
 105.140 +                      uint8_t *buf, int buf_size)
 105.141 +{
 105.142 +    int next;
 105.143 +
 105.144 +    next= ff_h264_find_frame_end(s, buf, buf_size);
 105.145 +
 105.146 +    if (ff_combine_frame(s, gb, next, &buf, &buf_size) < 0) {
 105.147 +        gb->buf_size = 0;
 105.148 +        return buf_size;
 105.149 +    }
 105.150 +
 105.151 +    if(next<0 && next != END_NOT_FOUND){
 105.152 +        assert(s->last_index + next >= 0 );
 105.153 +        ff_h264_find_frame_end(s, &gb->raw[s->last_index + next], -next); //update state
 105.154 +    }
 105.155 +
 105.156 +    gb->buf_size = buf_size;
 105.157 +    return next;
 105.158 +}
 105.159 +
 105.160 +static int ff_raw_read_partial_packet(ParserContext *pc)
 105.161 +{
 105.162 +    int len= -1;
 105.163 +
 105.164 +    if (!pc->eof_reached){
 105.165 +        len = read( pc->ifile, pc->data, pc->buffer_size);
 105.166 +//         printf("read task %d\t%d\n", pc->ifile, len); fflush(NULL);
 105.167 +        if (len < pc->buffer_size) {
 105.168 +            pc->eof_reached = 1;
 105.169 +        }
 105.170 +    }
 105.171 +
 105.172 +    return len;
 105.173 +}
 105.174 +
 105.175 +void av_read_frame_internal(ParserContext *pc, GetBitContext *gb){
 105.176 +    int len;
 105.177 +    uint8_t dummy_buf[FF_INPUT_BUFFER_PADDING_SIZE]={0};
 105.178 +    av_fast_malloc(&gb->raw, &gb->alloc_size, 2048+FF_INPUT_BUFFER_PADDING_SIZE);
 105.179 +
 105.180 +    //Parsing is performed before read, since there are ussually leftovers from parsing the previous frame.
 105.181 +    for(;;) {
 105.182 +        if (pc->cur_len>0){
 105.183 +            len = h264_parse(pc, gb, pc->cur_ptr, pc->cur_len);
 105.184 +            if (len<0)
 105.185 +                len =0;
 105.186 +            //* increment read pointer */
 105.187 +            pc->cur_ptr += len;
 105.188 +            pc->cur_len -= len;
 105.189 +
 105.190 +            if (gb->buf_size) {
 105.191 +                break;
 105.192 +            }
 105.193 +        }
 105.194 +
 105.195 +        //check for ret and not parser->eof_reached as one "read" can contain more than 1 frame
 105.196 +        pc->size= ff_raw_read_partial_packet(pc);
 105.197 +        if (pc->size < 0) {
 105.198 +            pc->final_frame =1;
 105.199 +            /* return the last frames, if any */
 105.200 +            h264_parse(pc, gb, dummy_buf, 0);
 105.201 +            break;
 105.202 +        }
 105.203 +        pc->cur_ptr = pc->data;
 105.204 +        pc->cur_len = pc->size;
 105.205 +    }
 105.206 +
 105.207 +    assert(gb->raw!=NULL);
 105.208 +
 105.209 +}
 105.210 +
 105.211 +ParserContext *get_parse_context(int ifile){
 105.212 +    ParserContext *pc = av_mallocz(sizeof(ParserContext));
 105.213 +    pc->buffer_size = 2048;
 105.214 +    pc->final_frame = 0;
 105.215 +    pc->cur_len= 0;
 105.216 +    pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE);
 105.217 +    pc->size = 2048;
 105.218 +    pc->eof_reached =0;
 105.219 +    pc->ifile = ifile;
 105.220 +
 105.221 +    return pc;
 105.222 +}
 105.223 +
 105.224 +void free_parse_context(ParserContext *pc){
 105.225 +    av_free(pc->data);
 105.226 +    av_free(pc);
 105.227 +}

   106.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   106.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_parser.h	Mon Aug 27 12:09:56 2012 +0200
   106.3 @@ -0,0 +1,10 @@
   106.4 +#ifndef H264_PARSER_H
   106.5 +#define H264_PARSER_H
   106.6 +
   106.7 +#include "h264_types.h"
   106.8 +
   106.9 +void av_read_frame_internal(ParserContext *pc, GetBitContext *gb);
  106.10 +ParserContext *get_parse_context(int ifile);
  106.11 +void free_parse_context(ParserContext *pc);
  106.12 +
  106.13 +#endif

   107.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   107.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred.c	Mon Aug 27 12:09:56 2012 +0200
   107.3 @@ -0,0 +1,945 @@
   107.4 +/*
   107.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   107.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   107.7 + *
   107.8 + * This file is part of FFmpeg.
   107.9 + *
  107.10 + * FFmpeg is free software; you can redistribute it and/or
  107.11 + * modify it under the terms of the GNU Lesser General Public
  107.12 + * License as published by the Free Software Foundation; either
  107.13 + * version 2.1 of the License, or (at your option) any later version.
  107.14 + *
  107.15 + * FFmpeg is distributed in the hope that it will be useful,
  107.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  107.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  107.18 + * Lesser General Public License for more details.
  107.19 + *
  107.20 + * You should have received a copy of the GNU Lesser General Public
  107.21 + * License along with FFmpeg; if not, write to the Free Software
  107.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  107.23 + */
  107.24 +
  107.25 +/**
  107.26 + * @file
  107.27 + * H.264 / AVC / MPEG4 part10 prediction functions.
  107.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  107.29 + */
  107.30 +
  107.31 +#include "avcodec.h"
  107.32 +#include "h264_pred.h"
  107.33 +//#include "dsputil.h"
  107.34 +
  107.35 +static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
  107.36 +	(void) topright;
  107.37 +    const uint32_t a= ((uint32_t*)(src-stride))[0];
  107.38 +    ((uint32_t*)(src+0*stride))[0]= a;
  107.39 +    ((uint32_t*)(src+1*stride))[0]= a;
  107.40 +    ((uint32_t*)(src+2*stride))[0]= a;
  107.41 +    ((uint32_t*)(src+3*stride))[0]= a;
  107.42 +}
  107.43 +
  107.44 +static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
  107.45 +	(void) topright;
  107.46 +    ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
  107.47 +    ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
  107.48 +    ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
  107.49 +    ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
  107.50 +}
  107.51 +
  107.52 +static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
  107.53 +	(void) topright;
  107.54 +    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
  107.55 +                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
  107.56 +
  107.57 +    ((uint32_t*)(src+0*stride))[0]=
  107.58 +    ((uint32_t*)(src+1*stride))[0]=
  107.59 +    ((uint32_t*)(src+2*stride))[0]=
  107.60 +    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
  107.61 +}
  107.62 +
  107.63 +static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
  107.64 +	(void) topright;
  107.65 +    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
  107.66 +
  107.67 +    ((uint32_t*)(src+0*stride))[0]=
  107.68 +    ((uint32_t*)(src+1*stride))[0]=
  107.69 +    ((uint32_t*)(src+2*stride))[0]=
  107.70 +    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
  107.71 +}
  107.72 +
  107.73 +static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
  107.74 +	(void) topright;
  107.75 +    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
  107.76 +
  107.77 +    ((uint32_t*)(src+0*stride))[0]=
  107.78 +    ((uint32_t*)(src+1*stride))[0]=
  107.79 +    ((uint32_t*)(src+2*stride))[0]=
  107.80 +    ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
  107.81 +}
  107.82 +
  107.83 +static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
  107.84 +	(void) topright;
  107.85 +    ((uint32_t*)(src+0*stride))[0]=
  107.86 +    ((uint32_t*)(src+1*stride))[0]=
  107.87 +    ((uint32_t*)(src+2*stride))[0]=
  107.88 +    ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
  107.89 +}
  107.90 +
  107.91 +
  107.92 +#define LOAD_TOP_RIGHT_EDGE\
  107.93 +    const int av_unused t4= topright[0];\
  107.94 +    const int av_unused t5= topright[1];\
  107.95 +    const int av_unused t6= topright[2];\
  107.96 +    const int av_unused t7= topright[3];\
  107.97 +
  107.98 +#define LOAD_DOWN_LEFT_EDGE\
  107.99 +    const int av_unused l4= src[-1+4*stride];\
 107.100 +    const int av_unused l5= src[-1+5*stride];\
 107.101 +    const int av_unused l6= src[-1+6*stride];\
 107.102 +    const int av_unused l7= src[-1+7*stride];\
 107.103 +
 107.104 +#define LOAD_LEFT_EDGE\
 107.105 +    const int av_unused l0= src[-1+0*stride];\
 107.106 +    const int av_unused l1= src[-1+1*stride];\
 107.107 +    const int av_unused l2= src[-1+2*stride];\
 107.108 +    const int av_unused l3= src[-1+3*stride];\
 107.109 +
 107.110 +#define LOAD_TOP_EDGE\
 107.111 +    const int av_unused t0= src[ 0-1*stride];\
 107.112 +    const int av_unused t1= src[ 1-1*stride];\
 107.113 +    const int av_unused t2= src[ 2-1*stride];\
 107.114 +    const int av_unused t3= src[ 3-1*stride];\
 107.115 +
 107.116 +static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
 107.117 +	(void) topright;
 107.118 +    const int lt= src[-1-1*stride];
 107.119 +    LOAD_TOP_EDGE
 107.120 +    LOAD_LEFT_EDGE
 107.121 +
 107.122 +    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
 107.123 +    src[0+2*stride]=
 107.124 +    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
 107.125 +    src[0+1*stride]=
 107.126 +    src[1+2*stride]=
 107.127 +    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
 107.128 +    src[0+0*stride]=
 107.129 +    src[1+1*stride]=
 107.130 +    src[2+2*stride]=
 107.131 +    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
 107.132 +    src[1+0*stride]=
 107.133 +    src[2+1*stride]=
 107.134 +    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
 107.135 +    src[2+0*stride]=
 107.136 +    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 107.137 +    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 107.138 +}
 107.139 +
 107.140 +static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
 107.141 +    LOAD_TOP_EDGE
 107.142 +    LOAD_TOP_RIGHT_EDGE
 107.143 +//    LOAD_LEFT_EDGE
 107.144 +
 107.145 +    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
 107.146 +    src[1+0*stride]=
 107.147 +    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
 107.148 +    src[2+0*stride]=
 107.149 +    src[1+1*stride]=
 107.150 +    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
 107.151 +    src[3+0*stride]=
 107.152 +    src[2+1*stride]=
 107.153 +    src[1+2*stride]=
 107.154 +    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
 107.155 +    src[3+1*stride]=
 107.156 +    src[2+2*stride]=
 107.157 +    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
 107.158 +    src[3+2*stride]=
 107.159 +    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
 107.160 +    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
 107.161 +}
 107.162 +
 107.163 +static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
 107.164 +	(void) topright;
 107.165 +    const int lt= src[-1-1*stride];
 107.166 +    LOAD_TOP_EDGE
 107.167 +    LOAD_LEFT_EDGE
 107.168 +
 107.169 +    src[0+0*stride]=
 107.170 +    src[1+2*stride]=(lt + t0 + 1)>>1;
 107.171 +    src[1+0*stride]=
 107.172 +    src[2+2*stride]=(t0 + t1 + 1)>>1;
 107.173 +    src[2+0*stride]=
 107.174 +    src[3+2*stride]=(t1 + t2 + 1)>>1;
 107.175 +    src[3+0*stride]=(t2 + t3 + 1)>>1;
 107.176 +    src[0+1*stride]=
 107.177 +    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
 107.178 +    src[1+1*stride]=
 107.179 +    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
 107.180 +    src[2+1*stride]=
 107.181 +    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 107.182 +    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 107.183 +    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
 107.184 +    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
 107.185 +}
 107.186 +
 107.187 +static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
 107.188 +    LOAD_TOP_EDGE
 107.189 +    LOAD_TOP_RIGHT_EDGE
 107.190 +
 107.191 +    src[0+0*stride]=(t0 + t1 + 1)>>1;
 107.192 +    src[1+0*stride]=
 107.193 +    src[0+2*stride]=(t1 + t2 + 1)>>1;
 107.194 +    src[2+0*stride]=
 107.195 +    src[1+2*stride]=(t2 + t3 + 1)>>1;
 107.196 +    src[3+0*stride]=
 107.197 +    src[2+2*stride]=(t3 + t4+ 1)>>1;
 107.198 +    src[3+2*stride]=(t4 + t5+ 1)>>1;
 107.199 +    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 107.200 +    src[1+1*stride]=
 107.201 +    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
 107.202 +    src[2+1*stride]=
 107.203 +    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
 107.204 +    src[3+1*stride]=
 107.205 +    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
 107.206 +    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
 107.207 +}
 107.208 +
 107.209 +static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
 107.210 +	(void) topright;
 107.211 +    LOAD_LEFT_EDGE
 107.212 +
 107.213 +    src[0+0*stride]=(l0 + l1 + 1)>>1;
 107.214 +    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
 107.215 +    src[2+0*stride]=
 107.216 +    src[0+1*stride]=(l1 + l2 + 1)>>1;
 107.217 +    src[3+0*stride]=
 107.218 +    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
 107.219 +    src[2+1*stride]=
 107.220 +    src[0+2*stride]=(l2 + l3 + 1)>>1;
 107.221 +    src[3+1*stride]=
 107.222 +    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
 107.223 +    src[3+2*stride]=
 107.224 +    src[1+3*stride]=
 107.225 +    src[0+3*stride]=
 107.226 +    src[2+2*stride]=
 107.227 +    src[2+3*stride]=
 107.228 +    src[3+3*stride]=l3;
 107.229 +}
 107.230 +
 107.231 +
 107.232 +static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
 107.233 +	(void) topright;
 107.234 +    const int lt= src[-1-1*stride];
 107.235 +    LOAD_TOP_EDGE
 107.236 +    LOAD_LEFT_EDGE
 107.237 +
 107.238 +    src[0+0*stride]=
 107.239 +    src[2+1*stride]=(lt + l0 + 1)>>1;
 107.240 +    src[1+0*stride]=
 107.241 +    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
 107.242 +    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
 107.243 +    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
 107.244 +    src[0+1*stride]=
 107.245 +    src[2+2*stride]=(l0 + l1 + 1)>>1;
 107.246 +    src[1+1*stride]=
 107.247 +    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
 107.248 +    src[0+2*stride]=
 107.249 +    src[2+3*stride]=(l1 + l2+ 1)>>1;
 107.250 +    src[1+2*stride]=
 107.251 +    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
 107.252 +    src[0+3*stride]=(l2 + l3 + 1)>>1;
 107.253 +    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
 107.254 +}
 107.255 +
 107.256 +static void pred16x16_vertical_c(uint8_t *src, int stride){
 107.257 +    int i;
 107.258 +    const uint32_t a= ((uint32_t*)(src-stride))[0];
 107.259 +    const uint32_t b= ((uint32_t*)(src-stride))[1];
 107.260 +    const uint32_t c= ((uint32_t*)(src-stride))[2];
 107.261 +    const uint32_t d= ((uint32_t*)(src-stride))[3];
 107.262 +
 107.263 +    for(i=0; i<16; i++){
 107.264 +        ((uint32_t*)(src+i*stride))[0]= a;
 107.265 +        ((uint32_t*)(src+i*stride))[1]= b;
 107.266 +        ((uint32_t*)(src+i*stride))[2]= c;
 107.267 +        ((uint32_t*)(src+i*stride))[3]= d;
 107.268 +    }
 107.269 +}
 107.270 +
 107.271 +static void pred16x16_horizontal_c(uint8_t *src, int stride){
 107.272 +    int i;
 107.273 +
 107.274 +    for(i=0; i<16; i++){
 107.275 +        ((uint32_t*)(src+i*stride))[0]=
 107.276 +        ((uint32_t*)(src+i*stride))[1]=
 107.277 +        ((uint32_t*)(src+i*stride))[2]=
 107.278 +        ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
 107.279 +    }
 107.280 +}
 107.281 +
 107.282 +static void pred16x16_dc_c(uint8_t *src, int stride){
 107.283 +    int i, dc=0;
 107.284 +
 107.285 +    for(i=0;i<16; i++){
 107.286 +        dc+= src[-1+i*stride];
 107.287 +    }
 107.288 +
 107.289 +    for(i=0;i<16; i++){
 107.290 +        dc+= src[i-stride];
 107.291 +    }
 107.292 +
 107.293 +    dc= 0x01010101*((dc + 16)>>5);
 107.294 +
 107.295 +    for(i=0; i<16; i++){
 107.296 +        ((uint32_t*)(src+i*stride))[0]=
 107.297 +        ((uint32_t*)(src+i*stride))[1]=
 107.298 +        ((uint32_t*)(src+i*stride))[2]=
 107.299 +        ((uint32_t*)(src+i*stride))[3]= dc;
 107.300 +    }
 107.301 +}
 107.302 +
 107.303 +static void pred16x16_left_dc_c(uint8_t *src, int stride){
 107.304 +    int i, dc=0;
 107.305 +
 107.306 +    for(i=0;i<16; i++){
 107.307 +        dc+= src[-1+i*stride];
 107.308 +    }
 107.309 +
 107.310 +    dc= 0x01010101*((dc + 8)>>4);
 107.311 +
 107.312 +    for(i=0; i<16; i++){
 107.313 +        ((uint32_t*)(src+i*stride))[0]=
 107.314 +        ((uint32_t*)(src+i*stride))[1]=
 107.315 +        ((uint32_t*)(src+i*stride))[2]=
 107.316 +        ((uint32_t*)(src+i*stride))[3]= dc;
 107.317 +    }
 107.318 +}
 107.319 +
 107.320 +static void pred16x16_top_dc_c(uint8_t *src, int stride){
 107.321 +    int i, dc=0;
 107.322 +
 107.323 +    for(i=0;i<16; i++){
 107.324 +        dc+= src[i-stride];
 107.325 +    }
 107.326 +    dc= 0x01010101*((dc + 8)>>4);
 107.327 +
 107.328 +    for(i=0; i<16; i++){
 107.329 +        ((uint32_t*)(src+i*stride))[0]=
 107.330 +        ((uint32_t*)(src+i*stride))[1]=
 107.331 +        ((uint32_t*)(src+i*stride))[2]=
 107.332 +        ((uint32_t*)(src+i*stride))[3]= dc;
 107.333 +    }
 107.334 +}
 107.335 +
 107.336 +static void pred16x16_128_dc_c(uint8_t *src, int stride){
 107.337 +    int i;
 107.338 +
 107.339 +    for(i=0; i<16; i++){
 107.340 +        ((uint32_t*)(src+i*stride))[0]=
 107.341 +        ((uint32_t*)(src+i*stride))[1]=
 107.342 +        ((uint32_t*)(src+i*stride))[2]=
 107.343 +        ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
 107.344 +    }
 107.345 +}
 107.346 +
 107.347 +static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3, const int rv40){
 107.348 +  int i, j, k;
 107.349 +  int a;
 107.350 +  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 107.351 +  const uint8_t * const src0 = src+7-stride;
 107.352 +  const uint8_t *src1 = src+8*stride-1;
 107.353 +  const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
 107.354 +  int H = src0[1] - src0[-1];
 107.355 +  int V = src1[0] - src2[ 0];
 107.356 +  for(k=2; k<=8; ++k) {
 107.357 +    src1 += stride; src2 -= stride;
 107.358 +    H += k*(src0[k] - src0[-k]);
 107.359 +    V += k*(src1[0] - src2[ 0]);
 107.360 +  }
 107.361 +  if(svq3){
 107.362 +    H = ( 5*(H/4) ) / 16;
 107.363 +    V = ( 5*(V/4) ) / 16;
 107.364 +
 107.365 +    /* required for 100% accuracy */
 107.366 +    i = H; H = V; V = i;
 107.367 +  }else if(rv40){
 107.368 +    H = ( H + (H>>2) ) >> 4;
 107.369 +    V = ( V + (V>>2) ) >> 4;
 107.370 +  }else{
 107.371 +    H = ( 5*H+32 ) >> 6;
 107.372 +    V = ( 5*V+32 ) >> 6;
 107.373 +  }
 107.374 +
 107.375 +  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
 107.376 +  for(j=16; j>0; --j) {
 107.377 +    int b = a;
 107.378 +    a += V;
 107.379 +    for(i=-16; i<0; i+=4) {
 107.380 +      src[16+i] = cm[ (b    ) >> 5 ];
 107.381 +      src[17+i] = cm[ (b+  H) >> 5 ];
 107.382 +      src[18+i] = cm[ (b+2*H) >> 5 ];
 107.383 +      src[19+i] = cm[ (b+3*H) >> 5 ];
 107.384 +      b += 4*H;
 107.385 +    }
 107.386 +    src += stride;
 107.387 +  }
 107.388 +}
 107.389 +
 107.390 +static void pred16x16_plane_c(uint8_t *src, int stride){
 107.391 +    pred16x16_plane_compat_c(src, stride, 0, 0);
 107.392 +}
 107.393 +
 107.394 +
 107.395 +static void pred8x8_vertical_c(uint8_t *src, int stride){
 107.396 +    int i;
 107.397 +    const uint32_t a= ((uint32_t*)(src-stride))[0];
 107.398 +    const uint32_t b= ((uint32_t*)(src-stride))[1];
 107.399 +
 107.400 +    for(i=0; i<8; i++){
 107.401 +        ((uint32_t*)(src+i*stride))[0]= a;
 107.402 +        ((uint32_t*)(src+i*stride))[1]= b;
 107.403 +    }
 107.404 +}
 107.405 +
 107.406 +static void pred8x8_horizontal_c(uint8_t *src, int stride){
 107.407 +    int i;
 107.408 +
 107.409 +    for(i=0; i<8; i++){
 107.410 +        ((uint32_t*)(src+i*stride))[0]=
 107.411 +        ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
 107.412 +    }
 107.413 +}
 107.414 +
 107.415 +static void pred8x8_128_dc_c(uint8_t *src, int stride){
 107.416 +    int i;
 107.417 +
 107.418 +    for(i=0; i<8; i++){
 107.419 +        ((uint32_t*)(src+i*stride))[0]=
 107.420 +        ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
 107.421 +    }
 107.422 +}
 107.423 +
 107.424 +static void pred8x8_left_dc_c(uint8_t *src, int stride){
 107.425 +    int i;
 107.426 +    int dc0, dc2;
 107.427 +
 107.428 +    dc0=dc2=0;
 107.429 +    for(i=0;i<4; i++){
 107.430 +        dc0+= src[-1+i*stride];
 107.431 +        dc2+= src[-1+(i+4)*stride];
 107.432 +    }
 107.433 +    dc0= 0x01010101*((dc0 + 2)>>2);
 107.434 +    dc2= 0x01010101*((dc2 + 2)>>2);
 107.435 +
 107.436 +    for(i=0; i<4; i++){
 107.437 +        ((uint32_t*)(src+i*stride))[0]=
 107.438 +        ((uint32_t*)(src+i*stride))[1]= dc0;
 107.439 +    }
 107.440 +    for(i=4; i<8; i++){
 107.441 +        ((uint32_t*)(src+i*stride))[0]=
 107.442 +        ((uint32_t*)(src+i*stride))[1]= dc2;
 107.443 +    }
 107.444 +}
 107.445 +
 107.446 +
 107.447 +static void pred8x8_top_dc_c(uint8_t *src, int stride){
 107.448 +    int i;
 107.449 +    int dc0, dc1;
 107.450 +
 107.451 +    dc0=dc1=0;
 107.452 +    for(i=0;i<4; i++){
 107.453 +        dc0+= src[i-stride];
 107.454 +        dc1+= src[4+i-stride];
 107.455 +    }
 107.456 +    dc0= 0x01010101*((dc0 + 2)>>2);
 107.457 +    dc1= 0x01010101*((dc1 + 2)>>2);
 107.458 +
 107.459 +    for(i=0; i<4; i++){
 107.460 +        ((uint32_t*)(src+i*stride))[0]= dc0;
 107.461 +        ((uint32_t*)(src+i*stride))[1]= dc1;
 107.462 +    }
 107.463 +    for(i=4; i<8; i++){
 107.464 +        ((uint32_t*)(src+i*stride))[0]= dc0;
 107.465 +        ((uint32_t*)(src+i*stride))[1]= dc1;
 107.466 +    }
 107.467 +}
 107.468 +
 107.469 +static void pred8x8_dc_c(uint8_t *src, int stride){
 107.470 +    int i;
 107.471 +    int dc0, dc1, dc2, dc3;
 107.472 +
 107.473 +    dc0=dc1=dc2=0;
 107.474 +    for(i=0;i<4; i++){
 107.475 +        dc0+= src[-1+i*stride] + src[i-stride];
 107.476 +        dc1+= src[4+i-stride];
 107.477 +        dc2+= src[-1+(i+4)*stride];
 107.478 +    }
 107.479 +    dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
 107.480 +    dc0= 0x01010101*((dc0 + 4)>>3);
 107.481 +    dc1= 0x01010101*((dc1 + 2)>>2);
 107.482 +    dc2= 0x01010101*((dc2 + 2)>>2);
 107.483 +
 107.484 +    for(i=0; i<4; i++){
 107.485 +        ((uint32_t*)(src+i*stride))[0]= dc0;
 107.486 +        ((uint32_t*)(src+i*stride))[1]= dc1;
 107.487 +    }
 107.488 +    for(i=4; i<8; i++){
 107.489 +        ((uint32_t*)(src+i*stride))[0]= dc2;
 107.490 +        ((uint32_t*)(src+i*stride))[1]= dc3;
 107.491 +    }
 107.492 +}
 107.493 +
 107.494 +//the following 4 function should not be optimized!
 107.495 +static void pred8x8_mad_cow_dc_l0t(uint8_t *src, int stride){
 107.496 +    pred8x8_top_dc_c(src, stride);
 107.497 +    pred4x4_dc_c(src, NULL, stride);
 107.498 +}
 107.499 +
 107.500 +static void pred8x8_mad_cow_dc_0lt(uint8_t *src, int stride){
 107.501 +    pred8x8_dc_c(src, stride);
 107.502 +    pred4x4_top_dc_c(src, NULL, stride);
 107.503 +}
 107.504 +
 107.505 +static void pred8x8_mad_cow_dc_l00(uint8_t *src, int stride){
 107.506 +    pred8x8_left_dc_c(src, stride);
 107.507 +    pred4x4_128_dc_c(src + 4*stride    , NULL, stride);
 107.508 +    pred4x4_128_dc_c(src + 4*stride + 4, NULL, stride);
 107.509 +}
 107.510 +
 107.511 +static void pred8x8_mad_cow_dc_0l0(uint8_t *src, int stride){
 107.512 +    pred8x8_left_dc_c(src, stride);
 107.513 +    pred4x4_128_dc_c(src    , NULL, stride);
 107.514 +    pred4x4_128_dc_c(src + 4, NULL, stride);
 107.515 +}
 107.516 +
 107.517 +static void pred8x8_plane_c(uint8_t *src, int stride){
 107.518 +  int j, k;
 107.519 +  int a;
 107.520 +  uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 107.521 +  const uint8_t * const src0 = src+3-stride;
 107.522 +  const uint8_t *src1 = src+4*stride-1;
 107.523 +  const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
 107.524 +  int H = src0[1] - src0[-1];
 107.525 +  int V = src1[0] - src2[ 0];
 107.526 +  for(k=2; k<=4; ++k) {
 107.527 +    src1 += stride; src2 -= stride;
 107.528 +    H += k*(src0[k] - src0[-k]);
 107.529 +    V += k*(src1[0] - src2[ 0]);
 107.530 +  }
 107.531 +  H = ( 17*H+16 ) >> 5;
 107.532 +  V = ( 17*V+16 ) >> 5;
 107.533 +
 107.534 +  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
 107.535 +  for(j=8; j>0; --j) {
 107.536 +    int b = a;
 107.537 +    a += V;
 107.538 +    src[0] = cm[ (b    ) >> 5 ];
 107.539 +    src[1] = cm[ (b+  H) >> 5 ];
 107.540 +    src[2] = cm[ (b+2*H) >> 5 ];
 107.541 +    src[3] = cm[ (b+3*H) >> 5 ];
 107.542 +    src[4] = cm[ (b+4*H) >> 5 ];
 107.543 +    src[5] = cm[ (b+5*H) >> 5 ];
 107.544 +    src[6] = cm[ (b+6*H) >> 5 ];
 107.545 +    src[7] = cm[ (b+7*H) >> 5 ];
 107.546 +    src += stride;
 107.547 +  }
 107.548 +}
 107.549 +
 107.550 +#define SRC(x,y) src[(x)+(y)*stride]
 107.551 +#define PL(y) \
 107.552 +    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
 107.553 +#define PREDICT_8x8_LOAD_LEFT \
 107.554 +    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
 107.555 +                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
 107.556 +    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
 107.557 +    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
 107.558 +
 107.559 +#define PT(x) \
 107.560 +    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
 107.561 +#define PREDICT_8x8_LOAD_TOP \
 107.562 +    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
 107.563 +                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
 107.564 +    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
 107.565 +    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
 107.566 +                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
 107.567 +
 107.568 +#define PTR(x) \
 107.569 +    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
 107.570 +#define PREDICT_8x8_LOAD_TOPRIGHT \
 107.571 +    int t8, t9, t10, t11, t12, t13, t14, t15; \
 107.572 +    if(has_topright) { \
 107.573 +        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
 107.574 +        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
 107.575 +    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
 107.576 +
 107.577 +#define PREDICT_8x8_LOAD_TOPLEFT \
 107.578 +    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
 107.579 +
 107.580 +#define PREDICT_8x8_DC(v) \
 107.581 +    int y; \
 107.582 +    for( y = 0; y < 8; y++ ) { \
 107.583 +        ((uint32_t*)src)[0] = \
 107.584 +        ((uint32_t*)src)[1] = v; \
 107.585 +        src += stride; \
 107.586 +    }
 107.587 +
 107.588 +static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.589 +	(void) has_topleft; (void) has_topright;
 107.590 +    PREDICT_8x8_DC(0x80808080);
 107.591 +}
 107.592 +
 107.593 +static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.594 +	(void) has_topleft; (void) has_topright;
 107.595 +    PREDICT_8x8_LOAD_LEFT;
 107.596 +    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
 107.597 +    PREDICT_8x8_DC(dc);
 107.598 +}
 107.599 +
 107.600 +static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.601 +    PREDICT_8x8_LOAD_TOP;
 107.602 +    const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
 107.603 +    PREDICT_8x8_DC(dc);
 107.604 +}
 107.605 +
 107.606 +static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.607 +    PREDICT_8x8_LOAD_LEFT;
 107.608 +    PREDICT_8x8_LOAD_TOP;
 107.609 +    const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
 107.610 +                         +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
 107.611 +    PREDICT_8x8_DC(dc);
 107.612 +}
 107.613 +
 107.614 +static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.615 +	(void) has_topleft; (void) has_topright;
 107.616 +    PREDICT_8x8_LOAD_LEFT;
 107.617 +#define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
 107.618 +               ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
 107.619 +    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
 107.620 +#undef ROW
 107.621 +}
 107.622 +
 107.623 +static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.624 +    int y;
 107.625 +    PREDICT_8x8_LOAD_TOP;
 107.626 +    src[0] = t0;
 107.627 +    src[1] = t1;
 107.628 +    src[2] = t2;
 107.629 +    src[3] = t3;
 107.630 +    src[4] = t4;
 107.631 +    src[5] = t5;
 107.632 +    src[6] = t6;
 107.633 +    src[7] = t7;
 107.634 +    for( y = 1; y < 8; y++ )
 107.635 +        *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
 107.636 +}
 107.637 +
 107.638 +static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.639 +    PREDICT_8x8_LOAD_TOP;
 107.640 +    PREDICT_8x8_LOAD_TOPRIGHT;
 107.641 +    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
 107.642 +    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
 107.643 +    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
 107.644 +    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
 107.645 +    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
 107.646 +    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
 107.647 +    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
 107.648 +    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
 107.649 +    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
 107.650 +    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
 107.651 +    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
 107.652 +    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
 107.653 +    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
 107.654 +    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
 107.655 +    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
 107.656 +}
 107.657 +
 107.658 +static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.659 +    PREDICT_8x8_LOAD_TOP;
 107.660 +    PREDICT_8x8_LOAD_LEFT;
 107.661 +    PREDICT_8x8_LOAD_TOPLEFT;
 107.662 +    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
 107.663 +    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
 107.664 +    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
 107.665 +    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
 107.666 +    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
 107.667 +    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
 107.668 +    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
 107.669 +    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
 107.670 +    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
 107.671 +    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
 107.672 +    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
 107.673 +    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
 107.674 +    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
 107.675 +    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
 107.676 +    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
 107.677 +}
 107.678 +
 107.679 +static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.680 +    PREDICT_8x8_LOAD_TOP;
 107.681 +    PREDICT_8x8_LOAD_LEFT;
 107.682 +    PREDICT_8x8_LOAD_TOPLEFT;
 107.683 +    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
 107.684 +    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
 107.685 +    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
 107.686 +    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
 107.687 +    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
 107.688 +    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
 107.689 +    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
 107.690 +    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
 107.691 +    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
 107.692 +    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
 107.693 +    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
 107.694 +    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
 107.695 +    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
 107.696 +    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
 107.697 +    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
 107.698 +    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
 107.699 +    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
 107.700 +    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
 107.701 +    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
 107.702 +    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
 107.703 +    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
 107.704 +    SRC(7,0)= (t6 + t7 + 1) >> 1;
 107.705 +}
 107.706 +
 107.707 +static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.708 +    PREDICT_8x8_LOAD_TOP;
 107.709 +    PREDICT_8x8_LOAD_LEFT;
 107.710 +    PREDICT_8x8_LOAD_TOPLEFT;
 107.711 +    SRC(0,7)= (l6 + l7 + 1) >> 1;
 107.712 +    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
 107.713 +    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
 107.714 +    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
 107.715 +    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
 107.716 +    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
 107.717 +    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
 107.718 +    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
 107.719 +    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
 107.720 +    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
 107.721 +    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
 107.722 +    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
 107.723 +    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
 107.724 +    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
 107.725 +    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
 107.726 +    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
 107.727 +    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
 107.728 +    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
 107.729 +    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
 107.730 +    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
 107.731 +    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
 107.732 +    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
 107.733 +}
 107.734 +
 107.735 +static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.736 +    PREDICT_8x8_LOAD_TOP;
 107.737 +    PREDICT_8x8_LOAD_TOPRIGHT;
 107.738 +    SRC(0,0)= (t0 + t1 + 1) >> 1;
 107.739 +    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
 107.740 +    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
 107.741 +    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
 107.742 +    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
 107.743 +    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
 107.744 +    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
 107.745 +    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
 107.746 +    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
 107.747 +    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
 107.748 +    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
 107.749 +    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
 107.750 +    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
 107.751 +    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
 107.752 +    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
 107.753 +    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
 107.754 +    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
 107.755 +    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
 107.756 +    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
 107.757 +    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
 107.758 +    SRC(7,6)= (t10 + t11 + 1) >> 1;
 107.759 +    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
 107.760 +}
 107.761 +
 107.762 +static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride){
 107.763 +	(void) has_topleft; (void) has_topright;
 107.764 +    PREDICT_8x8_LOAD_LEFT;
 107.765 +    SRC(0,0)= (l0 + l1 + 1) >> 1;
 107.766 +    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
 107.767 +    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
 107.768 +    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
 107.769 +    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
 107.770 +    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
 107.771 +    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
 107.772 +    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
 107.773 +    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
 107.774 +    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
 107.775 +    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
 107.776 +    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
 107.777 +    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
 107.778 +    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
 107.779 +    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
 107.780 +    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
 107.781 +    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
 107.782 +    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
 107.783 +}
 107.784 +#undef PREDICT_8x8_LOAD_LEFT
 107.785 +#undef PREDICT_8x8_LOAD_TOP
 107.786 +#undef PREDICT_8x8_LOAD_TOPLEFT
 107.787 +#undef PREDICT_8x8_LOAD_TOPRIGHT
 107.788 +#undef PREDICT_8x8_DC
 107.789 +#undef PTR
 107.790 +#undef PT
 107.791 +#undef PL
 107.792 +#undef SRC
 107.793 +
 107.794 +static void pred4x4_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
 107.795 +    int i;
 107.796 +    pix -= stride;
 107.797 +    for(i=0; i<4; i++){
 107.798 +        uint8_t v = pix[0];
 107.799 +        pix[1*stride]= v += block[0];
 107.800 +        pix[2*stride]= v += block[4];
 107.801 +        pix[3*stride]= v += block[8];
 107.802 +        pix[4*stride]= v +  block[12];
 107.803 +        pix++;
 107.804 +        block++;
 107.805 +    }
 107.806 +}
 107.807 +
 107.808 +static void pred4x4_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
 107.809 +    int i;
 107.810 +    for(i=0; i<4; i++){
 107.811 +        uint8_t v = pix[-1];
 107.812 +        pix[0]= v += block[0];
 107.813 +        pix[1]= v += block[1];
 107.814 +        pix[2]= v += block[2];
 107.815 +        pix[3]= v +  block[3];
 107.816 +        pix+= stride;
 107.817 +        block+= 4;
 107.818 +    }
 107.819 +}
 107.820 +
 107.821 +static void pred8x8l_vertical_add_c(uint8_t *pix, const DCTELEM *block, int stride){
 107.822 +    int i;
 107.823 +    pix -= stride;
 107.824 +    for(i=0; i<8; i++){
 107.825 +        uint8_t v = pix[0];
 107.826 +        pix[1*stride]= v += block[0];
 107.827 +        pix[2*stride]= v += block[8];
 107.828 +        pix[3*stride]= v += block[16];
 107.829 +        pix[4*stride]= v += block[24];
 107.830 +        pix[5*stride]= v += block[32];
 107.831 +        pix[6*stride]= v += block[40];
 107.832 +        pix[7*stride]= v += block[48];
 107.833 +        pix[8*stride]= v +  block[56];
 107.834 +        pix++;
 107.835 +        block++;
 107.836 +    }
 107.837 +}
 107.838 +
 107.839 +static void pred8x8l_horizontal_add_c(uint8_t *pix, const DCTELEM *block, int stride){
 107.840 +    int i;
 107.841 +    for(i=0; i<8; i++){
 107.842 +        uint8_t v = pix[-1];
 107.843 +        pix[0]= v += block[0];
 107.844 +        pix[1]= v += block[1];
 107.845 +        pix[2]= v += block[2];
 107.846 +        pix[3]= v += block[3];
 107.847 +        pix[4]= v += block[4];
 107.848 +        pix[5]= v += block[5];
 107.849 +        pix[6]= v += block[6];
 107.850 +        pix[7]= v +  block[7];
 107.851 +        pix+= stride;
 107.852 +        block+= 8;
 107.853 +    }
 107.854 +}
 107.855 +
 107.856 +static void pred16x16_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
 107.857 +    int i;
 107.858 +    for(i=0; i<16; i++)
 107.859 +        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
 107.860 +}
 107.861 +
 107.862 +static void pred16x16_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
 107.863 +    int i;
 107.864 +    for(i=0; i<16; i++)
 107.865 +        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
 107.866 +}
 107.867 +
 107.868 +static void pred8x8_vertical_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
 107.869 +    int i;
 107.870 +    for(i=0; i<4; i++)
 107.871 +        pred4x4_vertical_add_c(pix + block_offset[i], block + i*16, stride);
 107.872 +}
 107.873 +
 107.874 +static void pred8x8_horizontal_add_c(uint8_t *pix, const int *block_offset, const DCTELEM *block, int stride){
 107.875 +    int i;
 107.876 +    for(i=0; i<4; i++)
 107.877 +        pred4x4_horizontal_add_c(pix + block_offset[i], block + i*16, stride);
 107.878 +}
 107.879 +
 107.880 + 
 107.881 +/**
 107.882 + * Sets the intra prediction function pointers.
 107.883 + */
 107.884 +void ff_h264_pred_init(H264PredContext *h){
 107.885 +
 107.886 +    h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
 107.887 +    h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
 107.888 +    h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
 107.889 +    h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
 107.890 +    h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
 107.891 +    h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
 107.892 +    h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
 107.893 +    h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
 107.894 +    h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
 107.895 +    h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
 107.896 +    h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
 107.897 +    h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
 107.898 +
 107.899 +    h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
 107.900 +    h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
 107.901 +    h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
 107.902 +    h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
 107.903 +    h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
 107.904 +    h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
 107.905 +    h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
 107.906 +    h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
 107.907 +    h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
 107.908 +    h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
 107.909 +    h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
 107.910 +    h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
 107.911 +
 107.912 +    h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
 107.913 +    h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
 107.914 +    h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
 107.915 +
 107.916 +    h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
 107.917 +    h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
 107.918 +    h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
 107.919 +    h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8 ]= pred8x8_mad_cow_dc_l0t;
 107.920 +    h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8 ]= pred8x8_mad_cow_dc_0lt;
 107.921 +    h->pred8x8[ALZHEIMER_DC_L00_PRED8x8 ]= pred8x8_mad_cow_dc_l00;
 107.922 +    h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8 ]= pred8x8_mad_cow_dc_0l0;
 107.923 +
 107.924 +    h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
 107.925 +
 107.926 +    h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
 107.927 +    h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
 107.928 +    h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
 107.929 +    h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
 107.930 +
 107.931 +    h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
 107.932 +
 107.933 +    h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
 107.934 +    h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
 107.935 +    h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
 107.936 +
 107.937 +    //special lossless h/v prediction for h264
 107.938 +    h->pred4x4_add  [VERT_PRED   ]= pred4x4_vertical_add_c;
 107.939 +    h->pred4x4_add  [ HOR_PRED   ]= pred4x4_horizontal_add_c;
 107.940 +    h->pred8x8l_add [VERT_PRED   ]= pred8x8l_vertical_add_c;
 107.941 +    h->pred8x8l_add [ HOR_PRED   ]= pred8x8l_horizontal_add_c;
 107.942 +    h->pred8x8_add  [VERT_PRED8x8]= pred8x8_vertical_add_c;
 107.943 +    h->pred8x8_add  [ HOR_PRED8x8]= pred8x8_horizontal_add_c;
 107.944 +    h->pred16x16_add[VERT_PRED8x8]= pred16x16_vertical_add_c;
 107.945 +    h->pred16x16_add[ HOR_PRED8x8]= pred16x16_horizontal_add_c;
 107.946 +
 107.947 +    if (HAVE_NEON) ff_h264_pred_init_arm(h);
 107.948 +}

   108.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   108.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred.h	Mon Aug 27 12:09:56 2012 +0200
   108.3 @@ -0,0 +1,90 @@
   108.4 +/*
   108.5 + * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   108.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   108.7 + *
   108.8 + * This file is part of FFmpeg.
   108.9 + *
  108.10 + * FFmpeg is free software; you can redistribute it and/or
  108.11 + * modify it under the terms of the GNU Lesser General Public
  108.12 + * License as published by the Free Software Foundation; either
  108.13 + * version 2.1 of the License, or (at your option) any later version.
  108.14 + *
  108.15 + * FFmpeg is distributed in the hope that it will be useful,
  108.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  108.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  108.18 + * Lesser General Public License for more details.
  108.19 + *
  108.20 + * You should have received a copy of the GNU Lesser General Public
  108.21 + * License along with FFmpeg; if not, write to the Free Software
  108.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  108.23 + */
  108.24 +
  108.25 +/**
  108.26 + * @file
  108.27 + * H.264 / AVC / MPEG4 prediction functions.
  108.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  108.29 + */
  108.30 +
  108.31 +#ifndef AVCODEC_H264PRED_H
  108.32 +#define AVCODEC_H264PRED_H
  108.33 +
  108.34 +#include "libavutil/common.h"
  108.35 +#include "dsputil.h"
  108.36 +
  108.37 +/**
  108.38 + * Prediction types
  108.39 + */
  108.40 +//@{
  108.41 +#define VERT_PRED             0
  108.42 +#define HOR_PRED              1
  108.43 +#define DC_PRED               2
  108.44 +#define DIAG_DOWN_LEFT_PRED   3
  108.45 +#define DIAG_DOWN_RIGHT_PRED  4
  108.46 +#define VERT_RIGHT_PRED       5
  108.47 +#define HOR_DOWN_PRED         6
  108.48 +#define VERT_LEFT_PRED        7
  108.49 +#define HOR_UP_PRED           8
  108.50 +
  108.51 +#define LEFT_DC_PRED          9
  108.52 +#define TOP_DC_PRED           10
  108.53 +#define DC_128_PRED           11
  108.54 +
  108.55 +#define DIAG_DOWN_LEFT_PRED_RV40_NODOWN   12
  108.56 +#define HOR_UP_PRED_RV40_NODOWN           13
  108.57 +#define VERT_LEFT_PRED_RV40_NODOWN        14
  108.58 +
  108.59 +#define DC_PRED8x8            0
  108.60 +#define HOR_PRED8x8           1
  108.61 +#define VERT_PRED8x8          2
  108.62 +#define PLANE_PRED8x8         3
  108.63 +
  108.64 +#define LEFT_DC_PRED8x8       4
  108.65 +#define TOP_DC_PRED8x8        5
  108.66 +#define DC_128_PRED8x8        6
  108.67 +
  108.68 +#define ALZHEIMER_DC_L0T_PRED8x8 7
  108.69 +#define ALZHEIMER_DC_0LT_PRED8x8 8
  108.70 +#define ALZHEIMER_DC_L00_PRED8x8 9
  108.71 +#define ALZHEIMER_DC_0L0_PRED8x8 10
  108.72 +//@}
  108.73 +
  108.74 +/**
  108.75 + * Context for storing H.264 prediction functions
  108.76 + */
  108.77 +typedef struct H264PredContext{
  108.78 +    void (*pred4x4  [9+3+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
  108.79 +    void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
  108.80 +    void (*pred8x8  [4+3+4])(uint8_t *src, int stride);
  108.81 +    void (*pred16x16[4+3])(uint8_t *src, int stride);
  108.82 +
  108.83 +    void (*pred4x4_add  [2])(uint8_t *pix/*align  4*/, const DCTELEM *block/*align 16*/, int stride);
  108.84 +    void (*pred8x8l_add [2])(uint8_t *pix/*align  8*/, const DCTELEM *block/*align 16*/, int stride);
  108.85 +    void (*pred8x8_add  [3])(uint8_t *pix/*align  8*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
  108.86 +    void (*pred16x16_add[3])(uint8_t *pix/*align 16*/, const int *block_offset, const DCTELEM *block/*align 16*/, int stride);
  108.87 +}H264PredContext;
  108.88 +
  108.89 +void ff_h264_pred_init(H264PredContext *h);
  108.90 +void ff_h264_pred_init_arm(H264PredContext *h);
  108.91 +
  108.92 +
  108.93 +#endif /* AVCODEC_H264PRED_H */

   109.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   109.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.c	Mon Aug 27 12:09:56 2012 +0200
   109.3 @@ -0,0 +1,1013 @@
   109.4 +/*
   109.5 + * H.26L/H.264/AVC/JVT/14496-10/... direct mb/block decoding
   109.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   109.7 + *
   109.8 + * This file is part of FFmpeg.
   109.9 + *
  109.10 + * FFmpeg is free software; you can redistribute it and/or
  109.11 + * modify it under the terms of the GNU Lesser General Public
  109.12 + * License as published by the Free Software Foundation; either
  109.13 + * version 2.1 of the License, or (at your option) any later version.
  109.14 + *
  109.15 + * FFmpeg is distributed in the hope that it will be useful,
  109.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  109.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  109.18 + * Lesser General Public License for more details.
  109.19 + *
  109.20 + * You should have received a copy of the GNU Lesser General Public
  109.21 + * License along with FFmpeg; if not, write to the Free Software
  109.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  109.23 + */
  109.24 +
  109.25 +/**
  109.26 + * @file
  109.27 + * H.264 / AVC / MPEG4 part10 direct mb/block decoding.
  109.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  109.29 + */
  109.30 +
  109.31 +#include "dsputil.h"
  109.32 +#include "avcodec.h"
  109.33 +#include "h264_data.h"
  109.34 +#include "h264.h"
  109.35 +#include "rectangle.h"
  109.36 +
  109.37 +//#undef NDEBUG
  109.38 +#include <assert.h>
  109.39 +
  109.40 +static const uint8_t left_block_options[4][16]={
  109.41 +    {0,1,2,3,7,10,8,11,7+0*8, 7+1*8, 7+2*8, 7+3*8, 2+0*8, 2+3*8, 2+1*8, 2+2*8},
  109.42 +    {2,2,3,3,8,11,8,11,7+2*8, 7+2*8, 7+3*8, 7+3*8, 2+1*8, 2+2*8, 2+1*8, 2+2*8},
  109.43 +    {0,0,1,1,7,10,7,10,7+0*8, 7+0*8, 7+1*8, 7+1*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8},
  109.44 +    {0,2,0,2,7,10,7,10,7+0*8, 7+2*8, 7+0*8, 7+2*8, 2+0*8, 2+3*8, 2+0*8, 2+3*8}
  109.45 +};
  109.46 +
  109.47 +
  109.48 +// static void check_cache_copy(MBRecContext *mrc, H264Slice *s, H264Mb *m){
  109.49 +//     for (int list=0; list<2; list++){
  109.50 +//         for (int i=0; i<40; i++){
  109.51 +//             assert (m->ref_cache[list][i] == m->ref_cache_copy[list][i]);
  109.52 +//             assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy[list][i][0]);
  109.53 +//             assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy[list][i][1]);
  109.54 +//         }
  109.55 +//     }
  109.56 +// }
  109.57 +
  109.58 +// static void check_cache_copy2(MBRecContext *mrc, H264Slice *s, H264Mb *m){
  109.59 +//     for (int list=0; list<2; list++){
  109.60 +//         for (int i=0; i<40; i++){
  109.61 +//             assert (m->ref_cache[list][i] == m->ref_cache_copy2[list][i]);
  109.62 +//             assert (mrs->mv_cache[list][i][0] == mrs->mv_cache_copy2[list][i][0]);
  109.63 +//             assert (mrs->mv_cache[list][i][1] == mrs->mv_cache_copy2[list][i][1]);
  109.64 +//         }
  109.65 +//     }
  109.66 +// }
  109.67 +
  109.68 +static void fill_decode_caches_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
  109.69 +    int topleft_type, top_type, topright_type, left_type;
  109.70 +    const uint8_t * left_block= left_block_options[0];
  109.71 +    const int mb_x = m->mb_x;
  109.72 +    int i;
  109.73 +
  109.74 +    mrs->top_type  = mrs->mb_type_top[mb_x  ];
  109.75 +    mrs->left_type = mrs->mb_type    [mb_x-1];
  109.76 +
  109.77 +    topleft_type = mrs->mb_type_top[mb_x-1];
  109.78 +    top_type     = mrs->mb_type_top[mb_x  ];
  109.79 +    topright_type= mrs->mb_type_top[mb_x+1];
  109.80 +    left_type    = mrs->mb_type    [mb_x-1];
  109.81 +
  109.82 +    int type_mask= s->pps.constrained_intra_pred ? 1 : -1;
  109.83 +
  109.84 +    if(!IS_SKIP(mb_type)){
  109.85 +//         memset(mrc->non_zero_count_cache, 0, sizeof(mrc->non_zero_count_cache));
  109.86 +        AV_COPY32(&mrs->non_zero_count_cache[4+8*1], &m->non_zero_count[ 0]);
  109.87 +        AV_COPY32(&mrs->non_zero_count_cache[4+8*2], &m->non_zero_count[ 4]);
  109.88 +        AV_COPY32(&mrs->non_zero_count_cache[4+8*3], &m->non_zero_count[ 8]);
  109.89 +        AV_COPY32(&mrs->non_zero_count_cache[4+8*4], &m->non_zero_count[12]);
  109.90 +
  109.91 +        for (int i=0; i<2; i++) {
  109.92 +            mrs->non_zero_count_cache[8*1 + 8*i + 1] = m->non_zero_count[16 + i*2   ];
  109.93 +            mrs->non_zero_count_cache[8*1 + 8*i + 2] = m->non_zero_count[16 + i*2 +1];
  109.94 +            mrs->non_zero_count_cache[8*4 + 8*i + 1] = m->non_zero_count[20 + i*2   ];
  109.95 +            mrs->non_zero_count_cache[8*4 + 8*i + 2] = m->non_zero_count[20 + i*2 +1];
  109.96 +        }
  109.97 +
  109.98 +        if(IS_INTRA(mb_type)){
  109.99 +//             memset(mrc->intra4x4_pred_mode_cache, 0, sizeof(mrc->intra4x4_pred_mode_cache));
 109.100 +
 109.101 +            mrs->topleft_samples_available=
 109.102 +            mrs->top_samples_available=
 109.103 +            mrs->left_samples_available= 0xFFFF;
 109.104 +            mrs->topright_samples_available= 0xEEEA;
 109.105 +
 109.106 +            if(!(top_type & type_mask)){
 109.107 +                mrs->topleft_samples_available= 0xB3FF;
 109.108 +                mrs->top_samples_available= 0x33FF;
 109.109 +                mrs->topright_samples_available= 0x26EA;
 109.110 +            }
 109.111 +
 109.112 +            if(!(left_type & type_mask)){
 109.113 +                mrs->topleft_samples_available&= 0xDF5F;
 109.114 +                mrs->left_samples_available&= 0x5F5F;
 109.115 +            }
 109.116 +
 109.117 +            if(!(topleft_type & type_mask))
 109.118 +                mrs->topleft_samples_available&= 0x7FFF;
 109.119 +
 109.120 +            if(!(topright_type & type_mask))
 109.121 +                mrs->topright_samples_available&= 0xFBFF;
 109.122 +
 109.123 +            if(IS_INTRA4x4(mb_type)){
 109.124 +                if(IS_INTRA4x4(top_type)){
 109.125 +                    AV_COPY32(mrs->intra4x4_pred_mode_cache+4+8*0, &mrs->intra4x4_pred_mode_top[4*mb_x]);
 109.126 +                }else{
 109.127 +                    mrs->intra4x4_pred_mode_cache[4+8*0]=
 109.128 +                    mrs->intra4x4_pred_mode_cache[5+8*0]=
 109.129 +                    mrs->intra4x4_pred_mode_cache[6+8*0]=
 109.130 +                    mrs->intra4x4_pred_mode_cache[7+8*0]= 2 - 3*!(top_type & type_mask);
 109.131 +                }
 109.132 +
 109.133 +                if(IS_INTRA4x4(left_type)){
 109.134 +#if OMPSS
 109.135 +                    mrs->intra4x4_pred_mode_cache[3+8*1]= m->intra4x4_pred_mode_left[0];
 109.136 +                    mrs->intra4x4_pred_mode_cache[3+8*2]= m->intra4x4_pred_mode_left[1];
 109.137 +                    mrs->intra4x4_pred_mode_cache[3+8*3]= m->intra4x4_pred_mode_left[2];
 109.138 +                    mrs->intra4x4_pred_mode_cache[3+8*4]= m->intra4x4_pred_mode_left[3];
 109.139 +#else
 109.140 +                    mrs->intra4x4_pred_mode_cache[3+8*1]= mrs->intra4x4_pred_mode_left[0];
 109.141 +                    mrs->intra4x4_pred_mode_cache[3+8*2]= mrs->intra4x4_pred_mode_left[1];
 109.142 +                    mrs->intra4x4_pred_mode_cache[3+8*3]= mrs->intra4x4_pred_mode_left[2];
 109.143 +                    mrs->intra4x4_pred_mode_cache[3+8*4]= mrs->intra4x4_pred_mode_left[3];
 109.144 +#endif
 109.145 +                }else{
 109.146 +                    mrs->intra4x4_pred_mode_cache[3+8*1]=
 109.147 +                    mrs->intra4x4_pred_mode_cache[3+8*2]=
 109.148 +                    mrs->intra4x4_pred_mode_cache[3+8*3]=
 109.149 +                    mrs->intra4x4_pred_mode_cache[3+8*4]= 2 - 3*!(left_type & type_mask);
 109.150 +                }
 109.151 +            }
 109.152 +        }
 109.153 +    }
 109.154 +
 109.155 +    if(IS_INTER(mb_type) ||(IS_DIRECT(mb_type) && s->direct_spatial_mv_pred)){
 109.156 +        int list;
 109.157 +
 109.158 +//         memset(mrs->mv_cache, 0, sizeof(mrs->mv_cache));
 109.159 +//         memset(mrs->ref_cache, 0, sizeof(mrs->ref_cache));
 109.160 +
 109.161 +        mrs->ref_cache[0][scan8[5 ]+1] = mrs->ref_cache[0][scan8[7 ]+1] = mrs->ref_cache[0][scan8[13]+1] =
 109.162 +        mrs->ref_cache[1][scan8[5 ]+1] = mrs->ref_cache[1][scan8[7 ]+1] = mrs->ref_cache[1][scan8[13]+1] = PART_NOT_AVAILABLE;
 109.163 +
 109.164 +        for(list=0; list<s->list_count; list++){
 109.165 +            if(!USES_LIST(mb_type, list)){
 109.166 +                continue;
 109.167 +            }
 109.168 +            assert(!(IS_DIRECT(mb_type) && !s->direct_spatial_mv_pred));
 109.169 +
 109.170 +            if(USES_LIST(top_type, list)){
 109.171 +                const int b_xy= 4*mb_x + 3*mrc->b_stride;
 109.172 +                AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]);
 109.173 +                    mrs->ref_cache[list][scan8[0] + 0 - 1*8]=
 109.174 +                    mrs->ref_cache[list][scan8[0] + 1 - 1*8]= mrs->ref_index_top[list][4*mb_x + 2];
 109.175 +                    mrs->ref_cache[list][scan8[0] + 2 - 1*8]=
 109.176 +                    mrs->ref_cache[list][scan8[0] + 3 - 1*8]= mrs->ref_index_top[list][4*mb_x + 3];
 109.177 +            }else{
 109.178 +                AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]);
 109.179 +                AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101);
 109.180 +            }
 109.181 +
 109.182 +            if(mb_type & (MB_TYPE_16x8|MB_TYPE_8x8)){
 109.183 +                for(i=0; i<2; i++){
 109.184 +                    int cache_idx = scan8[0] - 1 + i*2*8;
 109.185 +                    if(USES_LIST(left_type, list)){
 109.186 +                        const int b_xy= 4*(mb_x-1) + 3;
 109.187 +                        const int b8_x= 4*(mb_x-1) + 1;
 109.188 +                        AV_COPY32(mrs->mv_cache[list][cache_idx  ], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[0+i*2]]);
 109.189 +                        AV_COPY32(mrs->mv_cache[list][cache_idx+8], mrs->motion_val[list][b_xy + mrc->b_stride*left_block[1+i*2]]);
 109.190 +                        mrs->ref_cache[list][cache_idx  ]= mrs->ref_index[list][b8_x + (left_block[0+i*2]&~1)];
 109.191 +                        mrs->ref_cache[list][cache_idx+8]= mrs->ref_index[list][b8_x + (left_block[1+i*2]&~1)];
 109.192 +                    }else{
 109.193 +                        AV_ZERO32(mrs->mv_cache [list][cache_idx  ]);
 109.194 +                        AV_ZERO32(mrs->mv_cache [list][cache_idx+8]);
 109.195 +                        mrs->ref_cache[list][cache_idx  ]=
 109.196 +                        mrs->ref_cache[list][cache_idx+8]= (left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE);
 109.197 +                    }
 109.198 +                }
 109.199 +            }else{
 109.200 +                if(USES_LIST(left_type, list)){
 109.201 +                    const int b_x = 4*(mb_x-1) + 3;
 109.202 +                    const int b8_x= 4*(mb_x-1) + 1;
 109.203 +                    AV_COPY32(mrs->mv_cache[list][scan8[0] - 1], mrs->motion_val[list][b_x + mrc->b_stride*left_block[0]]);
 109.204 +                    mrs->ref_cache[list][scan8[0] - 1]= mrs->ref_index[list][b8_x + (left_block[0]&~1)];
 109.205 +                }else{
 109.206 +                    AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1]);
 109.207 +                    mrs->ref_cache[list][scan8[0] - 1]= left_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 109.208 +                }
 109.209 +            }
 109.210 +
 109.211 +            if(USES_LIST(topright_type, list)){
 109.212 +                const int b_xy= 4*(mb_x+1) + 3*mrc->b_stride;
 109.213 +                AV_COPY32(mrs->mv_cache[list][scan8[0] + 4 - 1*8], mrs->motion_val_top[list][b_xy]);
 109.214 +                mrs->ref_cache[list][scan8[0] + 4 - 1*8]= mrs->ref_index_top[list][4*(mb_x+1) + 2];
 109.215 +            }else{
 109.216 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] + 4 - 1*8]);
 109.217 +                mrs->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 109.218 +            }
 109.219 +            if(mrs->ref_cache[list][scan8[0] + 4 - 1*8] < 0){
 109.220 +                int topleft_partition= -1;
 109.221 +                if(USES_LIST(topleft_type, list)){
 109.222 +                    const int b_xy = 4*(mb_x-1) + 3 + mrc->b_stride + (topleft_partition & 2*mrc->b_stride);
 109.223 +                    const int b8_x= 4*(mb_x-1) + 1 + (topleft_partition & 2);
 109.224 +                    AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 - 1*8], mrs->motion_val_top[list][b_xy]);
 109.225 +                    mrs->ref_cache[list][scan8[0] - 1 - 1*8]= mrs->ref_index_top[list][b8_x];
 109.226 +                }else{
 109.227 +                    AV_ZERO32(mrs->mv_cache[list][scan8[0] - 1 - 1*8]);
 109.228 +                    mrs->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 109.229 +                }
 109.230 +            }
 109.231 +
 109.232 +            if((mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2)))
 109.233 +                continue;
 109.234 +
 109.235 +            if(!(mb_type&(MB_TYPE_SKIP|MB_TYPE_DIRECT2))) {
 109.236 +                mrs->ref_cache[list][scan8[4 ]] =
 109.237 +                mrs->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 109.238 +                AV_ZERO32(mrs->mv_cache [list][scan8[4 ]]);
 109.239 +                AV_ZERO32(mrs->mv_cache [list][scan8[12]]);
 109.240 +            }
 109.241 +        }
 109.242 +    }
 109.243 +}
 109.244 +
 109.245 +static inline void write_back_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
 109.246 +    const int b_stride = mrc->b_stride;
 109.247 +    const int b_x = 4*m->mb_x; //try mb2b(8)_xy
 109.248 +    const int b8_x= 4*m->mb_x;
 109.249 +    int list;
 109.250 +
 109.251 +    if(!USES_LIST(mb_type, 0))
 109.252 +        fill_rectangle(&mrs->ref_index[0][b8_x], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
 109.253 +
 109.254 +    for(list=0; list<s->list_count; list++){
 109.255 +        int y;
 109.256 +        int16_t (*mv_dst)[2];
 109.257 +        int16_t (*mv_src)[2];
 109.258 +
 109.259 +        if(!USES_LIST(mb_type, list))
 109.260 +            continue;
 109.261 +
 109.262 +        mv_dst   = &mrs->motion_val[list][b_x];
 109.263 +        mv_src   = &mrs->mv_cache[list][scan8[0]];
 109.264 +        for(y=0; y<4; y++){
 109.265 +            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
 109.266 +        }
 109.267 +
 109.268 +        {
 109.269 +            int8_t *ref_index = &mrs->ref_index[list][b8_x];
 109.270 +            ref_index[0+0*2]= mrs->ref_cache[list][scan8[0]];
 109.271 +            ref_index[1+0*2]= mrs->ref_cache[list][scan8[4]];
 109.272 +            ref_index[0+1*2]= mrs->ref_cache[list][scan8[8]];
 109.273 +            ref_index[1+1*2]= mrs->ref_cache[list][scan8[12]];
 109.274 +        }
 109.275 +    }
 109.276 +}
 109.277 +
 109.278 +
 109.279 +/**
 109.280 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 109.281 +*/
 109.282 +static int check_intra4x4_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){
 109.283 +    static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 109.284 +    static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 109.285 +    int i;
 109.286 +
 109.287 +    if(!(mrs->top_samples_available&0x8000)){
 109.288 +        for(i=0; i<4; i++){
 109.289 +            int status= top[ mrs->intra4x4_pred_mode_cache[scan8[0] + i] ];
 109.290 +            if(status<0){
 109.291 +                av_log(AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
 109.292 +                return -1;
 109.293 +            } else if(status){
 109.294 +                mrs->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 109.295 +            }
 109.296 +        }
 109.297 +    }
 109.298 +
 109.299 +    if((mrs->left_samples_available&0x8888)!=0x8888){
 109.300 +        static const int mask[4]={0x8000,0x2000,0x80,0x20};
 109.301 +        for(i=0; i<4; i++){
 109.302 +            if(!(mrs->left_samples_available&mask[i])){
 109.303 +                int status= left[ mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 109.304 +                if(status<0){
 109.305 +                    av_log(AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, m->mb_x, m->mb_y);
 109.306 +                    return -1;
 109.307 +                } else if(status){
 109.308 +                    mrs->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 109.309 +                }
 109.310 +            }
 109.311 +        }
 109.312 +    }
 109.313 +    return 0;
 109.314 +}
 109.315 +
 109.316 +/**
 109.317 +* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 109.318 +*/
 109.319 +static int check_intra_pred_mode(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mode){
 109.320 +    static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 109.321 +    static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 109.322 +
 109.323 +    if(mode > 6) {
 109.324 +        av_log(AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", m->mb_x, m->mb_y);
 109.325 +        return -1;
 109.326 +    }
 109.327 +
 109.328 +    if(!(mrs->top_samples_available&0x8000)){
 109.329 +        mode= top[ mode ];
 109.330 +        if(mode<0){
 109.331 +            av_log(AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y);
 109.332 +            return -1;
 109.333 +        }
 109.334 +    }
 109.335 +
 109.336 +    if((mrs->left_samples_available&0x8080) != 0x8080){
 109.337 +        mode= left[ mode ];
 109.338 +        if(mrs->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 109.339 +            mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(mrs->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 109.340 +        }
 109.341 +        if(mode<0){
 109.342 +            av_log(AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", m->mb_x, m->mb_y);
 109.343 +            return -1;
 109.344 +        }
 109.345 +    }
 109.346 +    return mode;
 109.347 +}
 109.348 +
 109.349 +/**
 109.350 + * gets the predicted intra4x4 prediction mode.
 109.351 + */
 109.352 +static inline int pred_intra_mode(MBRecContext *mrc, MBRecState *mrs, int n){
 109.353 +    const int index8= scan8[n];
 109.354 +    const int left= mrs->intra4x4_pred_mode_cache[index8 - 1];
 109.355 +    const int top = mrs->intra4x4_pred_mode_cache[index8 - 8];
 109.356 +    const int min= FFMIN(left, top);
 109.357 +
 109.358 +    if(min<0) return DC_PRED;
 109.359 +    else      return min;
 109.360 +}
 109.361 +
 109.362 +static void write_back_intra_pred_mode_rec(MBRecContext *mrc, MBRecState *mrs, H264Mb *m, int mb_x){
 109.363 +    int8_t *mode= &mrs->intra4x4_pred_mode[4*mb_x];
 109.364 +
 109.365 +    AV_COPY32(mode, mrs->intra4x4_pred_mode_cache + 4 + 8*4);
 109.366 +#if OMPSS
 109.367 +    if (m->mb_x < mrc->mb_width-1){
 109.368 +        H264Mb *mr= m+1;
 109.369 +        mode = mr->intra4x4_pred_mode_left;
 109.370 +        mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1];
 109.371 +        mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2];
 109.372 +        mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3];
 109.373 +        mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4];
 109.374 +    }
 109.375 +#else
 109.376 +    mode = mrs->intra4x4_pred_mode_left;
 109.377 +    mode[0]= mrs->intra4x4_pred_mode_cache[7+8*1];
 109.378 +    mode[1]= mrs->intra4x4_pred_mode_cache[7+8*2];
 109.379 +    mode[2]= mrs->intra4x4_pred_mode_cache[7+8*3];
 109.380 +    mode[3]= mrs->intra4x4_pred_mode_cache[7+8*4];
 109.381 +#endif
 109.382 +}
 109.383 +
 109.384 +static void pred_spatial_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
 109.385 +    int b4_stride = mrc->b_stride;
 109.386 +    const int mb_x = m->mb_x;
 109.387 +    int mb_type_col[2];
 109.388 +    const int16_t (*l1mv0)[2], (*l1mv1)[2];
 109.389 +    const int8_t *l1ref0, *l1ref1;
 109.390 +    const int is_b8x8 = IS_8X8(*mb_type);
 109.391 +    unsigned int sub_mb_type= MB_TYPE_L0L1;
 109.392 +    int i8, i4;
 109.393 +    int ref[2];
 109.394 +    int mv[2];
 109.395 +    int list;
 109.396 +
 109.397 +    //assert(h->ref_list[1][0].reference&3);
 109.398 +
 109.399 +#define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 109.400 +
 109.401 +    /* ref = min(neighbors) */
 109.402 +    for(list=0; list<2; list++){
 109.403 +        int left_ref = mrs->ref_cache[list][scan8[0] - 1];
 109.404 +        int top_ref  = mrs->ref_cache[list][scan8[0] - 8];
 109.405 +        int refc = mrs->ref_cache[list][scan8[0] - 8 + 4];
 109.406 +        const int16_t *C= mrs->mv_cache[list][ scan8[0] - 8 + 4];
 109.407 +        if(refc == PART_NOT_AVAILABLE){
 109.408 +            refc = mrs->ref_cache[list][scan8[0] - 8 - 1];
 109.409 +            C    = mrs->mv_cache[list][scan8[0] - 8 - 1];
 109.410 +        }
 109.411 +        ref[list] = FFMIN3((unsigned)left_ref, (unsigned)top_ref, (unsigned)refc);
 109.412 +        if(ref[list] >= 0){
 109.413 +            //this is just pred_motion() but with the cases removed that cannot happen for direct blocks
 109.414 +            const int16_t * const A= mrs->mv_cache[list][ scan8[0] - 1 ];
 109.415 +            const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ];
 109.416 +
 109.417 +            int match_count= (left_ref==ref[list]) + (top_ref==ref[list]) + (refc==ref[list]);
 109.418 +            if(match_count > 1){ //most common
 109.419 +                mv[list]= pack16to32(mid_pred(A[0], B[0], C[0]),
 109.420 +                                     mid_pred(A[1], B[1], C[1]) );
 109.421 +            }else {
 109.422 +                assert(match_count==1);
 109.423 +                if(left_ref==ref[list]){
 109.424 +                    mv[list]= AV_RN32A(A);
 109.425 +                }else if(top_ref==ref[list]){
 109.426 +                    mv[list]= AV_RN32A(B);
 109.427 +                }else{
 109.428 +                    mv[list]= AV_RN32A(C);
 109.429 +                }
 109.430 +            }
 109.431 +        }else{
 109.432 +            int mask= ~(MB_TYPE_L0 << (2*list));
 109.433 +            mv[list] = 0;
 109.434 +            ref[list] = -1;
 109.435 +            if(!is_b8x8)
 109.436 +                *mb_type &= mask;
 109.437 +            sub_mb_type &= mask;
 109.438 +        }
 109.439 +    }
 109.440 +
 109.441 +    if(ref[0] < 0 && ref[1] < 0){
 109.442 +        ref[0] = ref[1] = 0;
 109.443 +        if(!is_b8x8)
 109.444 +            *mb_type |= MB_TYPE_L0L1;
 109.445 +        sub_mb_type |= MB_TYPE_L0L1;
 109.446 +    }
 109.447 +
 109.448 +    if(!(is_b8x8|mv[0]|mv[1])){
 109.449 +        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
 109.450 +        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
 109.451 +        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
 109.452 +        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
 109.453 +        *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
 109.454 +        return;
 109.455 +    }
 109.456 +
 109.457 +    mb_type_col[0] =
 109.458 +    mb_type_col[1] = mrs->list1_mb_type[mb_x];
 109.459 +
 109.460 +    sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 109.461 +    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
 109.462 +        *mb_type   |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_16x16 */
 109.463 +    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
 109.464 +        *mb_type   |= MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
 109.465 +    }else{
 109.466 +        if(!s->direct_8x8_inference_flag){
 109.467 +            /* FIXME save sub mb types from previous frames (or derive from MVs)
 109.468 +            * so we know exactly what block size to use */
 109.469 +            sub_mb_type += (MB_TYPE_8x8-MB_TYPE_16x16); /* B_SUB_4x4 */
 109.470 +        }
 109.471 +        *mb_type   |= MB_TYPE_8x8;
 109.472 +    }
 109.473 +
 109.474 +    l1mv0  = (void *) &mrs->list1_motion_val[0][4*mb_x];
 109.475 +    l1mv1  = (void *) &mrs->list1_motion_val[1][4*mb_x];
 109.476 +    l1ref0 = &mrs->list1_ref_index [0][4*mb_x];
 109.477 +    l1ref1 = &mrs->list1_ref_index [1][4*mb_x];
 109.478 +//     if(!b8_stride){
 109.479 +//         if(m->mb_y&1){
 109.480 +//             l1ref0 += 2;
 109.481 +//             l1ref1 += 2;
 109.482 +//             l1mv0  +=  2*b4_stride;
 109.483 +//             l1mv1  +=  2*b4_stride;
 109.484 +//         }
 109.485 +//     }
 109.486 +
 109.487 +    if(IS_16X16(*mb_type)){
 109.488 +        int a,b;
 109.489 +
 109.490 +        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
 109.491 +        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
 109.492 +        if(!IS_INTRA(mb_type_col[0]) && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
 109.493 +            || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
 109.494 +            ))){
 109.495 +            a=b=0;
 109.496 +            if(ref[0] > 0)
 109.497 +                a= mv[0];
 109.498 +            if(ref[1] > 0)
 109.499 +                b= mv[1];
 109.500 +        }else{
 109.501 +            a= mv[0];
 109.502 +            b= mv[1];
 109.503 +        }
 109.504 +        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
 109.505 +        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
 109.506 +    }else{
 109.507 +        int n=0;
 109.508 +        for(i8=0; i8<4; i8++){
 109.509 +            const int x8 = i8&1;
 109.510 +            const int y8 = i8>>1;
 109.511 +
 109.512 +            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
 109.513 +                continue;
 109.514 +            m->sub_mb_type[i8] = sub_mb_type;
 109.515 +
 109.516 +            fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, mv[0], 4);
 109.517 +            fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, mv[1], 4);
 109.518 +            fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
 109.519 +            fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
 109.520 +
 109.521 +            /* col_zero_flag */
 109.522 +            if(!IS_INTRA(mb_type_col[0]) && (l1ref0[i8] == 0 || (l1ref0[i8] < 0 && l1ref1[i8] == 0 ))
 109.523 +                ){
 109.524 +                const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
 109.525 +                if(IS_SUB_8X8(sub_mb_type)){
 109.526 +                    const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
 109.527 +                    if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
 109.528 +                        if(ref[0] == 0)
 109.529 +                            fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
 109.530 +                        if(ref[1] == 0)
 109.531 +                            fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
 109.532 +                        n+=4;
 109.533 +                    }
 109.534 +                }else{
 109.535 +                    int k=0;
 109.536 +                    for(i4=0; i4<4; i4++){
 109.537 +                        const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
 109.538 +                        if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
 109.539 +                            if(ref[0] == 0)
 109.540 +                                AV_ZERO32(mrs->mv_cache[0][scan8[i8*4+i4]]);
 109.541 +                            if(ref[1] == 0)
 109.542 +                                AV_ZERO32(mrs->mv_cache[1][scan8[i8*4+i4]]);
 109.543 +                            k++;
 109.544 +                        }
 109.545 +                    }
 109.546 +                    if(!(k&3))
 109.547 +                        m->sub_mb_type[i8]+= MB_TYPE_16x16 - MB_TYPE_8x8;
 109.548 +                    n+=k;
 109.549 +                }
 109.550 +            }
 109.551 +        }
 109.552 +        if(!is_b8x8 && !(n&15)){
 109.553 +            *mb_type= (*mb_type & ~(MB_TYPE_8x8|MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_P1L0|MB_TYPE_P1L1))|MB_TYPE_16x16|MB_TYPE_DIRECT2;
 109.554 +        }
 109.555 +    }
 109.556 +}
 109.557 +
 109.558 +static void pred_temp_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
 109.559 +    const int mb_x = m->mb_x;
 109.560 +    int b4_stride = mrc->b_stride;
 109.561 +    int mb_type_col[2];
 109.562 +    const int16_t (*l1mv0)[2], (*l1mv1)[2];
 109.563 +    const int8_t *l1ref0, *l1ref1;
 109.564 +    const int is_b8x8 = IS_8X8(*mb_type);
 109.565 +    unsigned int sub_mb_type;
 109.566 +    int i8, i4;
 109.567 +    const int *map_col_to_list0[2] = {s->map_col_to_list0[0], s->map_col_to_list0[1]};
 109.568 +    const int *dist_scale_factor = s->dist_scale_factor;
 109.569 +
 109.570 +    mb_type_col[0] =
 109.571 +    mb_type_col[1] = mrs->list1_mb_type[mb_x];
 109.572 +
 109.573 +    sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 109.574 +    if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
 109.575 +        *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 109.576 +    }else if(!is_b8x8 && (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16))){
 109.577 +        *mb_type   |= MB_TYPE_L0L1|MB_TYPE_DIRECT2 | (mb_type_col[0] & (MB_TYPE_16x8|MB_TYPE_8x16));
 109.578 +    }else{
 109.579 +        if(!s->direct_8x8_inference_flag){
 109.580 +            /* FIXME save sub mb types from previous frames (or derive from MVs)
 109.581 +            * so we know exactly what block size to use */
 109.582 +            sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 109.583 +        }
 109.584 +        *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
 109.585 +    }
 109.586 +
 109.587 +    l1mv0  = (void *) &mrs->list1_motion_val[0][4*mb_x];
 109.588 +    l1mv1  = (void *) &mrs->list1_motion_val[1][4*mb_x];
 109.589 +    l1ref0 = &mrs->list1_ref_index [0][4*mb_x];
 109.590 +    l1ref1 = &mrs->list1_ref_index [1][4*mb_x];
 109.591 +
 109.592 +    /* one-to-one mv scaling */
 109.593 +    if(IS_16X16(*mb_type)){
 109.594 +        int ref, mv0, mv1;
 109.595 +
 109.596 +        fill_rectangle(&mrs->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
 109.597 +        if(IS_INTRA(mb_type_col[0])){
 109.598 +            ref=mv0=mv1=0;
 109.599 +        }else{
 109.600 +            const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
 109.601 +            : map_col_to_list0[1][l1ref1[0]];
 109.602 +            const int scale = dist_scale_factor[ref0];
 109.603 +            const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
 109.604 +            int mv_l0[2];
 109.605 +            mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
 109.606 +            mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
 109.607 +            ref= ref0;
 109.608 +            mv0= pack16to32(mv_l0[0],mv_l0[1]);
 109.609 +            mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
 109.610 +        }
 109.611 +        fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
 109.612 +        fill_rectangle(&mrs->mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
 109.613 +        fill_rectangle(&mrs->mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
 109.614 +    }else{
 109.615 +        for(i8=0; i8<4; i8++){
 109.616 +            const int x8 = i8&1;
 109.617 +            const int y8 = i8>>1;
 109.618 +            int ref0, scale;
 109.619 +            const int16_t (*l1mv)[2]= l1mv0;
 109.620 +
 109.621 +            if(is_b8x8 && !IS_DIRECT(m->sub_mb_type[i8]))
 109.622 +                continue;
 109.623 +            m->sub_mb_type[i8] = sub_mb_type;
 109.624 +            fill_rectangle(&mrs->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
 109.625 +            if(IS_INTRA(mb_type_col[0])){
 109.626 +                fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
 109.627 +                fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
 109.628 +                fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
 109.629 +                continue;
 109.630 +            }
 109.631 +
 109.632 +            ref0 = l1ref0[i8];
 109.633 +            if(ref0 >= 0)
 109.634 +                ref0 = map_col_to_list0[0][ref0 ];
 109.635 +            else{
 109.636 +                ref0 = map_col_to_list0[1][l1ref1[i8]];
 109.637 +                l1mv= l1mv1;
 109.638 +            }
 109.639 +            scale = dist_scale_factor[ref0];
 109.640 +
 109.641 +            fill_rectangle(&mrs->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
 109.642 +            if(IS_SUB_8X8(sub_mb_type)){
 109.643 +                const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
 109.644 +                int mx = (scale * mv_col[0] + 128) >> 8;
 109.645 +                int my = (scale * mv_col[1] + 128) >> 8;
 109.646 +                fill_rectangle(&mrs->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
 109.647 +                fill_rectangle(&mrs->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
 109.648 +            }else
 109.649 +            for(i4=0; i4<4; i4++){
 109.650 +                const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
 109.651 +                int16_t *mv_l0 = mrs->mv_cache[0][scan8[i8*4+i4]];
 109.652 +                mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
 109.653 +                mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
 109.654 +                AV_WN32A(mrs->mv_cache[1][scan8[i8*4+i4]],
 109.655 +                    pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]));
 109.656 +            }
 109.657 +        }
 109.658 +    }
 109.659 +}
 109.660 +
 109.661 +void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int *mb_type){
 109.662 +    if(s->direct_spatial_mv_pred){
 109.663 +        pred_spatial_direct_motion_rec(mrc, mrs, s, m, mb_type);
 109.664 +    }else{
 109.665 +        pred_temp_direct_motion_rec(mrc, mrs, s, m, mb_type);
 109.666 +    }
 109.667 +}
 109.668 +
 109.669 +static inline int fetch_diagonal_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, const int16_t **C, int i, int list, int part_width){
 109.670 +    const int topright_ref= mrs->ref_cache[list][ i - 8 + part_width ];
 109.671 +
 109.672 +    if(topright_ref != PART_NOT_AVAILABLE){
 109.673 +        *C= mrs->mv_cache[list][ i - 8 + part_width ];
 109.674 +        return topright_ref;
 109.675 +    }else{
 109.676 +        *C= mrs->mv_cache[list][ i - 8 - 1 ];
 109.677 +        return mrs->ref_cache[list][ i - 8 - 1 ];
 109.678 +    }
 109.679 +}
 109.680 +
 109.681 +/**
 109.682 + * gets the predicted MV.
 109.683 + * @param n the block index
 109.684 + * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 109.685 + * @param mx the x component of the predicted motion vector
 109.686 + * @param my the y component of the predicted motion vector
 109.687 + */
 109.688 +static inline void pred_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int part_width, int list, int ref, int * const mx, int * const my){
 109.689 +    const int index8= scan8[n];
 109.690 +    const int top_ref=      mrs->ref_cache[list][ index8 - 8 ];
 109.691 +    const int left_ref=     mrs->ref_cache[list][ index8 - 1 ];
 109.692 +    const int16_t * const A= mrs->mv_cache[list][ index8 - 1 ];
 109.693 +    const int16_t * const B= mrs->mv_cache[list][ index8 - 8 ];
 109.694 +    const int16_t * C;
 109.695 +    int diagonal_ref, match_count;
 109.696 +
 109.697 +    assert(part_width==1 || part_width==2 || part_width==4);
 109.698 +
 109.699 +/* mv_cache
 109.700 +  B . . A T T T T
 109.701 +  U . . L . . , .
 109.702 +  U . . L . . . .
 109.703 +  U . . L . . , .
 109.704 +  . . . L . . . .
 109.705 +*/
 109.706 +
 109.707 +    diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, index8, list, part_width);
 109.708 +    match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 109.709 +
 109.710 +    if(match_count > 1){ //most common
 109.711 +        *mx= mid_pred(A[0], B[0], C[0]);
 109.712 +        *my= mid_pred(A[1], B[1], C[1]);
 109.713 +    }else if(match_count==1){
 109.714 +        if(left_ref==ref){
 109.715 +            *mx= A[0];
 109.716 +            *my= A[1];
 109.717 +        }else if(top_ref==ref){
 109.718 +            *mx= B[0];
 109.719 +            *my= B[1];
 109.720 +        }else{
 109.721 +            *mx= C[0];
 109.722 +            *my= C[1];
 109.723 +        }
 109.724 +    }else{
 109.725 +        if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 109.726 +            *mx= A[0];
 109.727 +            *my= A[1];
 109.728 +        }else{
 109.729 +            *mx= mid_pred(A[0], B[0], C[0]);
 109.730 +            *my= mid_pred(A[1], B[1], C[1]);
 109.731 +        }
 109.732 +    }
 109.733 +
 109.734 +}
 109.735 +
 109.736 +/**
 109.737 + * gets the directionally predicted 16x8 MV.
 109.738 + * @param n the block index
 109.739 + * @param mx the x component of the predicted motion vector
 109.740 + * @param my the y component of the predicted motion vector
 109.741 + */
 109.742 +static inline void pred_16x8_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){
 109.743 +    if(n==0){
 109.744 +        const int top_ref=      mrs->ref_cache[list][ scan8[0] - 8 ];
 109.745 +        const int16_t * const B= mrs->mv_cache[list][ scan8[0] - 8 ];
 109.746 +
 109.747 +        if(top_ref == ref){
 109.748 +            *mx= B[0];
 109.749 +            *my= B[1];
 109.750 +            return;
 109.751 +        }
 109.752 +    }else{
 109.753 +        const int left_ref=     mrs->ref_cache[list][ scan8[8] - 1 ];
 109.754 +        const int16_t * const A= mrs->mv_cache[list][ scan8[8] - 1 ];
 109.755 +
 109.756 +        if(left_ref == ref){
 109.757 +            *mx= A[0];
 109.758 +            *my= A[1];
 109.759 +            return;
 109.760 +        }
 109.761 +    }
 109.762 +
 109.763 +    //RARE
 109.764 +    pred_motion(mrc, mrs, s, n, 4, list, ref, mx, my);
 109.765 +}
 109.766 +
 109.767 +/**
 109.768 + * gets the directionally predicted 8x16 MV.
 109.769 + * @param n the block index
 109.770 + * @param mx the x component of the predicted motion vector
 109.771 + * @param my the y component of the predicted motion vector
 109.772 + */
 109.773 +static inline void pred_8x16_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int n, int list, int ref, int * const mx, int * const my){
 109.774 +    if(n==0){
 109.775 +        const int left_ref=      mrs->ref_cache[list][ scan8[0] - 1 ];
 109.776 +        const int16_t * const A=  mrs->mv_cache[list][ scan8[0] - 1 ];
 109.777 +
 109.778 +        if(left_ref == ref){
 109.779 +            *mx= A[0];
 109.780 +            *my= A[1];
 109.781 +            return;
 109.782 +        }
 109.783 +    }else{
 109.784 +        const int16_t * C;
 109.785 +        int diagonal_ref;
 109.786 +
 109.787 +        diagonal_ref= fetch_diagonal_mv(mrc, mrs, s, &C, scan8[4], list, 2);
 109.788 +        if(diagonal_ref == ref){
 109.789 +            *mx= C[0];
 109.790 +            *my= C[1];
 109.791 +            return;
 109.792 +        }
 109.793 +    }
 109.794 +
 109.795 +    //RARE
 109.796 +    pred_motion(mrc, mrs, s, n, 2, list, ref, mx, my);
 109.797 +}
 109.798 +
 109.799 +static inline void pred_pskip_motion(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb * m, int * const mx, int * const my){
 109.800 +    const int top_ref = mrs->ref_cache[0][ scan8[0] - 8 ];
 109.801 +    const int left_ref= mrs->ref_cache[0][ scan8[0] - 1 ];
 109.802 +
 109.803 +    if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 109.804 +       || !( top_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 8 ]))
 109.805 +       || !(left_ref | AV_RN32A(mrs->mv_cache[0][ scan8[0] - 1 ]))){
 109.806 +
 109.807 +        *mx = *my = 0;
 109.808 +        return;
 109.809 +    }
 109.810 +
 109.811 +    pred_motion(mrc, mrs, s, 0, 4, 0, 0, mx, my);
 109.812 +
 109.813 +    return;
 109.814 +}
 109.815 +
 109.816 +#define ADD_MVD(list) \
 109.817 +{ \
 109.818 +    mx += m->mvd[list][mp][0]; \
 109.819 +    my += m->mvd[list][mp][1]; \
 109.820 +    mp++; \
 109.821 +}
 109.822 +
 109.823 +int pred_motion_mb_rec (MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m){
 109.824 +    int mp=0;
 109.825 +    int mb_type = m->mb_type;
 109.826 +    const int mb_x = m->mb_x;
 109.827 +
 109.828 +//     mrc->m =m;
 109.829 +
 109.830 +    fill_decode_caches_rec(mrc, mrs, s, m, mb_type);
 109.831 +    if (IS_SKIP(mb_type)){
 109.832 +        mb_type=0;
 109.833 +
 109.834 +        if( s->slice_type_nos == FF_B_TYPE )
 109.835 +        {
 109.836 +            mb_type|= MB_TYPE_L0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
 109.837 +            ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
 109.838 +        }
 109.839 +        else
 109.840 +        {
 109.841 +            int mx, my;
 109.842 +
 109.843 +            mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP; //FIXME check required
 109.844 +            pred_pskip_motion(mrc, mrs, s, m, &mx, &my);
 109.845 +            fill_rectangle(&mrs->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
 109.846 +            fill_rectangle(mrs->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
 109.847 +        }
 109.848 +
 109.849 +        write_back_motion_rec(mrc, mrs, s, m, mb_type);
 109.850 +        m->mb_type = mrs->mb_type[mb_x]= mb_type;
 109.851 +        return 0;
 109.852 +    }
 109.853 +
 109.854 +
 109.855 +    if (IS_INTRA_PCM(mb_type)){
 109.856 +        mrs->mb_type[mb_x] =  mb_type;
 109.857 +        return 0;
 109.858 +    }
 109.859 +    else if (IS_INTRA(mb_type)){
 109.860 +        int i, pred_mode;
 109.861 +
 109.862 +        if( IS_INTRA4x4( mb_type ) ) {
 109.863 +            if ( IS_8x8DCT(mb_type) ) {
 109.864 +                for( i = 0; i < 16; i+=4 ) {
 109.865 +                    int pred = pred_intra_mode(mrc, mrs, i );
 109.866 +                    int mode = m->intra4x4_pred_mode[i];
 109.867 +
 109.868 +                    mode = mode < 0 ?  pred : mode + ( mode >= pred );
 109.869 +                    fill_rectangle( &mrs->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
 109.870 +                }
 109.871 +            } else {
 109.872 +                for( i = 0; i < 16; i++ ) {
 109.873 +                    int pred = pred_intra_mode(mrc, mrs, i );
 109.874 +                    int mode = m->intra4x4_pred_mode[i];
 109.875 +                    mode = mode < 0 ?  pred : mode + ( mode >= pred );
 109.876 +                    mrs->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
 109.877 +                }
 109.878 +            }
 109.879 +            write_back_intra_pred_mode_rec(mrc, mrs, m, mb_x);
 109.880 +            if( check_intra4x4_pred_mode(mrc, mrs, s, m) < 0 ) return -1;
 109.881 +        } else {
 109.882 +            m->intra16x16_pred_mode= check_intra_pred_mode(mrc, mrs, s, m, m->intra16x16_pred_mode );
 109.883 +            if( m->intra16x16_pred_mode < 0 ) return -1;
 109.884 +        }
 109.885 +
 109.886 +        pred_mode = m->chroma_pred_mode;
 109.887 +        pred_mode= check_intra_pred_mode( mrc, mrs, s, m, pred_mode );
 109.888 +        if( pred_mode < 0 ) return -1;
 109.889 +        m->chroma_pred_mode= pred_mode;
 109.890 +
 109.891 +    }
 109.892 +    else if (IS_8X8(mb_type)){
 109.893 +        int i, j, list;
 109.894 +
 109.895 +        if( s->slice_type_nos == FF_B_TYPE ) {
 109.896 +            if( IS_DIRECT(m->sub_mb_type[0] | m->sub_mb_type[1] |
 109.897 +                            m->sub_mb_type[2] | m->sub_mb_type[3]) ) {
 109.898 +                ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
 109.899 +                mrs->ref_cache[0][scan8[4]] =
 109.900 +                mrs->ref_cache[1][scan8[4]] =
 109.901 +                mrs->ref_cache[0][scan8[12]] =
 109.902 +                mrs->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
 109.903 +            }
 109.904 +        }
 109.905 +
 109.906 +        for(list=0; list<s->list_count; list++){
 109.907 +            for(i=0; i<4; i++){
 109.908 +                if(IS_DIRECT(m->sub_mb_type[i])){
 109.909 +                    mrs->ref_cache[list][ scan8[4*i]   ]=mrs->ref_cache[list][ scan8[4*i]+1 ];
 109.910 +                    continue;
 109.911 +                } else {
 109.912 +                    mrs->ref_cache[list][ scan8[4*i]   ]=mrs->ref_cache[list][ scan8[4*i]+1 ]=
 109.913 +                    mrs->ref_cache[list][ scan8[4*i]+8 ]=mrs->ref_cache[list][ scan8[4*i]+9 ]= m->ref_index[list][i];
 109.914 +
 109.915 +                    if(IS_DIR(m->sub_mb_type[i], 0, list) ){
 109.916 +                        const int sub_mb_type= m->sub_mb_type[i];
 109.917 +                        const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
 109.918 +
 109.919 +                        int sub_partition_count = IS_SUB_8X8(sub_mb_type) ? 1 : (IS_SUB_4X4(sub_mb_type)? 4 :2);
 109.920 +                        for(j=0; j<sub_partition_count; j++){
 109.921 +                            int mx, my;
 109.922 +                            const int index= 4*i + block_width*j;
 109.923 +                            int16_t (* mv_cache)[2]= &mrs->mv_cache[list][ scan8[index]];
 109.924 +                            pred_motion(mrc, mrs, s, index, block_width, list, mrs->ref_cache[list][ scan8[index] ], &mx, &my);
 109.925 +
 109.926 +                            ADD_MVD(list)
 109.927 +
 109.928 +                            if(IS_SUB_8X8(sub_mb_type)){
 109.929 +                                mv_cache[ 1 ][0]=
 109.930 +                                mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
 109.931 +                                mv_cache[ 1 ][1]=
 109.932 +                                mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
 109.933 +                            }else if(IS_SUB_8X4(sub_mb_type)){
 109.934 +                                mv_cache[ 1 ][0]= mx;
 109.935 +                                mv_cache[ 1 ][1]= my;
 109.936 +                            }else if(IS_SUB_4X8(sub_mb_type)){
 109.937 +                                mv_cache[ 8 ][0]= mx;
 109.938 +                                mv_cache[ 8 ][1]= my;
 109.939 +                            }
 109.940 +                            mv_cache[ 0 ][0]= mx;
 109.941 +                            mv_cache[ 0 ][1]= my;
 109.942 +                        }
 109.943 +                    }else{
 109.944 +                        fill_rectangle(mrs->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4);
 109.945 +                    }
 109.946 +                }
 109.947 +            }
 109.948 +        }
 109.949 +    } else if( IS_DIRECT(mb_type) ) {
 109.950 +        mb_type &= ~MB_TYPE_16x16;  //FIXME not nice
 109.951 +        ff_h264_pred_direct_motion_rec(mrc, mrs, s, m, &mb_type);
 109.952 +    }
 109.953 +    else {
 109.954 +        int list, i;
 109.955 +        if(IS_16X16(mb_type)){
 109.956 +            for(list=0; list<s->list_count; list++){
 109.957 +                if(IS_DIR(mb_type, 0, list)){
 109.958 +                    int ref;
 109.959 +                    int mx,my;
 109.960 +
 109.961 +                    ref = m->ref_index[list][0];
 109.962 +                    fill_rectangle(&mrs->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
 109.963 +                    pred_motion(mrc, mrs, s, 0, 4, list, mrs->ref_cache[list][ scan8[0] ], &mx, &my);
 109.964 +                    ADD_MVD(list)
 109.965 +                    fill_rectangle(mrs->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
 109.966 +                }
 109.967 +            }
 109.968 +        }
 109.969 +        else if(IS_16X8(mb_type)){
 109.970 +            for(list=0; list<s->list_count; list++){
 109.971 +                for(i=0; i<2; i++){
 109.972 +                    if(IS_DIR(mb_type, i, list)){
 109.973 +                        int ref;
 109.974 +                        int mx,my;
 109.975 +                        ref = m->ref_index[list][i];
 109.976 +                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
 109.977 +
 109.978 +                        pred_16x8_motion(mrc, mrs, s, 8*i, list, mrs->ref_cache[list][scan8[0] + 16*i], &mx, &my);
 109.979 +                        ADD_MVD(list)
 109.980 +
 109.981 +                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
 109.982 +                    }else{
 109.983 +                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
 109.984 +                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
 109.985 +                    }
 109.986 +                }
 109.987 +            }
 109.988 +
 109.989 +        }else{
 109.990 +            assert(IS_8X16(mb_type));
 109.991 +
 109.992 +            for(list=0; list<s->list_count; list++){
 109.993 +                for(i=0; i<2; i++){
 109.994 +                    if(IS_DIR(mb_type, i, list)){ //FIXME optimize
 109.995 +                        int ref;
 109.996 +                        int mx,my;
 109.997 +                        ref = m->ref_index[list][i];
 109.998 +                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
 109.999 +                        pred_8x16_motion(mrc, mrs, s, i*4, list, mrs->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
109.1000 +                        ADD_MVD(list)
109.1001 +                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
109.1002 +                    }else{
109.1003 +                        fill_rectangle(&mrs->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
109.1004 +                        fill_rectangle(mrs->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
109.1005 +                    }
109.1006 +                }
109.1007 +            }
109.1008 +        }
109.1009 +    }
109.1010 +
109.1011 +    if (IS_INTER(mb_type)||(IS_DIRECT(mb_type)))
109.1012 +        write_back_motion_rec(mrc, mrs, s, m, mb_type);
109.1013 +    m->mb_type = mrs->mb_type[mb_x]= mb_type;
109.1014 +
109.1015 +    return 0;
109.1016 +}

   110.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   110.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pred_mode.h	Mon Aug 27 12:09:56 2012 +0200
   110.3 @@ -0,0 +1,10 @@
   110.4 +#ifndef H264_DIRECT_H
   110.5 +#define H264_DIRECT_H
   110.6 +
   110.7 +#include "h264_types.h"
   110.8 +
   110.9 +void ff_h264_pred_direct_motion_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int *mb_type);
  110.10 +int pred_motion_mb_rec(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m);
  110.11 +
  110.12 +
  110.13 +#endif

   111.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   111.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_ps.c	Mon Aug 27 12:09:56 2012 +0200
   111.3 @@ -0,0 +1,462 @@
   111.4 +/*
   111.5 + * H.26L/H.264/AVC/JVT/14496-10/... parameter set decoding
   111.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   111.7 + *
   111.8 + * This file is part of FFmpeg.
   111.9 + *
  111.10 + * FFmpeg is free software; you can redistribute it and/or
  111.11 + * modify it under the terms of the GNU Lesser General Public
  111.12 + * License as published by the Free Software Foundation; either
  111.13 + * version 2.1 of the License, or (at your option) any later version.
  111.14 + *
  111.15 + * FFmpeg is distributed in the hope that it will be useful,
  111.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  111.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  111.18 + * Lesser General Public License for more details.
  111.19 + *
  111.20 + * You should have received a copy of the GNU Lesser General Public
  111.21 + * License along with FFmpeg; if not, write to the Free Software
  111.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  111.23 + */
  111.24 +
  111.25 +/**
  111.26 + * @file
  111.27 + * H.264 / AVC / MPEG4 part10 parameter set decoding.
  111.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  111.29 + */
  111.30 +
  111.31 +#include "dsputil.h"
  111.32 +#include "avcodec.h"
  111.33 +#include "h264_types.h"
  111.34 +#include "h264_data.h"
  111.35 +#include "golomb.h"
  111.36 +
  111.37 +
  111.38 +//#undef NDEBUG
  111.39 +#include <assert.h>
  111.40 +
  111.41 +static const int pixel_aspect[17][2]={
  111.42 + {0, 1},
  111.43 + {1, 1},
  111.44 + {12, 11},
  111.45 + {10, 11},
  111.46 + {16, 11},
  111.47 + {40, 33},
  111.48 + {24, 11},
  111.49 + {20, 11},
  111.50 + {32, 11},
  111.51 + {80, 33},
  111.52 + {18, 11},
  111.53 + {15, 11},
  111.54 + {64, 33},
  111.55 + {160,99},
  111.56 + {4, 3},
  111.57 + {3, 2},
  111.58 + {2, 1},
  111.59 +};
  111.60 +
  111.61 +const uint8_t ff_h264_chroma_qp[52]={
  111.62 +    0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
  111.63 +   12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
  111.64 +   28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
  111.65 +   37,38,38,38,39,39,39,39
  111.66 +};
  111.67 +
  111.68 +static const uint8_t default_scaling4[2][16]={
  111.69 +{   6,13,20,28,
  111.70 +   13,20,28,32,
  111.71 +   20,28,32,37,
  111.72 +   28,32,37,42
  111.73 +},{
  111.74 +   10,14,20,24,
  111.75 +   14,20,24,27,
  111.76 +   20,24,27,30,
  111.77 +   24,27,30,34
  111.78 +}};
  111.79 +
  111.80 +static const uint8_t default_scaling8[2][64]={
  111.81 +{   6,10,13,16,18,23,25,27,
  111.82 +   10,11,16,18,23,25,27,29,
  111.83 +   13,16,18,23,25,27,29,31,
  111.84 +   16,18,23,25,27,29,31,33,
  111.85 +   18,23,25,27,29,31,33,36,
  111.86 +   23,25,27,29,31,33,36,38,
  111.87 +   25,27,29,31,33,36,38,40,
  111.88 +   27,29,31,33,36,38,40,42
  111.89 +},{
  111.90 +    9,13,15,17,19,21,22,24,
  111.91 +   13,13,17,19,21,22,24,25,
  111.92 +   15,17,19,21,22,24,25,27,
  111.93 +   17,19,21,22,24,25,27,28,
  111.94 +   19,21,22,24,25,27,28,30,
  111.95 +   21,22,24,25,27,28,30,32,
  111.96 +   22,24,25,27,28,30,32,33,
  111.97 +   24,25,27,28,30,32,33,35
  111.98 +}};
  111.99 +
 111.100 +static inline int decode_hrd_parameters(GetBitContext *gb, SPS *sps){
 111.101 +    int cpb_count, i;
 111.102 +    cpb_count = get_ue_golomb_31(gb) + 1;
 111.103 +
 111.104 +    if(cpb_count > 32){
 111.105 +        av_log(AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
 111.106 +        return -1;
 111.107 +    }
 111.108 +
 111.109 +    get_bits(gb, 4); /* bit_rate_scale */
 111.110 +    get_bits(gb, 4); /* cpb_size_scale */
 111.111 +    for(i=0; i<cpb_count; i++){
 111.112 +        get_ue_golomb(gb); /* bit_rate_value_minus1 */
 111.113 +        get_ue_golomb(gb); /* cpb_size_value_minus1 */
 111.114 +        get_bits1(gb);     /* cbr_flag */
 111.115 +    }
 111.116 +    sps->initial_cpb_removal_delay_length = get_bits(gb, 5) + 1;
 111.117 +    sps->cpb_removal_delay_length = get_bits(gb, 5) + 1;
 111.118 +    sps->dpb_output_delay_length = get_bits(gb, 5) + 1;
 111.119 +    sps->time_offset_length = get_bits(gb, 5);
 111.120 +    sps->cpb_cnt = cpb_count;
 111.121 +    return 0;
 111.122 +}
 111.123 +
 111.124 +static inline int decode_vui_parameters(GetBitContext *gb, SPS *sps){
 111.125 +    int aspect_ratio_info_present_flag;
 111.126 +    unsigned int aspect_ratio_idc;
 111.127 +
 111.128 +    aspect_ratio_info_present_flag= get_bits1(gb);
 111.129 +
 111.130 +    if( aspect_ratio_info_present_flag ) {
 111.131 +        aspect_ratio_idc= get_bits(gb, 8);
 111.132 +        if( aspect_ratio_idc == EXTENDED_SAR ) {
 111.133 +            sps->num= get_bits(gb, 16);
 111.134 +            sps->den= get_bits(gb, 16);
 111.135 +        }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(int[2])){
 111.136 +            //sps->sar=  pixel_aspect[aspect_ratio_idc];
 111.137 +        }else{
 111.138 +            av_log( AV_LOG_ERROR, "illegal aspect ratio idc %d\n", aspect_ratio_idc);
 111.139 +         //   return -1;
 111.140 +        }
 111.141 +    }else{
 111.142 +        sps->num=
 111.143 +        sps->den= 0;
 111.144 +    }
 111.145 +
 111.146 +    if(get_bits1(gb)){      /* overscan_info_present_flag */
 111.147 +        get_bits1(gb);      /* overscan_appropriate_flag */
 111.148 +    }
 111.149 +
 111.150 +    sps->video_signal_type_present_flag = get_bits1(gb);
 111.151 +    if(sps->video_signal_type_present_flag){
 111.152 +        get_bits(gb, 3);    /* video_format */
 111.153 +        sps->full_range = get_bits1(gb); /* video_full_range_flag */
 111.154 +
 111.155 +        sps->colour_description_present_flag = get_bits1(gb);
 111.156 +        if(sps->colour_description_present_flag){
 111.157 +            sps->color_primaries = get_bits(gb, 8); /* colour_primaries */
 111.158 +            sps->color_trc       = get_bits(gb, 8); /* transfer_characteristics */
 111.159 +            sps->colorspace      = get_bits(gb, 8); /* matrix_coefficients */
 111.160 +            if (sps->color_primaries >= AVCOL_PRI_NB)
 111.161 +                sps->color_primaries  = AVCOL_PRI_UNSPECIFIED;
 111.162 +            if (sps->color_trc >= AVCOL_TRC_NB)
 111.163 +                sps->color_trc  = AVCOL_TRC_UNSPECIFIED;
 111.164 +            if (sps->colorspace >= AVCOL_SPC_NB)
 111.165 +                sps->colorspace  = AVCOL_SPC_UNSPECIFIED;
 111.166 +        }
 111.167 +    }
 111.168 +
 111.169 +    if(get_bits1(gb)){      /* chroma_location_info_present_flag */
 111.170 +        av_log(AV_LOG_ERROR, "chroma_location_info_present_flag found, but not supported\n");
 111.171 +        (void) (get_ue_golomb(gb)+1);  /* chroma_sample_location_type_top_field */
 111.172 +        (void) get_ue_golomb(gb);  /* chroma_sample_location_type_bottom_field */
 111.173 +    }
 111.174 +
 111.175 +    sps->timing_info_present_flag = get_bits1(gb);
 111.176 +    if(sps->timing_info_present_flag){
 111.177 +        sps->num_units_in_tick = get_bits_long(gb, 32);
 111.178 +        sps->time_scale = get_bits_long(gb, 32);
 111.179 +        if(!sps->num_units_in_tick || !sps->time_scale){
 111.180 +            av_log(AV_LOG_ERROR, "time_scale/num_units_in_tick invalid or unsupported (%d/%d)\n", sps->time_scale, sps->num_units_in_tick);
 111.181 +            return -1;
 111.182 +        }
 111.183 +        sps->fixed_frame_rate_flag = get_bits1(gb);
 111.184 +    }
 111.185 +
 111.186 +    sps->nal_hrd_parameters_present_flag = get_bits1(gb);
 111.187 +    if(sps->nal_hrd_parameters_present_flag)
 111.188 +        if(decode_hrd_parameters(gb, sps) < 0)
 111.189 +            return -1;
 111.190 +    sps->vcl_hrd_parameters_present_flag = get_bits1(gb);
 111.191 +    if(sps->vcl_hrd_parameters_present_flag)
 111.192 +        if(decode_hrd_parameters(gb, sps) < 0)
 111.193 +            return -1;
 111.194 +    if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
 111.195 +        get_bits1(gb);     /* low_delay_hrd_flag */
 111.196 +    sps->pic_struct_present_flag = get_bits1(gb);
 111.197 +
 111.198 +    sps->bitstream_restriction_flag = get_bits1(gb);
 111.199 +    if(sps->bitstream_restriction_flag){
 111.200 +        get_bits1(gb);     /* motion_vectors_over_pic_boundaries_flag */
 111.201 +        get_ue_golomb(gb); /* max_bytes_per_pic_denom */
 111.202 +        get_ue_golomb(gb); /* max_bits_per_mb_denom */
 111.203 +        get_ue_golomb(gb); /* log2_max_mv_length_horizontal */
 111.204 +        get_ue_golomb(gb); /* log2_max_mv_length_vertical */
 111.205 +        sps->num_reorder_frames= get_ue_golomb(gb);
 111.206 +        get_ue_golomb(gb); /*max_dec_frame_buffering*/
 111.207 +
 111.208 +        if(sps->num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
 111.209 +            av_log(AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
 111.210 +            return -1;
 111.211 +        }
 111.212 +    }
 111.213 +
 111.214 +    return 0;
 111.215 +}
 111.216 +
 111.217 +static void decode_scaling_list(GetBitContext *gb, uint8_t *factors, int size, const uint8_t *jvt_list, const uint8_t *fallback_list){
 111.218 +    int i, last = 8, next = 8;
 111.219 +    const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
 111.220 +    if(!get_bits1(gb)) /* matrix not written, we use the predicted one */
 111.221 +        memcpy(factors, fallback_list, size*sizeof(uint8_t));
 111.222 +    else
 111.223 +    for(i=0;i<size;i++){
 111.224 +        if(next)
 111.225 +            next = (last + get_se_golomb(gb)) & 0xff;
 111.226 +        if(!i && !next){ /* matrix not written, we use the preset one */
 111.227 +            memcpy(factors, jvt_list, size*sizeof(uint8_t));
 111.228 +            break;
 111.229 +        }
 111.230 +        last = factors[scan[i]] = next ? next : last;
 111.231 +    }
 111.232 +}
 111.233 +
 111.234 +static void decode_scaling_matrices(GetBitContext *gb, SPS *sps, PPS *pps, int is_sps,
 111.235 +                                   uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
 111.236 +    int fallback_sps = !is_sps && sps->scaling_matrix_present;
 111.237 +    const uint8_t *fallback[4] = {
 111.238 +        fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
 111.239 +        fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
 111.240 +        fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
 111.241 +        fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
 111.242 +    };
 111.243 +    if(get_bits1(gb)){
 111.244 +        sps->scaling_matrix_present |= is_sps;
 111.245 +        decode_scaling_list(gb, scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
 111.246 +        decode_scaling_list(gb, scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
 111.247 +        decode_scaling_list(gb, scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
 111.248 +        decode_scaling_list(gb, scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
 111.249 +        decode_scaling_list(gb, scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
 111.250 +        decode_scaling_list(gb, scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
 111.251 +        if(is_sps || pps->transform_8x8_mode){
 111.252 +            decode_scaling_list(gb, scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
 111.253 +            decode_scaling_list(gb, scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
 111.254 +        }
 111.255 +    }
 111.256 +}
 111.257 +
 111.258 +int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb){
 111.259 +    int profile_idc, level_idc;
 111.260 +    unsigned int sps_id;
 111.261 +    int i;
 111.262 +    SPS *sps;
 111.263 +
 111.264 +    profile_idc= get_bits(gb, 8);
 111.265 +    get_bits1(gb);   //constraint_set0_flag
 111.266 +    get_bits1(gb);   //constraint_set1_flag
 111.267 +    get_bits1(gb);   //constraint_set2_flag
 111.268 +    get_bits1(gb);   //constraint_set3_flag
 111.269 +    get_bits(gb, 4); // reserved
 111.270 +    level_idc= get_bits(gb, 8);
 111.271 +    sps_id= get_ue_golomb_31(gb);
 111.272 +
 111.273 +    if(sps_id >= MAX_SPS_COUNT) {
 111.274 +        av_log(AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
 111.275 +        return -1;
 111.276 +    }
 111.277 +    if (!n->sps_buffers[sps_id])
 111.278 +        n->sps_buffers[sps_id]= av_mallocz(sizeof(SPS));
 111.279 +        
 111.280 +    sps = n->sps_buffers[sps_id];
 111.281 +    if(sps == NULL)
 111.282 +        return -1;
 111.283 +
 111.284 +    sps->profile_idc= profile_idc;
 111.285 +    sps->level_idc= level_idc;
 111.286 +
 111.287 +    memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
 111.288 +    memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
 111.289 +    sps->scaling_matrix_present = 0;
 111.290 +
 111.291 +    if(sps->profile_idc >= 100){ //high profile
 111.292 +        sps->chroma_format_idc= get_ue_golomb_31(gb);
 111.293 +        if(sps->chroma_format_idc == 3)
 111.294 +            sps->residual_color_transform_flag = get_bits1(gb);
 111.295 +        sps->bit_depth_luma   = get_ue_golomb(gb) + 8;
 111.296 +        sps->bit_depth_chroma = get_ue_golomb(gb) + 8;
 111.297 +        sps->transform_bypass = get_bits1(gb);
 111.298 +        decode_scaling_matrices(gb, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
 111.299 +    }else{
 111.300 +        sps->chroma_format_idc= 1;
 111.301 +        sps->bit_depth_luma   = 8;
 111.302 +        sps->bit_depth_chroma = 8;
 111.303 +    }
 111.304 +
 111.305 +    sps->log2_max_frame_num= get_ue_golomb(gb) + 4;
 111.306 +    sps->poc_type= get_ue_golomb_31(gb);
 111.307 +
 111.308 +    if(sps->poc_type == 0){ //FIXME #define
 111.309 +        sps->log2_max_poc_lsb= get_ue_golomb(gb) + 4;
 111.310 +    } else if(sps->poc_type == 1){//FIXME #define
 111.311 +        sps->delta_pic_order_always_zero_flag= get_bits1(gb);
 111.312 +        sps->offset_for_non_ref_pic= get_se_golomb(gb);
 111.313 +        sps->offset_for_top_to_bottom_field= get_se_golomb(gb);
 111.314 +        sps->poc_cycle_length                = get_ue_golomb(gb);
 111.315 +
 111.316 +        if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
 111.317 +            av_log(AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
 111.318 +            goto fail;
 111.319 +        }
 111.320 +
 111.321 +        for(i=0; i<sps->poc_cycle_length; i++)
 111.322 +            sps->offset_for_ref_frame[i]= get_se_golomb(gb);
 111.323 +    }else if(sps->poc_type != 2){
 111.324 +        av_log(AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
 111.325 +        goto fail;
 111.326 +    }
 111.327 +
 111.328 +    sps->ref_frame_count= get_ue_golomb_31(gb);
 111.329 +    if(sps->ref_frame_count >= 32){
 111.330 +        av_log(AV_LOG_ERROR, "too many reference frames\n");
 111.331 +        goto fail;
 111.332 +    }
 111.333 +    sps->gaps_in_frame_num_allowed_flag= get_bits1(gb);
 111.334 +    sps->mb_width = get_ue_golomb(gb) + 1;
 111.335 +    sps->mb_height= get_ue_golomb(gb) + 1;
 111.336 +
 111.337 +
 111.338 +    sps->frame_mbs_only_flag= get_bits1(gb);
 111.339 +    if(!sps->frame_mbs_only_flag){
 111.340 +        av_log(AV_LOG_ERROR, "MBAFF support not included\n");
 111.341 +        get_bits1(gb);
 111.342 +    }else
 111.343 +        sps->mb_aff= 0;
 111.344 +
 111.345 +    sps->direct_8x8_inference_flag= get_bits1(gb);
 111.346 +    if(!sps->frame_mbs_only_flag && !sps->direct_8x8_inference_flag){
 111.347 +        av_log(AV_LOG_ERROR, "This stream was generated by a broken encoder, invalid 8x8 inference\n");
 111.348 +        goto fail;
 111.349 +    }
 111.350 +
 111.351 +    sps->crop= get_bits1(gb);
 111.352 +    if(sps->crop){
 111.353 +		sps->crop_left = get_ue_golomb(gb);
 111.354 +		sps->crop_right = get_ue_golomb(gb);
 111.355 +		sps->crop_top = get_ue_golomb(gb);
 111.356 +		sps->crop_bottom= get_ue_golomb(gb);
 111.357 +		if(sps->crop_left || sps->crop_top){
 111.358 +			av_log( AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
 111.359 +		}
 111.360 +		if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
 111.361 +			av_log( AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
 111.362 +		}
 111.363 +	}else {
 111.364 +	
 111.365 +		sps->crop_left  =
 111.366 +		sps->crop_right =
 111.367 +		sps->crop_top   =
 111.368 +		sps->crop_bottom= 0;
 111.369 +	}
 111.370 +
 111.371 +    sps->vui_parameters_present_flag= get_bits1(gb);
 111.372 +    if( sps->vui_parameters_present_flag )
 111.373 +        if (decode_vui_parameters(gb, sps) < 0)
 111.374 +            goto fail;
 111.375 +
 111.376 +    
 111.377 +    n->sps = *sps;
 111.378 +
 111.379 +    if( sps->bitstream_restriction_flag){
 111.380 +        n->has_b_frames = sps->num_reorder_frames;
 111.381 +    }
 111.382 +    else
 111.383 +        n->has_b_frames= MAX_DELAYED_PIC_COUNT;
 111.384 +
 111.385 +    return 0;
 111.386 +fail:
 111.387 +    av_free(sps);
 111.388 +    return -1;
 111.389 +}
 111.390 +
 111.391 +static void
 111.392 +build_qp_table(PPS *pps, int t, int index)
 111.393 +{
 111.394 +    int i;
 111.395 +    for(i = 0; i < 52; i++)
 111.396 +        pps->chroma_qp_table[t][i] = ff_h264_chroma_qp[av_clip(i + index, 0, 51)];
 111.397 +}
 111.398 +
 111.399 +int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length){
 111.400 +    unsigned int pps_id= get_ue_golomb(gb);
 111.401 +    PPS *pps;
 111.402 +
 111.403 +    if(pps_id >= MAX_PPS_COUNT) {
 111.404 +        av_log(AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
 111.405 +        return -1;
 111.406 +    }
 111.407 +    if (!n->pps_buffers[pps_id])
 111.408 +        n->pps_buffers[pps_id]= av_mallocz(sizeof(PPS));
 111.409 +    pps = n->pps_buffers[pps_id];
 111.410 +    if(pps == NULL)
 111.411 +        return -1;
 111.412 +    pps->sps_id= get_ue_golomb_31(gb);
 111.413 +    if((unsigned)pps->sps_id>=MAX_SPS_COUNT || n->sps_buffers[pps->sps_id] == NULL){
 111.414 +        av_log(AV_LOG_ERROR, "sps_id out of range\n");
 111.415 +        goto fail;
 111.416 +    }
 111.417 +
 111.418 +    pps->cabac= get_bits1(gb);
 111.419 +    pps->pic_order_present= get_bits1(gb);
 111.420 +    if(pps->pic_order_present){        
 111.421 +        av_log(AV_LOG_ERROR, "no interlaces support\n");
 111.422 +    }
 111.423 +    pps->slice_group_count= get_ue_golomb(gb) + 1;
 111.424 +    if(pps->slice_group_count > 1 ){
 111.425 +        pps->mb_slice_group_map_type= get_ue_golomb(gb);
 111.426 +        av_log(AV_LOG_ERROR, "multiple slices not supported\n");
 111.427 +    }
 111.428 +    pps->ref_count[0]= get_ue_golomb(gb) + 1;
 111.429 +    pps->ref_count[1]= get_ue_golomb(gb) + 1;
 111.430 +    if(pps->ref_count[0]> 32 || pps->ref_count[1]> 32){
 111.431 +        av_log(AV_LOG_ERROR, "reference overflow (pps)\n");
 111.432 +        goto fail;
 111.433 +    }
 111.434 +
 111.435 +    pps->weighted_pred= get_bits1(gb);
 111.436 +    pps->weighted_bipred_idc= get_bits(gb, 2);
 111.437 +    pps->init_qp= get_se_golomb(gb) + 26;
 111.438 +    pps->init_qs= get_se_golomb(gb) + 26;
 111.439 +    pps->chroma_qp_index_offset[0]= get_se_golomb(gb);
 111.440 +    pps->deblocking_filter_parameters_present= get_bits1(gb);
 111.441 +    pps->constrained_intra_pred= get_bits1(gb);
 111.442 +    pps->redundant_pic_cnt_present = get_bits1(gb);
 111.443 +
 111.444 +    pps->transform_8x8_mode= 0;
 111.445 +    memcpy(pps->scaling_matrix4, n->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
 111.446 +    memcpy(pps->scaling_matrix8, n->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
 111.447 +
 111.448 +    if(get_bits_count(gb) < bit_length){
 111.449 +        pps->transform_8x8_mode= get_bits1(gb);
 111.450 +        decode_scaling_matrices(gb, n->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
 111.451 +        pps->chroma_qp_index_offset[1]= get_se_golomb(gb); //second_chroma_qp_index_offset
 111.452 +    } else {
 111.453 +        pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
 111.454 +    }
 111.455 +
 111.456 +    build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
 111.457 +    build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
 111.458 +    if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
 111.459 +        pps->chroma_qp_diff= 1;
 111.460 +
 111.461 +    return 0;
 111.462 +fail:
 111.463 +    av_free(pps);
 111.464 +    return -1;
 111.465 +}

   112.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   112.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_ps.h	Mon Aug 27 12:09:56 2012 +0200
   112.3 @@ -0,0 +1,9 @@
   112.4 +#ifndef H264_PS_H
   112.5 +#define H264_PS_H
   112.6 +
   112.7 +#include "h264_types.h"
   112.8 +
   112.9 +int ff_h264_decode_seq_parameter_set(NalContext *n, GetBitContext *gb);
  112.10 +int ff_h264_decode_picture_parameter_set(NalContext *n, GetBitContext *gb, int bit_length);
  112.11 +
  112.12 +#endif

   113.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   113.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pthread.c	Mon Aug 27 12:09:56 2012 +0200
   113.3 @@ -0,0 +1,604 @@
   113.4 +#include "config.h"
   113.5 +
   113.6 +#include "h264_types.h"
   113.7 +#include "h264_parser.h"
   113.8 +#include "h264_nal.h"
   113.9 +#include "h264_entropy.h"
  113.10 +#include "h264_rec.h"
  113.11 +#include "h264_misc.h"
  113.12 +// #undef NDEBUG
  113.13 +#include <assert.h>
  113.14 +#include <pthread.h>
  113.15 +
  113.16 +#define XOANON 1
  113.17 +
  113.18 +#ifdef XOANON
  113.19 +static int ed_rec_affinity[40] = { 0,  4,  8, 12, 16, 20, 24, 28, 32, 36,
  113.20 +                                   1,  5,  9, 13, 17, 21, 25, 29, 33, 37,
  113.21 +                                   2,  6, 10, 14, 18, 22, 26, 30, 34, 38,
  113.22 +                                   3,  7, 11, 15, 19, 23, 27, 31, 35, 39 };
  113.23 +static int ed_rec_smt_aff[80]  = { 0,  40,  4, 44,  8, 48, 12, 52, 16, 56, 20, 60, 24, 64, 28, 68, 32, 72, 36, 76,
  113.24 +                                   1,  41,  5, 45,  9, 49, 13, 53, 17, 57, 21, 61, 25, 65, 29, 69, 33, 73, 37, 77,
  113.25 +                                   2,  42,  6, 46, 10, 50, 14, 54, 18, 58, 22, 62, 26, 66, 30, 70, 34, 74, 38, 78,
  113.26 +                                   3,  43,  7, 47, 11, 51, 15, 55, 19, 59, 23, 63, 27, 67, 31, 71, 35, 75, 39, 79 };
  113.27 +#else
  113.28 +static int ed_rec_affinity[10] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9};
  113.29 +static int ed_rec_smt_aff[20] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, };
  113.30 +#endif
  113.31 +
  113.32 +static int frames=0;
  113.33 +
  113.34 +static void notify_one_worker(H264Context *h){
  113.35 +    pthread_mutex_lock(&h->task_lock);
  113.36 +    pthread_cond_signal(&h->task_cond);
  113.37 +    pthread_mutex_unlock(&h->task_lock);
  113.38 +}
  113.39 +
  113.40 +static void notify_all_workers(H264Context *h){
  113.41 +    pthread_mutex_lock(&h->task_lock);
  113.42 +    pthread_cond_broadcast(&h->task_cond);
  113.43 +    pthread_mutex_unlock(&h->task_lock);
  113.44 +}
  113.45 +
  113.46 +static void push_sbe (SliceBufferQueue *sbq, SliceBufferEntry *sbe, int notify ){
  113.47 +    pthread_mutex_lock(&sbq->lock);
  113.48 +    while (sbq->cnt >= sbq->size)
  113.49 +        pthread_cond_wait(&sbq->cond, &sbq->lock);
  113.50 +    sbq->queue[sbq->fi] = sbe;
  113.51 +    sbq->cnt++;
  113.52 +    sbq->fi++; sbq->fi %= sbq->size;
  113.53 +    if (notify)
  113.54 +        pthread_cond_signal(&sbq->cond);
  113.55 +    pthread_mutex_unlock(&sbq->lock);
  113.56 +}
  113.57 +
  113.58 +static SliceBufferEntry* pop_sbe (SliceBufferQueue *sbq, int block){
  113.59 +    SliceBufferEntry *sbe=NULL;
  113.60 +
  113.61 +    pthread_mutex_lock(&sbq->lock);
  113.62 +    if (block){
  113.63 +        while (sbq->cnt <= 0)
  113.64 +            pthread_cond_wait(&sbq->cond, &sbq->lock);
  113.65 +    }else {
  113.66 +        if (sbq->cnt <= 0)
  113.67 +            goto nonblock;
  113.68 +    }
  113.69 +    sbe = sbq->queue[sbq->fo];
  113.70 +    sbq->cnt--;
  113.71 +    sbq->fo++; sbq->fo %= sbq->size;
  113.72 +    pthread_cond_signal(&sbq->cond);
  113.73 +nonblock:
  113.74 +    pthread_mutex_unlock(&sbq->lock);
  113.75 +
  113.76 +    return sbe;
  113.77 +}
  113.78 +
  113.79 +// static void push_rle (RingLineQueue *rlq, SliceBufferEntry *sbe, int line, int notify){
  113.80 +//
  113.81 +//     //check for free slots
  113.82 +//     pthread_mutex_lock(&rlq->wslock);
  113.83 +//     while (rlq->free <= 0){
  113.84 +//         pthread_cond_wait(&rlq->wscond, &rlq->wslock);
  113.85 +//     }
  113.86 +//     //free slot is available, decrement one in this lock
  113.87 +//     rlq->free--;
  113.88 +//     pthread_mutex_unlock(&rlq->wslock);
  113.89 +//
  113.90 +//     pthread_mutex_lock(&rlq->swlock);
  113.91 +//     rlq->queue[rlq->fi]->sbe=sbe;
  113.92 +//     rlq->queue[rlq->fi]->line=line;
  113.93 +//     rlq->queue[rlq->fi]->mb_cnt=0;
  113.94 +//     rlq->fi++; rlq->fi %= rlq->size;
  113.95 +//     rlq->ready++;
  113.96 +//     if(notify)
  113.97 +//         pthread_cond_signal(&rlq->swcond);
  113.98 +//     pthread_mutex_unlock(&rlq->swlock);
  113.99 +// }
 113.100 +
 113.101 +// static RingLineEntry* pop_rle (RingLineQueue *rlq, int block){
 113.102 +//     RingLineEntry *rle=NULL;
 113.103 +//
 113.104 +//     pthread_mutex_lock(&rlq->swlock);
 113.105 +//     if (block){
 113.106 +//         while (rlq->ready <= 0)
 113.107 +//             pthread_cond_wait(&rlq->swcond, &rlq->swlock);
 113.108 +//     }else {
 113.109 +//         if (rlq->ready <= 0)
 113.110 +//             goto nonblock;
 113.111 +//     }
 113.112 +//     rle = rlq->queue[rlq->fo];
 113.113 +//     rlq->fo++; rlq->fo %= rlq->size;
 113.114 +//     rlq->ready--;
 113.115 +// nonblock:
 113.116 +//     pthread_mutex_unlock(&rlq->swlock);
 113.117 +//
 113.118 +//     return rle;
 113.119 +// }
 113.120 +//
 113.121 +// static void rel_rle (RingLineQueue *rlq){
 113.122 +//     pthread_mutex_lock(&rlq->wslock);
 113.123 +//     rlq->free++;
 113.124 +//     pthread_cond_signal(&rlq->wscond);
 113.125 +//     pthread_mutex_unlock(&rlq->wslock);
 113.126 +// }
 113.127 +
 113.128 +static RingLineEntry* pop_rle (SliceBufferQueue *sbq, RingLineQueue *rlq, int *has_token){
 113.129 +    RingLineEntry *rle=NULL;
 113.130 +    SliceBufferEntry *sbe=NULL;
 113.131 +    int line=-1;
 113.132 +
 113.133 +    pthread_mutex_lock(&sbq->lock);
 113.134 +    if (sbq->cnt <= 0)
 113.135 +        goto unlock;
 113.136 +    sbe = sbq->queue[sbq->fo];
 113.137 +    line = sbe->lines_taken;
 113.138 +
 113.139 +
 113.140 +    pthread_mutex_lock(&rlq->swlock);
 113.141 +    if (!*has_token){
 113.142 +        if (rlq->free <= 0)
 113.143 +            goto unlock2;
 113.144 +        rlq->free--;
 113.145 +        *has_token=1;
 113.146 +    }
 113.147 +    rle = rlq->queue[rlq->fo];
 113.148 +    rlq->fo++; rlq->fo %= rlq->size;
 113.149 +    rle->sbe=sbe;
 113.150 +    rle->line = line;
 113.151 +    rle->mb_cnt =0;
 113.152 +    if (++sbe->lines_taken >= sbe->lines_total){
 113.153 +        sbq->cnt--;
 113.154 +        sbq->fo++; sbq->fo %= sbq->size;
 113.155 +        pthread_cond_signal(&sbq->cond);
 113.156 +    }
 113.157 +unlock2:
 113.158 +    pthread_mutex_unlock(&rlq->swlock);
 113.159 +unlock:
 113.160 +    pthread_mutex_unlock(&sbq->lock);
 113.161 +
 113.162 +
 113.163 +    return rle;
 113.164 +}
 113.165 +
 113.166 +static void rel_rle (RingLineQueue *rlq, int *rec_token){
 113.167 +    pthread_mutex_lock(&rlq->swlock);
 113.168 +    rlq->free++;
 113.169 +    *rec_token=0;
 113.170 +//     pthread_cond_signal(&rlq->swcond);
 113.171 +    pthread_mutex_unlock(&rlq->swlock);
 113.172 +
 113.173 +}
 113.174 +
 113.175 +//get either a entropy or a line reconstruct task
 113.176 +static void pop_next_task(H264Context *h, SliceBufferEntry **psbe, RingLineEntry **prle, int *rec_token){
 113.177 +
 113.178 +    pthread_mutex_lock(&h->task_lock);
 113.179 +
 113.180 +    for(;;){
 113.181 +        if ( (*psbe = pop_sbe(&h->sb_q[ENTROPY], 0)) ){
 113.182 +            if (*rec_token){
 113.183 +                rel_rle(&h->rl_q, rec_token);
 113.184 +                pthread_cond_signal(&h->task_cond);
 113.185 +            }
 113.186 +            break;
 113.187 +        }
 113.188 +        else if ( (*prle = pop_rle(&h->sb_q[MBDEC], &h->rl_q, rec_token)) )
 113.189 +            break;
 113.190 +        pthread_cond_wait(&h->task_cond, &h->task_lock);
 113.191 +    }
 113.192 +
 113.193 +    pthread_mutex_unlock(&h->task_lock);
 113.194 +}
 113.195 +
 113.196 +void *parse_thread(void *arg){
 113.197 +    H264Context *h = (H264Context *) arg;
 113.198 +    ParserContext *pc = get_parse_context(h->ifile);
 113.199 +    NalContext *nc = get_nal_context(h->width, h->height);
 113.200 +    H264Slice *s;
 113.201 +    SliceBufferEntry *sbe = NULL;
 113.202 +
 113.203 +    while(!pc->final_frame && frames++ <h->num_frames && !h->quit){
 113.204 +        sbe = get_sb_entry(h);
 113.205 +
 113.206 +        av_read_frame_internal(pc, &sbe->gb);
 113.207 +        s = &sbe->slice;
 113.208 +
 113.209 +        decode_nal_units(nc, s, &sbe->gb);
 113.210 +
 113.211 +        push_sbe(&h->sb_q[ENTROPY], sbe, 0);
 113.212 +        notify_one_worker(h);
 113.213 +    }
 113.214 +
 113.215 +    if (!h->no_mbd){
 113.216 +        sbe = get_sb_entry(h);
 113.217 +        sbe->state=-1;
 113.218 +        sbe->slice.coded_pic_num=nc->coded_pic_num;
 113.219 +        sbe->lines_total=h->threads;
 113.220 +
 113.221 +        push_sbe(&h->sb_q[REORDER], sbe, 1);
 113.222 +    }else{
 113.223 +        for (int i=0; i<h->threads; i++){
 113.224 +            sbe = get_sb_entry(h);
 113.225 +            sbe->state=-1;
 113.226 +            push_sbe(&h->sb_q[ENTROPY], sbe, 1);
 113.227 +            notify_one_worker(h);
 113.228 +        }
 113.229 +    }
 113.230 +    free_nal_context(nc);
 113.231 +    free_parse_context(pc);
 113.232 +
 113.233 +    pthread_exit(NULL);
 113.234 +    return NULL;
 113.235 +}
 113.236 +
 113.237 +int decode_slice_entropy(EntropyContext *ec, SliceBufferEntry *sbe){
 113.238 +    int i,j;
 113.239 +    H264Slice *s = &sbe->slice;
 113.240 +    GetBitContext *gb = &sbe->gb;
 113.241 +    CABACContext *c = &ec->c;
 113.242 +    H264Mb *mbs = sbe->mbs;
 113.243 +
 113.244 +    if( !s->pps.cabac ){
 113.245 +        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
 113.246 +        return -1;
 113.247 +    }
 113.248 +
 113.249 +    init_dequant_tables(s, ec);
 113.250 +    ec->curr_qscale = s->qscale;
 113.251 +    ec->last_qscale_diff = 0;
 113.252 +    ec->chroma_qp[0] = get_chroma_qp( s, 0, s->qscale);
 113.253 +    ec->chroma_qp[1] = get_chroma_qp( s, 1, s->qscale);
 113.254 +
 113.255 +    /* realign */
 113.256 +    align_get_bits( gb );
 113.257 +    /* init cabac */
 113.258 +    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
 113.259 +
 113.260 +    ff_h264_init_cabac_states(ec, s, c);
 113.261 +
 113.262 +    for(j=0; j<ec->mb_height; j++){
 113.263 +        init_entropy_buf(ec, s, j);
 113.264 +        for(i=0; i<ec->mb_width; i++){
 113.265 +            int eos,ret;
 113.266 +            H264Mb *m = &mbs[i + j*ec->mb_width];
 113.267 +            //memset(m, 0, sizeof(H264Mb));
 113.268 +            m->mb_x=i;
 113.269 +            m->mb_y=j;
 113.270 +            ec->m = m;
 113.271 +
 113.272 +            ret = ff_h264_decode_mb_cabac(ec, s, c);
 113.273 +            eos = get_cabac_terminate( c); (void) eos;
 113.274 +
 113.275 +            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
 113.276 +                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
 113.277 +                return -1;
 113.278 +            }
 113.279 +        }
 113.280 +    }
 113.281 +
 113.282 +    return 0;
 113.283 +}
 113.284 +
 113.285 +static int decode_slice_mb(MBRecContext *d, RingLineEntry *rle, int frames){
 113.286 +    SliceBufferEntry *sbe= rle->sbe;
 113.287 +    H264Slice *s = &sbe->slice;
 113.288 +    H264Mb *mbs = sbe->mbs;
 113.289 +
 113.290 +    int mb_width= d->mb_width;
 113.291 +    int i;
 113.292 +    const int line = rle->line;
 113.293 +
 113.294 +    init_mbrec_context(d, d->mrs, s, line);
 113.295 +
 113.296 +    H264Mb *m = &mbs[line*mb_width];
 113.297 +    d->top=rle->prev_line->top;
 113.298 +    d->top_next=rle->top;
 113.299 +
 113.300 +//     assert(rle->mb_cnt ==0);
 113.301 +    for(i=0; i< mb_width; i++){
 113.302 +        if (frames || line>0){
 113.303 +            while (rle->mb_cnt >= rle->prev_line->mb_cnt -1);
 113.304 +        }
 113.305 +        h264_decode_mb_internal( d, d->mrs, s, &m[i]);
 113.306 +        rle->mb_cnt++;
 113.307 +    }
 113.308 +    draw_edges(d, s, line);
 113.309 +
 113.310 +    return 0;
 113.311 +}
 113.312 +
 113.313 +// static int decode_slice_mb_static(MBRecContext *d, H264Slice *s, RLThreadContext *r, RLThreadContext *rp,  int frames){
 113.314 +//     int mb_height= d->mb_height;
 113.315 +//     int mb_width= d->mb_width;
 113.316 +//     int thread_num = r->thread_num;
 113.317 +//     int thread_total = r->thread_total;
 113.318 +//     int i;
 113.319 +//     int j = thread_num;
 113.320 +//
 113.321 +//     r->mb_cnt=frames* mb_height*mb_width;
 113.322 +//     for(; j<mb_height; j+=thread_total){
 113.323 +//         H264Mb *m = &s->mbs[j*mb_width];
 113.324 +//         for(i=0; i< mb_width; i++){
 113.325 +//             if (j>0){
 113.326 +//                 while (r->mb_cnt- (thread_num? 0:mb_width) >= rp->mb_cnt-1);
 113.327 +//             }
 113.328 +//             h264_decode_mb_internal(d, s, m++);
 113.329 +//             r->mb_cnt++;
 113.330 +//         }
 113.331 +//         draw_edges(d, s, j);
 113.332 +//     }
 113.333 +//     return 0;
 113.334 +// }
 113.335 +
 113.336 +static void *ed_rec_thread(void *arg){
 113.337 +    H264Context *h =  (H264Context*) arg;
 113.338 +    EntropyContext *ec=NULL;
 113.339 +    MBRecContext *mrc=NULL;
 113.340 +
 113.341 +    RingLineEntry *rle=NULL;
 113.342 +    SliceBufferEntry *sbe=NULL;
 113.343 +    H264Slice *s;
 113.344 +    int rec_token=0;
 113.345 +
 113.346 +    if (!h->no_mbd){
 113.347 +        mrc = get_mbrec_context(h);
 113.348 +    }
 113.349 +    ec = get_entropy_context(h);
 113.350 +
 113.351 +    for(;;){
 113.352 +        pop_next_task(h, &sbe, &rle, &rec_token);
 113.353 +        if (sbe){
 113.354 +            if (h->no_mbd && sbe->state<0){
 113.355 +                break;
 113.356 +            }
 113.357 +            if (!sbe->initialized){
 113.358 +                init_sb_entry(h, sbe);
 113.359 +            }
 113.360 +            decode_slice_entropy(ec, sbe);
 113.361 +
 113.362 +            if (h->no_mbd){
 113.363 +                release_sb_entry(h, sbe);
 113.364 +                sbe=NULL;
 113.365 +            } else {
 113.366 +                push_sbe(&h->sb_q[REORDER], sbe, 1);
 113.367 +            }
 113.368 +        } else if (rle){
 113.369 +            if (rle->sbe->state<0)
 113.370 +                break;
 113.371 +            s = &rle->sbe->slice;
 113.372 +
 113.373 +            decode_slice_mb(mrc, rle, s->coded_pic_num);
 113.374 +
 113.375 +            if (rle->line == h->mb_height-1){
 113.376 +                push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1);
 113.377 +            }
 113.378 +            rle->mb_cnt++;
 113.379 +        }
 113.380 +    }
 113.381 +
 113.382 +    //make sure threads quit in order of rle assignment
 113.383 +    if (!h->no_mbd){
 113.384 +        while (rle->prev_line->mb_cnt <= h->mb_width);
 113.385 +        rel_rle(&h->rl_q, &rec_token);
 113.386 +        notify_one_worker(h);
 113.387 +        rle->mb_cnt = h->mb_width +1;
 113.388 +        if (rle->line == h->threads-1){
 113.389 +            push_sbe(&h->sb_q[OUTPUT], rle->sbe, 1);
 113.390 +        }
 113.391 +
 113.392 +        free_mbrec_context(mrc);
 113.393 +    }
 113.394 +
 113.395 +    free_entropy_context(ec);
 113.396 +
 113.397 +    pthread_exit(NULL);
 113.398 +    return NULL;
 113.399 +}
 113.400 +
 113.401 +static void *reorder_thread(void *arg){
 113.402 +    H264Context *h = (H264Context *) arg;
 113.403 +    int i;
 113.404 +    SliceBufferEntry *reorder[h->sb_size];
 113.405 +    SliceBufferEntry *sbe, *next_sbe;
 113.406 +    H264Slice *s;
 113.407 +    int reorder_cnt=0;
 113.408 +    unsigned next_pic_num=0;
 113.409 +
 113.410 +    for(;;){
 113.411 +
 113.412 +        sbe = pop_sbe(&h->sb_q[REORDER], 1);
 113.413 +
 113.414 +        s = &sbe->slice;
 113.415 +        for(i=reorder_cnt; i>0; i--){
 113.416 +            if (s->coded_pic_num < reorder[i-1]->slice.coded_pic_num)
 113.417 +                break;
 113.418 +            reorder[i]=reorder[i-1];
 113.419 +        }
 113.420 +        reorder[i]=sbe;
 113.421 +
 113.422 +        while(reorder_cnt>=0){
 113.423 +            if (next_pic_num!=reorder[reorder_cnt]->slice.coded_pic_num){
 113.424 +                break;
 113.425 +            }
 113.426 +            next_sbe = reorder[reorder_cnt];
 113.427 +            H264Slice *es = &next_sbe->slice;
 113.428 +
 113.429 +            if (next_sbe->state<0)
 113.430 +                goto end;
 113.431 +
 113.432 +            for (int i=0; i<2; i++){
 113.433 +                for(int j=0; j< es->ref_count[i]; j++){
 113.434 +                    if (es->ref_list_cpn[i][j] ==-1)
 113.435 +                        continue;
 113.436 +                    int k;
 113.437 +                    for (k=0; k<h->max_dpb_cnt; k++){
 113.438 +                        if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == es->ref_list_cpn[i][j]){
 113.439 +                            es->dp_ref_list[i][j] = &h->dpb[k];
 113.440 +                            break;
 113.441 +                        }
 113.442 +                    }
 113.443 +                }
 113.444 +            }
 113.445 +            next_sbe->dp = get_dpb_entry(h, es);
 113.446 +
 113.447 +            push_sbe(&h->sb_q[MBDEC], next_sbe, 0);
 113.448 +            notify_all_workers(h);
 113.449 +
 113.450 +//             for (int i=0; i< h->mb_height; i++){
 113.451 +//                 push_rle(&h->rl_q, next_sbe, i, 0);
 113.452 +//                 notify_one_worker(h);
 113.453 +//             }
 113.454 +
 113.455 +
 113.456 +            next_pic_num++;
 113.457 +            reorder_cnt--;
 113.458 +        }
 113.459 +        reorder_cnt++;
 113.460 +    }
 113.461 +
 113.462 +end:
 113.463 +    {
 113.464 +        push_sbe(&h->sb_q[MBDEC], next_sbe, 0);
 113.465 +        notify_all_workers(h);
 113.466 +        if (h->no_mbd){
 113.467 +            push_sbe(&h->sb_q[OUTPUT], next_sbe, 1);
 113.468 +        }
 113.469 +//         for (int i=0; i< h->threads; i++){
 113.470 +//             push_rle(&h->rl_q, next_sbe, i, 0);
 113.471 +//             notify_one_worker(h);
 113.472 +//         }
 113.473 +    }
 113.474 +
 113.475 +    pthread_exit(NULL);
 113.476 +    return NULL;
 113.477 +}
 113.478 +
 113.479 +void create_ed_rec_threads(H264Context *h){
 113.480 +    cpu_set_t cpuset;
 113.481 +    int* aff;
 113.482 +
 113.483 +    if (h->setaff){
 113.484 +        aff = h->smt ? ed_rec_smt_aff : ed_rec_affinity ;
 113.485 +        for (int i=0; i<h->threads; i++){
 113.486 +            pthread_attr_init(&h->ed_rec_attr[i]);
 113.487 +            CPU_ZERO(&cpuset);
 113.488 +            CPU_SET(aff[i], &cpuset);
 113.489 +            pthread_attr_setaffinity_np(&h->ed_rec_attr[i], sizeof(cpu_set_t), &cpuset);
 113.490 +            pthread_create(&h->ed_rec_thr[i], &h->ed_rec_attr[i], ed_rec_thread, h);
 113.491 +        }
 113.492 +    } else {
 113.493 +        for (int i=0; i<h->threads; i++){
 113.494 +            pthread_create(&h->ed_rec_thr[i], NULL, ed_rec_thread, h);
 113.495 +        }
 113.496 +    }
 113.497 +}
 113.498 +
 113.499 +void join_ed_rec_threads(H264Context *h){
 113.500 +    for (int i=0; i< h->threads; i++){
 113.501 +        pthread_join(h->ed_rec_thr[i], NULL);
 113.502 +    }
 113.503 +}
 113.504 +
 113.505 +void *output_thread(void *arg){
 113.506 +    H264Context *h = (H264Context *) arg;
 113.507 +
 113.508 +    OutputContext *oc = get_output_context( h );
 113.509 +
 113.510 +    SliceBufferEntry *sbe = NULL;
 113.511 +    H264Slice *s=NULL;
 113.512 +    for(;;) {
 113.513 +        DecodedPicture *out, *dp;
 113.514 +        sbe = pop_sbe(&h->sb_q[OUTPUT], 1);
 113.515 +
 113.516 +        if (sbe->state <0)
 113.517 +            break;
 113.518 +
 113.519 +        s = &sbe->slice;
 113.520 +        for (int i=0; i<s->release_cnt; i++){
 113.521 +            for(int j=0; j<h->max_dpb_cnt; j++){
 113.522 +                if(h->dpb[j].cpn== s->release_ref_cpn[i]){
 113.523 +                    release_dpb_entry(h, &h->dpb[j], 2);
 113.524 +                    break;
 113.525 +                }
 113.526 +            }
 113.527 +        }
 113.528 +
 113.529 +        dp=sbe->dp;
 113.530 +        release_sb_entry(h, sbe);
 113.531 +
 113.532 +        out =output_frame(h, oc, dp, h->ofile, h->frame_width, h->frame_height);
 113.533 +        if (out){
 113.534 +            release_dpb_entry(h, out, 1);
 113.535 +        }
 113.536 +
 113.537 +        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
 113.538 +
 113.539 +    }
 113.540 +    /* at the end of stream, we must flush the decoder buffers */
 113.541 +    while (output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height));
 113.542 +    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
 113.543 +
 113.544 +    free_output_context(oc);
 113.545 +
 113.546 +    pthread_exit(NULL);
 113.547 +    return NULL;
 113.548 +}
 113.549 +
 113.550 +/*
 113.551 +* The following code is the main loop of the file converter
 113.552 +*/
 113.553 +int h264_decode_pthread(H264Context *h) {
 113.554 +    pthread_t parse_thr, reorder_thr, output_thr;
 113.555 +
 113.556 +    av_start_timer();
 113.557 +
 113.558 +    pthread_create(&parse_thr, NULL, parse_thread, h);
 113.559 +    if (!h->no_mbd){
 113.560 +        pthread_create(&reorder_thr, NULL, reorder_thread, h);
 113.561 +        pthread_create(&output_thr, NULL, output_thread, h);
 113.562 +    }
 113.563 +#if HAVE_LIBSDL2
 113.564 +    pthread_t sdl_thr;
 113.565 +    if (h->display){
 113.566 +        pthread_create(&sdl_thr, NULL, sdl_thread, h);
 113.567 +    }
 113.568 +#endif
 113.569 +    create_ed_rec_threads(h);
 113.570 +
 113.571 +
 113.572 +    if (h->rl_side_touch){
 113.573 +        pthread_mutex_lock(&h->ilock);
 113.574 +        while (h->init_threads< h->threads)
 113.575 +            pthread_cond_wait(&h->icond, &h->ilock);
 113.576 +        pthread_mutex_unlock(&h->ilock);
 113.577 +
 113.578 +        pthread_mutex_lock(&h->tlock);
 113.579 +        h->touch_start =1;
 113.580 +        pthread_cond_broadcast(&h->tcond);
 113.581 +        pthread_mutex_unlock(&h->tlock);
 113.582 +
 113.583 +        pthread_mutex_lock(&h->tdlock);
 113.584 +        while (h->touch_done < h->threads)
 113.585 +            pthread_cond_wait(&h->tdcond, &h->tdlock);
 113.586 +        pthread_mutex_unlock(&h->tdlock);
 113.587 +
 113.588 +        pthread_mutex_lock(&h->slock);
 113.589 +        h->start =1;
 113.590 +        pthread_cond_broadcast(&h->scond);
 113.591 +        pthread_mutex_unlock(&h->slock);
 113.592 +    }
 113.593 +    join_ed_rec_threads(h);
 113.594 +    pthread_join(parse_thr, NULL);
 113.595 +    if (!h->no_mbd){
 113.596 +        pthread_join(reorder_thr, NULL);
 113.597 +        pthread_join(output_thr, NULL);
 113.598 +    }
 113.599 +#if HAVE_LIBSDL2
 113.600 +    if (h->display)
 113.601 +        signal_sdl_exit(h);
 113.602 +        pthread_join(sdl_thr, NULL);
 113.603 +#endif
 113.604 +
 113.605 +
 113.606 +    return 0;
 113.607 +}

   114.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   114.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_pthread.h	Mon Aug 27 12:09:56 2012 +0200
   114.3 @@ -0,0 +1,14 @@
   114.4 +#ifndef H264_PTHREAD_H
   114.5 +#define H264_PTHREAD_H
   114.6 +
   114.7 +#include "h264_types.h"
   114.8 +
   114.9 +int decode_B_slice_entropy(EntropyContext *ec, EDSlice *s, EDThreadContext *eb, EDThreadContext *eb_prev);
  114.10 +int decode_slice_entropy(EntropyContext *hc, EDSlice *s);
  114.11 +
  114.12 +void *read_thread(void *arg);
  114.13 +void *parsenal_thread(void *arg);
  114.14 +void *mbrec_thread(void *arg);
  114.15 +void *write_thread(void *arg);
  114.16 +
  114.17 +#endif

   115.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   115.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_rec.c	Mon Aug 27 12:09:56 2012 +0200
   115.3 @@ -0,0 +1,412 @@
   115.4 +#include "config.h"
   115.5 +
   115.6 +#include "dsputil.h"
   115.7 +#include "h264_types.h"
   115.8 +#include "h264_data.h"
   115.9 +#include "h264_mc.h"
  115.10 +#include "h264_deblock.h"
  115.11 +#include "h264_pred_mode.h"
  115.12 +//#undef NDEBUG
  115.13 +#include <assert.h>
  115.14 +
  115.15 +void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line){
  115.16 +    DecodedPicture *pic = s->curr_pic;
  115.17 +    int mb_stride = mrc->mb_stride;
  115.18 +    int mb_width = mrc->mb_width;
  115.19 +    mrs->mb_type_top = pic->mb_type + (line -1)*mb_stride;
  115.20 +    mrs->mb_type = pic->mb_type + line*mb_stride;
  115.21 +    mrs->ref_index_top[0] = pic->ref_index[0] + 4*(line -1)*mb_stride;
  115.22 +    mrs->ref_index_top[1] = pic->ref_index[1] + 4*(line -1)*mb_stride;
  115.23 +    mrs->ref_index[0] = pic->ref_index[0] + 4*line*mb_stride;
  115.24 +    mrs->ref_index[1] = pic->ref_index[1] + 4*line*mb_stride;
  115.25 +
  115.26 +    mrs->motion_val_top[0] = pic->motion_val[0] + 4*mb_width*4*(line-1);
  115.27 +    mrs->motion_val_top[1] = pic->motion_val[1] + 4*mb_width*4*(line-1);
  115.28 +    mrs->motion_val[0] = pic->motion_val[0] + 4*mb_width*4*line;
  115.29 +    mrs->motion_val[1] = pic->motion_val[1] + 4*mb_width*4*line;
  115.30 +
  115.31 +    mrs->intra4x4_pred_mode_top = pic->intra4x4_pred_mode + 4*mb_width*(line-1);
  115.32 +    mrs->intra4x4_pred_mode = pic->intra4x4_pred_mode + 4*mb_width*line;
  115.33 +
  115.34 +    mrs->non_zero_count_top = pic->non_zero_count + 8*mb_width*(line-1);
  115.35 +    mrs->non_zero_count = pic->non_zero_count + 8*mb_width*line;
  115.36 +
  115.37 +    if (s->slice_type_nos == FF_B_TYPE){
  115.38 +        mrs->list1_mb_type = s->dp_ref_list[1][0]->mb_type + line*mb_stride;
  115.39 +        mrs->list1_ref_index[0]  = s->dp_ref_list[1][0]->ref_index[0] + 4*line*mb_stride;
  115.40 +        mrs->list1_ref_index[1]  = s->dp_ref_list[1][0]->ref_index[1] + 4*line*mb_stride;
  115.41 +        mrs->list1_motion_val[0] = s->dp_ref_list[1][0]->motion_val[0] + 4*mb_width*4*line;
  115.42 +        mrs->list1_motion_val[1] = s->dp_ref_list[1][0]->motion_val[1] + 4*mb_width*4*line;
  115.43 +    }
  115.44 +
  115.45 +}
  115.46 +
  115.47 +#if OMPSS
  115.48 +static void backup_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
  115.49 +    int i;
  115.50 +    uint8_t * top_border_y1 = m->top_border;
  115.51 +    uint8_t * top_border_y2 = m->top_border + 8;
  115.52 +    uint8_t * top_border_cb = m->top_border + 16;
  115.53 +    uint8_t * top_border_cr = m->top_border + 24;
  115.54 +    uint8_t * top_border_next = m->top_border_next;
  115.55 +
  115.56 +    src_y  -=   linesize;
  115.57 +    src_cb -= uvlinesize;
  115.58 +    src_cr -= uvlinesize;
  115.59 +
  115.60 +    m->left_border[0]= m->top_border[15];
  115.61 +    for(i=1; i<17 ; i++){
  115.62 +        m->left_border[i]= src_y[15 + i*linesize];
  115.63 +    }
  115.64 +
  115.65 +    *(uint64_t*)(top_border_y1)   = *(uint64_t*)(src_y +  16*linesize);
  115.66 +    *(uint64_t*)(top_border_next) = *(uint64_t*)(src_y +  16*linesize);
  115.67 +    *(uint64_t*)(top_border_y2)   = *(uint64_t*)(src_y +8+16*linesize);
  115.68 +
  115.69 +    m->left_border[17]= m->top_border[16+7];
  115.70 +    m->left_border[17+9]= m->top_border[24+7];
  115.71 +    for(i=1; i<9; i++){
  115.72 +        m->left_border[17  +i]= src_cb[7+i*uvlinesize];
  115.73 +        m->left_border[17+9+i]= src_cr[7+i*uvlinesize];
  115.74 +    }
  115.75 +    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
  115.76 +    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
  115.77 +}
  115.78 +
  115.79 +static void xchg_mb_border(H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
  115.80 +    int temp8, i;
  115.81 +    uint64_t temp64;
  115.82 +
  115.83 +    uint8_t * top_border_y1 = m->top_border;
  115.84 +    uint8_t * top_border_y2 = m->top_border + 8;
  115.85 +    uint8_t * top_border_cb = m->top_border + 16;
  115.86 +    uint8_t * top_border_cr = m->top_border + 24;
  115.87 +    uint8_t * top_border_next = m->top_border_next;
  115.88 +
  115.89 +    int deblock_left;
  115.90 +    int deblock_top;
  115.91 +
  115.92 +    deblock_left = (m->mb_x > 0);
  115.93 +    deblock_top =  (m->mb_y > 0);
  115.94 +
  115.95 +    src_y  -= (  linesize + 1);
  115.96 +    src_cb -= (uvlinesize + 1);
  115.97 +    src_cr -= (uvlinesize + 1);
  115.98 +
  115.99 +    #define XCHG(a,b,t,xchg)\
 115.100 +    t= a;\
 115.101 +    if(xchg)\
 115.102 +        a= b;\
 115.103 +    b= t;
 115.104 +
 115.105 +    if(deblock_left){
 115.106 +        for(i = !deblock_top; i<16; i++){
 115.107 +            XCHG(m->left_border[i], src_y [i*  linesize], temp8, xchg);
 115.108 +        }
 115.109 +        XCHG(m->left_border[i], src_y [i*  linesize], temp8, 1);
 115.110 +
 115.111 +        for(i = !deblock_top; i<8; i++){
 115.112 +            XCHG(m->left_border[17  +i], src_cb[i*uvlinesize], temp8, xchg);
 115.113 +            XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, xchg);
 115.114 +        }
 115.115 +        XCHG(m->left_border[17  +i], src_cb[i*uvlinesize], temp8, 1);
 115.116 +        XCHG(m->left_border[17+9+i], src_cr[i*uvlinesize], temp8, 1);
 115.117 +    }
 115.118 +
 115.119 +    if(deblock_top){
 115.120 +        XCHG(*(uint64_t*)(top_border_y1)  , *(uint64_t*)(src_y +1), temp64, xchg);
 115.121 +        XCHG(*(uint64_t*)(top_border_y2)  , *(uint64_t*)(src_y +9), temp64, 1);
 115.122 +        XCHG(*(uint64_t*)(top_border_next), *(uint64_t*)(src_y +17), temp64, 1);
 115.123 +
 115.124 +        XCHG(*(uint64_t*)(top_border_cb)  , *(uint64_t*)(src_cb+1), temp64, 1);
 115.125 +        XCHG(*(uint64_t*)(top_border_cr)  , *(uint64_t*)(src_cr+1), temp64, 1);
 115.126 +    }
 115.127 +}
 115.128 +#else
 115.129 +
 115.130 +static void backup_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
 115.131 +    int i;
 115.132 +    uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y;
 115.133 +    uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb;
 115.134 +    uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr;
 115.135 +
 115.136 +    uint8_t* left_border_y = d->left.unfiltered_y;
 115.137 +    uint8_t* left_border_cb = d->left.unfiltered_cb;
 115.138 +    uint8_t* left_border_cr = d->left.unfiltered_cr;
 115.139 +
 115.140 +    src_y  -=   linesize;
 115.141 +    src_cb -= uvlinesize;
 115.142 +    src_cr -= uvlinesize;
 115.143 +
 115.144 +    // There are two lines saved, the line above the top macroblock of a pair,
 115.145 +    // and the line above the bottom macroblock
 115.146 +    left_border_y[0] = top_border_y[15];
 115.147 +    for(i=1; i<17; i++){
 115.148 +        left_border_y[i] = src_y[15+i*  linesize];
 115.149 +    }
 115.150 +    *(uint64_t*)(top_border_y   )   = *(uint64_t*)(src_y +  16*linesize);
 115.151 +    *(uint64_t*)(top_border_y +8)   = *(uint64_t*)(src_y +8+16*linesize);
 115.152 +
 115.153 +    left_border_cb[0] = top_border_cb[7];
 115.154 +    left_border_cr[0] = top_border_cr[7];
 115.155 +    for(i=1; i<9; i++){
 115.156 +        left_border_cb[i] = src_cb[7+i*uvlinesize];
 115.157 +        left_border_cr[i] = src_cr[7+i*uvlinesize];
 115.158 +    }
 115.159 +    *(uint64_t*)(top_border_cb)= *(uint64_t*)(src_cb+8*uvlinesize);
 115.160 +    *(uint64_t*)(top_border_cr)= *(uint64_t*)(src_cr+8*uvlinesize);
 115.161 +}
 115.162 +
 115.163 +static void xchg_mb_border(MBRecContext *d, H264Mb *m, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
 115.164 +
 115.165 +    int temp8, i;
 115.166 +    uint64_t temp64;
 115.167 +    int deblock_left;
 115.168 +    int deblock_top;
 115.169 +
 115.170 +    uint8_t* top_border_y = d->top[m->mb_x].unfiltered_y;
 115.171 +    uint8_t* top_border_cb = d->top[m->mb_x].unfiltered_cb;
 115.172 +    uint8_t* top_border_cr = d->top[m->mb_x].unfiltered_cr;
 115.173 +    uint8_t* top_border_y_next = d->top[m->mb_x +1].unfiltered_y;
 115.174 +
 115.175 +    uint8_t* left_border_y = d->left.unfiltered_y;
 115.176 +    uint8_t* left_border_cb = d->left.unfiltered_cb;
 115.177 +    uint8_t* left_border_cr = d->left.unfiltered_cr;
 115.178 +
 115.179 +    deblock_left = (m->mb_x > 0);
 115.180 +    deblock_top =  (m->mb_y > 0);
 115.181 +
 115.182 +    src_y  -= (  linesize + 1);
 115.183 +    src_cb -= (uvlinesize + 1);
 115.184 +    src_cr -= (uvlinesize + 1);
 115.185 +
 115.186 +    #define XCHG(a,b,t,xchg)\
 115.187 +    t= a;\
 115.188 +    if(xchg)\
 115.189 +        a= b;\
 115.190 +    b= t;
 115.191 +
 115.192 +    if(deblock_left){
 115.193 +        for(i = !deblock_top; i<16; i++){
 115.194 +            XCHG(left_border_y[i], src_y [i*  linesize], temp8, xchg);
 115.195 +        }
 115.196 +        XCHG(left_border_y[i], src_y [i*  linesize], temp8, 1);
 115.197 +
 115.198 +        for(i = !deblock_top; i<8; i++){
 115.199 +            XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, xchg);
 115.200 +            XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, xchg);
 115.201 +        }
 115.202 +        XCHG(left_border_cb[i], src_cb[i*uvlinesize], temp8, 1);
 115.203 +        XCHG(left_border_cr[i], src_cr[i*uvlinesize], temp8, 1);
 115.204 +    }
 115.205 +
 115.206 +    if(deblock_top){
 115.207 +        XCHG(*(uint64_t*)(top_border_y+0), *(uint64_t*)(src_y +1), temp64, xchg);
 115.208 +        XCHG(*(uint64_t*)(top_border_y+8), *(uint64_t*)(src_y +9), temp64, 1);
 115.209 +        if(m->mb_x+1 < d->mb_width){
 115.210 +            XCHG(*(uint64_t*)(top_border_y_next), *(uint64_t*)(src_y +17), temp64, 1);
 115.211 +        }
 115.212 +        XCHG(*(uint64_t*)(top_border_cb), *(uint64_t*)(src_cb+1), temp64, 1);
 115.213 +        XCHG(*(uint64_t*)(top_border_cr), *(uint64_t*)(src_cr+1), temp64, 1);
 115.214 +    }
 115.215 +}
 115.216 +
 115.217 +#endif
 115.218 +
 115.219 +void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m){
 115.220 +    int i;
 115.221 +    const int mb_x= m->mb_x;
 115.222 +    const int mb_y= m->mb_y;
 115.223 +    int *block_offset = d->block_offset;
 115.224 +
 115.225 +    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
 115.226 +    void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
 115.227 +
 115.228 +    int linesize   = d->linesize;
 115.229 +    int uvlinesize = d->uvlinesize;
 115.230 +
 115.231 +    uint8_t *dest_y  = s->curr_pic->data[0] + (mb_x + mb_y * linesize  ) * 16;
 115.232 +    uint8_t *dest_cb = s->curr_pic->data[1] + (mb_x + mb_y * uvlinesize) * 8;
 115.233 +    uint8_t *dest_cr = s->curr_pic->data[2] + (mb_x + mb_y * uvlinesize) * 8;
 115.234 +
 115.235 +    pred_motion_mb_rec (d, mrs, s, m);
 115.236 +
 115.237 +    const int mb_type= m->mb_type;
 115.238 +
 115.239 +    d->dsp.prefetch(dest_y + (m->mb_x&3)*4*linesize + 64, d->linesize, 4);
 115.240 +    d->dsp.prefetch(dest_cb + (m->mb_x&7)*uvlinesize + 64, dest_cr - dest_cb, 2);
 115.241 +
 115.242 +    if(IS_INTRA(mb_type)){
 115.243 +#if OMPSS
 115.244 +        xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
 115.245 +#else
 115.246 +        xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
 115.247 +#endif
 115.248 +
 115.249 +        d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cb, uvlinesize);
 115.250 +        d->hpc.pred8x8[ m->chroma_pred_mode ](dest_cr, uvlinesize);
 115.251 +
 115.252 +        if(IS_INTRA4x4(mb_type)){
 115.253 +            if(IS_8x8DCT(mb_type)){
 115.254 +                idct_dc_add = d->hdsp.h264_idct8_dc_add;
 115.255 +                idct_add    = d->hdsp.h264_idct8_add;
 115.256 +
 115.257 +                for(i=0; i<16; i+=4){
 115.258 +                    uint8_t * const ptr= dest_y + block_offset[i];
 115.259 +                    const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ];
 115.260 +
 115.261 +                    const int nnz = mrs->non_zero_count_cache[ scan8[i] ];
 115.262 +                    d->hpc.pred8x8l[ dir ](ptr, (mrs->topleft_samples_available<<i)&0x8000,
 115.263 +                                                (mrs->topright_samples_available<<i)&0x4000, linesize);
 115.264 +                    if(nnz){
 115.265 +                        if(nnz == 1 && m->mb[i*16])
 115.266 +                            idct_dc_add(ptr, m->mb + i*16, linesize);
 115.267 +                        else
 115.268 +                            idct_add   (ptr, m->mb + i*16, linesize);
 115.269 +                    }
 115.270 +                }
 115.271 +            }else{
 115.272 +                idct_dc_add = d->hdsp.h264_idct_dc_add;
 115.273 +                idct_add    = d->hdsp.h264_idct_add;
 115.274 +
 115.275 +                for(i=0; i<16; i++){
 115.276 +                    uint8_t * const ptr= dest_y + block_offset[i];
 115.277 +                    const int dir= mrs->intra4x4_pred_mode_cache[ scan8[i] ];
 115.278 +                    uint8_t *topright;
 115.279 +                    int nnz, tr;
 115.280 +                    if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
 115.281 +                        const int topright_avail= (mrs->topright_samples_available<<i)&0x8000;
 115.282 +                        assert(mb_y || linesize <= block_offset[i]);
 115.283 +                        if(!topright_avail){
 115.284 +                            tr= ptr[3 - linesize]*0x01010101;
 115.285 +                            topright= (uint8_t*) &tr;
 115.286 +                        }else
 115.287 +                            topright= ptr + 4 - linesize;
 115.288 +                    }else
 115.289 +                        topright= NULL;
 115.290 +
 115.291 +                    d->hpc.pred4x4[ dir ](ptr, topright, linesize);
 115.292 +                    nnz = mrs->non_zero_count_cache[ scan8[i] ];
 115.293 +                    if(nnz){
 115.294 +                        if(nnz == 1 && m->mb[i*16])
 115.295 +                            idct_dc_add(ptr, m->mb + i*16, linesize);
 115.296 +                        else
 115.297 +                            idct_add   (ptr, m->mb + i*16, linesize);
 115.298 +                    }
 115.299 +                }
 115.300 +            }
 115.301 +        }else{
 115.302 +            d->hpc.pred16x16[ m->intra16x16_pred_mode ](dest_y , linesize);
 115.303 +        }
 115.304 +#if OMPSS
 115.305 +        xchg_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
 115.306 +#else
 115.307 +        xchg_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
 115.308 +#endif
 115.309 +    }else {
 115.310 +        hl_motion(d, mrs, s, m, dest_y, dest_cb, dest_cr,
 115.311 +                    d->hdsp.qpel_put, d->dsp.put_h264_chroma_pixels_tab,
 115.312 +                    d->hdsp.qpel_avg, d->dsp.avg_h264_chroma_pixels_tab,
 115.313 +                    d->hdsp.weight_h264_pixels_tab, d->hdsp.biweight_h264_pixels_tab);
 115.314 +    }
 115.315 +
 115.316 +    if(!IS_INTRA4x4(mb_type)){
 115.317 +
 115.318 +        if(IS_INTRA16x16(mb_type)){
 115.319 +
 115.320 +            d->hdsp.h264_idct_add16intra(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
 115.321 +
 115.322 +        }else if(m->cbp&15){
 115.323 +
 115.324 +            if(IS_8x8DCT(mb_type)){
 115.325 +                d->hdsp.h264_idct8_add4(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
 115.326 +            }else{
 115.327 +                d->hdsp.h264_idct_add16(dest_y, block_offset, m->mb, linesize, mrs->non_zero_count_cache);
 115.328 +            }
 115.329 +        }
 115.330 +    }
 115.331 +
 115.332 +    if(m->cbp&0x30){
 115.333 +        uint8_t *dest[2] = {dest_cb, dest_cr};
 115.334 +
 115.335 +        idct_add = d->hdsp.h264_idct_add;
 115.336 +        idct_dc_add = d->hdsp.h264_idct_dc_add;
 115.337 +        for(i=16; i<16+8; i++){
 115.338 +            if(mrs->non_zero_count_cache[ scan8[i] ])
 115.339 +                idct_add   (dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize);
 115.340 +            else if(m->mb[i*16])
 115.341 +                idct_dc_add(dest[(i&4)>>2] + block_offset[i], m->mb + i*16, uvlinesize);
 115.342 +        }
 115.343 +    }
 115.344 +
 115.345 +#if OMPSS
 115.346 +    backup_mb_border(m, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
 115.347 +    if (mb_x+1 <d->mb_width){
 115.348 +        H264Mb *mr = m+1;
 115.349 +        memcpy(mr->left_border, m->left_border, sizeof(m->left_border));
 115.350 +    }
 115.351 +    if (mb_y +1 <d->mb_height){
 115.352 +        H264Mb *md = m + d->mb_width;
 115.353 +        memcpy(md->top_border, m->top_border, sizeof(m->top_border));
 115.354 +        if (mb_x>0){
 115.355 +            H264Mb *mdl = m + d->mb_width -1;
 115.356 +            memcpy(mdl->top_border_next, m->top_border_next, sizeof(m->top_border_next));
 115.357 +        }
 115.358 +    }
 115.359 +#else
 115.360 +    backup_mb_border(d, m, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
 115.361 +    if (mb_y +1 <d->mb_height && d->top_next != d->top){
 115.362 +        memcpy(&d->top_next[mb_x],&d->top[mb_x], sizeof(TopBorder));
 115.363 +    }
 115.364 +#endif
 115.365 +
 115.366 +    ff_h264_filter_mb(d, mrs, s, m, dest_y, dest_cb, dest_cr);
 115.367 +}
 115.368 +
 115.369 +MBRecContext *get_mbrec_context(H264Context *h){
 115.370 +    MBRecContext *d = av_mallocz(sizeof(MBRecContext));
 115.371 +
 115.372 +    ff_h264dsp_init(&d->hdsp);
 115.373 +    ff_h264_pred_init(&d->hpc);
 115.374 +    dsputil_init(&d->dsp);
 115.375 +
 115.376 +#if !OMPSS
 115.377 +    d->mrs = av_mallocz(sizeof(MBRecState));
 115.378 +#endif
 115.379 +    d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab;
 115.380 +    d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab;
 115.381 +    d->mb_height = h->mb_height;
 115.382 +    d->mb_width  = h->mb_width;
 115.383 +    d->mb_stride  = h->mb_stride;
 115.384 +    d->b_stride  = h->b_stride;
 115.385 +    d->height = h->height;
 115.386 +    d->width  = h->width;
 115.387 +    d->linesize = h->width + EDGE_WIDTH*2;
 115.388 +    d->uvlinesize = d->linesize>>1;
 115.389 +
 115.390 +    d->scratchpad_y = av_malloc(d->linesize*16*sizeof(uint8_t));
 115.391 +    d->scratchpad_cb= av_malloc(d->uvlinesize*8*sizeof(uint8_t));
 115.392 +    d->scratchpad_cr= av_malloc(d->uvlinesize*8*sizeof(uint8_t));
 115.393 +
 115.394 +    for (int i=0; i<16; i++){
 115.395 +        d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3);
 115.396 +    }
 115.397 +    for (int i=0; i<4; i++){
 115.398 +        d->block_offset[16+i]=
 115.399 +        d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3);
 115.400 +    }
 115.401 +
 115.402 +
 115.403 +
 115.404 +    return d;
 115.405 +}
 115.406 +
 115.407 +void free_mbrec_context(MBRecContext *d){
 115.408 +#if !OMPSS
 115.409 +    av_free(d->mrs);
 115.410 +#endif
 115.411 +    av_free(d->scratchpad_y);
 115.412 +    av_free(d->scratchpad_cb);
 115.413 +    av_free(d->scratchpad_cr);
 115.414 +    av_free(d);
 115.415 +}

   116.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   116.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_rec.h	Mon Aug 27 12:09:56 2012 +0200
   116.3 @@ -0,0 +1,12 @@
   116.4 +#ifndef H264_REC_H
   116.5 +#define H264_REC_H
   116.6 +
   116.7 +#include "h264_types.h"
   116.8 +
   116.9 +MBRecContext *get_mbrec_context(H264Context *h);
  116.10 +void free_mbrec_context( MBRecContext *d);
  116.11 +void h264_decode_mb_internal(MBRecContext *d, MBRecState *mrs, H264Slice *s, H264Mb *m);
  116.12 +
  116.13 +void init_mbrec_context(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, int line);
  116.14 +
  116.15 +#endif

   117.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   117.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_refs.c	Mon Aug 27 12:09:56 2012 +0200
   117.3 @@ -0,0 +1,461 @@
   117.4 +/*
   117.5 + * H.26L/H.264/AVC/JVT/14496-10/... reference picture handling
   117.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   117.7 + *
   117.8 + * This file is part of FFmpeg.
   117.9 + *
  117.10 + * FFmpeg is free software; you can redistribute it and/or
  117.11 + * modify it under the terms of the GNU Lesser General Public
  117.12 + * License as published by the Free Software Foundation; either
  117.13 + * version 2.1 of the License, or (at your option) any later version.
  117.14 + *
  117.15 + * FFmpeg is distributed in the hope that it will be useful,
  117.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  117.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  117.18 + * Lesser General Public License for more details.
  117.19 + *
  117.20 + * You should have received a copy of the GNU Lesser General Public
  117.21 + * License along with FFmpeg; if not, write to the Free Software
  117.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  117.23 + */
  117.24 +
  117.25 +/**
  117.26 + * @file
  117.27 + * H.264 / AVC / MPEG4 part10  reference picture handling.
  117.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  117.29 + */
  117.30 +
  117.31 +#include "dsputil.h"
  117.32 +#include "h264_types.h"
  117.33 +#include "golomb.h"
  117.34 +
  117.35 +//#undef NDEBUG
  117.36 +#include <assert.h>
  117.37 +
  117.38 +static int build_def_list(PictureInfo **def, PictureInfo **in, int len, int is_long){
  117.39 +    int i[2]={0};
  117.40 +    int index=0;
  117.41 +
  117.42 +    while(i[0]<len || i[1]<len){
  117.43 +        while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference)))
  117.44 +            i[0]++;
  117.45 +        while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & 0)))
  117.46 +            i[1]++;
  117.47 +        if(i[0] < len){
  117.48 +            in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
  117.49 +            def[index++]= in[ i[0]++ ];
  117.50 +        }
  117.51 +        if(i[1] < len){
  117.52 +            in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
  117.53 +            def[index++]= in[ i[1]++ ];
  117.54 +        }
  117.55 +    }
  117.56 +
  117.57 +    return index;
  117.58 +}
  117.59 +
  117.60 +static int add_sorted(PictureInfo **sorted, PictureInfo **src, int len, int limit, int dir){
  117.61 +    int i, best_poc;
  117.62 +    int out_i= 0;
  117.63 +
  117.64 +    for(;;){
  117.65 +        best_poc= dir ? INT_MIN : INT_MAX;
  117.66 +
  117.67 +        for(i=0; i<len; i++){
  117.68 +            const int poc= src[i]->poc;
  117.69 +            if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
  117.70 +                best_poc= poc;
  117.71 +                sorted[out_i]= src[i];
  117.72 +            }
  117.73 +        }
  117.74 +        if(best_poc == (dir ? INT_MIN : INT_MAX))
  117.75 +            break;
  117.76 +        limit= sorted[out_i++]->poc - dir;
  117.77 +    }
  117.78 +    return out_i;
  117.79 +}
  117.80 +
  117.81 +int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s){
  117.82 +    int i,len;
  117.83 +
  117.84 +    if(s->slice_type_nos==FF_B_TYPE){
  117.85 +        PictureInfo *sorted[32];
  117.86 +        int cur_poc, list;
  117.87 +        int lens[2];
  117.88 +
  117.89 +        cur_poc= s->poc;
  117.90 +
  117.91 +        for(list= 0; list<2; list++){
  117.92 +            len= add_sorted(sorted, n->short_ref, n->short_ref_count, cur_poc, !list);
  117.93 +            len+=add_sorted(sorted+len, n->short_ref, n->short_ref_count, cur_poc, list);
  117.94 +            assert(len<=32);
  117.95 +            len= build_def_list(s->ref_list[list], sorted, len, 0);
  117.96 +            len+=build_def_list(s->ref_list[list] +len, n->long_ref, 16 , 1);
  117.97 +            assert(len<=32);
  117.98 +
  117.99 +            for(int i=len; i<s->ref_count[list]; i++)
 117.100 +                s->ref_list[list][i] = NULL;
 117.101 +
 117.102 +            lens[list]= len;
 117.103 +        }
 117.104 +
 117.105 +        if(lens[0] == lens[1] && lens[1] > 1){
 117.106 +            for(i=0; s->ref_list[0][i]->poc == s->ref_list[1][i]->poc && i<lens[0]; i++);
 117.107 +
 117.108 +			if(i == lens[0])
 117.109 +				FFSWAP(PictureInfo *, s->ref_list[1][0], s->ref_list[1][1]);
 117.110 +        }
 117.111 +    }else{
 117.112 +        len = build_def_list(s->ref_list[0], n->short_ref, n->short_ref_count, 0);
 117.113 +        len+= build_def_list(s->ref_list[0] +len, n->long_ref, 16, 1);
 117.114 +        assert(len <= 32);
 117.115 +        for(i=len; i<s->ref_count[0]; i++)
 117.116 +            s->ref_list[0][i] = NULL;
 117.117 +    }
 117.118 +
 117.119 +    return 0;
 117.120 +}
 117.121 +
 117.122 +/**
 117.123 +* print short term list
 117.124 +*/
 117.125 +static void print_short_term(NalContext *n) {
 117.126 +    av_log(AV_LOG_DEBUG, "short term list:\n");
 117.127 +    for(int i=0; i<n->short_ref_count; i++){
 117.128 +        PictureInfo *pic= n->short_ref[i];
 117.129 +        av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d ref:%d \n", i, pic->frame_num, pic->poc, pic->reference);
 117.130 +    }
 117.131 +}
 117.132 +
 117.133 +/**
 117.134 +* print long term list
 117.135 +*/
 117.136 +static void print_long_term(NalContext *n) {
 117.137 +    uint32_t i;
 117.138 +
 117.139 +    av_log(AV_LOG_DEBUG, "long term list:\n");
 117.140 +    for(i = 0; i < 16; i++){
 117.141 +        PictureInfo *pic= n->long_ref[i];
 117.142 +        if (pic) {
 117.143 +            av_log(AV_LOG_DEBUG, "%d fn:%d poc:%d\n", i, pic->frame_num, pic->poc);
 117.144 +        }
 117.145 +    }
 117.146 +}
 117.147 +
 117.148 +int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb){
 117.149 +    int list, index;
 117.150 +
 117.151 +    print_short_term(n);
 117.152 +    print_long_term(n);
 117.153 +
 117.154 +    for(list=0; list<s->list_count; list++){
 117.155 +
 117.156 +        if(get_bits1(gb)){
 117.157 +            int frame_num = n->frame_num;
 117.158 +            unsigned int abs_diff_pic_num;
 117.159 +            for(index=0; ; index++){
 117.160 +                unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(gb);
 117.161 +                int i=0;
 117.162 +                PictureInfo *ref = NULL;
 117.163 +
 117.164 +                if(reordering_of_pic_nums_idc==3){
 117.165 +                    break;
 117.166 +                }
 117.167 +                if(index >= s->ref_count[list]){
 117.168 +                    av_log(AV_LOG_ERROR, "reference count overflow\n");
 117.169 +                    return -1;
 117.170 +                }
 117.171 +
 117.172 +                if (reordering_of_pic_nums_idc>2){
 117.173 +                    av_log(AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
 117.174 +                    return -1;
 117.175 +                }
 117.176 +
 117.177 +                if (reordering_of_pic_nums_idc<2){
 117.178 +                    //av_log(AV_LOG_ERROR, "long term pic not supported\n");
 117.179 +
 117.180 +                    abs_diff_pic_num= get_ue_golomb(gb) + 1;
 117.181 +                    if(abs_diff_pic_num > (unsigned) n->max_pic_num){
 117.182 +                        av_log(AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
 117.183 +                        return -1;
 117.184 +                    }
 117.185 +
 117.186 +                    if(reordering_of_pic_nums_idc == 0)
 117.187 +                        frame_num-= abs_diff_pic_num;
 117.188 +                    else
 117.189 +                        frame_num+= abs_diff_pic_num;
 117.190 +                    frame_num &= n->max_pic_num - 1;
 117.191 +
 117.192 +                    for(i= 0 ; i<n->short_ref_count; i++){
 117.193 +                        ref = n->short_ref[i];
 117.194 +                        if(ref->frame_num == frame_num && ref->reference){
 117.195 +                            break;
 117.196 +                        }
 117.197 +                    }
 117.198 +                    ref->pic_id= frame_num;
 117.199 +                }else{
 117.200 +                    int long_idx;
 117.201 +                    long_idx= get_ue_golomb(gb); //long_term_pic_idx
 117.202 +
 117.203 +                    if(long_idx>31){
 117.204 +                        av_log(AV_LOG_ERROR, "long_term_pic_idx overflow\n");
 117.205 +                        return -1;
 117.206 +                    }
 117.207 +                    ref = n->long_ref[long_idx];
 117.208 +                    assert(!(ref && !ref->reference));
 117.209 +                    if(ref && (ref->reference)){
 117.210 +                        ref->pic_id= long_idx;
 117.211 +                        assert(ref->long_ref);
 117.212 +                    }else{
 117.213 +                        av_log(AV_LOG_ERROR, "reference picture missing during reorder\n");
 117.214 +                    }
 117.215 +                }
 117.216 +
 117.217 +                if (i >= n->short_ref_count) {
 117.218 +                    av_log(AV_LOG_ERROR, "reference picture missing during reorder\n");
 117.219 +                    return -1;
 117.220 +                } else {
 117.221 +                    for(i=index; i+1 <s->ref_count[list]; i++){
 117.222 +
 117.223 +//                         if(ref->frame_num == s->ref_list[list][i]->frame_num)
 117.224 +//                            break;
 117.225 +                        ///there is probably no need for a separate pic_id and frame_num
 117.226 +						if (s->ref_list[list][i]){
 117.227 +
 117.228 +							if(ref->long_ref == s->ref_list[list][i]->long_ref && ref->pic_id == s->ref_list[list][i]->pic_id)
 117.229 +								break;
 117.230 +						}
 117.231 +                    }
 117.232 +                    for(; i > index; i--){
 117.233 +                        s->ref_list[list][i]= s->ref_list[list][i-1];
 117.234 +                    }
 117.235 +                    s->ref_list[list][index]= ref;
 117.236 +                }
 117.237 +            }
 117.238 +        }
 117.239 +    }
 117.240 +
 117.241 +//     //Check if everything went well
 117.242 +//     for(list=0; list<s->list_count; list++){
 117.243 +// 		//printf("ref_count %d list %d\n", s->ref_count[list], list);
 117.244 +//         for(index= 0; index < s->ref_count[list]; index++){
 117.245 +// 			//printf("%d\n", s->ref_list[list][index]->pic_id);
 117.246 +//             if(!s->ref_list[list][index]->data[0]){
 117.247 +//                 av_log(AV_LOG_ERROR, "Missing reference picture\n");
 117.248 +//                 return -1;
 117.249 +//             }
 117.250 +//         }
 117.251 +//     }
 117.252 +
 117.253 +    return 0;
 117.254 +}
 117.255 +
 117.256 +static PictureInfo *find_short(NalContext *n, int frame_num){
 117.257 +    int i;
 117.258 +    for(i=0; i<n->short_ref_count; i++){
 117.259 +        if(n->short_ref[i]->frame_num == frame_num) {
 117.260 +            return n->short_ref[i];
 117.261 +        }
 117.262 +    }
 117.263 +    return NULL;
 117.264 +}
 117.265 +
 117.266 +static int remove_short(NalContext *n, H264Slice *s, int frame_num, int release){
 117.267 +    int i;
 117.268 +
 117.269 +    for (i=0; i<n->short_ref_count; i++){
 117.270 +        if (n->short_ref[i]->frame_num == frame_num){
 117.271 +            if (release){
 117.272 +                s->release_ref_cpn[s->release_cnt++] = n->short_ref[i]->cpn;
 117.273 +                n->short_ref[i]->reference &= ~2;
 117.274 +            }
 117.275 +            n->short_ref[i] = NULL;
 117.276 +            if (--n->short_ref_count)
 117.277 +                memmove(&n->short_ref[i], &n->short_ref[i+1], (n->short_ref_count - i)*sizeof(PictureInfo *));
 117.278 +            return 0;
 117.279 +        }
 117.280 +    }
 117.281 +    return -1;
 117.282 +}
 117.283 +
 117.284 +static void remove_long(NalContext *n, H264Slice *s, int i){
 117.285 +
 117.286 +    if (n->long_ref[i]){
 117.287 +        s->release_ref_cpn[s->release_cnt++] = n->long_ref[i]->cpn;
 117.288 +        n->long_ref[i]->reference &= ~2;
 117.289 +        n->long_ref[i]->long_ref = 0;
 117.290 +        n->long_ref_count--;
 117.291 +        n->long_ref[i] = NULL;
 117.292 +    }
 117.293 +}
 117.294 +
 117.295 +void ff_h264_remove_all_refs(NalContext *n, H264Slice *s){
 117.296 +    int i;
 117.297 +
 117.298 +    while (n->short_ref[0])
 117.299 +        remove_short(n, s, n->short_ref[0]->frame_num, 1);
 117.300 +
 117.301 +    for(i=0; i<16; i++){
 117.302 +        remove_long(n, s, i);
 117.303 +    }
 117.304 +    assert(n->short_ref_count==0);
 117.305 +    assert(n->long_ref_count==0);
 117.306 +}
 117.307 +
 117.308 +int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb){
 117.309 +
 117.310 +    if(s->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
 117.311 +        get_bits1(gb); //get_bits1(gb) -1; //broken link
 117.312 +        if(get_bits1(gb)){
 117.313 +            av_log(AV_LOG_ERROR, "MMCO_LONG reference management not supported\n");
 117.314 +        }
 117.315 +    }else{
 117.316 +        if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
 117.317 +            int i,j;
 117.318 +            for(i= 0; i<MAX_MMCO_COUNT; i++) {
 117.319 +                PictureInfo *pic;
 117.320 +                int short_pic_num=0;
 117.321 +                unsigned int long_arg=0;
 117.322 +                MMCOOpcode opcode= get_ue_golomb_31(gb);
 117.323 +
 117.324 +                if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
 117.325 +                    short_pic_num= (n->frame_num - get_ue_golomb(gb) - 1) & (n->max_pic_num - 1);
 117.326 +                }
 117.327 +                if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
 117.328 +                    long_arg= get_ue_golomb_31(gb);
 117.329 +                    if(long_arg >= 16){
 117.330 +                        av_log(AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
 117.331 +                        return -1;
 117.332 +                    }
 117.333 +                }
 117.334 +
 117.335 +                if(opcode > (unsigned)MMCO_LONG){
 117.336 +                    av_log(AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
 117.337 +                    return -1;
 117.338 +                }
 117.339 +                if(opcode == MMCO_END)
 117.340 +                    break;
 117.341 +
 117.342 +                switch (opcode){
 117.343 +                    case MMCO_SHORT2UNUSED:
 117.344 +                        remove_short(n, s, short_pic_num, 1);
 117.345 +                        break;
 117.346 +                    case MMCO_SHORT2LONG:
 117.347 +                        pic = find_short(n, short_pic_num);
 117.348 +                        if (n->long_ref[long_arg] != pic)
 117.349 +                            remove_long(n, s, long_arg);
 117.350 +                        remove_short(n, s, short_pic_num, 0);
 117.351 +                        n->long_ref[long_arg]= pic;
 117.352 +                        if (pic){
 117.353 +                            pic->long_ref=1;
 117.354 +                            n->long_ref[long_arg]= pic;
 117.355 +                            n->long_ref_count++;
 117.356 +                        }
 117.357 +                        break;
 117.358 +                    case MMCO_LONG2UNUSED:
 117.359 +                        assert(n->long_ref[long_arg]);
 117.360 +                        remove_long(n, s, long_arg);
 117.361 +                        break;
 117.362 +                    case MMCO_SET_MAX_LONG:
 117.363 +                        for(j=long_arg; j<16; j++)
 117.364 +                            remove_long(n, s, j);
 117.365 +                        break;
 117.366 +                    case MMCO_RESET:
 117.367 +                        while(n->short_ref_count)
 117.368 +                            remove_short(n, s, n->short_ref[0]->frame_num, 1);
 117.369 +
 117.370 +                        for(j=0; j < 16; j++)
 117.371 +                            remove_long(n, s, j);
 117.372 +
 117.373 +                        s->current_picture_info->poc=
 117.374 +                        s->poc =
 117.375 +                        n->poc_lsb=
 117.376 +                        n->poc_msb=
 117.377 +                        n->frame_num=
 117.378 +                        s->current_picture_info->frame_num= 0;
 117.379 +                        break;
 117.380 +					case MMCO_END:
 117.381 +					case MMCO_LONG:
 117.382 +						break;
 117.383 +                }
 117.384 +            }
 117.385 +        }else{// sliding window ref picture marking
 117.386 +            if(n->short_ref_count == n->sps.ref_frame_count) {
 117.387 +                s->release_ref_cpn[s->release_cnt++] = n->short_ref[n->short_ref_count - 1]->cpn;
 117.388 +                n->short_ref[n->short_ref_count - 1]->reference &= ~2;
 117.389 +                n->short_ref[ n->short_ref_count - 1 ] =NULL;
 117.390 +                n->short_ref_count--;
 117.391 +            }
 117.392 +        }
 117.393 +    }
 117.394 +
 117.395 +    if(n->short_ref_count)
 117.396 +        memmove(&n->short_ref[1], &n->short_ref[0], n->short_ref_count*sizeof(PictureInfo *));
 117.397 +
 117.398 +    n->short_ref[0]= s->current_picture_info;
 117.399 +    n->short_ref_count++;
 117.400 +
 117.401 +    return 0;
 117.402 +}
 117.403 +
 117.404 +static int get_scale_factor(H264Slice *s, int poc, int poc1, int i){
 117.405 +    int poc0 = s->ref_list[0][i]->poc;
 117.406 +    int td = av_clip(poc1 - poc0, -128, 127);
 117.407 +    if(td == 0 || s->ref_list[0][i]->long_ref){
 117.408 +        return 256;
 117.409 +    }else{
 117.410 +        int tb = av_clip(poc - poc0, -128, 127);
 117.411 +        int tx = (16384 + (FFABS(td) >> 1)) / td;
 117.412 +        return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 117.413 +    }
 117.414 +}
 117.415 +
 117.416 +void ff_h264_direct_dist_scale_factor(H264Slice *s){
 117.417 +    const int poc = s->current_picture_info->poc;
 117.418 +    const int poc1 = s->ref_list[1][0]->poc;
 117.419 +
 117.420 +    for(int i=0; i<s->ref_count[0]; i++){
 117.421 +        s->dist_scale_factor[i] = get_scale_factor(s, poc, poc1, i);
 117.422 +    }
 117.423 +}
 117.424 +
 117.425 +static void fill_colmap(H264Slice *s, int map[2][16], int list){
 117.426 +    PictureInfo * const ref1 = s->ref_list[1][0];
 117.427 +    int old_ref, rfield;
 117.428 +
 117.429 +    /* bogus; fills in for missing frames */
 117.430 +    memset(map[list], 0, sizeof(map[list]));
 117.431 +
 117.432 +    for(rfield=0; rfield<2; rfield++){
 117.433 +        for(old_ref=0; old_ref < ref1->ref_count[list]; old_ref++){
 117.434 +            int poc = ref1->ref_poc[list][old_ref];
 117.435 +
 117.436 +            for(int j=0; j<s->ref_count[0]; j++){
 117.437 +                if(s->ref_list[0][j]->poc == poc){
 117.438 +                    map[list][old_ref] = j;
 117.439 +                    break;
 117.440 +                }
 117.441 +            }
 117.442 +        }
 117.443 +    }
 117.444 +}
 117.445 +
 117.446 +void ff_h264_direct_ref_list_init(H264Slice *s){
 117.447 +    PictureInfo * const cur = s->current_picture_info;
 117.448 +    int list;
 117.449 +
 117.450 +    for(list=0; list<2; list++){
 117.451 +        cur->ref_count[list] = s->ref_count[list];
 117.452 +        for(int j=0; j<s->ref_count[list]; j++){
 117.453 +            cur->ref_poc[list][j] = s->ref_list[list][j] ? s->ref_list[list][j]->poc : 0;
 117.454 +        }
 117.455 +    }
 117.456 +
 117.457 +    if(s->slice_type_nos != FF_B_TYPE || s->direct_spatial_mv_pred)
 117.458 +        return;
 117.459 +
 117.460 +    for(list=0; list<2; list++){
 117.461 +        fill_colmap(s, s->map_col_to_list0, list);
 117.462 +    }
 117.463 +}
 117.464 +

   118.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   118.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_refs.h	Mon Aug 27 12:09:56 2012 +0200
   118.3 @@ -0,0 +1,14 @@
   118.4 +#ifndef H264_REFS_H
   118.5 +#define H264_REFS_H
   118.6 +
   118.7 +#include "avcodec.h"
   118.8 +#include "h264_types.h"
   118.9 +
  118.10 +int ff_h264_fill_default_ref_list(NalContext *n, H264Slice *s);
  118.11 +int ff_h264_decode_ref_pic_list_reordering(NalContext *n, H264Slice *s, GetBitContext *gb);
  118.12 +void ff_h264_remove_all_refs(NalContext *n, H264Slice *s);
  118.13 +int ff_h264_ref_pic_marking(NalContext *n, H264Slice *s, GetBitContext *gb);
  118.14 +void ff_h264_direct_ref_list_init(H264Slice *s);
  118.15 +void ff_h264_direct_dist_scale_factor(H264Slice *s);
  118.16 +
  118.17 +#endif

   119.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   119.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_sei.c	Mon Aug 27 12:09:56 2012 +0200
   119.3 @@ -0,0 +1,191 @@
   119.4 +/*
   119.5 + * H.26L/H.264/AVC/JVT/14496-10/... sei decoding
   119.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   119.7 + *
   119.8 + * This file is part of FFmpeg.
   119.9 + *
  119.10 + * FFmpeg is free software; you can redistribute it and/or
  119.11 + * modify it under the terms of the GNU Lesser General Public
  119.12 + * License as published by the Free Software Foundation; either
  119.13 + * version 2.1 of the License, or (at your option) any later version.
  119.14 + *
  119.15 + * FFmpeg is distributed in the hope that it will be useful,
  119.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  119.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  119.18 + * Lesser General Public License for more details.
  119.19 + *
  119.20 + * You should have received a copy of the GNU Lesser General Public
  119.21 + * License along with FFmpeg; if not, write to the Free Software
  119.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  119.23 + */
  119.24 +
  119.25 +/**
  119.26 + * @file
  119.27 + * H.264 / AVC / MPEG4 part10 sei decoding.
  119.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  119.29 + */
  119.30 +
  119.31 +#include "avcodec.h"
  119.32 +#include "h264_types.h"
  119.33 +#include "golomb.h"
  119.34 +
  119.35 +//#undef NDEBUG
  119.36 +#include <assert.h>
  119.37 +
  119.38 +static const uint8_t sei_num_clock_ts_table[9]={
  119.39 +    1,  1,  1,  2,  2,  3,  3,  2,  3
  119.40 +};
  119.41 +
  119.42 +void ff_h264_reset_sei(NalContext *n) {
  119.43 +    n->sei_recovery_frame_cnt       = -1;
  119.44 +    n->sei_dpb_output_delay         =  0;
  119.45 +    n->sei_cpb_removal_delay        = -1;
  119.46 +    n->sei_buffering_period_present =  0;
  119.47 +}
  119.48 +
  119.49 +static int decode_picture_timing(NalContext *n, GetBitContext *gb){
  119.50 +    if(n->sps.nal_hrd_parameters_present_flag || n->sps.vcl_hrd_parameters_present_flag){
  119.51 +        n->sei_cpb_removal_delay = get_bits(gb, n->sps.cpb_removal_delay_length);
  119.52 +        n->sei_dpb_output_delay = get_bits(gb, n->sps.dpb_output_delay_length);
  119.53 +    }
  119.54 +    if(n->sps.pic_struct_present_flag){
  119.55 +        unsigned int i, num_clock_ts;
  119.56 +        n->sei_pic_struct = get_bits(gb, 4);
  119.57 +        n->sei_ct_type    = 0;
  119.58 +
  119.59 +        if (n->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
  119.60 +            return -1;
  119.61 +
  119.62 +        num_clock_ts = sei_num_clock_ts_table[n->sei_pic_struct];
  119.63 +
  119.64 +        for (i = 0 ; i < num_clock_ts ; i++){
  119.65 +            if(get_bits(gb, 1)){                  /* clock_timestamp_flag */
  119.66 +                unsigned int full_timestamp_flag;
  119.67 +                n->sei_ct_type |= 1<<get_bits(gb, 2);
  119.68 +                skip_bits(gb, 1);                 /* nuit_field_based_flag */
  119.69 +                skip_bits(gb, 5);                 /* counting_type */
  119.70 +                full_timestamp_flag = get_bits(gb, 1);
  119.71 +                skip_bits(gb, 1);                 /* discontinuity_flag */
  119.72 +                skip_bits(gb, 1);                 /* cnt_dropped_flag */
  119.73 +                skip_bits(gb, 8);                 /* n_frames */
  119.74 +                if(full_timestamp_flag){
  119.75 +                    skip_bits(gb, 6);             /* seconds_value 0..59 */
  119.76 +                    skip_bits(gb, 6);             /* minutes_value 0..59 */
  119.77 +                    skip_bits(gb, 5);             /* hours_value 0..23 */
  119.78 +                }else{
  119.79 +                    if(get_bits(gb, 1)){          /* seconds_flag */
  119.80 +                        skip_bits(gb, 6);         /* seconds_value range 0..59 */
  119.81 +                        if(get_bits(gb, 1)){      /* minutes_flag */
  119.82 +                            skip_bits(gb, 6);     /* minutes_value 0..59 */
  119.83 +                            if(get_bits(gb, 1))   /* hours_flag */
  119.84 +                                skip_bits(gb, 5); /* hours_value 0..23 */
  119.85 +                        }
  119.86 +                    }
  119.87 +                }
  119.88 +                if(n->sps.time_offset_length > 0)
  119.89 +                    skip_bits(gb, n->sps.time_offset_length); /* time_offset */
  119.90 +            }
  119.91 +        }
  119.92 +    }
  119.93 +    return 0;
  119.94 +}
  119.95 +
  119.96 +static int decode_unregistered_user_data(GetBitContext *gb, int size){
  119.97 +    char user_data[16+256];
  119.98 +    int e, build, i;
  119.99 +
 119.100 +    if(size<16)
 119.101 +        return -1;
 119.102 +
 119.103 +    for(i=0; i<(int) sizeof(user_data)-1 && i<size; i++){
 119.104 +        user_data[i]= get_bits(gb, 8);
 119.105 +    }
 119.106 +
 119.107 +    user_data[i]= 0;
 119.108 +    e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
 119.109 +    (void) e;
 119.110 +    for(; i<size; i++)
 119.111 +        skip_bits(gb, 8);
 119.112 +
 119.113 +    return 0;
 119.114 +}
 119.115 +
 119.116 +static int decode_recovery_point(NalContext *n, GetBitContext *gb){
 119.117 +
 119.118 +    n->sei_recovery_frame_cnt = get_ue_golomb(gb);
 119.119 +    skip_bits(gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
 119.120 +
 119.121 +    return 0;
 119.122 +}
 119.123 +
 119.124 +static int decode_buffering_period(NalContext *n, GetBitContext *gb){
 119.125 +    unsigned int sps_id;
 119.126 +    int sched_sel_idx;
 119.127 +    SPS *sps;
 119.128 +
 119.129 +    sps_id = get_ue_golomb_31(gb);
 119.130 +    if(sps_id > 31 || !n->sps_buffers[sps_id]) {
 119.131 +        av_log(AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
 119.132 +        return -1;
 119.133 +    }
 119.134 +    sps = n->sps_buffers[sps_id];
 119.135 +
 119.136 +    // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
 119.137 +    if (sps->nal_hrd_parameters_present_flag) {
 119.138 +        for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
 119.139 +            n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length);
 119.140 +            skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
 119.141 +        }
 119.142 +    }
 119.143 +    if (sps->vcl_hrd_parameters_present_flag) {
 119.144 +        for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
 119.145 +            n->initial_cpb_removal_delay[sched_sel_idx] = get_bits(gb, sps->initial_cpb_removal_delay_length);
 119.146 +            skip_bits(gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
 119.147 +        }
 119.148 +    }
 119.149 +
 119.150 +    n->sei_buffering_period_present = 1;
 119.151 +    return 0;
 119.152 +}
 119.153 +
 119.154 +int ff_h264_decode_sei(NalContext *n, GetBitContext *gb){
 119.155 +    while(get_bits_count(gb) + 16 < gb->size_in_bits){
 119.156 +        int size, type;
 119.157 +
 119.158 +        type=0;
 119.159 +        do{
 119.160 +            type+= show_bits(gb, 8);
 119.161 +        }while(get_bits(gb, 8) == 255);
 119.162 +
 119.163 +        size=0;
 119.164 +        do{
 119.165 +            size+= show_bits(gb, 8);
 119.166 +        }while(get_bits(gb, 8) == 255);
 119.167 +
 119.168 +        switch(type){
 119.169 +        case SEI_TYPE_PIC_TIMING: // Picture timing SEI
 119.170 +            if(decode_picture_timing(n, gb) < 0)
 119.171 +                return -1;
 119.172 +            break;
 119.173 +        case SEI_TYPE_USER_DATA_UNREGISTERED:
 119.174 +            if(decode_unregistered_user_data(gb, size) < 0)
 119.175 +                return -1;
 119.176 +            break;
 119.177 +        case SEI_TYPE_RECOVERY_POINT:
 119.178 +            if(decode_recovery_point(n, gb) < 0)
 119.179 +                return -1;
 119.180 +            break;
 119.181 +        case SEI_BUFFERING_PERIOD:
 119.182 +            if(decode_buffering_period(n, gb) < 0)
 119.183 +                return -1;
 119.184 +            break;
 119.185 +        default:
 119.186 +            skip_bits(gb, 8*size);
 119.187 +        }
 119.188 +
 119.189 +        //FIXME check bits here
 119.190 +        align_get_bits(gb);
 119.191 +    }
 119.192 +
 119.193 +    return 0;
 119.194 +}

   120.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   120.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_sei.h	Mon Aug 27 12:09:56 2012 +0200
   120.3 @@ -0,0 +1,7 @@
   120.4 +#ifndef H264_SEI_H
   120.5 +#define H264_SEI_H
   120.6 +
   120.7 +int ff_h264_decode_sei(NalContext *n, GetBitContext *gb);
   120.8 +void ff_h264_reset_sei(NalContext *n);
   120.9 +
  120.10 +#endif

   121.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   121.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_seq.c	Mon Aug 27 12:09:56 2012 +0200
   121.3 @@ -0,0 +1,220 @@
   121.4 +/*
   121.5 +* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   121.6 +* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   121.7 +*
   121.8 +* This file is part of FFmpeg.
   121.9 +*
  121.10 +* FFmpeg is free software; you can redistribute it and/or
  121.11 +* modify it under the terms of the GNU Lesser General Public
  121.12 +* License as published by the Free Software Foundation; either
  121.13 +* version 2.1 of the License, or (at your option) any later version.
  121.14 +*
  121.15 +* FFmpeg is distributed in the hope that it will be useful,
  121.16 +* but WITHOUT ANY WARRANTY; without even the implied warranty of
  121.17 +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  121.18 +* Lesser General Public License for more details.
  121.19 +*
  121.20 +* You should have received a copy of the GNU Lesser General Public
  121.21 +* License along with FFmpeg; if not, write to the Free Software
  121.22 +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  121.23 +*/
  121.24 +#include "h264_types.h"
  121.25 +#include "h264_parser.h"
  121.26 +#include "h264_nal.h"
  121.27 +#include "h264_entropy.h"
  121.28 +#include "h264_rec.h"
  121.29 +#include "h264_pred_mode.h"
  121.30 +#include "h264_misc.h"
  121.31 +// #undef NDEBUG
  121.32 +#include <assert.h>
  121.33 +
  121.34 +static int decode_slice_entropy_seq(H264Context *h, EntropyContext *ec, H264Slice *s, GetBitContext *gb, H264Mb *mbs){
  121.35 +    int i,j;
  121.36 +//     GetBitContext *gb = s->gb;
  121.37 +    CABACContext *c = &ec->c;
  121.38 +
  121.39 +    if( !s->pps.cabac ){
  121.40 +        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
  121.41 +        return -1;
  121.42 +    }
  121.43 +
  121.44 +    init_dequant_tables(s, ec);
  121.45 +    ec->curr_qscale = s->qscale;
  121.46 +    ec->last_qscale_diff = 0;
  121.47 +    ec->chroma_qp[0] = get_chroma_qp((H264Slice *) s, 0, s->qscale);
  121.48 +    ec->chroma_qp[1] = get_chroma_qp((H264Slice *) s, 1, s->qscale);
  121.49 +
  121.50 +    /* realign */
  121.51 +    align_get_bits( gb );
  121.52 +    /* init cabac */
  121.53 +    ff_init_cabac_decoder( c, gb->buffer + get_bits_count(gb)/8, (get_bits_left(gb) + 7)/8);
  121.54 +
  121.55 +    ff_h264_init_cabac_states(ec, s, c);
  121.56 +
  121.57 +    for(j=0; j<ec->mb_height; j++){
  121.58 +        init_entropy_buf(ec, s, j);
  121.59 +        for(i=0; i<ec->mb_width; i++){
  121.60 +            int eos,ret;
  121.61 +            H264Mb *m = &mbs[i + j*ec->mb_width];
  121.62 +            //memset(m, 0, sizeof(H264Mb));
  121.63 +            m->mb_x=i;
  121.64 +            m->mb_y=j;
  121.65 +            ec->m = m;
  121.66 +
  121.67 +            ret = ff_h264_decode_mb_cabac(ec, s, c);
  121.68 +            eos = get_cabac_terminate( c);
  121.69 +            (void) eos;
  121.70 +            if( ret < 0 || c->bytestream > c->bytestream_end + 2) {
  121.71 +                av_log(AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", m->mb_x, m->mb_y, c->bytestream_end - c->bytestream);
  121.72 +                return -1;
  121.73 +            }
  121.74 +        }
  121.75 +    }
  121.76 +
  121.77 +//     av_freep(&s->gb.raw);
  121.78 +//     if (s->gb.rbsp)
  121.79 +//         av_freep(&s->gb.rbsp);
  121.80 +
  121.81 +    return 0;
  121.82 +}
  121.83 +
  121.84 +
  121.85 +
  121.86 +/**
  121.87 +*   Sequential version
  121.88 +*/
  121.89 +static void decode_slice_mb_seq(H264Context *h, MBRecContext *d, H264Slice *s2, H264Mb *mbs){
  121.90 +
  121.91 +    for (int i=0; i<2; i++){
  121.92 +        for(int j=0; j< s2->ref_count[i]; j++){
  121.93 +            if (s2->ref_list_cpn[i][j] ==-1)
  121.94 +                continue;
  121.95 +            int k;
  121.96 +            for (k=0; k<h->max_dpb_cnt; k++){
  121.97 +                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s2->ref_list_cpn[i][j]){
  121.98 +                    s2->dp_ref_list[i][j] = &h->dpb[k];
  121.99 +                    break;
 121.100 +                }
 121.101 +            }
 121.102 +        }
 121.103 +    }
 121.104 +
 121.105 +    get_dpb_entry(h, s2);
 121.106 +
 121.107 +    if (!h->no_mbd){
 121.108 +        for(int j=0; j<d->mb_height; j++){
 121.109 +            init_mbrec_context(d, d->mrs, s2, j);
 121.110 +            if (h->profile) printf("\n[MBREC LINE %d ", j);
 121.111 +            for(int i=0; i<d->mb_width; i++){
 121.112 +
 121.113 +                if ((i & 0x7) == 0) start_timer(h, REC);
 121.114 +                H264Mb *m = &mbs[i + j*d->mb_width];
 121.115 +                if (h->profile==2)
 121.116 +                    pred_motion_mb_rec (d, d->mrs, s2, m);
 121.117 +                else{
 121.118 +                    h264_decode_mb_internal(d, d->mrs, s2, m);
 121.119 +                }
 121.120 +                stop_timer(h, REC);
 121.121 +            }
 121.122 +            draw_edges(d, s2, j);
 121.123 +
 121.124 +        }
 121.125 +    }
 121.126 +
 121.127 +    for (int i=0; i<s2->release_cnt; i++){
 121.128 +        for(int j=0; j<h->max_dpb_cnt; j++){
 121.129 +            if(h->dpb[j].cpn== s2->release_ref_cpn[i]){
 121.130 +                release_dpb_entry(h, &h->dpb[j], 2);
 121.131 +                break;
 121.132 +            }
 121.133 +        }
 121.134 +    }
 121.135 +    s2->release_cnt=0;
 121.136 +}
 121.137 +
 121.138 +/*
 121.139 +* The following code is the main loop of the file converter
 121.140 +*/
 121.141 +int h264_decode_seq( H264Context *h) {
 121.142 +    ParserContext *pc;
 121.143 +    NalContext *nc;
 121.144 +    EntropyContext *ec;
 121.145 +    MBRecContext *rc;
 121.146 +    OutputContext *oc;
 121.147 +
 121.148 +    H264Slice slice, *s=&slice;
 121.149 +    H264Mb *mbs;
 121.150 +    DecodedPicture *out;
 121.151 +    int frames=0;
 121.152 +
 121.153 +#if HAVE_LIBSDL2
 121.154 +    pthread_t sdl_thr;
 121.155 +    if (h->display){
 121.156 +        pthread_create(&sdl_thr, NULL, sdl_thread, h);
 121.157 +    }
 121.158 +#endif
 121.159 +    
 121.160 +    pc = get_parse_context(h->ifile);
 121.161 +    nc = get_nal_context(h->width, h->height);
 121.162 +
 121.163 +    memset(s, 0, sizeof(H264Slice));
 121.164 +    mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb));
 121.165 +
 121.166 +    ec = get_entropy_context( h );
 121.167 +    rc = get_mbrec_context(h);
 121.168 +    rc->top_next = rc->top = av_malloc( h->mb_width * sizeof(TopBorder));
 121.169 +
 121.170 +    oc = get_output_context( h );
 121.171 +
 121.172 +    av_start_timer();
 121.173 +    GetBitContext gb = {0,};
 121.174 +    while(!pc->final_frame && frames++ < h->num_frames && !h->quit){
 121.175 +        if (h->profile) start_timer(h, FRONT);
 121.176 +        av_read_frame_internal(pc, &gb);
 121.177 +        decode_nal_units(nc, s, &gb);
 121.178 +        if (h->profile) stop_timer(h, FRONT);
 121.179 +//         memset(s->mbs, 0, sizeof(H264Mb)*ec->mb_width*ec->mb_height);
 121.180 +        if (h->profile) start_timer(h, ED);
 121.181 +        decode_slice_entropy_seq(h, ec, s, &gb, mbs);
 121.182 +        if (h->profile) stop_timer(h, ED);
 121.183 +
 121.184 +        if (h->profile) start_timer(h, REC);
 121.185 +        decode_slice_mb_seq(h, rc, s, mbs);
 121.186 +        if (h->profile) stop_timer(h, REC);
 121.187 +
 121.188 +        out =output_frame(h, oc, s->curr_pic, h->ofile, h->frame_width, h->frame_height);
 121.189 +        if (out){
 121.190 +            release_dpb_entry(h, out, 1);
 121.191 +        }
 121.192 +
 121.193 +        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
 121.194 +        if (h->profile == 3){
 121.195 +            printf("[ENTROPY %.3fms] [MBREC %.3fms]\n", h->last_time[ED] , h->last_time[REC]);
 121.196 +        }
 121.197 +    }
 121.198 +    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
 121.199 +    
 121.200 +    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
 121.201 +    h->num_frames = oc->frame_number;
 121.202 +    /* finished ! */
 121.203 +    av_freep(&mbs);
 121.204 +    av_freep(&gb.raw);
 121.205 +    if (gb.rbsp)
 121.206 +        av_freep(&gb.rbsp);
 121.207 +    av_freep(&rc->top);
 121.208 +
 121.209 +    free_parse_context(pc);
 121.210 +    free_nal_context  (nc);
 121.211 +    free_entropy_context(ec);
 121.212 +    free_mbrec_context(rc);
 121.213 +    free_output_context(oc);
 121.214 +
 121.215 +#if HAVE_LIBSDL2
 121.216 +    if (h->display){
 121.217 +        signal_sdl_exit(h);
 121.218 +        pthread_join(sdl_thr, NULL);
 121.219 +    }
 121.220 +#endif
 121.221 +    
 121.222 +    return 0;
 121.223 +}

   122.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   122.2 +++ b/ffmpeg_smp/h264dec/libavcodec/h264_types.h	Mon Aug 27 12:09:56 2012 +0200
   122.3 @@ -0,0 +1,658 @@
   122.4 +#ifndef H264_TYPES_H
   122.5 +#define H264_TYPES_H
   122.6 +
   122.7 +#include "config.h"
   122.8 +#ifdef HAVE_LIBSDL2
   122.9 +#include <SDL2/SDL.h>
  122.10 +#endif
  122.11 +
  122.12 +#include <pthread.h>
  122.13 +#include "avcodec.h"
  122.14 +#include "cabac.h"
  122.15 +#include "h264_dsp.h"
  122.16 +#include "h264_pred.h"
  122.17 +#include "get_bits.h"
  122.18 +
  122.19 +
  122.20 +#define MAX_REF_PIC_COUNT 16
  122.21 +#define MAX_DELAYED_PIC_COUNT 16
  122.22 +
  122.23 +#define MAX_THREADS 80
  122.24 +
  122.25 +//#define MAX_PIC_COUNT (4*(MAX_REF_PIC_COUNT+MAX_DELAYED_PIC_COUNT))
  122.26 +
  122.27 +#define DPB_SIZE 33
  122.28 +
  122.29 +
  122.30 +//potsdam machine 8xX7560 without HT
  122.31 +// static int edb_affinity [16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
  122.32 +// static int edip_affinity[8] =  {16, 17, 18, 19, 20, 21, 22, 23};
  122.33 +//
  122.34 +// static int mbd_affinity[8][5] = {	{24, 32, 40, 48, 56},
  122.35 +// 							{25, 33, 41, 49, 57},
  122.36 +// 							{26, 34, 42, 50, 58},
  122.37 +// 							{27, 35, 43, 51, 59},
  122.38 +// 							{28, 36, 44, 52, 60},
  122.39 +// 							{29, 37, 45, 53, 61},
  122.40 +// 							{30, 38, 46, 54, 62},
  122.41 +// 							{31, 39, 47, 55, 63}, };
  122.42 +
  122.43 +// static int edb_affinity [22] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 58, 59, 60, 61 ,62, 63};
  122.44 +// static int edip_affinity[10] =  {16, 17, 18, 19, 20, 21, 22, 23, 56, 57 };
  122.45 +//
  122.46 +// static int mbd_affinity[8][5] = {	{24, 32, 40, 48, 56},
  122.47 +// 							{25, 33, 41, 49, 57},
  122.48 +// 							{26, 34, 42, 50, 58},
  122.49 +// 							{27, 35, 43, 51, 59},
  122.50 +// 							{28, 36, 44, 52, 60},
  122.51 +// 							{29, 37, 45, 53, 61},
  122.52 +// 							{30, 38, 46, 54, 62},
  122.53 +// 							{31, 39, 47, 55, 63}, };
  122.54 +// //4 socket
  122.55 +// static int edip_affinity[5] = {0, 1, 2, 3, 56};
  122.56 +// static int edb_affinity [12] = {8, 9, 10, 11, 16, 17, 18, 19, 59, 58, 57, 51};
  122.57 +//
  122.58 +// static int mbd_affinity[4][5] = { {24, 32, 40, 48, 56},
  122.59 +// {25, 33, 41, 49, 57},
  122.60 +// {26, 34, 42, 50, 58},
  122.61 +// {27, 35, 43, 51, 59}, };
  122.62 +
  122.63 +// static int edip_affinity[3] = {0, 1, 49};
  122.64 +// static int edb_affinity [6] = {8, 9, 16, 17, 56, 57};
  122.65 +//
  122.66 +// static int mbd_affinity[2][5] = { {24, 32, 40, 48, 56},
  122.67 +// {25, 33, 41, 49, 57}};
  122.68 +
  122.69 +// static int edip_affinity[2] = {0, 8};
  122.70 +// static int edb_affinity [3] = {16, 24, 56};
  122.71 +//
  122.72 +// static int mbd_affinity[1][4] = { {32, 40, 48, 56},
  122.73 +// };
  122.74 +
  122.75 +/// for ducks_take_off_2160p
  122.76 +// static int edip_affinity[2] = {0, 8};
  122.77 +// static int edb_affinity [3] = {16, 24, 32};
  122.78 +//
  122.79 +// static int mbd_affinity[1][4] = {{ 40, 48, 56, 32}};
  122.80 +
  122.81 +// static int edip_affinity[3] = {0, 1, 57};
  122.82 +// static int edb_affinity [7] = {8, 9, 16, 17, 24, 25, 56};
  122.83 +//
  122.84 +// static int mbd_affinity[2][4] = { {32, 40, 48, 56},
  122.85 +// {33, 41, 49, 57}};
  122.86 +
  122.87 +//4 socket
  122.88 +// static int edip_affinity[6]  = {0, 1, 2, 3, 59};
  122.89 +// static int edb_affinity [14] = {8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 58, 57};
  122.90 +//
  122.91 +// static int mbd_affinity[4][4] = { {32, 40, 48, 56},
  122.92 +// {33, 41, 49, 57},
  122.93 +// {34, 42, 50, 58},
  122.94 +// {35, 43, 51, 59}, };
  122.95 +
  122.96 +
  122.97 +// static int edb_affinity [29] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 59, 60, 61, 62, 63};
  122.98 +// static int edip_affinity[11] =  {24, 25, 26, 27, 28, 29, 30, 31, 63, 62, 61};
  122.99 +//
 122.100 +// static int mbd_affinity[8][4] = {{32, 40, 48, 56},
 122.101 +// 							{33, 41, 49, 57},
 122.102 +// 							{34, 42, 50, 58},
 122.103 +// 							{35, 43, 51, 59},
 122.104 +// 							{36, 44, 52, 60},
 122.105 +// 							{37, 45, 53, 61},
 122.106 +// 							{38, 46, 54, 62},
 122.107 +// 							{39, 47, 55, 63}, };
 122.108 +
 122.109 +//potsdam machine 4xX7550 with HT
 122.110 +// int edip_affinity[16] = {0, 8, 16, 24, 	1, 9, 17, 25, 	2, 10, 18, 26,	3, 11, 19, 27 };
 122.111 +// int edb_affinity [16] = {1, 9, 17, 25, 	2, 10, 18, 26, 	6, 14, 22, 30,	7, 15, 23, 31 };
 122.112 +// int edip_affinity[16] = {58, 50, 42, 34, 	1, 9, 17, 25, 	2, 10, 18, 26,	3, 11, 19, 27 };
 122.113 +// int edb_affinity [16] = {57, 49, 41, 33, 	56, 48, 40, 32, 	6, 14, 22, 30,	7, 15, 23, 31 };
 122.114 +// //int edb_affinity [16] = {4, 12, 20, 28, 5, 13, 21, 29, 	6, 14, 22, 30,	7, 15, 23, 31 };
 122.115 +// //mb threads affinity on logical cores moving back to keep inteference with ed threads low
 122.116 +// int mbd_affinity[4][8] = {	{63, 62, 61, 60, 59, 58, 57, 56},
 122.117 +// 							{55, 54, 53, 52, 51, 50, 49, 48},
 122.118 +// 							{47, 46, 45, 44, 43, 42, 41, 40},
 122.119 +// 							{39, 38, 37, 36, 35, 34, 33, 32},
 122.120 +// 							};
 122.121 +
 122.122 +
 122.123 +// static int edip_affinity[2] = {0, 2};
 122.124 +// static int edb_affinity [4] = {1, 3, 2, 5};
 122.125 +//
 122.126 +// static int mbd_affinity[1][4] = {{ 4, 6, 7, 5}};
 122.127 +
 122.128 +enum{
 122.129 +    PARSE=0,
 122.130 +    ENTROPY,
 122.131 +    REORDER,
 122.132 +    REORDER2,   //second mutex-cond pair used in reorder_thread
 122.133 +    MBDEC,
 122.134 +    OUTPUT,
 122.135 +    STAGES
 122.136 +};
 122.137 +
 122.138 +//adhoc for profiling
 122.139 +enum{
 122.140 +    TOTAL=0,
 122.141 +    FRONT,
 122.142 +    ED,
 122.143 +    REC,
 122.144 +    PROFILE_STAGES
 122.145 +};
 122.146 +
 122.147 +/* bit input */
 122.148 +/* buffer, buffer_end and size_in_bits must be present and used by every reader */
 122.149 +
 122.150 +/* frame parsing */
 122.151 +typedef struct ParserContext {
 122.152 +    //int64_t offset;      ///< byte offset from starting packet start
 122.153 +    int ifile;
 122.154 +    int ofile;
 122.155 +    int buffer_size;
 122.156 +    int eof_reached;
 122.157 +
 122.158 +    uint8_t *data;
 122.159 +    int   size;
 122.160 +    uint8_t *cur_ptr;
 122.161 +    int cur_len;
 122.162 +
 122.163 +    int64_t frame_offset; /* offset of the current frame */
 122.164 +    int64_t cur_offset; /* current offset (incremented by each av_parser_parse()) */
 122.165 +    int64_t next_frame_offset; /* offset of the next frame */
 122.166 +    int pict_type;
 122.167 +    int repeat_pict;     //frame_duration = (1 + repeat_pict) * time_base. It is used by codecs like H.264 to display telecined material.
 122.168 +    int key_frame;  //Set by parser to 1 for key frames and 0 for non-key frames.
 122.169 +    int64_t pos;     // Byte position of currently parsed frame in stream.
 122.170 +    int64_t last_pos;  //Previous frame byte position.
 122.171 +    int final_frame;
 122.172 +
 122.173 +    uint8_t overread[5];
 122.174 +    int overread_cnt;           ///< the number of bytes which where irreversibly read from the next frame
 122.175 +    int index;
 122.176 +    int last_index;
 122.177 +    int frame_start_found;
 122.178 +    uint32_t state;             ///< contains the last few bytes in MSB order
 122.179 +} ParserContext;
 122.180 +
 122.181 +typedef struct NalContext {
 122.182 +
 122.183 +    SPS *sps_buffers[MAX_SPS_COUNT];
 122.184 +    PPS *pps_buffers[MAX_PPS_COUNT];
 122.185 +    SPS sps; ///< current sps
 122.186 +
 122.187 +    PictureInfo picture[16 + 1];  ///< Ref pic buffer used for deriving lists. Later linked with pic in dpb.
 122.188 +    PictureInfo *release_ref[MAX_MMCO_COUNT];
 122.189 +    PictureInfo *short_ref[32];
 122.190 +    PictureInfo *long_ref[32];
 122.191 +    int long_ref_count;  ///< number of actual long term references
 122.192 +    int short_ref_count; ///< number of actual short term references
 122.193 +
 122.194 +    //POC stuff
 122.195 +    uint32_t coded_pic_num;
 122.196 +    int poc_lsb;
 122.197 +    int poc_msb;
 122.198 +    uint32_t poc_offset;
 122.199 +    int delta_poc;
 122.200 +    int frame_num;
 122.201 +    int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 122.202 +    int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 122.203 +    int frame_num_offset;         ///< for POC type 2
 122.204 +    int prev_frame_num_offset;    ///< for POC type 2
 122.205 +    int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 122.206 +
 122.207 +    int max_pic_num;
 122.208 +    int redundant_pic_count;
 122.209 +    int outputed_poc;
 122.210 +    int ip_id;
 122.211 +//   int b8_stride;             ///< 2*mb_width+1 used for some 8x8 block arrays to allow simple addressing
 122.212 +    int b4_stride;             ///< 4*mb_width+1 used for some 4x4 block arrays to allow simple addressing
 122.213 +    int mb_stride;             ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11
 122.214 +    int mb_width;
 122.215 +    int mb_height;
 122.216 +    int width;
 122.217 +    int height;
 122.218 +
 122.219 +    int has_b_frames;
 122.220 +    //pic_struct in picture timing SEI message
 122.221 +    SEI_PicStructType sei_pic_struct;
 122.222 +    // Bit set of clock types for fields/frames in picture timing SEI message. For each found ct_type, appropriate bit is set (e.g., bit 1 for interlaced).
 122.223 +    int sei_ct_type;
 122.224 +    // dpb_output_delay in picture timing SEI message, see H.264 C.2.2
 122.225 +    int sei_dpb_output_delay;
 122.226 +    //cpb_removal_delay in picture timing SEI message, see H.264 C.1.2
 122.227 +    int sei_cpb_removal_delay;
 122.228 +    //recovery_frame_cnt from SEI message
 122.229 +    int sei_recovery_frame_cnt;
 122.230 +    // Timestamp stuff
 122.231 +    int sei_buffering_period_present;  ///< Buffering period SEI flag
 122.232 +    int initial_cpb_removal_delay[32]; ///< Initial timestamps for CPBs
 122.233 +
 122.234 +} NalContext;
 122.235 +
 122.236 +typedef struct EntropyContext{
 122.237 +    CABACContext c;
 122.238 +
 122.239 +    H264Mb *m;
 122.240 +    int top_cbp;
 122.241 +    int left_cbp;
 122.242 +    int neighbor_transform_size; //number of neighbors (top and/or left) that used 8x8 dct
 122.243 +
 122.244 +    uint32_t top_type;
 122.245 +    uint32_t left_type;
 122.246 +    uint32_t topright_type;
 122.247 +    uint32_t topleft_type;
 122.248 +
 122.249 +    int curr_qscale;
 122.250 +    int chroma_qp[2]; //QPc
 122.251 +    int last_qscale_diff;
 122.252 +
 122.253 +    uint32_t dequant4_buffer[6][52][16];
 122.254 +    uint32_t dequant8_buffer[2][52][64];
 122.255 +    uint32_t (*dequant4_coeff[6])[16];
 122.256 +    uint32_t (*dequant8_coeff[2])[64];
 122.257 +
 122.258 +//     uint8_t (*non_zero_count_top)[32];
 122.259 +//     uint8_t (*non_zero_count)[32];
 122.260 +//     uint8_t (*non_zero_count_row[2])[32];
 122.261 +
 122.262 +    uint8_t (*non_zero_count_top)[8];
 122.263 +    uint8_t (*non_zero_count)[8];
 122.264 +    uint8_t (*non_zero_count_row[2])[8];
 122.265 +    DECLARE_ALIGNED(8, uint8_t, non_zero_count_left[8]);
 122.266 +
 122.267 +    uint8_t (*mvd_top[2])[2];
 122.268 +    uint8_t (*mvd[2])[2];
 122.269 +    uint8_t (*mvd_table[2][2])[2];
 122.270 +
 122.271 +    uint8_t *direct_top;
 122.272 +    uint8_t *direct;
 122.273 +    uint8_t *direct_table[2];
 122.274 +
 122.275 +    uint8_t *chroma_pred_mode_top;
 122.276 +    uint8_t *chroma_pred_mode;
 122.277 +    uint8_t *chroma_pred_mode_table[2];
 122.278 +
 122.279 +    uint16_t *cbp_top;
 122.280 +    uint16_t *cbp;
 122.281 +    uint16_t *cbp_table[2];
 122.282 +
 122.283 +    int8_t *qscale_top;
 122.284 +    int8_t *qscale;
 122.285 +    int8_t *qscale_table[2];
 122.286 +
 122.287 +    int8_t *ref_index_top[2];
 122.288 +    int8_t *ref_index[2];
 122.289 +    int8_t *ref_index_table[2][2];
 122.290 +
 122.291 +    uint32_t *mb_type_top;
 122.292 +    uint32_t *mb_type;
 122.293 +    uint32_t *mb_type_table[2];
 122.294 +
 122.295 +    int b_stride;
 122.296 +    int mb_stride;
 122.297 +    int mb_width;
 122.298 +    int mb_height;
 122.299 +
 122.300 +    uint8_t *zigzag_scan;
 122.301 +    uint8_t *zigzag_scan8x8;
 122.302 +    uint8_t direct_cache[5*8];
 122.303 +
 122.304 +    DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]);
 122.305 +    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
 122.306 +    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
 122.307 +    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
 122.308 +    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
 122.309 +
 122.310 +} EntropyContext;
 122.311 +
 122.312 +typedef struct H264Slice {
 122.313 +    PPS pps;                   ///< current pps
 122.314 +    PictureInfo* current_picture_info;
 122.315 +    DecodedPicture* curr_pic;
 122.316 +    int slice_num;
 122.317 +
 122.318 +    int release_ref_cpn[MAX_MMCO_COUNT];
 122.319 +    int release_cnt;
 122.320 +
 122.321 +    int qp_thresh;      ///< QP threshold to skip loopfilter
 122.322 +    int use_weight;
 122.323 +    int use_weight_chroma;
 122.324 +    int luma_log2_weight_denom;
 122.325 +    int chroma_log2_weight_denom;
 122.326 +
 122.327 +    int16_t luma_weight[16][2][2];
 122.328 +    int16_t chroma_weight[16][2][2][2];
 122.329 +    int16_t implicit_weight[16][16][2];
 122.330 +
 122.331 +    //poc number of ref_list int ref_poc[2][16]
 122.332 +    //In edslice this must becom Picture Info
 122.333 +    int ref_list_cpn[2][16];
 122.334 +    PictureInfo *ref_list[2][16];         ///Reordered version of default_ref_list according to picture reordering in slice header
 122.335 +    DecodedPicture *dp_ref_list[2][16];
 122.336 +    int ref_count[2];   ///< counts frames or fields, depending on current mb mode
 122.337 +
 122.338 +    int slice_type;
 122.339 +    int slice_type_nos;
 122.340 +    int slice_alpha_c0_offset;
 122.341 +    int slice_beta_offset;
 122.342 +    int direct_8x8_inference_flag;
 122.343 +
 122.344 +    uint8_t list_count;
 122.345 +    uint32_t coded_pic_num;
 122.346 +
 122.347 +    int poc;
 122.348 +    int key_frame;
 122.349 +    int mmco_reset; //FIXME not used?
 122.350 +
 122.351 +    ///stuff only needed for nal/entropy decoding
 122.352 +//     H264Mb *m;
 122.353 +//     GetBitContext *gb;
 122.354 +    int ip_id;
 122.355 +    int transform_bypass;
 122.356 +    int direct_spatial_mv_pred;
 122.357 +    int map_col_to_list0[2][16];
 122.358 +    int dist_scale_factor[16];
 122.359 +
 122.360 +    int cabac_init_idc;
 122.361 +    int nal_ref_idc;
 122.362 +    int nal_unit_type;
 122.363 +
 122.364 +    int ref2frm[2][64];  ///< reference to frame number lists, the first 2 are for -2,-1
 122.365 +
 122.366 +    int qscale;
 122.367 +
 122.368 +} H264Slice;
 122.369 +
 122.370 +typedef struct {
 122.371 +    H264Slice slice;
 122.372 +    H264Mb *mbs;
 122.373 +    DecodedPicture *dp;
 122.374 +    GetBitContext gb;
 122.375 +
 122.376 +    int lines_taken;
 122.377 +    int lines_total;
 122.378 +    int state;       // 0 free, 1 in use //1 wait for entropy, 2 wait for reconstruct.
 122.379 +    int initialized;
 122.380 +} SliceBufferEntry;
 122.381 +
 122.382 +typedef struct RingLineEntry{
 122.383 +    union{
 122.384 +    DECLARE_ALIGNED(64, volatile int32_t, mb_cnt);
 122.385 +    DECLARE_ALIGNED(64, int32_t, pad[16]);
 122.386 +    };
 122.387 +    SliceBufferEntry *sbe;
 122.388 +    int id;
 122.389 +    int line;
 122.390 +    TopBorder *top;
 122.391 +    struct RingLineEntry *prev_line;
 122.392 +
 122.393 +} RingLineEntry;
 122.394 +
 122.395 +// #if OMPSS
 122.396 +typedef struct SuperMBTask{
 122.397 +    int smb_x;
 122.398 +    int smb_y;
 122.399 +} SuperMBTask;
 122.400 +
 122.401 +typedef struct SuperMBContext{
 122.402 +    int nsmb_width;             //number of super macroblocks in picture width
 122.403 +    int nsmb_height;            //number of super macroblocks in picture height
 122.404 +    int nsmb_3dheight;          //number of super macroblocks in picture height - max motion vertical vector
 122.405 +    int smb_width;              //width of a super macroblock
 122.406 +    int smb_height;             //height of a super macroblock
 122.407 +    int refcount;
 122.408 +    int index;
 122.409 +    SuperMBTask *smbs[2];
 122.410 +} SuperMBContext;
 122.411 +// #endif
 122.412 +
 122.413 +//scratchpad for decoding a macroblock
 122.414 +typedef struct MBRecState{
 122.415 +    int8_t *ref_index_top[2];
 122.416 +    int8_t *ref_index[2];
 122.417 +    int16_t (*motion_val_top[2])[2];
 122.418 +    int16_t (*motion_val[2])[2];
 122.419 +    uint32_t *mb_type_top;
 122.420 +    uint32_t *mb_type;
 122.421 +
 122.422 +    int8_t *list1_ref_index[2];
 122.423 +    int16_t (*list1_motion_val[2])[2];
 122.424 +    uint32_t *list1_mb_type;
 122.425 +
 122.426 +    int8_t *intra4x4_pred_mode_top;
 122.427 +    int8_t *intra4x4_pred_mode;
 122.428 +#if !OMPSS
 122.429 +    int8_t intra4x4_pred_mode_left[4];
 122.430 +#endif
 122.431 +    int8_t *non_zero_count_top;
 122.432 +    int8_t *non_zero_count;
 122.433 +//     int8_t non_zero_count_left[8];
 122.434 +
 122.435 +
 122.436 +    unsigned int topleft_samples_available;
 122.437 +    unsigned int topright_samples_available;
 122.438 +    unsigned int top_samples_available;
 122.439 +    unsigned int left_samples_available;
 122.440 +
 122.441 +    int top_type;
 122.442 +    int left_type;
 122.443 +
 122.444 +    DECLARE_ALIGNED(8, int8_t, intra4x4_pred_mode_cache[5*8]);
 122.445 +    DECLARE_ALIGNED(16, int16_t, mv_cache)[2][5*8][2];
 122.446 +    DECLARE_ALIGNED(8, int8_t, ref_cache)[2][5*8];
 122.447 +    DECLARE_ALIGNED(8, uint8_t, non_zero_count_cache)[6*8];
 122.448 +    DECLARE_ALIGNED(16, uint8_t, mvd_cache)[2][5*8][2];
 122.449 +
 122.450 +    DECLARE_ALIGNED(8, int16_t, bS)[2][4][4];
 122.451 +    uint8_t edges[2];
 122.452 +
 122.453 +}MBRecState ;
 122.454 +
 122.455 +typedef struct MBRecContext{
 122.456 +    DSPContext dsp;             ///< pointers for accelerated dsp functions
 122.457 +    H264DSPContext hdsp;
 122.458 +    H264PredContext hpc;
 122.459 +
 122.460 +    MBRecState *mrs;
 122.461 +    RingLineEntry *rle;         //debug
 122.462 +
 122.463 +    uint8_t *scratchpad_y;      ///implemented different on Cell
 122.464 +    uint8_t *scratchpad_cb;     ///implemented different on Cell
 122.465 +    uint8_t *scratchpad_cr;     ///implemented different on Cell
 122.466 +
 122.467 +    int linesize;
 122.468 +    int uvlinesize;
 122.469 +    int mb_width;
 122.470 +    int mb_height;
 122.471 +    int mb_stride;
 122.472 +    int b_stride;
 122.473 +    int width;
 122.474 +    int height;
 122.475 +
 122.476 +#if !OMPSS   // not used in OMPSS
 122.477 +    LeftBorder left;
 122.478 +    TopBorder *top;
 122.479 +    TopBorder *top_next; 	// next line top border
 122.480 +#endif
 122.481 +    /*
 122.482 +    .UU.YYYY
 122.483 +    .UU.YYYY
 122.484 +    .vv.YYYY
 122.485 +    .VV.YYYY
 122.486 +    */
 122.487 +
 122.488 +    // block_offset[ 0..23] for frame macroblocks
 122.489 +    int block_offset[16+8];
 122.490 +
 122.491 +} MBRecContext;
 122.492 +
 122.493 +#ifdef HAVE_LIBSDL2
 122.494 +typedef struct SDLContext{
 122.495 +    int display;
 122.496 +    int fullscreen;
 122.497 +    pthread_t listen_thread;
 122.498 +
 122.499 +    SDL_DisplayMode full;
 122.500 +    SDL_DisplayMode wind;
 122.501 +
 122.502 +    
 122.503 +    SDL_Renderer *renderer;
 122.504 +    SDL_Rect rect;
 122.505 +    SDL_Rect win_rect;
 122.506 +    SDL_Window *window;
 122.507 +    double aspect;
 122.508 +    int win_w;
 122.509 +    int win_h;
 122.510 +    int resized;
 122.511 +    
 122.512 +    SDL_Texture *sbmap_texture;
 122.513 +    int showmap;
 122.514 +    int updatemap;
 122.515 +    int pause;
 122.516 +    
 122.517 +} SDLContext;
 122.518 +#endif
 122.519 +
 122.520 +typedef struct OutputContext {
 122.521 +    int bit_buffer_size;
 122.522 +    uint8_t *bit_buffer;
 122.523 +    uint64_t video_size;
 122.524 +    int frame_number;
 122.525 +    DecodedPicture *delayed_pic[DPB_SIZE];
 122.526 +    int dp_cnt;
 122.527 +
 122.528 +} OutputContext;
 122.529 +
 122.530 +typedef struct {
 122.531 +    pthread_mutex_t lock;
 122.532 +    pthread_cond_t cond;
 122.533 +    SliceBufferEntry **queue;
 122.534 +    int size;
 122.535 +    int cnt;
 122.536 +    int fi;
 122.537 +    int fo;
 122.538 +} SliceBufferQueue;
 122.539 +
 122.540 +typedef struct {
 122.541 +    pthread_mutex_t wslock;
 122.542 +    pthread_cond_t wscond;
 122.543 +    pthread_mutex_t swlock;
 122.544 +    pthread_cond_t swcond;
 122.545 +    RingLineEntry **queue;
 122.546 +    int size;
 122.547 +    int ready;
 122.548 +    int free;
 122.549 +    int fi;
 122.550 +    int fo;
 122.551 +} RingLineQueue;
 122.552 +
 122.553 +#if HAVE_LIBSDL2
 122.554 +typedef struct {
 122.555 +    pthread_mutex_t sdl_lock;
 122.556 +    pthread_cond_t sdl_cond;
 122.557 +    SDL_Texture **queue;
 122.558 +    int size;
 122.559 +    int ready;
 122.560 +    int fi;
 122.561 +    int fo;
 122.562 +    int exit;
 122.563 +} SDLTextureQueue;
 122.564 +#endif
 122.565 +/**
 122.566 +* H264Context
 122.567 +*/
 122.568 +typedef struct H264Context{
 122.569 +    SliceBufferQueue sb_q[STAGES];
 122.570 +    RingLineQueue rl_q;
 122.571 +
 122.572 +    pthread_mutex_t lock[STAGES];
 122.573 +    pthread_cond_t cond[STAGES];
 122.574 +
 122.575 +    pthread_mutex_t task_lock;
 122.576 +    pthread_cond_t task_cond;
 122.577 +
 122.578 +    pthread_attr_t ed_rec_attr[MAX_THREADS];
 122.579 +    pthread_t ed_rec_thr[MAX_THREADS];
 122.580 +
 122.581 +    int init_threads;
 122.582 +    pthread_mutex_t ilock;
 122.583 +    pthread_cond_t icond;
 122.584 +
 122.585 +    const char *file_name;
 122.586 +    int profile;
 122.587 +    int start;
 122.588 +    int touch_start;
 122.589 +    int setaff;
 122.590 +    int touch_done;
 122.591 +    int rl_side_touch;
 122.592 +    int statmbd;
 122.593 +    pthread_mutex_t slock;
 122.594 +    pthread_cond_t scond;
 122.595 +    pthread_mutex_t tlock;
 122.596 +    pthread_cond_t tcond;
 122.597 +    pthread_mutex_t tdlock;
 122.598 +    pthread_cond_t tdcond;
 122.599 +
 122.600 +    int ed_ppe_threads;
 122.601 +    int threads;
 122.602 +    int smt;
 122.603 +
 122.604 +    int acdpb_cnt;  //debug
 122.605 +    int reldpb_cnt;
 122.606 +    
 122.607 +    int sb_size;
 122.608 +    SliceBufferEntry *sb;               ///< Slice Syntax Buffer
 122.609 +    int free_sb_cnt;
 122.610 +    int slice_bufs;
 122.611 +
 122.612 +    int max_dpb_cnt;
 122.613 +    DecodedPicture *dpb;       ///< Decoded Picture Buffer
 122.614 +    int free_dpb_cnt;
 122.615 +
 122.616 +    int ifile;
 122.617 +    int ofile;
 122.618 +    int frame_width;
 122.619 +    int frame_height;
 122.620 +    int num_frames;
 122.621 +    int width;
 122.622 +    int height;
 122.623 +    int mb_width;
 122.624 +    int mb_height;
 122.625 +    int mb_stride;          ///< mb_width+1 used for some arrays to allow simple addressing of left & top MBs without sig11
 122.626 +    int b4_stride;
 122.627 +    int b_stride;
 122.628 +
 122.629 +    int smb_height;
 122.630 +    int smb_width;
 122.631 +    pthread_mutex_t smb_lock;
 122.632 +    pthread_cond_t sdl_cond;
 122.633 +    pthread_mutex_t sdl_lock;
 122.634 +    SuperMBContext *smbc;
 122.635 +    
 122.636 +    int wave_order;
 122.637 +    int static_3d;
 122.638 +    int pipe_bufs;
 122.639 +
 122.640 +    //shared tables used in entropy decoding
 122.641 +    uint8_t zigzag_scan[16];
 122.642 +    uint8_t zigzag_scan8x8[64];
 122.643 +
 122.644 +    int verbose;
 122.645 +    int no_mbd;
 122.646 +    int display;
 122.647 +    int fullscreen;
 122.648 +    int quit;
 122.649 +#ifdef HAVE_LIBSDL2
 122.650 +    SDLTextureQueue sdlq;
 122.651 +    SDLContext *sdlc;
 122.652 +#endif
 122.653 +     
 122.654 +    struct timespec start_time[PROFILE_STAGES];
 122.655 +    struct timespec end_time[PROFILE_STAGES];
 122.656 +    double last_time[PROFILE_STAGES];
 122.657 +    double total_time[PROFILE_STAGES];
 122.658 +
 122.659 +}H264Context;
 122.660 +
 122.661 +#endif

   123.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   123.2 +++ b/ffmpeg_smp/h264dec/libavcodec/mathops.h	Mon Aug 27 12:09:56 2012 +0200
   123.3 @@ -0,0 +1,145 @@
   123.4 +/*
   123.5 + * simple math operations
   123.6 + * Copyright (c) 2001, 2002 Fabrice Bellard
   123.7 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
   123.8 + *
   123.9 + * This file is part of FFmpeg.
  123.10 + *
  123.11 + * FFmpeg is free software; you can redistribute it and/or
  123.12 + * modify it under the terms of the GNU Lesser General Public
  123.13 + * License as published by the Free Software Foundation; either
  123.14 + * version 2.1 of the License, or (at your option) any later version.
  123.15 + *
  123.16 + * FFmpeg is distributed in the hope that it will be useful,
  123.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  123.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  123.19 + * Lesser General Public License for more details.
  123.20 + *
  123.21 + * You should have received a copy of the GNU Lesser General Public
  123.22 + * License along with FFmpeg; if not, write to the Free Software
  123.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  123.24 + */
  123.25 +#ifndef AVCODEC_MATHOPS_H
  123.26 +#define AVCODEC_MATHOPS_H
  123.27 +
  123.28 +#include "libavutil/common.h"
  123.29 +#include "libavutil/internal.h"
  123.30 +
  123.31 +#if   ARCH_ARM
  123.32 +#   include "arm/mathops.h"
  123.33 +#elif ARCH_PPC
  123.34 +#   include "ppc/mathops.h"
  123.35 +#elif ARCH_X86
  123.36 +#   include "x86/mathops.h"
  123.37 +#endif
  123.38 +
  123.39 +/* generic implementation */
  123.40 +
  123.41 +#ifndef MULL
  123.42 +#   define MULL(a,b,s) (((int64_t)(a) * (int64_t)(b)) >> (s))
  123.43 +#endif
  123.44 +
  123.45 +#ifndef MULH
  123.46 +//gcc 3.4 creates an incredibly bloated mess out of this
  123.47 +//#    define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32)
  123.48 +
  123.49 +static av_always_inline int MULH(int a, int b){
  123.50 +    return ((int64_t)(a) * (int64_t)(b))>>32;
  123.51 +}
  123.52 +#endif
  123.53 +
  123.54 +#ifndef UMULH
  123.55 +static av_always_inline unsigned UMULH(unsigned a, unsigned b){
  123.56 +    return ((uint64_t)(a) * (uint64_t)(b))>>32;
  123.57 +}
  123.58 +#endif
  123.59 +
  123.60 +#ifndef MUL64
  123.61 +#   define MUL64(a,b) ((int64_t)(a) * (int64_t)(b))
  123.62 +#endif
  123.63 +
  123.64 +#ifndef MAC64
  123.65 +#   define MAC64(d, a, b) ((d) += MUL64(a, b))
  123.66 +#endif
  123.67 +
  123.68 +#ifndef MLS64
  123.69 +#   define MLS64(d, a, b) ((d) -= MUL64(a, b))
  123.70 +#endif
  123.71 +
  123.72 +/* signed 16x16 -> 32 multiply add accumulate */
  123.73 +#ifndef MAC16
  123.74 +#   define MAC16(rt, ra, rb) rt += (ra) * (rb)
  123.75 +#endif
  123.76 +
  123.77 +/* signed 16x16 -> 32 multiply */
  123.78 +#ifndef MUL16
  123.79 +#   define MUL16(ra, rb) ((ra) * (rb))
  123.80 +#endif
  123.81 +
  123.82 +#ifndef MLS16
  123.83 +#   define MLS16(rt, ra, rb) ((rt) -= (ra) * (rb))
  123.84 +#endif
  123.85 +
  123.86 +/* median of 3 */
  123.87 +#ifndef mid_pred
  123.88 +#define mid_pred mid_pred
  123.89 +static inline av_const int mid_pred(int a, int b, int c)
  123.90 +{
  123.91 +#if 0
  123.92 +    int t= (a-b)&((a-b)>>31);
  123.93 +    a-=t;
  123.94 +    b+=t;
  123.95 +    b-= (b-c)&((b-c)>>31);
  123.96 +    b+= (a-b)&((a-b)>>31);
  123.97 +
  123.98 +    return b;
  123.99 +#else
 123.100 +    if(a>b){
 123.101 +        if(c>b){
 123.102 +            if(c>a) b=a;
 123.103 +            else    b=c;
 123.104 +        }
 123.105 +    }else{
 123.106 +        if(b>c){
 123.107 +            if(c>a) b=c;
 123.108 +            else    b=a;
 123.109 +        }
 123.110 +    }
 123.111 +    return b;
 123.112 +#endif
 123.113 +}
 123.114 +#endif
 123.115 +
 123.116 +#ifndef sign_extend
 123.117 +static inline av_const int sign_extend(int val, unsigned bits)
 123.118 +{
 123.119 +    return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
 123.120 +}
 123.121 +#endif
 123.122 +
 123.123 +#ifndef zero_extend
 123.124 +static inline av_const unsigned zero_extend(unsigned val, unsigned bits)
 123.125 +{
 123.126 +    return (val << (INT_BIT - bits)) >> (INT_BIT - bits);
 123.127 +}
 123.128 +#endif
 123.129 +
 123.130 +#ifndef COPY3_IF_LT
 123.131 +#define COPY3_IF_LT(x, y, a, b, c, d)\
 123.132 +if ((y) < (x)) {\
 123.133 +    (x) = (y);\
 123.134 +    (a) = (b);\
 123.135 +    (c) = (d);\
 123.136 +}
 123.137 +#endif
 123.138 +
 123.139 +#ifndef NEG_SSR32
 123.140 +#   define NEG_SSR32(a,s) ((( int32_t)(a))>>(32-(s)))
 123.141 +#endif
 123.142 +
 123.143 +#ifndef NEG_USR32
 123.144 +#   define NEG_USR32(a,s) (((uint32_t)(a))>>(32-(s)))
 123.145 +#endif
 123.146 +
 123.147 +#endif /* AVCODEC_MATHOPS_H */
 123.148 +

   124.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   124.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.c	Mon Aug 27 12:09:56 2012 +0200
   124.3 @@ -0,0 +1,619 @@
   124.4 +/*
   124.5 + * Copyright (c) 2002 Brian Foley
   124.6 + * Copyright (c) 2002 Dieter Shirley
   124.7 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
   124.8 + *
   124.9 + * This file is part of FFmpeg.
  124.10 + *
  124.11 + * FFmpeg is free software; you can redistribute it and/or
  124.12 + * modify it under the terms of the GNU Lesser General Public
  124.13 + * License as published by the Free Software Foundation; either
  124.14 + * version 2.1 of the License, or (at your option) any later version.
  124.15 + *
  124.16 + * FFmpeg is distributed in the hope that it will be useful,
  124.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  124.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  124.19 + * Lesser General Public License for more details.
  124.20 + *
  124.21 + * You should have received a copy of the GNU Lesser General Public
  124.22 + * License along with FFmpeg; if not, write to the Free Software
  124.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  124.24 + */
  124.25 +
  124.26 +#include "config.h"
  124.27 +#if HAVE_ALTIVEC_H
  124.28 +#include <altivec.h>
  124.29 +#endif
  124.30 +#include "libavcodec/dsputil.h"
  124.31 +#include "dsputil_ppc.h"
  124.32 +#include "util_altivec.h"
  124.33 +#include "types_altivec.h"
  124.34 +#include "dsputil_altivec.h"
  124.35 +
  124.36 +
  124.37 +static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  124.38 +{
  124.39 +    int i;
  124.40 +    vector unsigned char perm, bytes, *pixv;
  124.41 +    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  124.42 +    vector signed short shorts;
  124.43 +
  124.44 +    for (i = 0; i < 8; i++) {
  124.45 +        // Read potentially unaligned pixels.
  124.46 +        // We're reading 16 pixels, and actually only want 8,
  124.47 +        // but we simply ignore the extras.
  124.48 +        perm = vec_lvsl(0, pixels);
  124.49 +        pixv = (vector unsigned char *) pixels;
  124.50 +        bytes = vec_perm(pixv[0], pixv[1], perm);
  124.51 +
  124.52 +        // convert the bytes into shorts
  124.53 +        shorts = (vector signed short)vec_mergeh(zero, bytes);
  124.54 +
  124.55 +        // save the data to the block, we assume the block is 16-byte aligned
  124.56 +        vec_st(shorts, i*16, (vector signed short*)block);
  124.57 +
  124.58 +        pixels += line_size;
  124.59 +    }
  124.60 +}
  124.61 +
  124.62 +static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  124.63 +        const uint8_t *s2, int stride)
  124.64 +{
  124.65 +    int i;
  124.66 +    vector unsigned char perm, bytes, *pixv;
  124.67 +    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  124.68 +    vector signed short shorts1, shorts2;
  124.69 +
  124.70 +    for (i = 0; i < 4; i++) {
  124.71 +        // Read potentially unaligned pixels
  124.72 +        // We're reading 16 pixels, and actually only want 8,
  124.73 +        // but we simply ignore the extras.
  124.74 +        perm = vec_lvsl(0, s1);
  124.75 +        pixv = (vector unsigned char *) s1;
  124.76 +        bytes = vec_perm(pixv[0], pixv[1], perm);
  124.77 +
  124.78 +        // convert the bytes into shorts
  124.79 +        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  124.80 +
  124.81 +        // Do the same for the second block of pixels
  124.82 +        perm = vec_lvsl(0, s2);
  124.83 +        pixv = (vector unsigned char *) s2;
  124.84 +        bytes = vec_perm(pixv[0], pixv[1], perm);
  124.85 +
  124.86 +        // convert the bytes into shorts
  124.87 +        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  124.88 +
  124.89 +        // Do the subtraction
  124.90 +        shorts1 = vec_sub(shorts1, shorts2);
  124.91 +
  124.92 +        // save the data to the block, we assume the block is 16-byte aligned
  124.93 +        vec_st(shorts1, 0, (vector signed short*)block);
  124.94 +
  124.95 +        s1 += stride;
  124.96 +        s2 += stride;
  124.97 +        block += 8;
  124.98 +
  124.99 +
 124.100 +        // The code below is a copy of the code above... This is a manual
 124.101 +        // unroll.
 124.102 +
 124.103 +        // Read potentially unaligned pixels
 124.104 +        // We're reading 16 pixels, and actually only want 8,
 124.105 +        // but we simply ignore the extras.
 124.106 +        perm = vec_lvsl(0, s1);
 124.107 +        pixv = (vector unsigned char *) s1;
 124.108 +        bytes = vec_perm(pixv[0], pixv[1], perm);
 124.109 +
 124.110 +        // convert the bytes into shorts
 124.111 +        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 124.112 +
 124.113 +        // Do the same for the second block of pixels
 124.114 +        perm = vec_lvsl(0, s2);
 124.115 +        pixv = (vector unsigned char *) s2;
 124.116 +        bytes = vec_perm(pixv[0], pixv[1], perm);
 124.117 +
 124.118 +        // convert the bytes into shorts
 124.119 +        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 124.120 +
 124.121 +        // Do the subtraction
 124.122 +        shorts1 = vec_sub(shorts1, shorts2);
 124.123 +
 124.124 +        // save the data to the block, we assume the block is 16-byte aligned
 124.125 +        vec_st(shorts1, 0, (vector signed short*)block);
 124.126 +
 124.127 +        s1 += stride;
 124.128 +        s2 += stride;
 124.129 +        block += 8;
 124.130 +    }
 124.131 +}
 124.132 +
 124.133 +
 124.134 +static void clear_block_altivec(DCTELEM *block) {
 124.135 +    LOAD_ZERO;
 124.136 +    vec_st(zero_s16v,   0, block);
 124.137 +    vec_st(zero_s16v,  16, block);
 124.138 +    vec_st(zero_s16v,  32, block);
 124.139 +    vec_st(zero_s16v,  48, block);
 124.140 +    vec_st(zero_s16v,  64, block);
 124.141 +    vec_st(zero_s16v,  80, block);
 124.142 +    vec_st(zero_s16v,  96, block);
 124.143 +    vec_st(zero_s16v, 112, block);
 124.144 +}
 124.145 +
 124.146 +
 124.147 +
 124.148 +/* next one assumes that ((line_size % 16) == 0) */
 124.149 +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 124.150 +{
 124.151 +POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
 124.152 +    register vector unsigned char pixelsv1, pixelsv2;
 124.153 +    register vector unsigned char pixelsv1B, pixelsv2B;
 124.154 +    register vector unsigned char pixelsv1C, pixelsv2C;
 124.155 +    register vector unsigned char pixelsv1D, pixelsv2D;
 124.156 +
 124.157 +    register vector unsigned char perm = vec_lvsl(0, pixels);
 124.158 +    int i;
 124.159 +    register int line_size_2 = line_size << 1;
 124.160 +    register int line_size_3 = line_size + line_size_2;
 124.161 +    register int line_size_4 = line_size << 2;
 124.162 +
 124.163 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
 124.164 +// hand-unrolling the loop by 4 gains about 15%
 124.165 +// mininum execution time goes from 74 to 60 cycles
 124.166 +// it's faster than -funroll-loops, but using
 124.167 +// -funroll-loops w/ this is bad - 74 cycles again.
 124.168 +// all this is on a 7450, tuning for the 7450
 124.169 +#if 0
 124.170 +    for (i = 0; i < h; i++) {
 124.171 +        pixelsv1 = vec_ld(0, pixels);
 124.172 +        pixelsv2 = vec_ld(16, pixels);
 124.173 +        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 124.174 +               0, block);
 124.175 +        pixels+=line_size;
 124.176 +        block +=line_size;
 124.177 +    }
 124.178 +#else
 124.179 +    for (i = 0; i < h; i += 4) {
 124.180 +        pixelsv1  = vec_ld( 0, pixels);
 124.181 +        pixelsv2  = vec_ld(15, pixels);
 124.182 +        pixelsv1B = vec_ld(line_size, pixels);
 124.183 +        pixelsv2B = vec_ld(15 + line_size, pixels);
 124.184 +        pixelsv1C = vec_ld(line_size_2, pixels);
 124.185 +        pixelsv2C = vec_ld(15 + line_size_2, pixels);
 124.186 +        pixelsv1D = vec_ld(line_size_3, pixels);
 124.187 +        pixelsv2D = vec_ld(15 + line_size_3, pixels);
 124.188 +        vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 124.189 +               0, (unsigned char*)block);
 124.190 +        vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
 124.191 +               line_size, (unsigned char*)block);
 124.192 +        vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
 124.193 +               line_size_2, (unsigned char*)block);
 124.194 +        vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
 124.195 +               line_size_3, (unsigned char*)block);
 124.196 +        pixels+=line_size_4;
 124.197 +        block +=line_size_4;
 124.198 +    }
 124.199 +#endif
 124.200 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
 124.201 +}
 124.202 +
 124.203 +/* next one assumes that ((line_size % 16) == 0) */
 124.204 +#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 124.205 +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 124.206 +{
 124.207 +POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
 124.208 +    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 124.209 +    register vector unsigned char perm = vec_lvsl(0, pixels);
 124.210 +    int i;
 124.211 +
 124.212 +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
 124.213 +
 124.214 +    for (i = 0; i < h; i++) {
 124.215 +        pixelsv1 = vec_ld( 0, pixels);
 124.216 +        pixelsv2 = vec_ld(16,pixels);
 124.217 +        blockv = vec_ld(0, block);
 124.218 +        pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
 124.219 +        blockv = vec_avg(blockv,pixelsv);
 124.220 +        vec_st(blockv, 0, (unsigned char*)block);
 124.221 +        pixels+=line_size;
 124.222 +        block +=line_size;
 124.223 +    }
 124.224 +
 124.225 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
 124.226 +}
 124.227 +
 124.228 +/* next one assumes that ((line_size % 8) == 0) */
 124.229 +static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
 124.230 +{
 124.231 +POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
 124.232 +    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 124.233 +    int i;
 124.234 +
 124.235 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
 124.236 +
 124.237 +   for (i = 0; i < h; i++) {
 124.238 +       /* block is 8 bytes-aligned, so we're either in the
 124.239 +          left block (16 bytes-aligned) or in the right block (not) */
 124.240 +       int rightside = ((unsigned long)block & 0x0000000F);
 124.241 +
 124.242 +       blockv = vec_ld(0, block);
 124.243 +       pixelsv1 = vec_ld( 0, pixels);
 124.244 +       pixelsv2 = vec_ld(16, pixels);
 124.245 +       pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
 124.246 +
 124.247 +       if (rightside) {
 124.248 +           pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
 124.249 +       } else {
 124.250 +           pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
 124.251 +       }
 124.252 +
 124.253 +       blockv = vec_avg(blockv, pixelsv);
 124.254 +
 124.255 +       vec_st(blockv, 0, block);
 124.256 +
 124.257 +       pixels += line_size;
 124.258 +       block += line_size;
 124.259 +   }
 124.260 +
 124.261 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
 124.262 +}
 124.263 +
 124.264 +/* next one assumes that ((line_size % 8) == 0) */
 124.265 +static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 124.266 +{
 124.267 +POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
 124.268 +    register int i;
 124.269 +    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
 124.270 +    register vector unsigned char blockv, temp1, temp2;
 124.271 +    register vector unsigned short pixelssum1, pixelssum2, temp3;
 124.272 +    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
 124.273 +    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 124.274 +
 124.275 +    temp1 = vec_ld(0, pixels);
 124.276 +    temp2 = vec_ld(16, pixels);
 124.277 +    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 124.278 +    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
 124.279 +        pixelsv2 = temp2;
 124.280 +    } else {
 124.281 +        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 124.282 +    }
 124.283 +    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.284 +    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.285 +    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 124.286 +                         (vector unsigned short)pixelsv2);
 124.287 +    pixelssum1 = vec_add(pixelssum1, vctwo);
 124.288 +
 124.289 +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
 124.290 +    for (i = 0; i < h ; i++) {
 124.291 +        int rightside = ((unsigned long)block & 0x0000000F);
 124.292 +        blockv = vec_ld(0, block);
 124.293 +
 124.294 +        temp1 = vec_ld(line_size, pixels);
 124.295 +        temp2 = vec_ld(line_size + 16, pixels);
 124.296 +        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 124.297 +        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
 124.298 +            pixelsv2 = temp2;
 124.299 +        } else {
 124.300 +            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 124.301 +        }
 124.302 +
 124.303 +        pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.304 +        pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.305 +        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 124.306 +                             (vector unsigned short)pixelsv2);
 124.307 +        temp3 = vec_add(pixelssum1, pixelssum2);
 124.308 +        temp3 = vec_sra(temp3, vctwo);
 124.309 +        pixelssum1 = vec_add(pixelssum2, vctwo);
 124.310 +        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
 124.311 +
 124.312 +        if (rightside) {
 124.313 +            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
 124.314 +        } else {
 124.315 +            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
 124.316 +        }
 124.317 +
 124.318 +        vec_st(blockv, 0, block);
 124.319 +
 124.320 +        block += line_size;
 124.321 +        pixels += line_size;
 124.322 +    }
 124.323 +
 124.324 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
 124.325 +}
 124.326 +
 124.327 +/* next one assumes that ((line_size % 8) == 0) */
 124.328 +static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 124.329 +{
 124.330 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
 124.331 +    register int i;
 124.332 +    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
 124.333 +    register vector unsigned char blockv, temp1, temp2;
 124.334 +    register vector unsigned short pixelssum1, pixelssum2, temp3;
 124.335 +    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
 124.336 +    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
 124.337 +    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 124.338 +
 124.339 +    temp1 = vec_ld(0, pixels);
 124.340 +    temp2 = vec_ld(16, pixels);
 124.341 +    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 124.342 +    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
 124.343 +        pixelsv2 = temp2;
 124.344 +    } else {
 124.345 +        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 124.346 +    }
 124.347 +    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.348 +    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.349 +    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 124.350 +                         (vector unsigned short)pixelsv2);
 124.351 +    pixelssum1 = vec_add(pixelssum1, vcone);
 124.352 +
 124.353 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 124.354 +    for (i = 0; i < h ; i++) {
 124.355 +        int rightside = ((unsigned long)block & 0x0000000F);
 124.356 +        blockv = vec_ld(0, block);
 124.357 +
 124.358 +        temp1 = vec_ld(line_size, pixels);
 124.359 +        temp2 = vec_ld(line_size + 16, pixels);
 124.360 +        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 124.361 +        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
 124.362 +            pixelsv2 = temp2;
 124.363 +        } else {
 124.364 +            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 124.365 +        }
 124.366 +
 124.367 +        pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.368 +        pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.369 +        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 124.370 +                             (vector unsigned short)pixelsv2);
 124.371 +        temp3 = vec_add(pixelssum1, pixelssum2);
 124.372 +        temp3 = vec_sra(temp3, vctwo);
 124.373 +        pixelssum1 = vec_add(pixelssum2, vcone);
 124.374 +        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
 124.375 +
 124.376 +        if (rightside) {
 124.377 +            blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
 124.378 +        } else {
 124.379 +            blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
 124.380 +        }
 124.381 +
 124.382 +        vec_st(blockv, 0, block);
 124.383 +
 124.384 +        block += line_size;
 124.385 +        pixels += line_size;
 124.386 +    }
 124.387 +
 124.388 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 124.389 +}
 124.390 +
 124.391 +/* next one assumes that ((line_size % 16) == 0) */
 124.392 +static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
 124.393 +{
 124.394 +POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
 124.395 +    register int i;
 124.396 +    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
 124.397 +    register vector unsigned char blockv, temp1, temp2;
 124.398 +    register vector unsigned short temp3, temp4,
 124.399 +        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
 124.400 +    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
 124.401 +    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 124.402 +
 124.403 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
 124.404 +
 124.405 +    temp1 = vec_ld(0, pixels);
 124.406 +    temp2 = vec_ld(16, pixels);
 124.407 +    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 124.408 +    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
 124.409 +        pixelsv2 = temp2;
 124.410 +    } else {
 124.411 +        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 124.412 +    }
 124.413 +    pixelsv3 = vec_mergel(vczero, pixelsv1);
 124.414 +    pixelsv4 = vec_mergel(vczero, pixelsv2);
 124.415 +    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.416 +    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.417 +    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
 124.418 +                         (vector unsigned short)pixelsv4);
 124.419 +    pixelssum3 = vec_add(pixelssum3, vctwo);
 124.420 +    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 124.421 +                         (vector unsigned short)pixelsv2);
 124.422 +    pixelssum1 = vec_add(pixelssum1, vctwo);
 124.423 +
 124.424 +    for (i = 0; i < h ; i++) {
 124.425 +        blockv = vec_ld(0, block);
 124.426 +
 124.427 +        temp1 = vec_ld(line_size, pixels);
 124.428 +        temp2 = vec_ld(line_size + 16, pixels);
 124.429 +        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 124.430 +        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
 124.431 +            pixelsv2 = temp2;
 124.432 +        } else {
 124.433 +            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 124.434 +        }
 124.435 +
 124.436 +        pixelsv3 = vec_mergel(vczero, pixelsv1);
 124.437 +        pixelsv4 = vec_mergel(vczero, pixelsv2);
 124.438 +        pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.439 +        pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.440 +
 124.441 +        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
 124.442 +                             (vector unsigned short)pixelsv4);
 124.443 +        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 124.444 +                             (vector unsigned short)pixelsv2);
 124.445 +        temp4 = vec_add(pixelssum3, pixelssum4);
 124.446 +        temp4 = vec_sra(temp4, vctwo);
 124.447 +        temp3 = vec_add(pixelssum1, pixelssum2);
 124.448 +        temp3 = vec_sra(temp3, vctwo);
 124.449 +
 124.450 +        pixelssum3 = vec_add(pixelssum4, vctwo);
 124.451 +        pixelssum1 = vec_add(pixelssum2, vctwo);
 124.452 +
 124.453 +        blockv = vec_packsu(temp3, temp4);
 124.454 +
 124.455 +        vec_st(blockv, 0, block);
 124.456 +
 124.457 +        block += line_size;
 124.458 +        pixels += line_size;
 124.459 +    }
 124.460 +
 124.461 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
 124.462 +}
 124.463 +
 124.464 +/* next one assumes that ((line_size % 16) == 0) */
 124.465 +static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
 124.466 +{
 124.467 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
 124.468 +    register int i;
 124.469 +    register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
 124.470 +    register vector unsigned char blockv, temp1, temp2;
 124.471 +    register vector unsigned short temp3, temp4,
 124.472 +        pixelssum1, pixelssum2, pixelssum3, pixelssum4;
 124.473 +    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
 124.474 +    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
 124.475 +    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 124.476 +
 124.477 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
 124.478 +
 124.479 +    temp1 = vec_ld(0, pixels);
 124.480 +    temp2 = vec_ld(16, pixels);
 124.481 +    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 124.482 +    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
 124.483 +        pixelsv2 = temp2;
 124.484 +    } else {
 124.485 +        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 124.486 +    }
 124.487 +    pixelsv3 = vec_mergel(vczero, pixelsv1);
 124.488 +    pixelsv4 = vec_mergel(vczero, pixelsv2);
 124.489 +    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.490 +    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.491 +    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
 124.492 +                         (vector unsigned short)pixelsv4);
 124.493 +    pixelssum3 = vec_add(pixelssum3, vcone);
 124.494 +    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 124.495 +                         (vector unsigned short)pixelsv2);
 124.496 +    pixelssum1 = vec_add(pixelssum1, vcone);
 124.497 +
 124.498 +    for (i = 0; i < h ; i++) {
 124.499 +        blockv = vec_ld(0, block);
 124.500 +
 124.501 +        temp1 = vec_ld(line_size, pixels);
 124.502 +        temp2 = vec_ld(line_size + 16, pixels);
 124.503 +        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 124.504 +        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
 124.505 +            pixelsv2 = temp2;
 124.506 +        } else {
 124.507 +            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 124.508 +        }
 124.509 +
 124.510 +        pixelsv3 = vec_mergel(vczero, pixelsv1);
 124.511 +        pixelsv4 = vec_mergel(vczero, pixelsv2);
 124.512 +        pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.513 +        pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.514 +
 124.515 +        pixelssum4 = vec_add((vector unsigned short)pixelsv3,
 124.516 +                             (vector unsigned short)pixelsv4);
 124.517 +        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 124.518 +                             (vector unsigned short)pixelsv2);
 124.519 +        temp4 = vec_add(pixelssum3, pixelssum4);
 124.520 +        temp4 = vec_sra(temp4, vctwo);
 124.521 +        temp3 = vec_add(pixelssum1, pixelssum2);
 124.522 +        temp3 = vec_sra(temp3, vctwo);
 124.523 +
 124.524 +        pixelssum3 = vec_add(pixelssum4, vcone);
 124.525 +        pixelssum1 = vec_add(pixelssum2, vcone);
 124.526 +
 124.527 +        blockv = vec_packsu(temp3, temp4);
 124.528 +
 124.529 +        vec_st(blockv, 0, block);
 124.530 +
 124.531 +        block += line_size;
 124.532 +        pixels += line_size;
 124.533 +    }
 124.534 +
 124.535 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
 124.536 +}
 124.537 +
 124.538 +/* next one assumes that ((line_size % 8) == 0) */
 124.539 +static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 124.540 +{
 124.541 +POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
 124.542 +    register int i;
 124.543 +    register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
 124.544 +    register vector unsigned char blockv, temp1, temp2, blocktemp;
 124.545 +    register vector unsigned short pixelssum1, pixelssum2, temp3;
 124.546 +
 124.547 +    register const vector unsigned char vczero = (const vector unsigned char)
 124.548 +                                        vec_splat_u8(0);
 124.549 +    register const vector unsigned short vctwo = (const vector unsigned short)
 124.550 +                                        vec_splat_u16(2);
 124.551 +
 124.552 +    temp1 = vec_ld(0, pixels);
 124.553 +    temp2 = vec_ld(16, pixels);
 124.554 +    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 124.555 +    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F) {
 124.556 +        pixelsv2 = temp2;
 124.557 +    } else {
 124.558 +        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 124.559 +    }
 124.560 +    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.561 +    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.562 +    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 124.563 +                         (vector unsigned short)pixelsv2);
 124.564 +    pixelssum1 = vec_add(pixelssum1, vctwo);
 124.565 +
 124.566 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
 124.567 +    for (i = 0; i < h ; i++) {
 124.568 +        int rightside = ((unsigned long)block & 0x0000000F);
 124.569 +        blockv = vec_ld(0, block);
 124.570 +
 124.571 +        temp1 = vec_ld(line_size, pixels);
 124.572 +        temp2 = vec_ld(line_size + 16, pixels);
 124.573 +        pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 124.574 +        if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F) {
 124.575 +            pixelsv2 = temp2;
 124.576 +        } else {
 124.577 +            pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 124.578 +        }
 124.579 +
 124.580 +        pixelsv1 = vec_mergeh(vczero, pixelsv1);
 124.581 +        pixelsv2 = vec_mergeh(vczero, pixelsv2);
 124.582 +        pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 124.583 +                             (vector unsigned short)pixelsv2);
 124.584 +        temp3 = vec_add(pixelssum1, pixelssum2);
 124.585 +        temp3 = vec_sra(temp3, vctwo);
 124.586 +        pixelssum1 = vec_add(pixelssum2, vctwo);
 124.587 +        pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
 124.588 +
 124.589 +        if (rightside) {
 124.590 +            blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
 124.591 +        } else {
 124.592 +            blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
 124.593 +        }
 124.594 +
 124.595 +        blockv = vec_avg(blocktemp, blockv);
 124.596 +        vec_st(blockv, 0, block);
 124.597 +
 124.598 +        block += line_size;
 124.599 +        pixels += line_size;
 124.600 +    }
 124.601 +
 124.602 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
 124.603 +}
 124.604 +
 124.605 +void dsputil_init_altivec(DSPContext* c)
 124.606 +{
 124.607 +    c->diff_pixels = diff_pixels_altivec;
 124.608 +    c->get_pixels = get_pixels_altivec;
 124.609 +    c->clear_block = clear_block_altivec;
 124.610 +
 124.611 +    c->put_pixels_tab[0][0] = put_pixels16_altivec;
 124.612 +    /* the two functions do the same thing, so use the same code */
 124.613 +    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
 124.614 +    c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
 124.615 +    c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
 124.616 +    c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
 124.617 +    c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
 124.618 +    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
 124.619 +    c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
 124.620 +    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
 124.621 +
 124.622 +}

   125.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   125.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_altivec.h	Mon Aug 27 12:09:56 2012 +0200
   125.3 @@ -0,0 +1,52 @@
   125.4 +/*
   125.5 + * Copyright (c) 2002 Brian Foley
   125.6 + * Copyright (c) 2002 Dieter Shirley
   125.7 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
   125.8 + *
   125.9 + * This file is part of FFmpeg.
  125.10 + *
  125.11 + * FFmpeg is free software; you can redistribute it and/or
  125.12 + * modify it under the terms of the GNU Lesser General Public
  125.13 + * License as published by the Free Software Foundation; either
  125.14 + * version 2.1 of the License, or (at your option) any later version.
  125.15 + *
  125.16 + * FFmpeg is distributed in the hope that it will be useful,
  125.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  125.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  125.19 + * Lesser General Public License for more details.
  125.20 + *
  125.21 + * You should have received a copy of the GNU Lesser General Public
  125.22 + * License along with FFmpeg; if not, write to the Free Software
  125.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  125.24 + */
  125.25 +
  125.26 +#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H
  125.27 +#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H
  125.28 +
  125.29 +#include <stdint.h>
  125.30 +#include "libavcodec/dsputil.h"
  125.31 +
  125.32 +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  125.33 +
  125.34 +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  125.35 +
  125.36 +int has_altivec(void);
  125.37 +
  125.38 +void fdct_altivec(int16_t *block);
  125.39 +void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
  125.40 +                  int x16, int y16, int rounder);
  125.41 +void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
  125.42 +void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
  125.43 +
  125.44 +void ff_vp3_idct_altivec(DCTELEM *block);
  125.45 +void ff_vp3_idct_put_altivec(uint8_t *dest, int line_size, DCTELEM *block);
  125.46 +void ff_vp3_idct_add_altivec(uint8_t *dest, int line_size, DCTELEM *block);
  125.47 +
  125.48 +void dsputil_h264_init_ppc(DSPContext* c);
  125.49 +
  125.50 +void dsputil_init_altivec(DSPContext* c);
  125.51 +//void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
  125.52 +//void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
  125.53 +//void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
  125.54 +
  125.55 +#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */

   126.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   126.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.c	Mon Aug 27 12:09:56 2012 +0200
   126.3 @@ -0,0 +1,48 @@
   126.4 +/*
   126.5 + * Copyright (c) 2002 Brian Foley
   126.6 + * Copyright (c) 2002 Dieter Shirley
   126.7 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
   126.8 + *
   126.9 + * This file is part of FFmpeg.
  126.10 + *
  126.11 + * FFmpeg is free software; you can redistribute it and/or
  126.12 + * modify it under the terms of the GNU Lesser General Public
  126.13 + * License as published by the Free Software Foundation; either
  126.14 + * version 2.1 of the License, or (at your option) any later version.
  126.15 + *
  126.16 + * FFmpeg is distributed in the hope that it will be useful,
  126.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  126.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  126.19 + * Lesser General Public License for more details.
  126.20 + *
  126.21 + * You should have received a copy of the GNU Lesser General Public
  126.22 + * License along with FFmpeg; if not, write to the Free Software
  126.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  126.24 + */
  126.25 +
  126.26 +#include "libavcodec/dsputil.h"
  126.27 +#include "dsputil_ppc.h"
  126.28 +#include "dsputil_altivec.h"
  126.29 +
  126.30 +static void prefetch_ppc(void *mem, int stride, int h)
  126.31 +{
  126.32 +    register const uint8_t *p = mem;
  126.33 +    do {
  126.34 +        __asm__ volatile ("dcbt 0,%0" : : "r" (p));
  126.35 +        p+= stride;
  126.36 +    } while(--h);
  126.37 +}
  126.38 +
  126.39 +void dsputil_init_ppc(DSPContext* c)
  126.40 +{
  126.41 +    c->prefetch = prefetch_ppc;
  126.42 +
  126.43 +#if HAVE_ALTIVEC
  126.44 +	dsputil_h264_init_ppc(c);	
  126.45 +	dsputil_init_altivec(c);
  126.46 +
  126.47 +	c->idct_put = idct_put_altivec;
  126.48 +	c->idct_add = idct_add_altivec;
  126.49 +
  126.50 +#endif /* HAVE_ALTIVEC */
  126.51 +}

   127.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   127.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/dsputil_ppc.h	Mon Aug 27 12:09:56 2012 +0200
   127.3 @@ -0,0 +1,154 @@
   127.4 +/*
   127.5 + * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
   127.6 + *
   127.7 + * This file is part of FFmpeg.
   127.8 + *
   127.9 + * FFmpeg is free software; you can redistribute it and/or
  127.10 + * modify it under the terms of the GNU Lesser General Public
  127.11 + * License as published by the Free Software Foundation; either
  127.12 + * version 2.1 of the License, or (at your option) any later version.
  127.13 + *
  127.14 + * FFmpeg is distributed in the hope that it will be useful,
  127.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  127.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  127.17 + * Lesser General Public License for more details.
  127.18 + *
  127.19 + * You should have received a copy of the GNU Lesser General Public
  127.20 + * License along with FFmpeg; if not, write to the Free Software
  127.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  127.22 + */
  127.23 +
  127.24 +#ifndef AVCODEC_PPC_DSPUTIL_PPC_H
  127.25 +#define AVCODEC_PPC_DSPUTIL_PPC_H
  127.26 +
  127.27 +#include "config.h"
  127.28 +
  127.29 +#if CONFIG_POWERPC_PERF
  127.30 +void powerpc_display_perf_report(void);
  127.31 +/* the 604* have 2, the G3* have 4, the G4s have 6,
  127.32 +   and the G5 are completely different (they MUST use
  127.33 +   ARCH_PPC64, and let's hope all future 64 bis PPC
  127.34 +   will use the same PMCs... */
  127.35 +#define POWERPC_NUM_PMC_ENABLED 6
  127.36 +/* if you add to the enum below, also add to the perfname array
  127.37 +   in dsputil_ppc.c */
  127.38 +enum powerpc_perf_index {
  127.39 +    altivec_fft_num = 0,
  127.40 +    altivec_gmc1_num,
  127.41 +    altivec_dct_unquantize_h263_num,
  127.42 +    altivec_fdct,
  127.43 +    altivec_idct_add_num,
  127.44 +    altivec_idct_put_num,
  127.45 +    altivec_put_pixels16_num,
  127.46 +    altivec_avg_pixels16_num,
  127.47 +    altivec_avg_pixels8_num,
  127.48 +    altivec_put_pixels8_xy2_num,
  127.49 +    altivec_put_no_rnd_pixels8_xy2_num,
  127.50 +    altivec_put_pixels16_xy2_num,
  127.51 +    altivec_put_no_rnd_pixels16_xy2_num,
  127.52 +    altivec_hadamard8_diff8x8_num,
  127.53 +    altivec_hadamard8_diff16_num,
  127.54 +    altivec_avg_pixels8_xy2_num,
  127.55 +    powerpc_clear_blocks_dcbz32,
  127.56 +    powerpc_clear_blocks_dcbz128,
  127.57 +    altivec_put_h264_chroma_mc8_num,
  127.58 +    altivec_avg_h264_chroma_mc8_num,
  127.59 +    altivec_put_h264_qpel16_h_lowpass_num,
  127.60 +    altivec_avg_h264_qpel16_h_lowpass_num,
  127.61 +    altivec_put_h264_qpel16_v_lowpass_num,
  127.62 +    altivec_avg_h264_qpel16_v_lowpass_num,
  127.63 +    altivec_put_h264_qpel16_hv_lowpass_num,
  127.64 +    altivec_avg_h264_qpel16_hv_lowpass_num,
  127.65 +    powerpc_perf_total
  127.66 +};
  127.67 +enum powerpc_data_index {
  127.68 +    powerpc_data_min = 0,
  127.69 +    powerpc_data_max,
  127.70 +    powerpc_data_sum,
  127.71 +    powerpc_data_num,
  127.72 +    powerpc_data_total
  127.73 +};
  127.74 +extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
  127.75 +
  127.76 +#if !ARCH_PPC64
  127.77 +#define POWERP_PMC_DATATYPE unsigned long
  127.78 +#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a))
  127.79 +#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a))
  127.80 +#if (POWERPC_NUM_PMC_ENABLED > 2)
  127.81 +#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a))
  127.82 +#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a))
  127.83 +#else
  127.84 +#define POWERPC_GET_PMC3(a) do {} while (0)
  127.85 +#define POWERPC_GET_PMC4(a) do {} while (0)
  127.86 +#endif
  127.87 +#if (POWERPC_NUM_PMC_ENABLED > 4)
  127.88 +#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a))
  127.89 +#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a))
  127.90 +#else
  127.91 +#define POWERPC_GET_PMC5(a) do {} while (0)
  127.92 +#define POWERPC_GET_PMC6(a) do {} while (0)
  127.93 +#endif
  127.94 +#else /* ARCH_PPC64 */
  127.95 +#define POWERP_PMC_DATATYPE unsigned long long
  127.96 +#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a))
  127.97 +#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a))
  127.98 +#if (POWERPC_NUM_PMC_ENABLED > 2)
  127.99 +#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a))
 127.100 +#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a))
 127.101 +#else
 127.102 +#define POWERPC_GET_PMC3(a) do {} while (0)
 127.103 +#define POWERPC_GET_PMC4(a) do {} while (0)
 127.104 +#endif
 127.105 +#if (POWERPC_NUM_PMC_ENABLED > 4)
 127.106 +#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a))
 127.107 +#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a))
 127.108 +#else
 127.109 +#define POWERPC_GET_PMC5(a) do {} while (0)
 127.110 +#define POWERPC_GET_PMC6(a) do {} while (0)
 127.111 +#endif
 127.112 +#endif /* ARCH_PPC64 */
 127.113 +#define POWERPC_PERF_DECLARE(a, cond)       \
 127.114 +    POWERP_PMC_DATATYPE                     \
 127.115 +        pmc_start[POWERPC_NUM_PMC_ENABLED], \
 127.116 +        pmc_stop[POWERPC_NUM_PMC_ENABLED],  \
 127.117 +        pmc_loop_index;
 127.118 +#define POWERPC_PERF_START_COUNT(a, cond) do { \
 127.119 +    POWERPC_GET_PMC6(pmc_start[5]); \
 127.120 +    POWERPC_GET_PMC5(pmc_start[4]); \
 127.121 +    POWERPC_GET_PMC4(pmc_start[3]); \
 127.122 +    POWERPC_GET_PMC3(pmc_start[2]); \
 127.123 +    POWERPC_GET_PMC2(pmc_start[1]); \
 127.124 +    POWERPC_GET_PMC1(pmc_start[0]); \
 127.125 +    } while (0)
 127.126 +#define POWERPC_PERF_STOP_COUNT(a, cond) do { \
 127.127 +    POWERPC_GET_PMC1(pmc_stop[0]);            \
 127.128 +    POWERPC_GET_PMC2(pmc_stop[1]);            \
 127.129 +    POWERPC_GET_PMC3(pmc_stop[2]);            \
 127.130 +    POWERPC_GET_PMC4(pmc_stop[3]);            \
 127.131 +    POWERPC_GET_PMC5(pmc_stop[4]);            \
 127.132 +    POWERPC_GET_PMC6(pmc_stop[5]);            \
 127.133 +    if (cond) {                               \
 127.134 +        for(pmc_loop_index = 0;               \
 127.135 +            pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
 127.136 +            pmc_loop_index++) {               \
 127.137 +            if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) {  \
 127.138 +                POWERP_PMC_DATATYPE diff =                                \
 127.139 +                  pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index];   \
 127.140 +                if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
 127.141 +                    perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
 127.142 +                if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
 127.143 +                    perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
 127.144 +                perfdata[pmc_loop_index][a][powerpc_data_sum] += diff;    \
 127.145 +                perfdata[pmc_loop_index][a][powerpc_data_num] ++;         \
 127.146 +            }                                 \
 127.147 +        }                                     \
 127.148 +    }                                         \
 127.149 +} while (0)
 127.150 +#else /* CONFIG_POWERPC_PERF */
 127.151 +// those are needed to avoid empty statements.
 127.152 +#define POWERPC_PERF_DECLARE(a, cond)        int altivec_placeholder __attribute__ ((unused))
 127.153 +#define POWERPC_PERF_START_COUNT(a, cond)    do {} while (0)
 127.154 +#define POWERPC_PERF_STOP_COUNT(a, cond)     do {} while (0)
 127.155 +#endif /* CONFIG_POWERPC_PERF */
 127.156 +
 127.157 +#endif /*  AVCODEC_PPC_DSPUTIL_PPC_H */

   128.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   128.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/h264_altivec.c	Mon Aug 27 12:09:56 2012 +0200
   128.3 @@ -0,0 +1,1021 @@
   128.4 +/*
   128.5 + * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
   128.6 + *
   128.7 + * This file is part of FFmpeg.
   128.8 + *
   128.9 + * FFmpeg is free software; you can redistribute it and/or
  128.10 + * modify it under the terms of the GNU Lesser General Public
  128.11 + * License as published by the Free Software Foundation; either
  128.12 + * version 2.1 of the License, or (at your option) any later version.
  128.13 + *
  128.14 + * FFmpeg is distributed in the hope that it will be useful,
  128.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  128.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  128.17 + * Lesser General Public License for more details.
  128.18 + *
  128.19 + * You should have received a copy of the GNU Lesser General Public
  128.20 + * License along with FFmpeg; if not, write to the Free Software
  128.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  128.22 + */
  128.23 +
  128.24 +#include "libavcodec/dsputil.h"
  128.25 +#include "libavcodec/h264_data.h"
  128.26 +#include "libavcodec/h264_dsp.h"
  128.27 +
  128.28 +#include "dsputil_ppc.h"
  128.29 +#include "dsputil_altivec.h"
  128.30 +#include "util_altivec.h"
  128.31 +#include "types_altivec.h"
  128.32 +
  128.33 +#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
  128.34 +#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
  128.35 +
  128.36 +#define OP_U8_ALTIVEC                          PUT_OP_U8_ALTIVEC
  128.37 +#define PREFIX_h264_chroma_mc8_altivec         put_h264_chroma_mc8_altivec
  128.38 +#define PREFIX_no_rnd_vc1_chroma_mc8_altivec   put_no_rnd_vc1_chroma_mc8_altivec
  128.39 +#define PREFIX_h264_chroma_mc8_num             altivec_put_h264_chroma_mc8_num
  128.40 +#define PREFIX_h264_qpel16_h_lowpass_altivec   put_h264_qpel16_h_lowpass_altivec
  128.41 +#define PREFIX_h264_qpel16_h_lowpass_num       altivec_put_h264_qpel16_h_lowpass_num
  128.42 +#define PREFIX_h264_qpel16_v_lowpass_altivec   put_h264_qpel16_v_lowpass_altivec
  128.43 +#define PREFIX_h264_qpel16_v_lowpass_num       altivec_put_h264_qpel16_v_lowpass_num
  128.44 +#define PREFIX_h264_qpel16_hv_lowpass_altivec  put_h264_qpel16_hv_lowpass_altivec
  128.45 +#define PREFIX_h264_qpel16_hv_lowpass_num      altivec_put_h264_qpel16_hv_lowpass_num
  128.46 +#include "h264_template_altivec.c"
  128.47 +#undef OP_U8_ALTIVEC
  128.48 +#undef PREFIX_h264_chroma_mc8_altivec
  128.49 +#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
  128.50 +#undef PREFIX_h264_chroma_mc8_num
  128.51 +#undef PREFIX_h264_qpel16_h_lowpass_altivec
  128.52 +#undef PREFIX_h264_qpel16_h_lowpass_num
  128.53 +#undef PREFIX_h264_qpel16_v_lowpass_altivec
  128.54 +#undef PREFIX_h264_qpel16_v_lowpass_num
  128.55 +#undef PREFIX_h264_qpel16_hv_lowpass_altivec
  128.56 +#undef PREFIX_h264_qpel16_hv_lowpass_num
  128.57 +
  128.58 +#define OP_U8_ALTIVEC                          AVG_OP_U8_ALTIVEC
  128.59 +#define PREFIX_h264_chroma_mc8_altivec         avg_h264_chroma_mc8_altivec
  128.60 +#define PREFIX_no_rnd_vc1_chroma_mc8_altivec   avg_no_rnd_vc1_chroma_mc8_altivec
  128.61 +#define PREFIX_h264_chroma_mc8_num             altivec_avg_h264_chroma_mc8_num
  128.62 +#define PREFIX_h264_qpel16_h_lowpass_altivec   avg_h264_qpel16_h_lowpass_altivec
  128.63 +#define PREFIX_h264_qpel16_h_lowpass_num       altivec_avg_h264_qpel16_h_lowpass_num
  128.64 +#define PREFIX_h264_qpel16_v_lowpass_altivec   avg_h264_qpel16_v_lowpass_altivec
  128.65 +#define PREFIX_h264_qpel16_v_lowpass_num       altivec_avg_h264_qpel16_v_lowpass_num
  128.66 +#define PREFIX_h264_qpel16_hv_lowpass_altivec  avg_h264_qpel16_hv_lowpass_altivec
  128.67 +#define PREFIX_h264_qpel16_hv_lowpass_num      altivec_avg_h264_qpel16_hv_lowpass_num
  128.68 +#include "h264_template_altivec.c"
  128.69 +#undef OP_U8_ALTIVEC
  128.70 +#undef PREFIX_h264_chroma_mc8_altivec
  128.71 +#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
  128.72 +#undef PREFIX_h264_chroma_mc8_num
  128.73 +#undef PREFIX_h264_qpel16_h_lowpass_altivec
  128.74 +#undef PREFIX_h264_qpel16_h_lowpass_num
  128.75 +#undef PREFIX_h264_qpel16_v_lowpass_altivec
  128.76 +#undef PREFIX_h264_qpel16_v_lowpass_num
  128.77 +#undef PREFIX_h264_qpel16_hv_lowpass_altivec
  128.78 +#undef PREFIX_h264_qpel16_hv_lowpass_num
  128.79 +
  128.80 +#define H264_MC(OPNAME, SIZE, CODETYPE) \
  128.81 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
  128.82 +    OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
  128.83 +}\
  128.84 +\
  128.85 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
  128.86 +    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
  128.87 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  128.88 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
  128.89 +}\
  128.90 +\
  128.91 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  128.92 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
  128.93 +}\
  128.94 +\
  128.95 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  128.96 +    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
  128.97 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  128.98 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
  128.99 +}\
 128.100 +\
 128.101 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.102 +    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
 128.103 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
 128.104 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
 128.105 +}\
 128.106 +\
 128.107 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.108 +    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
 128.109 +}\
 128.110 +\
 128.111 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.112 +    DECLARE_ALIGNED(16, uint8_t, half)[SIZE*SIZE];\
 128.113 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
 128.114 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
 128.115 +}\
 128.116 +\
 128.117 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.118 +    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
 128.119 +    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
 128.120 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
 128.121 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
 128.122 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 128.123 +}\
 128.124 +\
 128.125 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.126 +    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
 128.127 +    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
 128.128 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
 128.129 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
 128.130 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 128.131 +}\
 128.132 +\
 128.133 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.134 +    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
 128.135 +    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
 128.136 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
 128.137 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
 128.138 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 128.139 +}\
 128.140 +\
 128.141 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.142 +    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
 128.143 +    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
 128.144 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
 128.145 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
 128.146 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
 128.147 +}\
 128.148 +\
 128.149 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.150 +    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
 128.151 +    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
 128.152 +}\
 128.153 +\
 128.154 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.155 +    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
 128.156 +    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
 128.157 +    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
 128.158 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
 128.159 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 128.160 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
 128.161 +}\
 128.162 +\
 128.163 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.164 +    DECLARE_ALIGNED(16, uint8_t, halfH)[SIZE*SIZE];\
 128.165 +    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
 128.166 +    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
 128.167 +    put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
 128.168 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 128.169 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
 128.170 +}\
 128.171 +\
 128.172 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.173 +    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
 128.174 +    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
 128.175 +    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
 128.176 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
 128.177 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 128.178 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 128.179 +}\
 128.180 +\
 128.181 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
 128.182 +    DECLARE_ALIGNED(16, uint8_t, halfV)[SIZE*SIZE];\
 128.183 +    DECLARE_ALIGNED(16, uint8_t, halfHV)[SIZE*SIZE];\
 128.184 +    DECLARE_ALIGNED(16, int16_t, tmp)[SIZE*(SIZE+8)];\
 128.185 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
 128.186 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
 128.187 +    OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
 128.188 +}\
 128.189 +
 128.190 +static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
 128.191 +                                    const uint8_t * src2, int dst_stride,
 128.192 +                                    int src_stride1, int h)
 128.193 +{
 128.194 +    int i;
 128.195 +    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
 128.196 +
 128.197 +    mask_ = vec_lvsl(0, src2);
 128.198 +
 128.199 +    for (i = 0; i < h; i++) {
 128.200 +
 128.201 +        tmp1 = vec_ld(i * src_stride1, src1);
 128.202 +        mask = vec_lvsl(i * src_stride1, src1);
 128.203 +        tmp2 = vec_ld(i * src_stride1 + 15, src1);
 128.204 +
 128.205 +        a = vec_perm(tmp1, tmp2, mask);
 128.206 +
 128.207 +        tmp1 = vec_ld(i * 16, src2);
 128.208 +        tmp2 = vec_ld(i * 16 + 15, src2);
 128.209 +
 128.210 +        b = vec_perm(tmp1, tmp2, mask_);
 128.211 +
 128.212 +        tmp1 = vec_ld(0, dst);
 128.213 +        mask = vec_lvsl(0, dst);
 128.214 +        tmp2 = vec_ld(15, dst);
 128.215 +
 128.216 +        d = vec_avg(a, b);
 128.217 +
 128.218 +        edges = vec_perm(tmp2, tmp1, mask);
 128.219 +
 128.220 +        align = vec_lvsr(0, dst);
 128.221 +
 128.222 +        tmp2 = vec_perm(d, edges, align);
 128.223 +        tmp1 = vec_perm(edges, d, align);
 128.224 +
 128.225 +        vec_st(tmp2, 15, dst);
 128.226 +        vec_st(tmp1, 0 , dst);
 128.227 +
 128.228 +        dst += dst_stride;
 128.229 +    }
 128.230 +}
 128.231 +
 128.232 +static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
 128.233 +                                    const uint8_t * src2, int dst_stride,
 128.234 +                                    int src_stride1, int h)
 128.235 +{
 128.236 +    int i;
 128.237 +    vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
 128.238 +
 128.239 +    mask_ = vec_lvsl(0, src2);
 128.240 +
 128.241 +    for (i = 0; i < h; i++) {
 128.242 +
 128.243 +        tmp1 = vec_ld(i * src_stride1, src1);
 128.244 +        mask = vec_lvsl(i * src_stride1, src1);
 128.245 +        tmp2 = vec_ld(i * src_stride1 + 15, src1);
 128.246 +
 128.247 +        a = vec_perm(tmp1, tmp2, mask);
 128.248 +
 128.249 +        tmp1 = vec_ld(i * 16, src2);
 128.250 +        tmp2 = vec_ld(i * 16 + 15, src2);
 128.251 +
 128.252 +        b = vec_perm(tmp1, tmp2, mask_);
 128.253 +
 128.254 +        tmp1 = vec_ld(0, dst);
 128.255 +        mask = vec_lvsl(0, dst);
 128.256 +        tmp2 = vec_ld(15, dst);
 128.257 +
 128.258 +        d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
 128.259 +
 128.260 +        edges = vec_perm(tmp2, tmp1, mask);
 128.261 +
 128.262 +        align = vec_lvsr(0, dst);
 128.263 +
 128.264 +        tmp2 = vec_perm(d, edges, align);
 128.265 +        tmp1 = vec_perm(edges, d, align);
 128.266 +
 128.267 +        vec_st(tmp2, 15, dst);
 128.268 +        vec_st(tmp1, 0 , dst);
 128.269 +
 128.270 +        dst += dst_stride;
 128.271 +    }
 128.272 +}
 128.273 +
 128.274 +/* Implemented but could be faster
 128.275 +#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
 128.276 +#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
 128.277 + */
 128.278 +
 128.279 +H264_MC(put_, 16, altivec)
 128.280 +H264_MC(avg_, 16, altivec)
 128.281 +
 128.282 +
 128.283 +/****************************************************************************
 128.284 + * IDCT transform:
 128.285 + ****************************************************************************/
 128.286 +
 128.287 +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)               \
 128.288 +    /* 1st stage */                                               \
 128.289 +    vz0 = vec_add(vb0,vb2);       /* temp[0] = Y[0] + Y[2] */     \
 128.290 +    vz1 = vec_sub(vb0,vb2);       /* temp[1] = Y[0] - Y[2] */     \
 128.291 +    vz2 = vec_sra(vb1,vec_splat_u16(1));                          \
 128.292 +    vz2 = vec_sub(vz2,vb3);       /* temp[2] = Y[1].1/2 - Y[3] */ \
 128.293 +    vz3 = vec_sra(vb3,vec_splat_u16(1));                          \
 128.294 +    vz3 = vec_add(vb1,vz3);       /* temp[3] = Y[1] + Y[3].1/2 */ \
 128.295 +    /* 2nd stage: output */                                       \
 128.296 +    va0 = vec_add(vz0,vz3);       /* x[0] = temp[0] + temp[3] */  \
 128.297 +    va1 = vec_add(vz1,vz2);       /* x[1] = temp[1] + temp[2] */  \
 128.298 +    va2 = vec_sub(vz1,vz2);       /* x[2] = temp[1] - temp[2] */  \
 128.299 +    va3 = vec_sub(vz0,vz3)        /* x[3] = temp[0] - temp[3] */
 128.300 +
 128.301 +#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
 128.302 +    b0 = vec_mergeh( a0, a0 ); \
 128.303 +    b1 = vec_mergeh( a1, a0 ); \
 128.304 +    b2 = vec_mergeh( a2, a0 ); \
 128.305 +    b3 = vec_mergeh( a3, a0 ); \
 128.306 +    a0 = vec_mergeh( b0, b2 ); \
 128.307 +    a1 = vec_mergel( b0, b2 ); \
 128.308 +    a2 = vec_mergeh( b1, b3 ); \
 128.309 +    a3 = vec_mergel( b1, b3 ); \
 128.310 +    b0 = vec_mergeh( a0, a2 ); \
 128.311 +    b1 = vec_mergel( a0, a2 ); \
 128.312 +    b2 = vec_mergeh( a1, a3 ); \
 128.313 +    b3 = vec_mergel( a1, a3 )
 128.314 +
 128.315 +#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
 128.316 +    vdst_orig = vec_ld(0, dst);                               \
 128.317 +    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
 128.318 +    vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst);         \
 128.319 +    va = vec_add(va, vdst_ss);                                \
 128.320 +    va_u8 = vec_packsu(va, zero_s16v);                        \
 128.321 +    va_u32 = vec_splat((vec_u32)va_u8, 0);                  \
 128.322 +    vec_ste(va_u32, element, (uint32_t*)dst);
 128.323 +
 128.324 +static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
 128.325 +{
 128.326 +    vec_s16 va0, va1, va2, va3;
 128.327 +    vec_s16 vz0, vz1, vz2, vz3;
 128.328 +    vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
 128.329 +    vec_u8 va_u8;
 128.330 +    vec_u32 va_u32;
 128.331 +    vec_s16 vdst_ss;
 128.332 +    const vec_u16 v6us = vec_splat_u16(6);
 128.333 +    vec_u8 vdst, vdst_orig;
 128.334 +    vec_u8 vdst_mask = vec_lvsl(0, dst);
 128.335 +    int element = ((unsigned long)dst & 0xf) >> 2;
 128.336 +    LOAD_ZERO;
 128.337 +
 128.338 +    block[0] += 32;  /* add 32 as a DC-level for rounding */
 128.339 +
 128.340 +    vtmp0 = vec_ld(0,block);
 128.341 +    vtmp1 = vec_sld(vtmp0, vtmp0, 8);
 128.342 +    vtmp2 = vec_ld(16,block);
 128.343 +    vtmp3 = vec_sld(vtmp2, vtmp2, 8);
 128.344 +
 128.345 +    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
 128.346 +    VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
 128.347 +    VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
 128.348 +
 128.349 +    va0 = vec_sra(va0,v6us);
 128.350 +    va1 = vec_sra(va1,v6us);
 128.351 +    va2 = vec_sra(va2,v6us);
 128.352 +    va3 = vec_sra(va3,v6us);
 128.353 +
 128.354 +    VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
 128.355 +    dst += stride;
 128.356 +    VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
 128.357 +    dst += stride;
 128.358 +    VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
 128.359 +    dst += stride;
 128.360 +    VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
 128.361 +}
 128.362 +
 128.363 +#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7) {\
 128.364 +    /*        a0  = SRC(0) + SRC(4); */ \
 128.365 +    vec_s16 a0v = vec_add(s0, s4);    \
 128.366 +    /*        a2  = SRC(0) - SRC(4); */ \
 128.367 +    vec_s16 a2v = vec_sub(s0, s4);    \
 128.368 +    /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
 128.369 +    vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6);    \
 128.370 +    /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
 128.371 +    vec_s16 a6v = vec_add(vec_sra(s6, onev), s2);    \
 128.372 +    /*        b0  =         a0 + a6; */ \
 128.373 +    vec_s16 b0v = vec_add(a0v, a6v);  \
 128.374 +    /*        b2  =         a2 + a4; */ \
 128.375 +    vec_s16 b2v = vec_add(a2v, a4v);  \
 128.376 +    /*        b4  =         a2 - a4; */ \
 128.377 +    vec_s16 b4v = vec_sub(a2v, a4v);  \
 128.378 +    /*        b6  =         a0 - a6; */ \
 128.379 +    vec_s16 b6v = vec_sub(a0v, a6v);  \
 128.380 +    /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
 128.381 +    /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
 128.382 +    vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
 128.383 +    /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
 128.384 +    /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
 128.385 +    vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
 128.386 +    /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
 128.387 +    /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */ \
 128.388 +    vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
 128.389 +    /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */ \
 128.390 +    vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
 128.391 +    /*        b1 =                  (a7>>2)  +  a1; */ \
 128.392 +    vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
 128.393 +    /*        b3 =          a3 +        (a5>>2); */ \
 128.394 +    vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
 128.395 +    /*        b5 =                  (a3>>2)  -   a5; */ \
 128.396 +    vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
 128.397 +    /*        b7 =           a7 -        (a1>>2); */ \
 128.398 +    vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
 128.399 +    /* DST(0,    b0 + b7); */ \
 128.400 +    d0 = vec_add(b0v, b7v); \
 128.401 +    /* DST(1,    b2 + b5); */ \
 128.402 +    d1 = vec_add(b2v, b5v); \
 128.403 +    /* DST(2,    b4 + b3); */ \
 128.404 +    d2 = vec_add(b4v, b3v); \
 128.405 +    /* DST(3,    b6 + b1); */ \
 128.406 +    d3 = vec_add(b6v, b1v); \
 128.407 +    /* DST(4,    b6 - b1); */ \
 128.408 +    d4 = vec_sub(b6v, b1v); \
 128.409 +    /* DST(5,    b4 - b3); */ \
 128.410 +    d5 = vec_sub(b4v, b3v); \
 128.411 +    /* DST(6,    b2 - b5); */ \
 128.412 +    d6 = vec_sub(b2v, b5v); \
 128.413 +    /* DST(7,    b0 - b7); */ \
 128.414 +    d7 = vec_sub(b0v, b7v); \
 128.415 +}
 128.416 +
 128.417 +#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
 128.418 +    /* unaligned load */                                       \
 128.419 +    vec_u8 hv = vec_ld( 0, dest );                           \
 128.420 +    vec_u8 lv = vec_ld( 7, dest );                           \
 128.421 +    vec_u8 dstv   = vec_perm( hv, lv, (vec_u8)perm_ldv );  \
 128.422 +    vec_s16 idct_sh6 = vec_sra(idctv, sixv);                 \
 128.423 +    vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv);   \
 128.424 +    vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16);  \
 128.425 +    vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum);        \
 128.426 +    vec_u8 edgehv;                                           \
 128.427 +    /* unaligned store */                                      \
 128.428 +    vec_u8 bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
 128.429 +    vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
 128.430 +    lv    = vec_sel( lv, bodyv, edgelv );                      \
 128.431 +    vec_st( lv, 7, dest );                                     \
 128.432 +    hv    = vec_ld( 0, dest );                                 \
 128.433 +    edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
 128.434 +    hv    = vec_sel( hv, bodyv, edgehv );                      \
 128.435 +    vec_st( hv, 0, dest );                                     \
 128.436 + }
 128.437 +
 128.438 +static void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
 128.439 +    vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
 128.440 +    vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
 128.441 +    vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
 128.442 +
 128.443 +    vec_u8 perm_ldv = vec_lvsl(0, dst);
 128.444 +    vec_u8 perm_stv = vec_lvsr(8, dst);
 128.445 +
 128.446 +    const vec_u16 onev = vec_splat_u16(1);
 128.447 +    const vec_u16 twov = vec_splat_u16(2);
 128.448 +    const vec_u16 sixv = vec_splat_u16(6);
 128.449 +
 128.450 +    const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
 128.451 +    LOAD_ZERO;
 128.452 +
 128.453 +    dct[0] += 32; // rounding for the >>6 at the end
 128.454 +
 128.455 +    s0 = vec_ld(0x00, (int16_t*)dct);
 128.456 +    s1 = vec_ld(0x10, (int16_t*)dct);
 128.457 +    s2 = vec_ld(0x20, (int16_t*)dct);
 128.458 +    s3 = vec_ld(0x30, (int16_t*)dct);
 128.459 +    s4 = vec_ld(0x40, (int16_t*)dct);
 128.460 +    s5 = vec_ld(0x50, (int16_t*)dct);
 128.461 +    s6 = vec_ld(0x60, (int16_t*)dct);
 128.462 +    s7 = vec_ld(0x70, (int16_t*)dct);
 128.463 +
 128.464 +    IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
 128.465 +                     d0, d1, d2, d3, d4, d5, d6, d7);
 128.466 +
 128.467 +    TRANSPOSE8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7 );
 128.468 +
 128.469 +    IDCT8_1D_ALTIVEC(d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
 128.470 +                     idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
 128.471 +
 128.472 +    ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
 128.473 +    ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
 128.474 +    ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
 128.475 +    ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
 128.476 +    ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
 128.477 +    ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
 128.478 +    ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
 128.479 +    ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
 128.480 +}
 128.481 +
 128.482 +static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, DCTELEM *block, int stride, int size)
 128.483 +{
 128.484 +    vec_s16 dc16;
 128.485 +    vec_u8 dcplus, dcminus, v0, v1, v2, v3, aligner;
 128.486 +    LOAD_ZERO;
 128.487 +    DECLARE_ALIGNED(16, int, dc);
 128.488 +    int i;
 128.489 +
 128.490 +    dc = (block[0] + 32) >> 6;
 128.491 +    dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
 128.492 +
 128.493 +    if (size == 4)
 128.494 +        dc16 = vec_sld(dc16, zero_s16v, 8);
 128.495 +    dcplus = vec_packsu(dc16, zero_s16v);
 128.496 +    dcminus = vec_packsu(vec_sub(zero_s16v, dc16), zero_s16v);
 128.497 +
 128.498 +    aligner = vec_lvsr(0, dst);
 128.499 +    dcplus = vec_perm(dcplus, dcplus, aligner);
 128.500 +    dcminus = vec_perm(dcminus, dcminus, aligner);
 128.501 +
 128.502 +    for (i = 0; i < size; i += 4) {
 128.503 +        v0 = vec_ld(0, dst+0*stride);
 128.504 +        v1 = vec_ld(0, dst+1*stride);
 128.505 +        v2 = vec_ld(0, dst+2*stride);
 128.506 +        v3 = vec_ld(0, dst+3*stride);
 128.507 +
 128.508 +        v0 = vec_adds(v0, dcplus);
 128.509 +        v1 = vec_adds(v1, dcplus);
 128.510 +        v2 = vec_adds(v2, dcplus);
 128.511 +        v3 = vec_adds(v3, dcplus);
 128.512 +
 128.513 +        v0 = vec_subs(v0, dcminus);
 128.514 +        v1 = vec_subs(v1, dcminus);
 128.515 +        v2 = vec_subs(v2, dcminus);
 128.516 +        v3 = vec_subs(v3, dcminus);
 128.517 +
 128.518 +        vec_st(v0, 0, dst+0*stride);
 128.519 +        vec_st(v1, 0, dst+1*stride);
 128.520 +        vec_st(v2, 0, dst+2*stride);
 128.521 +        vec_st(v3, 0, dst+3*stride);
 128.522 +
 128.523 +        dst += 4*stride;
 128.524 +    }
 128.525 +}
 128.526 +
 128.527 +static void h264_idct_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
 128.528 +{
 128.529 +    h264_idct_dc_add_internal(dst, block, stride, 4);
 128.530 +}
 128.531 +
 128.532 +static void ff_h264_idct8_dc_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
 128.533 +{
 128.534 +    h264_idct_dc_add_internal(dst, block, stride, 8);
 128.535 +}
 128.536 +
 128.537 +static void ff_h264_idct_add16_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 128.538 +    int i;
 128.539 +    for(i=0; i<16; i++){
 128.540 +        int nnz = nnzc[ scan8[i] ];
 128.541 +        if(nnz){
 128.542 +            if(nnz==1 && block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
 128.543 +            else                      ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
 128.544 +        }
 128.545 +    }
 128.546 +}
 128.547 +
 128.548 +static void ff_h264_idct_add16intra_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 128.549 +    int i;
 128.550 +    for(i=0; i<16; i++){
 128.551 +        if(nnzc[ scan8[i] ]) ff_h264_idct_add_altivec(dst + block_offset[i], block + i*16, stride);
 128.552 +        else if(block[i*16]) h264_idct_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
 128.553 +    }
 128.554 +}
 128.555 +
 128.556 +static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 128.557 +    int i;
 128.558 +    for(i=0; i<16; i+=4){
 128.559 +        int nnz = nnzc[ scan8[i] ];
 128.560 +        if(nnz){
 128.561 +            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_altivec(dst + block_offset[i], block + i*16, stride);
 128.562 +            else                      ff_h264_idct8_add_altivec   (dst + block_offset[i], block + i*16, stride);
 128.563 +        }
 128.564 +    }
 128.565 +}
 128.566 +
 128.567 +static void ff_h264_idct_add8_altivec(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 128.568 +    int i;
 128.569 +    for(i=16; i<16+8; i++){
 128.570 +        if(nnzc[ scan8[i] ])
 128.571 +            ff_h264_idct_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
 128.572 +        else if(block[i*16])
 128.573 +            h264_idct_dc_add_altivec(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
 128.574 +    }
 128.575 +}
 128.576 +
 128.577 +#define transpose4x16(r0, r1, r2, r3) {      \
 128.578 +    register vec_u8 r4;                    \
 128.579 +    register vec_u8 r5;                    \
 128.580 +    register vec_u8 r6;                    \
 128.581 +    register vec_u8 r7;                    \
 128.582 +                                             \
 128.583 +    r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
 128.584 +    r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
 128.585 +    r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/ \
 128.586 +    r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/ \
 128.587 +                                             \
 128.588 +    r0 = vec_mergeh(r4, r6);  /*all set 0*/  \
 128.589 +    r1 = vec_mergel(r4, r6);  /*all set 1*/  \
 128.590 +    r2 = vec_mergeh(r5, r7);  /*all set 2*/  \
 128.591 +    r3 = vec_mergel(r5, r7);  /*all set 3*/  \
 128.592 +}
 128.593 +
 128.594 +static inline void write16x4(uint8_t *dst, int dst_stride,
 128.595 +                             register vec_u8 r0, register vec_u8 r1,
 128.596 +                             register vec_u8 r2, register vec_u8 r3) {
 128.597 +    DECLARE_ALIGNED(16, unsigned char, result)[64];
 128.598 +    uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
 128.599 +    int int_dst_stride = dst_stride/4;
 128.600 +
 128.601 +    vec_st(r0, 0, result);
 128.602 +    vec_st(r1, 16, result);
 128.603 +    vec_st(r2, 32, result);
 128.604 +    vec_st(r3, 48, result);
 128.605 +    /* FIXME: there has to be a better way!!!! */
 128.606 +    *dst_int = *src_int;
 128.607 +    *(dst_int+   int_dst_stride) = *(src_int + 1);
 128.608 +    *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
 128.609 +    *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
 128.610 +    *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
 128.611 +    *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
 128.612 +    *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
 128.613 +    *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
 128.614 +    *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
 128.615 +    *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
 128.616 +    *(dst_int+10*int_dst_stride) = *(src_int + 10);
 128.617 +    *(dst_int+11*int_dst_stride) = *(src_int + 11);
 128.618 +    *(dst_int+12*int_dst_stride) = *(src_int + 12);
 128.619 +    *(dst_int+13*int_dst_stride) = *(src_int + 13);
 128.620 +    *(dst_int+14*int_dst_stride) = *(src_int + 14);
 128.621 +    *(dst_int+15*int_dst_stride) = *(src_int + 15);
 128.622 +}
 128.623 +
 128.624 +/** \brief performs a 6x16 transpose of data in src, and stores it to dst
 128.625 +    \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
 128.626 +    out of unaligned_load() */
 128.627 +#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
 128.628 +    register vec_u8 r0  = unaligned_load(0,             src);            \
 128.629 +    register vec_u8 r1  = unaligned_load(   src_stride, src);            \
 128.630 +    register vec_u8 r2  = unaligned_load(2* src_stride, src);            \
 128.631 +    register vec_u8 r3  = unaligned_load(3* src_stride, src);            \
 128.632 +    register vec_u8 r4  = unaligned_load(4* src_stride, src);            \
 128.633 +    register vec_u8 r5  = unaligned_load(5* src_stride, src);            \
 128.634 +    register vec_u8 r6  = unaligned_load(6* src_stride, src);            \
 128.635 +    register vec_u8 r7  = unaligned_load(7* src_stride, src);            \
 128.636 +    register vec_u8 r14 = unaligned_load(14*src_stride, src);            \
 128.637 +    register vec_u8 r15 = unaligned_load(15*src_stride, src);            \
 128.638 +                                                                           \
 128.639 +    r8  = unaligned_load( 8*src_stride, src);                              \
 128.640 +    r9  = unaligned_load( 9*src_stride, src);                              \
 128.641 +    r10 = unaligned_load(10*src_stride, src);                              \
 128.642 +    r11 = unaligned_load(11*src_stride, src);                              \
 128.643 +    r12 = unaligned_load(12*src_stride, src);                              \
 128.644 +    r13 = unaligned_load(13*src_stride, src);                              \
 128.645 +                                                                           \
 128.646 +    /*Merge first pairs*/                                                  \
 128.647 +    r0 = vec_mergeh(r0, r8);    /*0, 8*/                                   \
 128.648 +    r1 = vec_mergeh(r1, r9);    /*1, 9*/                                   \
 128.649 +    r2 = vec_mergeh(r2, r10);   /*2,10*/                                   \
 128.650 +    r3 = vec_mergeh(r3, r11);   /*3,11*/                                   \
 128.651 +    r4 = vec_mergeh(r4, r12);   /*4,12*/                                   \
 128.652 +    r5 = vec_mergeh(r5, r13);   /*5,13*/                                   \
 128.653 +    r6 = vec_mergeh(r6, r14);   /*6,14*/                                   \
 128.654 +    r7 = vec_mergeh(r7, r15);   /*7,15*/                                   \
 128.655 +                                                                           \
 128.656 +    /*Merge second pairs*/                                                 \
 128.657 +    r8  = vec_mergeh(r0, r4);   /*0,4, 8,12 set 0*/                        \
 128.658 +    r9  = vec_mergel(r0, r4);   /*0,4, 8,12 set 1*/                        \
 128.659 +    r10 = vec_mergeh(r1, r5);   /*1,5, 9,13 set 0*/                        \
 128.660 +    r11 = vec_mergel(r1, r5);   /*1,5, 9,13 set 1*/                        \
 128.661 +    r12 = vec_mergeh(r2, r6);   /*2,6,10,14 set 0*/                        \
 128.662 +    r13 = vec_mergel(r2, r6);   /*2,6,10,14 set 1*/                        \
 128.663 +    r14 = vec_mergeh(r3, r7);   /*3,7,11,15 set 0*/                        \
 128.664 +    r15 = vec_mergel(r3, r7);   /*3,7,11,15 set 1*/                        \
 128.665 +                                                                           \
 128.666 +    /*Third merge*/                                                        \
 128.667 +    r0 = vec_mergeh(r8,  r12);  /*0,2,4,6,8,10,12,14 set 0*/               \
 128.668 +    r1 = vec_mergel(r8,  r12);  /*0,2,4,6,8,10,12,14 set 1*/               \
 128.669 +    r2 = vec_mergeh(r9,  r13);  /*0,2,4,6,8,10,12,14 set 2*/               \
 128.670 +    r4 = vec_mergeh(r10, r14);  /*1,3,5,7,9,11,13,15 set 0*/               \
 128.671 +    r5 = vec_mergel(r10, r14);  /*1,3,5,7,9,11,13,15 set 1*/               \
 128.672 +    r6 = vec_mergeh(r11, r15);  /*1,3,5,7,9,11,13,15 set 2*/               \
 128.673 +    /* Don't need to compute 3 and 7*/                                     \
 128.674 +                                                                           \
 128.675 +    /*Final merge*/                                                        \
 128.676 +    r8  = vec_mergeh(r0, r4);   /*all set 0*/                              \
 128.677 +    r9  = vec_mergel(r0, r4);   /*all set 1*/                              \
 128.678 +    r10 = vec_mergeh(r1, r5);   /*all set 2*/                              \
 128.679 +    r11 = vec_mergel(r1, r5);   /*all set 3*/                              \
 128.680 +    r12 = vec_mergeh(r2, r6);   /*all set 4*/                              \
 128.681 +    r13 = vec_mergel(r2, r6);   /*all set 5*/                              \
 128.682 +    /* Don't need to compute 14 and 15*/                                   \
 128.683 +                                                                           \
 128.684 +}
 128.685 +
 128.686 +// out: o = |x-y| < a
 128.687 +static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
 128.688 +                                         register vec_u8 y,
 128.689 +                                         register vec_u8 a) {
 128.690 +
 128.691 +    register vec_u8 diff = vec_subs(x, y);
 128.692 +    register vec_u8 diffneg = vec_subs(y, x);
 128.693 +    register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
 128.694 +    o = (vec_u8)vec_cmplt(o, a);
 128.695 +    return o;
 128.696 +}
 128.697 +
 128.698 +static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
 128.699 +                                           register vec_u8 p1,
 128.700 +                                           register vec_u8 q0,
 128.701 +                                           register vec_u8 q1,
 128.702 +                                           register vec_u8 alpha,
 128.703 +                                           register vec_u8 beta) {
 128.704 +
 128.705 +    register vec_u8 mask;
 128.706 +    register vec_u8 tempmask;
 128.707 +
 128.708 +    mask = diff_lt_altivec(p0, q0, alpha);
 128.709 +    tempmask = diff_lt_altivec(p1, p0, beta);
 128.710 +    mask = vec_and(mask, tempmask);
 128.711 +    tempmask = diff_lt_altivec(q1, q0, beta);
 128.712 +    mask = vec_and(mask, tempmask);
 128.713 +
 128.714 +    return mask;
 128.715 +}
 128.716 +
 128.717 +// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
 128.718 +static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
 128.719 +                                       register vec_u8 p1,
 128.720 +                                       register vec_u8 p2,
 128.721 +                                       register vec_u8 q0,
 128.722 +                                       register vec_u8 tc0) {
 128.723 +
 128.724 +    register vec_u8 average = vec_avg(p0, q0);
 128.725 +    register vec_u8 temp;
 128.726 +    register vec_u8 uncliped;
 128.727 +    register vec_u8 ones;
 128.728 +    register vec_u8 max;
 128.729 +    register vec_u8 min;
 128.730 +    register vec_u8 newp1;
 128.731 +
 128.732 +    temp = vec_xor(average, p2);
 128.733 +    average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */
 128.734 +    ones = vec_splat_u8(1);
 128.735 +    temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */
 128.736 +    uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
 128.737 +    max = vec_adds(p1, tc0);
 128.738 +    min = vec_subs(p1, tc0);
 128.739 +    newp1 = vec_max(min, uncliped);
 128.740 +    newp1 = vec_min(max, newp1);
 128.741 +    return newp1;
 128.742 +}
 128.743 +
 128.744 +#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
 128.745 +                                                                                                  \
 128.746 +    const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                               \
 128.747 +                                                                                                  \
 128.748 +    register vec_u8 pq0bit = vec_xor(p0,q0);                                                    \
 128.749 +    register vec_u8 q1minus;                                                                    \
 128.750 +    register vec_u8 p0minus;                                                                    \
 128.751 +    register vec_u8 stage1;                                                                     \
 128.752 +    register vec_u8 stage2;                                                                     \
 128.753 +    register vec_u8 vec160;                                                                     \
 128.754 +    register vec_u8 delta;                                                                      \
 128.755 +    register vec_u8 deltaneg;                                                                   \
 128.756 +                                                                                                  \
 128.757 +    q1minus = vec_nor(q1, q1);                 /* 255 - q1 */                                     \
 128.758 +    stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
 128.759 +    stage2 = vec_sr(stage1, vec_splat_u8(1));  /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */     \
 128.760 +    p0minus = vec_nor(p0, p0);                 /* 255 - p0 */                                     \
 128.761 +    stage1 = vec_avg(q0, p0minus);             /* (q0 - p0 + 256)>>1 */                           \
 128.762 +    pq0bit = vec_and(pq0bit, vec_splat_u8(1));                                                    \
 128.763 +    stage2 = vec_avg(stage2, pq0bit);          /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
 128.764 +    stage2 = vec_adds(stage2, stage1);         /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */  \
 128.765 +    vec160 = vec_ld(0, &A0v);                                                                     \
 128.766 +    deltaneg = vec_subs(vec160, stage2);       /* -d */                                           \
 128.767 +    delta = vec_subs(stage2, vec160);          /* d */                                            \
 128.768 +    deltaneg = vec_min(tc0masked, deltaneg);                                                      \
 128.769 +    delta = vec_min(tc0masked, delta);                                                            \
 128.770 +    p0 = vec_subs(p0, deltaneg);                                                                  \
 128.771 +    q0 = vec_subs(q0, delta);                                                                     \
 128.772 +    p0 = vec_adds(p0, delta);                                                                     \
 128.773 +    q0 = vec_adds(q0, deltaneg);                                                                  \
 128.774 +}
 128.775 +
 128.776 +#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
 128.777 +    DECLARE_ALIGNED(16, unsigned char, temp)[16];                                             \
 128.778 +    register vec_u8 alphavec;                                                              \
 128.779 +    register vec_u8 betavec;                                                               \
 128.780 +    register vec_u8 mask;                                                                  \
 128.781 +    register vec_u8 p1mask;                                                                \
 128.782 +    register vec_u8 q1mask;                                                                \
 128.783 +    register vector signed   char tc0vec;                                                    \
 128.784 +    register vec_u8 finaltc0;                                                              \
 128.785 +    register vec_u8 tc0masked;                                                             \
 128.786 +    register vec_u8 newp1;                                                                 \
 128.787 +    register vec_u8 newq1;                                                                 \
 128.788 +                                                                                             \
 128.789 +    temp[0] = alpha;                                                                         \
 128.790 +    temp[1] = beta;                                                                          \
 128.791 +    alphavec = vec_ld(0, temp);                                                              \
 128.792 +    betavec = vec_splat(alphavec, 0x1);                                                      \
 128.793 +    alphavec = vec_splat(alphavec, 0x0);                                                     \
 128.794 +    mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */            \
 128.795 +                                                                                             \
 128.796 +    *((int *)temp) = *((int *)tc0);                                                          \
 128.797 +    tc0vec = vec_ld(0, (signed char*)temp);                                                  \
 128.798 +    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
 128.799 +    tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
 128.800 +    mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1)));  /* if tc0[i] >= 0 */         \
 128.801 +    finaltc0 = vec_and((vec_u8)tc0vec, mask);     /* tc = tc0 */                           \
 128.802 +                                                                                             \
 128.803 +    p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
 128.804 +    p1mask = vec_and(p1mask, mask);                             /* if ( |p2 - p0| < beta) */ \
 128.805 +    tc0masked = vec_and(p1mask, (vec_u8)tc0vec);                                           \
 128.806 +    finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
 128.807 +    newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                      \
 128.808 +    /*end if*/                                                                               \
 128.809 +                                                                                             \
 128.810 +    q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
 128.811 +    q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
 128.812 +    tc0masked = vec_and(q1mask, (vec_u8)tc0vec);                                           \
 128.813 +    finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
 128.814 +    newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                      \
 128.815 +    /*end if*/                                                                               \
 128.816 +                                                                                             \
 128.817 +    h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);                                            \
 128.818 +    p1 = newp1;                                                                              \
 128.819 +    q1 = newq1;                                                                              \
 128.820 +}
 128.821 +
 128.822 +static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
 128.823 +
 128.824 +    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
 128.825 +        register vec_u8 p2 = vec_ld(-3*stride, pix);
 128.826 +        register vec_u8 p1 = vec_ld(-2*stride, pix);
 128.827 +        register vec_u8 p0 = vec_ld(-1*stride, pix);
 128.828 +        register vec_u8 q0 = vec_ld(0, pix);
 128.829 +        register vec_u8 q1 = vec_ld(stride, pix);
 128.830 +        register vec_u8 q2 = vec_ld(2*stride, pix);
 128.831 +        h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
 128.832 +        vec_st(p1, -2*stride, pix);
 128.833 +        vec_st(p0, -1*stride, pix);
 128.834 +        vec_st(q0, 0, pix);
 128.835 +        vec_st(q1, stride, pix);
 128.836 +    }
 128.837 +}
 128.838 +
 128.839 +static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
 128.840 +
 128.841 +    register vec_u8 line0, line1, line2, line3, line4, line5;
 128.842 +    if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
 128.843 +        return;
 128.844 +    readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
 128.845 +    h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
 128.846 +    transpose4x16(line1, line2, line3, line4);
 128.847 +    write16x4(pix-2, stride, line1, line2, line3, line4);
 128.848 +}
 128.849 +
 128.850 +static av_always_inline
 128.851 +void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
 128.852 +{
 128.853 +    int y, aligned;
 128.854 +    vec_u8 vblock;
 128.855 +    vec_s16 vtemp, vweight, voffset, v0, v1;
 128.856 +    vec_u16 vlog2_denom;
 128.857 +    DECLARE_ALIGNED(16, int32_t, temp)[4];
 128.858 +    LOAD_ZERO;
 128.859 +
 128.860 +    offset <<= log2_denom;
 128.861 +    if(log2_denom) offset += 1<<(log2_denom-1);
 128.862 +    temp[0] = log2_denom;
 128.863 +    temp[1] = weight;
 128.864 +    temp[2] = offset;
 128.865 +
 128.866 +    vtemp = (vec_s16)vec_ld(0, temp);
 128.867 +    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
 128.868 +    vweight = vec_splat(vtemp, 3);
 128.869 +    voffset = vec_splat(vtemp, 5);
 128.870 +    aligned = !((unsigned long)block & 0xf);
 128.871 +
 128.872 +    for (y=0; y<h; y++) {
 128.873 +        vblock = vec_ld(0, block);
 128.874 +
 128.875 +        v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
 128.876 +        v1 = (vec_s16)vec_mergel(zero_u8v, vblock);
 128.877 +
 128.878 +        if (w == 16 || aligned) {
 128.879 +            v0 = vec_mladd(v0, vweight, zero_s16v);
 128.880 +            v0 = vec_adds(v0, voffset);
 128.881 +            v0 = vec_sra(v0, vlog2_denom);
 128.882 +        }
 128.883 +        if (w == 16 || !aligned) {
 128.884 +            v1 = vec_mladd(v1, vweight, zero_s16v);
 128.885 +            v1 = vec_adds(v1, voffset);
 128.886 +            v1 = vec_sra(v1, vlog2_denom);
 128.887 +        }
 128.888 +        vblock = vec_packsu(v0, v1);
 128.889 +        vec_st(vblock, 0, block);
 128.890 +
 128.891 +        block += stride;
 128.892 +    }
 128.893 +}
 128.894 +
 128.895 +static av_always_inline
 128.896 +void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
 128.897 +                               int weightd, int weights, int offset, int w, int h)
 128.898 +{
 128.899 +    int y, dst_aligned, src_aligned;
 128.900 +    vec_u8 vsrc, vdst;
 128.901 +    vec_s16 vtemp, vweights, vweightd, voffset, v0, v1, v2, v3;
 128.902 +    vec_u16 vlog2_denom;
 128.903 +    DECLARE_ALIGNED(16, int32_t, temp)[4];
 128.904 +    LOAD_ZERO;
 128.905 +
 128.906 +    offset = ((offset + 1) | 1) << log2_denom;
 128.907 +    temp[0] = log2_denom+1;
 128.908 +    temp[1] = weights;
 128.909 +    temp[2] = weightd;
 128.910 +    temp[3] = offset;
 128.911 +
 128.912 +    vtemp = (vec_s16)vec_ld(0, temp);
 128.913 +    vlog2_denom = (vec_u16)vec_splat(vtemp, 1);
 128.914 +    vweights = vec_splat(vtemp, 3);
 128.915 +    vweightd = vec_splat(vtemp, 5);
 128.916 +    voffset = vec_splat(vtemp, 7);
 128.917 +    dst_aligned = !((unsigned long)dst & 0xf);
 128.918 +    src_aligned = !((unsigned long)src & 0xf);
 128.919 +
 128.920 +    for (y=0; y<h; y++) {
 128.921 +        vdst = vec_ld(0, dst);
 128.922 +        vsrc = vec_ld(0, src);
 128.923 +
 128.924 +        v0 = (vec_s16)vec_mergeh(zero_u8v, vdst);
 128.925 +        v1 = (vec_s16)vec_mergel(zero_u8v, vdst);
 128.926 +        v2 = (vec_s16)vec_mergeh(zero_u8v, vsrc);
 128.927 +        v3 = (vec_s16)vec_mergel(zero_u8v, vsrc);
 128.928 +
 128.929 +        if (w == 8) {
 128.930 +            if (src_aligned)
 128.931 +                v3 = v2;
 128.932 +            else
 128.933 +                v2 = v3;
 128.934 +        }
 128.935 +
 128.936 +        if (w == 16 || dst_aligned) {
 128.937 +            v0 = vec_mladd(v0, vweightd, zero_s16v);
 128.938 +            v2 = vec_mladd(v2, vweights, zero_s16v);
 128.939 +
 128.940 +            v0 = vec_adds(v0, voffset);
 128.941 +            v0 = vec_adds(v0, v2);
 128.942 +            v0 = vec_sra(v0, vlog2_denom);
 128.943 +        }
 128.944 +        if (w == 16 || !dst_aligned) {
 128.945 +            v1 = vec_mladd(v1, vweightd, zero_s16v);
 128.946 +            v3 = vec_mladd(v3, vweights, zero_s16v);
 128.947 +
 128.948 +            v1 = vec_adds(v1, voffset);
 128.949 +            v1 = vec_adds(v1, v3);
 128.950 +            v1 = vec_sra(v1, vlog2_denom);
 128.951 +        }
 128.952 +        vdst = vec_packsu(v0, v1);
 128.953 +        vec_st(vdst, 0, dst);
 128.954 +
 128.955 +        dst += stride;
 128.956 +        src += stride;
 128.957 +    }
 128.958 +}
 128.959 +
 128.960 +#define H264_WEIGHT(W,H) \
 128.961 +static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
 128.962 +    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
 128.963 +}\
 128.964 +static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
 128.965 +    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
 128.966 +}
 128.967 +
 128.968 +H264_WEIGHT(16,16)
 128.969 +H264_WEIGHT(16, 8)
 128.970 +H264_WEIGHT( 8,16)
 128.971 +H264_WEIGHT( 8, 8)
 128.972 +H264_WEIGHT( 8, 4)
 128.973 +
 128.974 +void dsputil_h264_init_ppc(DSPContext* c) {    
 128.975 +	c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
 128.976 +	c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
 128.977 +
 128.978 +#define dspfunc(PFX, IDX, NUM) \
 128.979 +	c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
 128.980 +	c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
 128.981 +	c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
 128.982 +	c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
 128.983 +	c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
 128.984 +	c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
 128.985 +	c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
 128.986 +	c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
 128.987 +	c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
 128.988 +	c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
 128.989 +	c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
 128.990 +	c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
 128.991 +	c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
 128.992 +	c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
 128.993 +	c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
 128.994 +	c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
 128.995 +
 128.996 +	dspfunc(put_h264_qpel, 0, 16);
 128.997 +	dspfunc(avg_h264_qpel, 0, 16);
 128.998 +#undef dspfunc
 128.999 +}
128.1000 +
128.1001 +void ff_h264dsp_init_ppc(H264DSPContext *c){
128.1002 +	c->h264_idct_dc_add= h264_idct_dc_add_altivec;
128.1003 +	c->h264_idct_add = ff_h264_idct_add_altivec;
128.1004 +	c->h264_idct_add8 = ff_h264_idct_add8_altivec;
128.1005 +	c->h264_idct_add16 = ff_h264_idct_add16_altivec;
128.1006 +	c->h264_idct_add16intra = ff_h264_idct_add16intra_altivec;
128.1007 +
128.1008 +	c->h264_idct8_dc_add = ff_h264_idct8_dc_add_altivec;
128.1009 +	c->h264_idct8_add = ff_h264_idct8_add_altivec;
128.1010 +	c->h264_idct8_add4 = ff_h264_idct8_add4_altivec;
128.1011 +	c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
128.1012 +	c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
128.1013 +
128.1014 +	c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
128.1015 +	c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
128.1016 +	c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
128.1017 +	c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
128.1018 +	c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
128.1019 +	c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
128.1020 +	c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
128.1021 +	c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
128.1022 +	c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
128.1023 +	c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
128.1024 +}

   129.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   129.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/h264_template_altivec.c	Mon Aug 27 12:09:56 2012 +0200
   129.3 @@ -0,0 +1,783 @@
   129.4 +/*
   129.5 + * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
   129.6 + *
   129.7 + * This file is part of FFmpeg.
   129.8 + *
   129.9 + * FFmpeg is free software; you can redistribute it and/or
  129.10 + * modify it under the terms of the GNU Lesser General Public
  129.11 + * License as published by the Free Software Foundation; either
  129.12 + * version 2.1 of the License, or (at your option) any later version.
  129.13 + *
  129.14 + * FFmpeg is distributed in the hope that it will be useful,
  129.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  129.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  129.17 + * Lesser General Public License for more details.
  129.18 + *
  129.19 + * You should have received a copy of the GNU Lesser General Public
  129.20 + * License along with FFmpeg; if not, write to the Free Software
  129.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  129.22 + */
  129.23 +
  129.24 +//#define DEBUG_ALIGNMENT
  129.25 +#ifdef DEBUG_ALIGNMENT
  129.26 +#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
  129.27 +#else
  129.28 +#define ASSERT_ALIGNED(ptr) ;
  129.29 +#endif
  129.30 +
  129.31 +/* this code assume that stride % 16 == 0 */
  129.32 +
  129.33 +#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
  129.34 +        vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
  129.35 +        vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
  129.36 +\
  129.37 +        psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
  129.38 +        psum = vec_mladd(vB, vsrc1ssH, psum);\
  129.39 +        psum = vec_mladd(vC, vsrc2ssH, psum);\
  129.40 +        psum = vec_mladd(vD, vsrc3ssH, psum);\
  129.41 +        psum = BIAS2(psum);\
  129.42 +        psum = vec_sr(psum, v6us);\
  129.43 +\
  129.44 +        vdst = vec_ld(0, dst);\
  129.45 +        ppsum = (vec_u8)vec_pack(psum, psum);\
  129.46 +        vfdst = vec_perm(vdst, ppsum, fperm);\
  129.47 +\
  129.48 +        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
  129.49 +\
  129.50 +        vec_st(fsum, 0, dst);\
  129.51 +\
  129.52 +        vsrc0ssH = vsrc2ssH;\
  129.53 +        vsrc1ssH = vsrc3ssH;\
  129.54 +\
  129.55 +        dst += stride;\
  129.56 +        src += stride;
  129.57 +
  129.58 +#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
  129.59 +\
  129.60 +        vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
  129.61 +        vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
  129.62 +\
  129.63 +        psum = vec_mladd(vA, vsrc0ssH, v32ss);\
  129.64 +        psum = vec_mladd(vE, vsrc1ssH, psum);\
  129.65 +        psum = vec_sr(psum, v6us);\
  129.66 +\
  129.67 +        vdst = vec_ld(0, dst);\
  129.68 +        ppsum = (vec_u8)vec_pack(psum, psum);\
  129.69 +        vfdst = vec_perm(vdst, ppsum, fperm);\
  129.70 +\
  129.71 +        OP_U8_ALTIVEC(fsum, vfdst, vdst);\
  129.72 +\
  129.73 +        vec_st(fsum, 0, dst);\
  129.74 +\
  129.75 +        dst += stride;\
  129.76 +        src += stride;
  129.77 +
  129.78 +#define noop(a) a
  129.79 +#define add28(a) vec_add(v28ss, a)
  129.80 +
  129.81 +static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
  129.82 +                                    int stride, int h, int x, int y) {
  129.83 +  POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
  129.84 +    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
  129.85 +                        {((8 - x) * (8 - y)),
  129.86 +                         ((    x) * (8 - y)),
  129.87 +                         ((8 - x) * (    y)),
  129.88 +                         ((    x) * (    y))};
  129.89 +    register int i;
  129.90 +    vec_u8 fperm;
  129.91 +    const vec_s32 vABCD = vec_ld(0, ABCD);
  129.92 +    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
  129.93 +    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
  129.94 +    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
  129.95 +    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
  129.96 +    LOAD_ZERO;
  129.97 +    const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
  129.98 +    const vec_u16 v6us = vec_splat_u16(6);
  129.99 +    register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
 129.100 +    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 129.101 +
 129.102 +    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
 129.103 +    vec_u8 vsrc0uc, vsrc1uc;
 129.104 +    vec_s16 vsrc0ssH, vsrc1ssH;
 129.105 +    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
 129.106 +    vec_s16 vsrc2ssH, vsrc3ssH, psum;
 129.107 +    vec_u8 vdst, ppsum, vfdst, fsum;
 129.108 +
 129.109 +  POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
 129.110 +
 129.111 +    if (((unsigned long)dst) % 16 == 0) {
 129.112 +        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
 129.113 +                         0x14, 0x15, 0x16, 0x17,
 129.114 +                         0x08, 0x09, 0x0A, 0x0B,
 129.115 +                         0x0C, 0x0D, 0x0E, 0x0F};
 129.116 +    } else {
 129.117 +        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
 129.118 +                         0x04, 0x05, 0x06, 0x07,
 129.119 +                         0x18, 0x19, 0x1A, 0x1B,
 129.120 +                         0x1C, 0x1D, 0x1E, 0x1F};
 129.121 +    }
 129.122 +
 129.123 +    vsrcAuc = vec_ld(0, src);
 129.124 +
 129.125 +    if (loadSecond)
 129.126 +        vsrcBuc = vec_ld(16, src);
 129.127 +    vsrcperm0 = vec_lvsl(0, src);
 129.128 +    vsrcperm1 = vec_lvsl(1, src);
 129.129 +
 129.130 +    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
 129.131 +    if (reallyBadAlign)
 129.132 +        vsrc1uc = vsrcBuc;
 129.133 +    else
 129.134 +        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
 129.135 +
 129.136 +    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
 129.137 +    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
 129.138 +
 129.139 +    if (ABCD[3]) {
 129.140 +        if (!loadSecond) {// -> !reallyBadAlign
 129.141 +            for (i = 0 ; i < h ; i++) {
 129.142 +                vsrcCuc = vec_ld(stride + 0, src);
 129.143 +                vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
 129.144 +                vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
 129.145 +
 129.146 +                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
 129.147 +            }
 129.148 +        } else {
 129.149 +            vec_u8 vsrcDuc;
 129.150 +            for (i = 0 ; i < h ; i++) {
 129.151 +                vsrcCuc = vec_ld(stride + 0, src);
 129.152 +                vsrcDuc = vec_ld(stride + 16, src);
 129.153 +                vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
 129.154 +                if (reallyBadAlign)
 129.155 +                    vsrc3uc = vsrcDuc;
 129.156 +                else
 129.157 +                    vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
 129.158 +
 129.159 +                CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
 129.160 +            }
 129.161 +        }
 129.162 +    } else {
 129.163 +        const vec_s16 vE = vec_add(vB, vC);
 129.164 +        if (ABCD[2]) { // x == 0 B == 0
 129.165 +            if (!loadSecond) {// -> !reallyBadAlign
 129.166 +                for (i = 0 ; i < h ; i++) {
 129.167 +                    vsrcCuc = vec_ld(stride + 0, src);
 129.168 +                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
 129.169 +                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
 129.170 +
 129.171 +                    vsrc0uc = vsrc1uc;
 129.172 +                }
 129.173 +            } else {
 129.174 +                vec_u8 vsrcDuc;
 129.175 +                for (i = 0 ; i < h ; i++) {
 129.176 +                    vsrcCuc = vec_ld(stride + 0, src);
 129.177 +                    vsrcDuc = vec_ld(stride + 15, src);
 129.178 +                    vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
 129.179 +                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
 129.180 +
 129.181 +                    vsrc0uc = vsrc1uc;
 129.182 +                }
 129.183 +            }
 129.184 +        } else { // y == 0 C == 0
 129.185 +            if (!loadSecond) {// -> !reallyBadAlign
 129.186 +                for (i = 0 ; i < h ; i++) {
 129.187 +                    vsrcCuc = vec_ld(0, src);
 129.188 +                    vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
 129.189 +                    vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
 129.190 +
 129.191 +                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
 129.192 +                }
 129.193 +            } else {
 129.194 +                vec_u8 vsrcDuc;
 129.195 +                for (i = 0 ; i < h ; i++) {
 129.196 +                    vsrcCuc = vec_ld(0, src);
 129.197 +                    vsrcDuc = vec_ld(15, src);
 129.198 +                    vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
 129.199 +                    if (reallyBadAlign)
 129.200 +                        vsrc1uc = vsrcDuc;
 129.201 +                    else
 129.202 +                        vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
 129.203 +
 129.204 +                    CHROMA_MC8_ALTIVEC_CORE_SIMPLE
 129.205 +                }
 129.206 +            }
 129.207 +        }
 129.208 +    }
 129.209 +    POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
 129.210 +}
 129.211 +
 129.212 +/* this code assume that stride % 16 == 0 */
 129.213 +static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
 129.214 +   DECLARE_ALIGNED(16, signed int, ABCD)[4] =
 129.215 +                        {((8 - x) * (8 - y)),
 129.216 +                         ((    x) * (8 - y)),
 129.217 +                         ((8 - x) * (    y)),
 129.218 +                         ((    x) * (    y))};
 129.219 +    register int i;
 129.220 +    vec_u8 fperm;
 129.221 +    const vec_s32 vABCD = vec_ld(0, ABCD);
 129.222 +    const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
 129.223 +    const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
 129.224 +    const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
 129.225 +    const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
 129.226 +    LOAD_ZERO;
 129.227 +    const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
 129.228 +    const vec_u16 v6us  = vec_splat_u16(6);
 129.229 +    register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
 129.230 +    register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
 129.231 +
 129.232 +    vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
 129.233 +    vec_u8 vsrc0uc, vsrc1uc;
 129.234 +    vec_s16 vsrc0ssH, vsrc1ssH;
 129.235 +    vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
 129.236 +    vec_s16 vsrc2ssH, vsrc3ssH, psum;
 129.237 +    vec_u8 vdst, ppsum, vfdst, fsum;
 129.238 +
 129.239 +    if (((unsigned long)dst) % 16 == 0) {
 129.240 +        fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
 129.241 +                         0x14, 0x15, 0x16, 0x17,
 129.242 +                         0x08, 0x09, 0x0A, 0x0B,
 129.243 +                         0x0C, 0x0D, 0x0E, 0x0F};
 129.244 +    } else {
 129.245 +        fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
 129.246 +                         0x04, 0x05, 0x06, 0x07,
 129.247 +                         0x18, 0x19, 0x1A, 0x1B,
 129.248 +                         0x1C, 0x1D, 0x1E, 0x1F};
 129.249 +    }
 129.250 +
 129.251 +    vsrcAuc = vec_ld(0, src);
 129.252 +
 129.253 +    if (loadSecond)
 129.254 +        vsrcBuc = vec_ld(16, src);
 129.255 +    vsrcperm0 = vec_lvsl(0, src);
 129.256 +    vsrcperm1 = vec_lvsl(1, src);
 129.257 +
 129.258 +    vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
 129.259 +    if (reallyBadAlign)
 129.260 +        vsrc1uc = vsrcBuc;
 129.261 +    else
 129.262 +        vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
 129.263 +
 129.264 +    vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
 129.265 +    vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
 129.266 +
 129.267 +    if (!loadSecond) {// -> !reallyBadAlign
 129.268 +        for (i = 0 ; i < h ; i++) {
 129.269 +
 129.270 +
 129.271 +            vsrcCuc = vec_ld(stride + 0, src);
 129.272 +
 129.273 +            vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
 129.274 +            vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
 129.275 +
 129.276 +            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
 129.277 +        }
 129.278 +    } else {
 129.279 +        vec_u8 vsrcDuc;
 129.280 +        for (i = 0 ; i < h ; i++) {
 129.281 +            vsrcCuc = vec_ld(stride + 0, src);
 129.282 +            vsrcDuc = vec_ld(stride + 16, src);
 129.283 +
 129.284 +            vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
 129.285 +            if (reallyBadAlign)
 129.286 +                vsrc3uc = vsrcDuc;
 129.287 +            else
 129.288 +                vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
 129.289 +
 129.290 +            CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
 129.291 +        }
 129.292 +    }
 129.293 +}
 129.294 +
 129.295 +#undef noop
 129.296 +#undef add28
 129.297 +#undef CHROMA_MC8_ALTIVEC_CORE
 129.298 +
 129.299 +/* this code assume stride % 16 == 0 */
 129.300 +static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
 129.301 +    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
 129.302 +    register int i;
 129.303 +
 129.304 +    LOAD_ZERO;
 129.305 +    const vec_u8 permM2 = vec_lvsl(-2, src);
 129.306 +    const vec_u8 permM1 = vec_lvsl(-1, src);
 129.307 +    const vec_u8 permP0 = vec_lvsl(+0, src);
 129.308 +    const vec_u8 permP1 = vec_lvsl(+1, src);
 129.309 +    const vec_u8 permP2 = vec_lvsl(+2, src);
 129.310 +    const vec_u8 permP3 = vec_lvsl(+3, src);
 129.311 +    const vec_s16 v5ss = vec_splat_s16(5);
 129.312 +    const vec_u16 v5us = vec_splat_u16(5);
 129.313 +    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
 129.314 +    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
 129.315 +
 129.316 +    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
 129.317 +
 129.318 +    register int align = ((((unsigned long)src) - 2) % 16);
 129.319 +
 129.320 +    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
 129.321 +              srcP2A, srcP2B, srcP3A, srcP3B,
 129.322 +              srcM1A, srcM1B, srcM2A, srcM2B,
 129.323 +              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
 129.324 +              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
 129.325 +              psumA, psumB, sumA, sumB;
 129.326 +
 129.327 +    vec_u8 sum, vdst, fsum;
 129.328 +
 129.329 +    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
 129.330 +
 129.331 +    for (i = 0 ; i < 16 ; i ++) {
 129.332 +        vec_u8 srcR1 = vec_ld(-2, src);
 129.333 +        vec_u8 srcR2 = vec_ld(14, src);
 129.334 +
 129.335 +        switch (align) {
 129.336 +        default: {
 129.337 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.338 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.339 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.340 +            srcP1 = vec_perm(srcR1, srcR2, permP1);
 129.341 +            srcP2 = vec_perm(srcR1, srcR2, permP2);
 129.342 +            srcP3 = vec_perm(srcR1, srcR2, permP3);
 129.343 +        } break;
 129.344 +        case 11: {
 129.345 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.346 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.347 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.348 +            srcP1 = vec_perm(srcR1, srcR2, permP1);
 129.349 +            srcP2 = vec_perm(srcR1, srcR2, permP2);
 129.350 +            srcP3 = srcR2;
 129.351 +        } break;
 129.352 +        case 12: {
 129.353 +            vec_u8 srcR3 = vec_ld(30, src);
 129.354 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.355 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.356 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.357 +            srcP1 = vec_perm(srcR1, srcR2, permP1);
 129.358 +            srcP2 = srcR2;
 129.359 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.360 +        } break;
 129.361 +        case 13: {
 129.362 +            vec_u8 srcR3 = vec_ld(30, src);
 129.363 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.364 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.365 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.366 +            srcP1 = srcR2;
 129.367 +            srcP2 = vec_perm(srcR2, srcR3, permP2);
 129.368 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.369 +        } break;
 129.370 +        case 14: {
 129.371 +            vec_u8 srcR3 = vec_ld(30, src);
 129.372 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.373 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.374 +            srcP0 = srcR2;
 129.375 +            srcP1 = vec_perm(srcR2, srcR3, permP1);
 129.376 +            srcP2 = vec_perm(srcR2, srcR3, permP2);
 129.377 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.378 +        } break;
 129.379 +        case 15: {
 129.380 +            vec_u8 srcR3 = vec_ld(30, src);
 129.381 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.382 +            srcM1 = srcR2;
 129.383 +            srcP0 = vec_perm(srcR2, srcR3, permP0);
 129.384 +            srcP1 = vec_perm(srcR2, srcR3, permP1);
 129.385 +            srcP2 = vec_perm(srcR2, srcR3, permP2);
 129.386 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.387 +        } break;
 129.388 +        }
 129.389 +
 129.390 +        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
 129.391 +        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
 129.392 +        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
 129.393 +        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
 129.394 +
 129.395 +        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
 129.396 +        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
 129.397 +        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
 129.398 +        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
 129.399 +
 129.400 +        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
 129.401 +        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
 129.402 +        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
 129.403 +        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
 129.404 +
 129.405 +        sum1A = vec_adds(srcP0A, srcP1A);
 129.406 +        sum1B = vec_adds(srcP0B, srcP1B);
 129.407 +        sum2A = vec_adds(srcM1A, srcP2A);
 129.408 +        sum2B = vec_adds(srcM1B, srcP2B);
 129.409 +        sum3A = vec_adds(srcM2A, srcP3A);
 129.410 +        sum3B = vec_adds(srcM2B, srcP3B);
 129.411 +
 129.412 +        pp1A = vec_mladd(sum1A, v20ss, v16ss);
 129.413 +        pp1B = vec_mladd(sum1B, v20ss, v16ss);
 129.414 +
 129.415 +        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
 129.416 +        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
 129.417 +
 129.418 +        pp3A = vec_add(sum3A, pp1A);
 129.419 +        pp3B = vec_add(sum3B, pp1B);
 129.420 +
 129.421 +        psumA = vec_sub(pp3A, pp2A);
 129.422 +        psumB = vec_sub(pp3B, pp2B);
 129.423 +
 129.424 +        sumA = vec_sra(psumA, v5us);
 129.425 +        sumB = vec_sra(psumB, v5us);
 129.426 +
 129.427 +        sum = vec_packsu(sumA, sumB);
 129.428 +
 129.429 +        ASSERT_ALIGNED(dst);
 129.430 +        vdst = vec_ld(0, dst);
 129.431 +
 129.432 +        OP_U8_ALTIVEC(fsum, sum, vdst);
 129.433 +
 129.434 +        vec_st(fsum, 0, dst);
 129.435 +
 129.436 +        src += srcStride;
 129.437 +        dst += dstStride;
 129.438 +    }
 129.439 +    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
 129.440 +}
 129.441 +
 129.442 +/* this code assume stride % 16 == 0 */
 129.443 +static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
 129.444 +    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
 129.445 +
 129.446 +    register int i;
 129.447 +
 129.448 +    LOAD_ZERO;
 129.449 +    const vec_u8 perm = vec_lvsl(0, src);
 129.450 +    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
 129.451 +    const vec_u16 v5us = vec_splat_u16(5);
 129.452 +    const vec_s16 v5ss = vec_splat_s16(5);
 129.453 +    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
 129.454 +
 129.455 +    uint8_t *srcbis = src - (srcStride * 2);
 129.456 +
 129.457 +    const vec_u8 srcM2a = vec_ld(0, srcbis);
 129.458 +    const vec_u8 srcM2b = vec_ld(16, srcbis);
 129.459 +    const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
 129.460 +    //srcbis += srcStride;
 129.461 +    const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
 129.462 +    const vec_u8 srcM1b = vec_ld(16, srcbis);
 129.463 +    const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
 129.464 +    //srcbis += srcStride;
 129.465 +    const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
 129.466 +    const vec_u8 srcP0b = vec_ld(16, srcbis);
 129.467 +    const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
 129.468 +    //srcbis += srcStride;
 129.469 +    const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
 129.470 +    const vec_u8 srcP1b = vec_ld(16, srcbis);
 129.471 +    const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
 129.472 +    //srcbis += srcStride;
 129.473 +    const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
 129.474 +    const vec_u8 srcP2b = vec_ld(16, srcbis);
 129.475 +    const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
 129.476 +    //srcbis += srcStride;
 129.477 +
 129.478 +    vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
 129.479 +    vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
 129.480 +    vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
 129.481 +    vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
 129.482 +    vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
 129.483 +    vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
 129.484 +    vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
 129.485 +    vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
 129.486 +    vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
 129.487 +    vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
 129.488 +
 129.489 +    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
 129.490 +              psumA, psumB, sumA, sumB,
 129.491 +              srcP3ssA, srcP3ssB,
 129.492 +              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
 129.493 +
 129.494 +    vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
 129.495 +
 129.496 +    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
 129.497 +
 129.498 +    for (i = 0 ; i < 16 ; i++) {
 129.499 +        srcP3a = vec_ld(0, srcbis += srcStride);
 129.500 +        srcP3b = vec_ld(16, srcbis);
 129.501 +        srcP3 = vec_perm(srcP3a, srcP3b, perm);
 129.502 +        srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
 129.503 +        srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
 129.504 +        //srcbis += srcStride;
 129.505 +
 129.506 +        sum1A = vec_adds(srcP0ssA, srcP1ssA);
 129.507 +        sum1B = vec_adds(srcP0ssB, srcP1ssB);
 129.508 +        sum2A = vec_adds(srcM1ssA, srcP2ssA);
 129.509 +        sum2B = vec_adds(srcM1ssB, srcP2ssB);
 129.510 +        sum3A = vec_adds(srcM2ssA, srcP3ssA);
 129.511 +        sum3B = vec_adds(srcM2ssB, srcP3ssB);
 129.512 +
 129.513 +        srcM2ssA = srcM1ssA;
 129.514 +        srcM2ssB = srcM1ssB;
 129.515 +        srcM1ssA = srcP0ssA;
 129.516 +        srcM1ssB = srcP0ssB;
 129.517 +        srcP0ssA = srcP1ssA;
 129.518 +        srcP0ssB = srcP1ssB;
 129.519 +        srcP1ssA = srcP2ssA;
 129.520 +        srcP1ssB = srcP2ssB;
 129.521 +        srcP2ssA = srcP3ssA;
 129.522 +        srcP2ssB = srcP3ssB;
 129.523 +
 129.524 +        pp1A = vec_mladd(sum1A, v20ss, v16ss);
 129.525 +        pp1B = vec_mladd(sum1B, v20ss, v16ss);
 129.526 +
 129.527 +        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
 129.528 +        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
 129.529 +
 129.530 +        pp3A = vec_add(sum3A, pp1A);
 129.531 +        pp3B = vec_add(sum3B, pp1B);
 129.532 +
 129.533 +        psumA = vec_sub(pp3A, pp2A);
 129.534 +        psumB = vec_sub(pp3B, pp2B);
 129.535 +
 129.536 +        sumA = vec_sra(psumA, v5us);
 129.537 +        sumB = vec_sra(psumB, v5us);
 129.538 +
 129.539 +        sum = vec_packsu(sumA, sumB);
 129.540 +
 129.541 +        ASSERT_ALIGNED(dst);
 129.542 +        vdst = vec_ld(0, dst);
 129.543 +
 129.544 +        OP_U8_ALTIVEC(fsum, sum, vdst);
 129.545 +
 129.546 +        vec_st(fsum, 0, dst);
 129.547 +
 129.548 +        dst += dstStride;
 129.549 +    }
 129.550 +    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
 129.551 +}
 129.552 +
 129.553 +/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
 129.554 +static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
 129.555 +    POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
 129.556 +    register int i;
 129.557 +    LOAD_ZERO;
 129.558 +    const vec_u8 permM2 = vec_lvsl(-2, src);
 129.559 +    const vec_u8 permM1 = vec_lvsl(-1, src);
 129.560 +    const vec_u8 permP0 = vec_lvsl(+0, src);
 129.561 +    const vec_u8 permP1 = vec_lvsl(+1, src);
 129.562 +    const vec_u8 permP2 = vec_lvsl(+2, src);
 129.563 +    const vec_u8 permP3 = vec_lvsl(+3, src);
 129.564 +    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
 129.565 +    const vec_u32 v10ui = vec_splat_u32(10);
 129.566 +    const vec_s16 v5ss = vec_splat_s16(5);
 129.567 +    const vec_s16 v1ss = vec_splat_s16(1);
 129.568 +    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
 129.569 +    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
 129.570 +
 129.571 +    register int align = ((((unsigned long)src) - 2) % 16);
 129.572 +
 129.573 +    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
 129.574 +              srcP2A, srcP2B, srcP3A, srcP3B,
 129.575 +              srcM1A, srcM1B, srcM2A, srcM2B,
 129.576 +              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
 129.577 +              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
 129.578 +
 129.579 +    const vec_u8 mperm = (const vec_u8)
 129.580 +        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
 129.581 +         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
 129.582 +    int16_t *tmpbis = tmp;
 129.583 +
 129.584 +    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
 129.585 +              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
 129.586 +              tmpP2ssA, tmpP2ssB;
 129.587 +
 129.588 +    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
 129.589 +              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
 129.590 +              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
 129.591 +              ssumAe, ssumAo, ssumBe, ssumBo;
 129.592 +    vec_u8 fsum, sumv, sum, vdst;
 129.593 +    vec_s16 ssume, ssumo;
 129.594 +
 129.595 +    POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
 129.596 +    src -= (2 * srcStride);
 129.597 +    for (i = 0 ; i < 21 ; i ++) {
 129.598 +        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
 129.599 +        vec_u8 srcR1 = vec_ld(-2, src);
 129.600 +        vec_u8 srcR2 = vec_ld(14, src);
 129.601 +
 129.602 +        switch (align) {
 129.603 +        default: {
 129.604 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.605 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.606 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.607 +            srcP1 = vec_perm(srcR1, srcR2, permP1);
 129.608 +            srcP2 = vec_perm(srcR1, srcR2, permP2);
 129.609 +            srcP3 = vec_perm(srcR1, srcR2, permP3);
 129.610 +        } break;
 129.611 +        case 11: {
 129.612 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.613 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.614 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.615 +            srcP1 = vec_perm(srcR1, srcR2, permP1);
 129.616 +            srcP2 = vec_perm(srcR1, srcR2, permP2);
 129.617 +            srcP3 = srcR2;
 129.618 +        } break;
 129.619 +        case 12: {
 129.620 +            vec_u8 srcR3 = vec_ld(30, src);
 129.621 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.622 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.623 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.624 +            srcP1 = vec_perm(srcR1, srcR2, permP1);
 129.625 +            srcP2 = srcR2;
 129.626 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.627 +        } break;
 129.628 +        case 13: {
 129.629 +            vec_u8 srcR3 = vec_ld(30, src);
 129.630 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.631 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.632 +            srcP0 = vec_perm(srcR1, srcR2, permP0);
 129.633 +            srcP1 = srcR2;
 129.634 +            srcP2 = vec_perm(srcR2, srcR3, permP2);
 129.635 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.636 +        } break;
 129.637 +        case 14: {
 129.638 +            vec_u8 srcR3 = vec_ld(30, src);
 129.639 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.640 +            srcM1 = vec_perm(srcR1, srcR2, permM1);
 129.641 +            srcP0 = srcR2;
 129.642 +            srcP1 = vec_perm(srcR2, srcR3, permP1);
 129.643 +            srcP2 = vec_perm(srcR2, srcR3, permP2);
 129.644 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.645 +        } break;
 129.646 +        case 15: {
 129.647 +            vec_u8 srcR3 = vec_ld(30, src);
 129.648 +            srcM2 = vec_perm(srcR1, srcR2, permM2);
 129.649 +            srcM1 = srcR2;
 129.650 +            srcP0 = vec_perm(srcR2, srcR3, permP0);
 129.651 +            srcP1 = vec_perm(srcR2, srcR3, permP1);
 129.652 +            srcP2 = vec_perm(srcR2, srcR3, permP2);
 129.653 +            srcP3 = vec_perm(srcR2, srcR3, permP3);
 129.654 +        } break;
 129.655 +        }
 129.656 +
 129.657 +        srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
 129.658 +        srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
 129.659 +        srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
 129.660 +        srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
 129.661 +
 129.662 +        srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
 129.663 +        srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
 129.664 +        srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
 129.665 +        srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
 129.666 +
 129.667 +        srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
 129.668 +        srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
 129.669 +        srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
 129.670 +        srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
 129.671 +
 129.672 +        sum1A = vec_adds(srcP0A, srcP1A);
 129.673 +        sum1B = vec_adds(srcP0B, srcP1B);
 129.674 +        sum2A = vec_adds(srcM1A, srcP2A);
 129.675 +        sum2B = vec_adds(srcM1B, srcP2B);
 129.676 +        sum3A = vec_adds(srcM2A, srcP3A);
 129.677 +        sum3B = vec_adds(srcM2B, srcP3B);
 129.678 +
 129.679 +        pp1A = vec_mladd(sum1A, v20ss, sum3A);
 129.680 +        pp1B = vec_mladd(sum1B, v20ss, sum3B);
 129.681 +
 129.682 +        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
 129.683 +        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
 129.684 +
 129.685 +        psumA = vec_sub(pp1A, pp2A);
 129.686 +        psumB = vec_sub(pp1B, pp2B);
 129.687 +
 129.688 +        vec_st(psumA, 0, tmp);
 129.689 +        vec_st(psumB, 16, tmp);
 129.690 +
 129.691 +        src += srcStride;
 129.692 +        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
 129.693 +    }
 129.694 +
 129.695 +    tmpM2ssA = vec_ld(0, tmpbis);
 129.696 +    tmpM2ssB = vec_ld(16, tmpbis);
 129.697 +    tmpbis += tmpStride;
 129.698 +    tmpM1ssA = vec_ld(0, tmpbis);
 129.699 +    tmpM1ssB = vec_ld(16, tmpbis);
 129.700 +    tmpbis += tmpStride;
 129.701 +    tmpP0ssA = vec_ld(0, tmpbis);
 129.702 +    tmpP0ssB = vec_ld(16, tmpbis);
 129.703 +    tmpbis += tmpStride;
 129.704 +    tmpP1ssA = vec_ld(0, tmpbis);
 129.705 +    tmpP1ssB = vec_ld(16, tmpbis);
 129.706 +    tmpbis += tmpStride;
 129.707 +    tmpP2ssA = vec_ld(0, tmpbis);
 129.708 +    tmpP2ssB = vec_ld(16, tmpbis);
 129.709 +    tmpbis += tmpStride;
 129.710 +
 129.711 +    for (i = 0 ; i < 16 ; i++) {
 129.712 +        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
 129.713 +        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
 129.714 +
 129.715 +        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
 129.716 +        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
 129.717 +        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
 129.718 +        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
 129.719 +        const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
 129.720 +        const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
 129.721 +
 129.722 +        tmpbis += tmpStride;
 129.723 +
 129.724 +        tmpM2ssA = tmpM1ssA;
 129.725 +        tmpM2ssB = tmpM1ssB;
 129.726 +        tmpM1ssA = tmpP0ssA;
 129.727 +        tmpM1ssB = tmpP0ssB;
 129.728 +        tmpP0ssA = tmpP1ssA;
 129.729 +        tmpP0ssB = tmpP1ssB;
 129.730 +        tmpP1ssA = tmpP2ssA;
 129.731 +        tmpP1ssB = tmpP2ssB;
 129.732 +        tmpP2ssA = tmpP3ssA;
 129.733 +        tmpP2ssB = tmpP3ssB;
 129.734 +
 129.735 +        pp1Ae = vec_mule(sum1A, v20ss);
 129.736 +        pp1Ao = vec_mulo(sum1A, v20ss);
 129.737 +        pp1Be = vec_mule(sum1B, v20ss);
 129.738 +        pp1Bo = vec_mulo(sum1B, v20ss);
 129.739 +
 129.740 +        pp2Ae = vec_mule(sum2A, v5ss);
 129.741 +        pp2Ao = vec_mulo(sum2A, v5ss);
 129.742 +        pp2Be = vec_mule(sum2B, v5ss);
 129.743 +        pp2Bo = vec_mulo(sum2B, v5ss);
 129.744 +
 129.745 +        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
 129.746 +        pp3Ao = vec_mulo(sum3A, v1ss);
 129.747 +        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
 129.748 +        pp3Bo = vec_mulo(sum3B, v1ss);
 129.749 +
 129.750 +        pp1cAe = vec_add(pp1Ae, v512si);
 129.751 +        pp1cAo = vec_add(pp1Ao, v512si);
 129.752 +        pp1cBe = vec_add(pp1Be, v512si);
 129.753 +        pp1cBo = vec_add(pp1Bo, v512si);
 129.754 +
 129.755 +        pp32Ae = vec_sub(pp3Ae, pp2Ae);
 129.756 +        pp32Ao = vec_sub(pp3Ao, pp2Ao);
 129.757 +        pp32Be = vec_sub(pp3Be, pp2Be);
 129.758 +        pp32Bo = vec_sub(pp3Bo, pp2Bo);
 129.759 +
 129.760 +        sumAe = vec_add(pp1cAe, pp32Ae);
 129.761 +        sumAo = vec_add(pp1cAo, pp32Ao);
 129.762 +        sumBe = vec_add(pp1cBe, pp32Be);
 129.763 +        sumBo = vec_add(pp1cBo, pp32Bo);
 129.764 +
 129.765 +        ssumAe = vec_sra(sumAe, v10ui);
 129.766 +        ssumAo = vec_sra(sumAo, v10ui);
 129.767 +        ssumBe = vec_sra(sumBe, v10ui);
 129.768 +        ssumBo = vec_sra(sumBo, v10ui);
 129.769 +
 129.770 +        ssume = vec_packs(ssumAe, ssumBe);
 129.771 +        ssumo = vec_packs(ssumAo, ssumBo);
 129.772 +
 129.773 +        sumv = vec_packsu(ssume, ssumo);
 129.774 +        sum = vec_perm(sumv, sumv, mperm);
 129.775 +
 129.776 +        ASSERT_ALIGNED(dst);
 129.777 +        vdst = vec_ld(0, dst);
 129.778 +
 129.779 +        OP_U8_ALTIVEC(fsum, sum, vdst);
 129.780 +
 129.781 +        vec_st(fsum, 0, dst);
 129.782 +
 129.783 +        dst += dstStride;
 129.784 +    }
 129.785 +    POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
 129.786 +}

   130.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   130.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/idct_altivec.c	Mon Aug 27 12:09:56 2012 +0200
   130.3 @@ -0,0 +1,232 @@
   130.4 +/*
   130.5 + * Copyright (c) 2001 Michel Lespinasse
   130.6 + *
   130.7 + * This file is part of FFmpeg.
   130.8 + *
   130.9 + * FFmpeg is free software; you can redistribute it and/or
  130.10 + * modify it under the terms of the GNU Lesser General Public
  130.11 + * License as published by the Free Software Foundation; either
  130.12 + * version 2.1 of the License, or (at your option) any later version.
  130.13 + *
  130.14 + * FFmpeg is distributed in the hope that it will be useful,
  130.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  130.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  130.17 + * Lesser General Public License for more details.
  130.18 + *
  130.19 + * You should have received a copy of the GNU Lesser General Public
  130.20 + * License along with FFmpeg; if not, write to the Free Software
  130.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  130.22 + */
  130.23 +
  130.24 +/*
  130.25 + * NOTE: This code is based on GPL code from the libmpeg2 project.  The
  130.26 + * author, Michel Lespinasses, has given explicit permission to release
  130.27 + * under LGPL as part of FFmpeg.
  130.28 + */
  130.29 +
  130.30 +/*
  130.31 + * FFmpeg integration by Dieter Shirley
  130.32 + *
  130.33 + * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
  130.34 + * project.  I've deleted all of the libmpeg2-specific code, renamed the
  130.35 + * functions and reordered the function parameters.  The only change to the
  130.36 + * IDCT function itself was to factor out the partial transposition, and to
  130.37 + * perform a full transpose at the end of the function.
  130.38 + */
  130.39 +
  130.40 +
  130.41 +#include <stdlib.h>                                      /* malloc(), free() */
  130.42 +#include <string.h>
  130.43 +#include "config.h"
  130.44 +#if HAVE_ALTIVEC_H
  130.45 +#include <altivec.h>
  130.46 +#endif
  130.47 +#include "libavcodec/dsputil.h"
  130.48 +#include "types_altivec.h"
  130.49 +#include "dsputil_ppc.h"
  130.50 +#include "dsputil_altivec.h"
  130.51 +
  130.52 +#define IDCT_HALF                                       \
  130.53 +    /* 1st stage */                                     \
  130.54 +    t1 = vec_mradds (a1, vx7, vx1 );                    \
  130.55 +    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));    \
  130.56 +    t7 = vec_mradds (a2, vx5, vx3);                     \
  130.57 +    t3 = vec_mradds (ma2, vx3, vx5);                    \
  130.58 +                                                        \
  130.59 +    /* 2nd stage */                                     \
  130.60 +    t5 = vec_adds (vx0, vx4);                           \
  130.61 +    t0 = vec_subs (vx0, vx4);                           \
  130.62 +    t2 = vec_mradds (a0, vx6, vx2);                     \
  130.63 +    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));    \
  130.64 +    t6 = vec_adds (t8, t3);                             \
  130.65 +    t3 = vec_subs (t8, t3);                             \
  130.66 +    t8 = vec_subs (t1, t7);                             \
  130.67 +    t1 = vec_adds (t1, t7);                             \
  130.68 +                                                        \
  130.69 +    /* 3rd stage */                                     \
  130.70 +    t7 = vec_adds (t5, t2);                             \
  130.71 +    t2 = vec_subs (t5, t2);                             \
  130.72 +    t5 = vec_adds (t0, t4);                             \
  130.73 +    t0 = vec_subs (t0, t4);                             \
  130.74 +    t4 = vec_subs (t8, t3);                             \
  130.75 +    t3 = vec_adds (t8, t3);                             \
  130.76 +                                                        \
  130.77 +    /* 4th stage */                                     \
  130.78 +    vy0 = vec_adds (t7, t1);                            \
  130.79 +    vy7 = vec_subs (t7, t1);                            \
  130.80 +    vy1 = vec_mradds (c4, t3, t5);                      \
  130.81 +    vy6 = vec_mradds (mc4, t3, t5);                     \
  130.82 +    vy2 = vec_mradds (c4, t4, t0);                      \
  130.83 +    vy5 = vec_mradds (mc4, t4, t0);                     \
  130.84 +    vy3 = vec_adds (t2, t6);                            \
  130.85 +    vy4 = vec_subs (t2, t6);
  130.86 +
  130.87 +
  130.88 +#define IDCT                                                            \
  130.89 +    vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
  130.90 +    vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
  130.91 +    vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
  130.92 +    vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
  130.93 +    vec_u16 shift;                                                 \
  130.94 +                                                                        \
  130.95 +    c4 = vec_splat (constants[0], 0);                                   \
  130.96 +    a0 = vec_splat (constants[0], 1);                                   \
  130.97 +    a1 = vec_splat (constants[0], 2);                                   \
  130.98 +    a2 = vec_splat (constants[0], 3);                                   \
  130.99 +    mc4 = vec_splat (constants[0], 4);                                  \
 130.100 +    ma2 = vec_splat (constants[0], 5);                                  \
 130.101 +    bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3);     \
 130.102 +                                                                        \
 130.103 +    zero = vec_splat_s16 (0);                                           \
 130.104 +    shift = vec_splat_u16 (4);                                          \
 130.105 +                                                                        \
 130.106 +    vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);    \
 130.107 +    vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);    \
 130.108 +    vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);    \
 130.109 +    vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);    \
 130.110 +    vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);    \
 130.111 +    vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);    \
 130.112 +    vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);    \
 130.113 +    vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);    \
 130.114 +                                                                        \
 130.115 +    IDCT_HALF                                                           \
 130.116 +                                                                        \
 130.117 +    vx0 = vec_mergeh (vy0, vy4);                                        \
 130.118 +    vx1 = vec_mergel (vy0, vy4);                                        \
 130.119 +    vx2 = vec_mergeh (vy1, vy5);                                        \
 130.120 +    vx3 = vec_mergel (vy1, vy5);                                        \
 130.121 +    vx4 = vec_mergeh (vy2, vy6);                                        \
 130.122 +    vx5 = vec_mergel (vy2, vy6);                                        \
 130.123 +    vx6 = vec_mergeh (vy3, vy7);                                        \
 130.124 +    vx7 = vec_mergel (vy3, vy7);                                        \
 130.125 +                                                                        \
 130.126 +    vy0 = vec_mergeh (vx0, vx4);                                        \
 130.127 +    vy1 = vec_mergel (vx0, vx4);                                        \
 130.128 +    vy2 = vec_mergeh (vx1, vx5);                                        \
 130.129 +    vy3 = vec_mergel (vx1, vx5);                                        \
 130.130 +    vy4 = vec_mergeh (vx2, vx6);                                        \
 130.131 +    vy5 = vec_mergel (vx2, vx6);                                        \
 130.132 +    vy6 = vec_mergeh (vx3, vx7);                                        \
 130.133 +    vy7 = vec_mergel (vx3, vx7);                                        \
 130.134 +                                                                        \
 130.135 +    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);                       \
 130.136 +    vx1 = vec_mergel (vy0, vy4);                                        \
 130.137 +    vx2 = vec_mergeh (vy1, vy5);                                        \
 130.138 +    vx3 = vec_mergel (vy1, vy5);                                        \
 130.139 +    vx4 = vec_mergeh (vy2, vy6);                                        \
 130.140 +    vx5 = vec_mergel (vy2, vy6);                                        \
 130.141 +    vx6 = vec_mergeh (vy3, vy7);                                        \
 130.142 +    vx7 = vec_mergel (vy3, vy7);                                        \
 130.143 +                                                                        \
 130.144 +    IDCT_HALF                                                           \
 130.145 +                                                                        \
 130.146 +    shift = vec_splat_u16 (6);                                          \
 130.147 +    vx0 = vec_sra (vy0, shift);                                         \
 130.148 +    vx1 = vec_sra (vy1, shift);                                         \
 130.149 +    vx2 = vec_sra (vy2, shift);                                         \
 130.150 +    vx3 = vec_sra (vy3, shift);                                         \
 130.151 +    vx4 = vec_sra (vy4, shift);                                         \
 130.152 +    vx5 = vec_sra (vy5, shift);                                         \
 130.153 +    vx6 = vec_sra (vy6, shift);                                         \
 130.154 +    vx7 = vec_sra (vy7, shift);
 130.155 +
 130.156 +
 130.157 +static const vec_s16 constants[5] = {
 130.158 +    {23170, 13573,  6518, 21895, -23170, -21895,    32,    31},
 130.159 +    {16384, 22725, 21407, 19266,  16384,  19266, 21407, 22725},
 130.160 +    {22725, 31521, 29692, 26722,  22725,  26722, 29692, 31521},
 130.161 +    {21407, 29692, 27969, 25172,  21407,  25172, 27969, 29692},
 130.162 +    {19266, 26722, 25172, 22654,  19266,  22654, 25172, 26722}
 130.163 +};
 130.164 +
 130.165 +void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
 130.166 +{
 130.167 +POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
 130.168 +    vec_s16 *block = (vec_s16*)blk;
 130.169 +    vec_u8 tmp;
 130.170 +
 130.171 +#if CONFIG_POWERPC_PERF
 130.172 +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
 130.173 +#endif
 130.174 +    IDCT
 130.175 +
 130.176 +#define COPY(dest,src)                                          \
 130.177 +    tmp = vec_packsu (src, src);                                \
 130.178 +    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);       \
 130.179 +    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
 130.180 +
 130.181 +    COPY (dest, vx0)    dest += stride;
 130.182 +    COPY (dest, vx1)    dest += stride;
 130.183 +    COPY (dest, vx2)    dest += stride;
 130.184 +    COPY (dest, vx3)    dest += stride;
 130.185 +    COPY (dest, vx4)    dest += stride;
 130.186 +    COPY (dest, vx5)    dest += stride;
 130.187 +    COPY (dest, vx6)    dest += stride;
 130.188 +    COPY (dest, vx7)
 130.189 +
 130.190 +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
 130.191 +}
 130.192 +
 130.193 +void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
 130.194 +{
 130.195 +POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
 130.196 +    vec_s16 *block = (vec_s16*)blk;
 130.197 +    vec_u8 tmp;
 130.198 +    vec_s16 tmp2, tmp3;
 130.199 +    vec_u8 perm0;
 130.200 +    vec_u8 perm1;
 130.201 +    vec_u8 p0, p1, p;
 130.202 +
 130.203 +#if CONFIG_POWERPC_PERF
 130.204 +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
 130.205 +#endif
 130.206 +
 130.207 +    IDCT
 130.208 +
 130.209 +    p0 = vec_lvsl (0, dest);
 130.210 +    p1 = vec_lvsl (stride, dest);
 130.211 +    p = vec_splat_u8 (-1);
 130.212 +    perm0 = vec_mergeh (p, p0);
 130.213 +    perm1 = vec_mergeh (p, p1);
 130.214 +
 130.215 +#define ADD(dest,src,perm)                                              \
 130.216 +    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
 130.217 +    tmp = vec_ld (0, dest);                                             \
 130.218 +    tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm);       \
 130.219 +    tmp3 = vec_adds (tmp2, src);                                        \
 130.220 +    tmp = vec_packsu (tmp3, tmp3);                                      \
 130.221 +    vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest);               \
 130.222 +    vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
 130.223 +
 130.224 +    ADD (dest, vx0, perm0)      dest += stride;
 130.225 +    ADD (dest, vx1, perm1)      dest += stride;
 130.226 +    ADD (dest, vx2, perm0)      dest += stride;
 130.227 +    ADD (dest, vx3, perm1)      dest += stride;
 130.228 +    ADD (dest, vx4, perm0)      dest += stride;
 130.229 +    ADD (dest, vx5, perm1)      dest += stride;
 130.230 +    ADD (dest, vx6, perm0)      dest += stride;
 130.231 +    ADD (dest, vx7, perm1)
 130.232 +
 130.233 +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
 130.234 +}
 130.235 +

   131.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   131.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/mathops.h	Mon Aug 27 12:09:56 2012 +0200
   131.3 @@ -0,0 +1,79 @@
   131.4 +/*
   131.5 + * simple math operations
   131.6 + * Copyright (c) 2001, 2002 Fabrice Bellard
   131.7 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
   131.8 + *
   131.9 + * This file is part of FFmpeg.
  131.10 + *
  131.11 + * FFmpeg is free software; you can redistribute it and/or
  131.12 + * modify it under the terms of the GNU Lesser General Public
  131.13 + * License as published by the Free Software Foundation; either
  131.14 + * version 2.1 of the License, or (at your option) any later version.
  131.15 + *
  131.16 + * FFmpeg is distributed in the hope that it will be useful,
  131.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  131.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  131.19 + * Lesser General Public License for more details.
  131.20 + *
  131.21 + * You should have received a copy of the GNU Lesser General Public
  131.22 + * License along with FFmpeg; if not, write to the Free Software
  131.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  131.24 + */
  131.25 +
  131.26 +#ifndef AVCODEC_PPC_MATHOPS_H
  131.27 +#define AVCODEC_PPC_MATHOPS_H
  131.28 +
  131.29 +#include <stdint.h>
  131.30 +#include "config.h"
  131.31 +#include "libavutil/common.h"
  131.32 +
  131.33 +#if HAVE_PPC4XX
  131.34 +/* signed 16x16 -> 32 multiply add accumulate */
  131.35 +#define MAC16(rt, ra, rb) \
  131.36 +    __asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
  131.37 +
  131.38 +/* signed 16x16 -> 32 multiply */
  131.39 +#define MUL16(ra, rb) \
  131.40 +    ({ int __rt; \
  131.41 +    __asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
  131.42 +    __rt; })
  131.43 +#endif
  131.44 +
  131.45 +#define MULH MULH
  131.46 +static inline av_const int MULH(int a, int b){
  131.47 +    int r;
  131.48 +    __asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
  131.49 +    return r;
  131.50 +}
  131.51 +
  131.52 +#if !ARCH_PPC64
  131.53 +static inline av_const int64_t MAC64(int64_t d, int a, int b)
  131.54 +{
  131.55 +    union { uint64_t x; unsigned hl[2]; } x = { d };
  131.56 +    int h, l;
  131.57 +    __asm__ ("mullw %3, %4, %5   \n\t"
  131.58 +             "mulhw %2, %4, %5   \n\t"
  131.59 +             "addc  %1, %1, %3   \n\t"
  131.60 +             "adde  %0, %0, %2   \n\t"
  131.61 +             : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
  131.62 +             : "r"(a), "r"(b));
  131.63 +    return x.x;
  131.64 +}
  131.65 +#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
  131.66 +
  131.67 +static inline av_const int64_t MLS64(int64_t d, int a, int b)
  131.68 +{
  131.69 +    union { uint64_t x; unsigned hl[2]; } x = { d };
  131.70 +    int h, l;
  131.71 +    __asm__ ("mullw %3, %4, %5   \n\t"
  131.72 +             "mulhw %2, %4, %5   \n\t"
  131.73 +             "subfc %1, %3, %1   \n\t"
  131.74 +             "subfe %0, %2, %0   \n\t"
  131.75 +             : "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
  131.76 +             : "r"(a), "r"(b));
  131.77 +    return x.x;
  131.78 +}
  131.79 +#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
  131.80 +#endif
  131.81 +
  131.82 +#endif /* AVCODEC_PPC_MATHOPS_H */

   132.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   132.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/types_altivec.h	Mon Aug 27 12:09:56 2012 +0200
   132.3 @@ -0,0 +1,46 @@
   132.4 +/*
   132.5 + * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
   132.6 + *
   132.7 + * This file is part of FFmpeg.
   132.8 + *
   132.9 + * FFmpeg is free software; you can redistribute it and/or
  132.10 + * modify it under the terms of the GNU Lesser General Public
  132.11 + * License as published by the Free Software Foundation; either
  132.12 + * version 2.1 of the License, or (at your option) any later version.
  132.13 + *
  132.14 + * FFmpeg is distributed in the hope that it will be useful,
  132.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  132.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  132.17 + * Lesser General Public License for more details.
  132.18 + *
  132.19 + * You should have received a copy of the GNU Lesser General Public
  132.20 + * License along with FFmpeg; if not, write to the Free Software
  132.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  132.22 + */
  132.23 +
  132.24 +#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H
  132.25 +#define AVCODEC_PPC_TYPES_ALTIVEC_H
  132.26 +
  132.27 +/***********************************************************************
  132.28 + * Vector types
  132.29 + **********************************************************************/
  132.30 +#define vec_u8  vector unsigned char
  132.31 +#define vec_s8  vector signed char
  132.32 +#define vec_u16 vector unsigned short
  132.33 +#define vec_s16 vector signed short
  132.34 +#define vec_u32 vector unsigned int
  132.35 +#define vec_s32 vector signed int
  132.36 +
  132.37 +/***********************************************************************
  132.38 + * Null vector
  132.39 + **********************************************************************/
  132.40 +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
  132.41 +
  132.42 +#define zero_u8v  (vec_u8)  zerov
  132.43 +#define zero_s8v  (vec_s8)  zerov
  132.44 +#define zero_u16v (vec_u16) zerov
  132.45 +#define zero_s16v (vec_s16) zerov
  132.46 +#define zero_u32v (vec_u32) zerov
  132.47 +#define zero_s32v (vec_s32) zerov
  132.48 +
  132.49 +#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */

   133.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   133.2 +++ b/ffmpeg_smp/h264dec/libavcodec/ppc/util_altivec.h	Mon Aug 27 12:09:56 2012 +0200
   133.3 @@ -0,0 +1,105 @@
   133.4 +/*
   133.5 + * This file is part of FFmpeg.
   133.6 + *
   133.7 + * FFmpeg is free software; you can redistribute it and/or
   133.8 + * modify it under the terms of the GNU Lesser General Public
   133.9 + * License as published by the Free Software Foundation; either
  133.10 + * version 2.1 of the License, or (at your option) any later version.
  133.11 + *
  133.12 + * FFmpeg is distributed in the hope that it will be useful,
  133.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  133.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  133.15 + * Lesser General Public License for more details.
  133.16 + *
  133.17 + * You should have received a copy of the GNU Lesser General Public
  133.18 + * License along with FFmpeg; if not, write to the Free Software
  133.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  133.20 + */
  133.21 +
  133.22 +/**
  133.23 + * @file
  133.24 + * Contains misc utility macros and inline functions
  133.25 + */
  133.26 +
  133.27 +#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H
  133.28 +#define AVCODEC_PPC_UTIL_ALTIVEC_H
  133.29 +
  133.30 +#include <stdint.h>
  133.31 +
  133.32 +#include "config.h"
  133.33 +
  133.34 +#if HAVE_ALTIVEC_H
  133.35 +#include <altivec.h>
  133.36 +#endif
  133.37 +
  133.38 +// used to build registers permutation vectors (vcprm)
  133.39 +// the 's' are for words in the _s_econd vector
  133.40 +#define WORD_0 0x00,0x01,0x02,0x03
  133.41 +#define WORD_1 0x04,0x05,0x06,0x07
  133.42 +#define WORD_2 0x08,0x09,0x0a,0x0b
  133.43 +#define WORD_3 0x0c,0x0d,0x0e,0x0f
  133.44 +#define WORD_s0 0x10,0x11,0x12,0x13
  133.45 +#define WORD_s1 0x14,0x15,0x16,0x17
  133.46 +#define WORD_s2 0x18,0x19,0x1a,0x1b
  133.47 +#define WORD_s3 0x1c,0x1d,0x1e,0x1f
  133.48 +
  133.49 +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
  133.50 +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
  133.51 +
  133.52 +// vcprmle is used to keep the same index as in the SSE version.
  133.53 +// it's the same as vcprm, with the index inversed
  133.54 +// ('le' is Little Endian)
  133.55 +#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
  133.56 +
  133.57 +// used to build inverse/identity vectors (vcii)
  133.58 +// n is _n_egative, p is _p_ositive
  133.59 +#define FLOAT_n -1.
  133.60 +#define FLOAT_p 1.
  133.61 +
  133.62 +
  133.63 +// Transpose 8x8 matrix of 16-bit elements (in-place)
  133.64 +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
  133.65 +do { \
  133.66 +    vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
  133.67 +    vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
  133.68 + \
  133.69 +    A1 = vec_mergeh (a, e); \
  133.70 +    B1 = vec_mergel (a, e); \
  133.71 +    C1 = vec_mergeh (b, f); \
  133.72 +    D1 = vec_mergel (b, f); \
  133.73 +    E1 = vec_mergeh (c, g); \
  133.74 +    F1 = vec_mergel (c, g); \
  133.75 +    G1 = vec_mergeh (d, h); \
  133.76 +    H1 = vec_mergel (d, h); \
  133.77 + \
  133.78 +    A2 = vec_mergeh (A1, E1); \
  133.79 +    B2 = vec_mergel (A1, E1); \
  133.80 +    C2 = vec_mergeh (B1, F1); \
  133.81 +    D2 = vec_mergel (B1, F1); \
  133.82 +    E2 = vec_mergeh (C1, G1); \
  133.83 +    F2 = vec_mergel (C1, G1); \
  133.84 +    G2 = vec_mergeh (D1, H1); \
  133.85 +    H2 = vec_mergel (D1, H1); \
  133.86 + \
  133.87 +    a = vec_mergeh (A2, E2); \
  133.88 +    b = vec_mergel (A2, E2); \
  133.89 +    c = vec_mergeh (B2, F2); \
  133.90 +    d = vec_mergel (B2, F2); \
  133.91 +    e = vec_mergeh (C2, G2); \
  133.92 +    f = vec_mergel (C2, G2); \
  133.93 +    g = vec_mergeh (D2, H2); \
  133.94 +    h = vec_mergel (D2, H2); \
  133.95 +} while (0)
  133.96 +
  133.97 +
  133.98 +/** \brief loads unaligned vector \a *src with offset \a offset
  133.99 +    and returns it */
 133.100 +static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
 133.101 +{
 133.102 +    register vector unsigned char first = vec_ld(offset, src);
 133.103 +    register vector unsigned char second = vec_ld(offset+15, src);
 133.104 +    register vector unsigned char mask = vec_lvsl(offset, src);
 133.105 +    return vec_perm(first, second, mask);
 133.106 +}
 133.107 +
 133.108 +#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */

   134.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   134.2 +++ b/ffmpeg_smp/h264dec/libavcodec/raw.h	Mon Aug 27 12:09:56 2012 +0200
   134.3 @@ -0,0 +1,39 @@
   134.4 +/*
   134.5 + * Raw Video Codec
   134.6 + * Copyright (c) 2001 Fabrice Bellard
   134.7 + *
   134.8 + * This file is part of FFmpeg.
   134.9 + *
  134.10 + * FFmpeg is free software; you can redistribute it and/or
  134.11 + * modify it under the terms of the GNU Lesser General Public
  134.12 + * License as published by the Free Software Foundation; either
  134.13 + * version 2.1 of the License, or (at your option) any later version.
  134.14 + *
  134.15 + * FFmpeg is distributed in the hope that it will be useful,
  134.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  134.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  134.18 + * Lesser General Public License for more details.
  134.19 + *
  134.20 + * You should have received a copy of the GNU Lesser General Public
  134.21 + * License along with FFmpeg; if not, write to the Free Software
  134.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  134.23 + */
  134.24 +
  134.25 +/**
  134.26 + * @file
  134.27 + * Raw Video Codec
  134.28 + */
  134.29 +
  134.30 +#ifndef AVCODEC_RAW_H
  134.31 +#define AVCODEC_RAW_H
  134.32 +
  134.33 +#include "avcodec.h"
  134.34 +
  134.35 +typedef struct PixelFormatTag {
  134.36 +    enum PixelFormat pix_fmt;
  134.37 +    unsigned int fourcc;
  134.38 +} PixelFormatTag;
  134.39 +
  134.40 +extern const PixelFormatTag ff_raw_pixelFormatTags[];
  134.41 +int raw_init_encoder(AVCodecContext *avctx);
  134.42 +#endif /* AVCODEC_RAW_H */

   135.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   135.2 +++ b/ffmpeg_smp/h264dec/libavcodec/rectangle.h	Mon Aug 27 12:09:56 2012 +0200
   135.3 @@ -0,0 +1,92 @@
   135.4 +/*
   135.5 + * rectangle filling function
   135.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   135.7 + *
   135.8 + * This file is part of FFmpeg.
   135.9 + *
  135.10 + * FFmpeg is free software; you can redistribute it and/or
  135.11 + * modify it under the terms of the GNU Lesser General Public
  135.12 + * License as published by the Free Software Foundation; either
  135.13 + * version 2.1 of the License, or (at your option) any later version.
  135.14 + *
  135.15 + * FFmpeg is distributed in the hope that it will be useful,
  135.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  135.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  135.18 + * Lesser General Public License for more details.
  135.19 + *
  135.20 + * You should have received a copy of the GNU Lesser General Public
  135.21 + * License along with FFmpeg; if not, write to the Free Software
  135.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  135.23 + */
  135.24 +
  135.25 +/**
  135.26 + * @file
  135.27 + * useful rectangle filling function
  135.28 + * @author Michael Niedermayer <michaelni@gmx.at>
  135.29 + */
  135.30 +
  135.31 +#ifndef AVCODEC_RECTANGLE_H
  135.32 +#define AVCODEC_RECTANGLE_H
  135.33 +
  135.34 +#include <assert.h>
  135.35 +//#include "config.h"
  135.36 +#include "libavutil/common.h"
  135.37 +#include "dsputil.h"
  135.38 +
  135.39 +/**
  135.40 + * fill a rectangle.
  135.41 + * @param h height of the rectangle, should be a constant
  135.42 + * @param w width of the rectangle, should be a constant
  135.43 + * @param size the size of val (1, 2 or 4), should be a constant
  135.44 + */
  135.45 +static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
  135.46 +    uint8_t *p= (uint8_t*)vp;
  135.47 +    assert(size==1 || size==2 || size==4);
  135.48 +    assert(w<=4);
  135.49 +
  135.50 +    w      *= size;
  135.51 +    stride *= size;
  135.52 +
  135.53 +    assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
  135.54 +    assert((stride&(w-1))==0);
  135.55 +    if(w==2){
  135.56 +        const uint16_t v= size==4 ? val : val*0x0101;
  135.57 +        *(uint16_t*)(p + 0*stride)= v;
  135.58 +        if(h==1) return;
  135.59 +        *(uint16_t*)(p + 1*stride)= v;
  135.60 +        if(h==2) return;
  135.61 +        *(uint16_t*)(p + 2*stride)= v;
  135.62 +        *(uint16_t*)(p + 3*stride)= v;
  135.63 +    }else if(w==4){
  135.64 +        const uint32_t v= size==4 ? val : size==2 ? val*0x00010001 : val*0x01010101;
  135.65 +        *(uint32_t*)(p + 0*stride)= v;
  135.66 +        if(h==1) return;
  135.67 +        *(uint32_t*)(p + 1*stride)= v;
  135.68 +        if(h==2) return;
  135.69 +        *(uint32_t*)(p + 2*stride)= v;
  135.70 +        *(uint32_t*)(p + 3*stride)= v;
  135.71 +    }else if(w==8){
  135.72 +        const uint64_t v=  size==2 ? val*0x0001000100010001ULL : val*0x0100000001ULL;
  135.73 +        *(uint64_t*)(p + 0*stride)= v;
  135.74 +        if(h==1) return;
  135.75 +        *(uint64_t*)(p + 1*stride)= v;
  135.76 +        if(h==2) return;
  135.77 +        *(uint64_t*)(p + 2*stride)= v;
  135.78 +        *(uint64_t*)(p + 3*stride)= v;
  135.79 +    }else if(w==16){
  135.80 +        const uint64_t v= val*0x0100000001ULL;
  135.81 +        *(uint64_t*)(p + 0+0*stride)= v;
  135.82 +        *(uint64_t*)(p + 8+0*stride)= v;
  135.83 +        *(uint64_t*)(p + 0+1*stride)= v;
  135.84 +        *(uint64_t*)(p + 8+1*stride)= v;
  135.85 +        if(h==2) return;
  135.86 +        *(uint64_t*)(p + 0+2*stride)= v;
  135.87 +        *(uint64_t*)(p + 8+2*stride)= v;
  135.88 +        *(uint64_t*)(p + 0+3*stride)= v;
  135.89 +        *(uint64_t*)(p + 8+3*stride)= v;
  135.90 +    }else
  135.91 +        assert(0);
  135.92 +    assert(h==4);
  135.93 +}
  135.94 +
  135.95 +#endif /* AVCODEC_RECTANGLE_H */

   136.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   136.2 +++ b/ffmpeg_smp/h264dec/libavcodec/scratch.c	Mon Aug 27 12:09:56 2012 +0200
   136.3 @@ -0,0 +1,295 @@
   136.4 +static void *entropy_thread(void *arg){
   136.5 +	H264Context *h = (H264Context *) arg;
   136.6 +	EDSlice *s;
   136.7 +	
   136.8 +	H264Cabac hcabac;
   136.9 +	CABACContext cabac;
  136.10 +	
  136.11 +	ff_init_cabac_states();
  136.12 +	
  136.13 +	if (init_cabac(h, &hcabac)<0)
  136.14 +		return NULL;
  136.15 +	
  136.16 +	for(;;){
  136.17 +		{
  136.18 +			pthread_mutex_lock(&h->lock[ENTROPY]);
  136.19 +			while (h->ed_cnt<=0)
  136.20 +				pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]);
  136.21 +			s= &h->ed_q[h->ed_fo];
  136.22 +			pthread_mutex_unlock(&h->lock[ENTROPY]);
  136.23 +			h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT;
  136.24 +		}
  136.25 +		if (s->state<0)
  136.26 +			break;
  136.27 +		
  136.28 +		decode_slice_entropy(&hcabac, &cabac, s);
  136.29 +		
  136.30 +		{
  136.31 +			pthread_mutex_lock(&h->lock[MBDEC]);
  136.32 +			while (h->mbdec_cnt >= MAX_SLICE_COUNT)
  136.33 +				pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
  136.34 +			h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s);
  136.35 +			h->mbdec_cnt++;
  136.36 +			h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
  136.37 +			pthread_cond_signal(&h->cond[MBDEC]);
  136.38 +			pthread_mutex_unlock(&h->lock[MBDEC]);
  136.39 +		}
  136.40 +		{
  136.41 +			pthread_mutex_lock(&h->lock[ENTROPY]);
  136.42 +			h->ed_cnt--;
  136.43 +			pthread_cond_signal(&h->cond[ENTROPY]);
  136.44 +			pthread_mutex_unlock(&h->lock[ENTROPY]);
  136.45 +		}
  136.46 +	}
  136.47 +	
  136.48 +	{
  136.49 +		pthread_mutex_lock(&h->lock[MBDEC]);
  136.50 +		while (h->mbdec_cnt >= MAX_SLICE_COUNT)
  136.51 +			pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
  136.52 +		h->mbdec_q[h->mbdec_fi] = *((MBSlice *) s);
  136.53 +		h->mbdec_cnt++;
  136.54 +		h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
  136.55 +		pthread_cond_signal(&h->cond[MBDEC]);
  136.56 +		pthread_mutex_unlock(&h->lock[MBDEC]);
  136.57 +		
  136.58 +	}
  136.59 +	
  136.60 +	free_cabac(&hcabac);
  136.61 +	
  136.62 +	pthread_exit(NULL);
  136.63 +	return NULL;
  136.64 +	
  136.65 +}
  136.66 +/*
  136.67 +* The following code is the main loop of the file converter
  136.68 +*/
  136.69 +int av_transcode_1ed(int ifile, int ofile, int frame_width, int frame_height) {
  136.70 +	H264Context *h;
  136.71 +	pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;
  136.72 +	
  136.73 +	h = ff_h264_decode_init(ifile, ofile, frame_width, frame_height);
  136.74 +	
  136.75 +	timer_start = av_gettime();
  136.76 +	
  136.77 +	//    pthread_create(&read_thr, NULL, read_thread, h);
  136.78 +	//    pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
  136.79 +	pthread_create(&entropy_thr, NULL, entropy_mbd_thread, h);
  136.80 +	
  136.81 +	// pthread_create(&mbdec_thr, NULL, mbdec_thread, h);
  136.82 +	
  136.83 +	//   pthread_create(&write_thr, NULL, write_thread, h);
  136.84 +	
  136.85 +	//   pthread_join(read_thr, NULL);
  136.86 +	//    pthread_join(parsenal_thr, NULL);
  136.87 +	pthread_join(entropy_thr, NULL);
  136.88 +	//    pthread_join(mbdec_thr, NULL);
  136.89 +	//	printf("before write_thr\n");
  136.90 +	//    pthread_join(write_thr, NULL);
  136.91 +	
  136.92 +	/* finished ! */
  136.93 +	ff_h264_decode_end(h);
  136.94 +	
  136.95 +	return 0;
  136.96 +}
  136.97 +
  136.98 +static void reset_h264mb(EDSlice *s, int mb_width, int mb_height){
  136.99 +	for (int i=0; i<mb_height; i++){
 136.100 +		for (int j=0; j<mb_width; j++){
 136.101 +			H264Mb *m = &s->mbs[i*mb_width + j];
 136.102 +
 136.103 +			m->left_mb_xy=0;
 136.104 +			m->top_mb_xy = 0;
 136.105 +		}
 136.106 +	}
 136.107 +}
 136.108 +
 136.109 +static void *entropy_mbd_thread(void *arg){
 136.110 +	H264Context *h = (H264Context *) arg;
 136.111 +
 136.112 +	EDSlice slice, *s=&slice;
 136.113 +	MBSlice mbslice, *s2=&mbslice;
 136.114 +	H264Cabac hcabac;
 136.115 +	CABACContext cabac;
 136.116 +	int frames =0;
 136.117 +	MBDecContext mbdec, *d=&mbdec;
 136.118 +	int size=h->width*h->height;
 136.119 +	WriteContext write, *w=&write;
 136.120 +	AVCodecParserContext parser, *pc= &parser;
 136.121 +	NalContext nal, *n=&nal;
 136.122 +
 136.123 +
 136.124 +	memset(pc, 0, sizeof(AVCodecParserContext));
 136.125 +	pc->buffer_size = 2048;
 136.126 +	pc->final_frame = 0;
 136.127 +	pc->cur_len= 0;
 136.128 +	pc->data = av_mallocz(2048 + FF_INPUT_BUFFER_PADDING_SIZE);
 136.129 +	pc->size = 2048;
 136.130 +	pc->eof_reached =0;
 136.131 +	pc->ifile = h->ifile;
 136.132 +
 136.133 +	//init parse
 136.134 +	memset(n, 0, sizeof(NalContext));
 136.135 +	n->width = h->width;
 136.136 +	n->height = h->height;
 136.137 +	n->mb_height = h->mb_height;
 136.138 +	n->mb_width  = h->mb_width;
 136.139 +	n->b4_stride = n->mb_width*4 + 1;
 136.140 +	n->mb_stride = n->mb_width + 1;
 136.141 +	n->outputed_poc = INT_MIN;
 136.142 +// 	memset(s, 0, sizeof(EDSlice));
 136.143 +// 	ff_init_slice(n, s);
 136.144 +//
 136.145 +
 136.146 +	memset(w, 0, sizeof(WriteContext));
 136.147 +	w->bit_buffer_size= FFMAX(1024*256, 6*size + 200);
 136.148 +	w->bit_buffer=  av_mallocz(w->bit_buffer_size);
 136.149 +
 136.150 +
 136.151 +
 136.152 +	ff_h264dsp_init(&d->hdsp);
 136.153 +	ff_h264_pred_init(&d->hpc);
 136.154 +	dsputil_init(&d->dsp);
 136.155 +	d->hdsp.qpel_put= d->dsp.put_h264_qpel_pixels_tab;
 136.156 +	d->hdsp.qpel_avg= d->dsp.avg_h264_qpel_pixels_tab;
 136.157 +	d->mb_height = (h->height + 15) / 16;
 136.158 +	d->mb_width  = (h->width  + 15) / 16;
 136.159 +	d->linesize = h->width + EDGE_WIDTH*2;
 136.160 +	d->uvlinesize = d->linesize>>1;
 136.161 +
 136.162 +	for(int i=0; i<16; i++){
 136.163 +		d->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*d->linesize*((scan8[i] - scan8[0])>>3);
 136.164 +	}
 136.165 +	for(int i=0; i<4; i++){
 136.166 +		d->block_offset[16+i]=
 136.167 +		d->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*d->uvlinesize*((scan8[i] - scan8[0])>>3);
 136.168 +	}
 136.169 +
 136.170 +	d->scratchpad= av_mallocz((h->width+64)*4*16*2*sizeof(uint8_t));
 136.171 +
 136.172 +	ff_init_cabac_states();
 136.173 +
 136.174 +	if (init_cabac(h, &hcabac)<0)
 136.175 +		return NULL;
 136.176 +
 136.177 +	while(!pc->final_frame && frames_max++ < 1000){
 136.178 +		Picture *out;
 136.179 +
 136.180 +		RawFrame *frm;
 136.181 +		Picture *pic=NULL;
 136.182 +
 136.183 +		RawFrame frm_read;
 136.184 +		frm_read.state =0;
 136.185 +		av_read_frame_internal(pc, &frm_read);
 136.186 +		frm = &frm_read;
 136.187 +
 136.188 +		if (frm->state < 0)
 136.189 +			break;
 136.190 +/*
 136.191 +		{
 136.192 +			pthread_mutex_lock(&h->lock[PARSE2]);
 136.193 +			while (h->slice_cnt<=0)
 136.194 +				pthread_cond_wait(&h->cond[PARSE2], &h->lock[PARSE2]);
 136.195 +			h->slice_cnt--;
 136.196 +			s= &h->slices[h->slice_next++];
 136.197 +			h->slice_next %= MAX_SLICE_COUNT;
 136.198 +			pthread_mutex_unlock(&h->lock[PARSE2]);
 136.199 +		}*/
 136.200 +		ff_init_slice(n, s);
 136.201 +		reset_h264mb(s, n->mb_width, n->mb_height);
 136.202 +		for(int i=0; i<MAX_PIC_COUNT; i++){
 136.203 +			if(h->picture[i].reference==0){
 136.204 +				pic= &h->picture[i];
 136.205 +				break;
 136.206 +			}
 136.207 +		}
 136.208 +// 		{
 136.209 +// 			pthread_mutex_lock(&h->lock[PARSE3]);
 136.210 +// 			while (h->free_pic_cnt<=0)
 136.211 +// 				pthread_cond_wait(&h->cond[PARSE3], &h->lock[PARSE3]);
 136.212 +// 			h->free_pic_cnt--;
 136.213 +// 			/* use first free picture */
 136.214 +// 			for(int i=0; i<MAX_PIC_COUNT; i++){
 136.215 +// 				if(h->picture[i].reference==0){
 136.216 +// 					pic= &h->picture[i];
 136.217 +// 					break;
 136.218 +// 				}
 136.219 +// 			}
 136.220 +// 			pthread_mutex_unlock(&h->lock[PARSE3]);
 136.221 +// 		}
 136.222 +		ff_alloc_picture(n, s, pic);
 136.223 +
 136.224 +		decode_nal_units(n, s, frm, pic);
 136.225 +
 136.226 +
 136.227 +		decode_slice_entropy(&hcabac, &cabac, s);
 136.228 +		memcpy( s2, s, sizeof(MBSlice)); //this only copys the COMMON_SLICE part
 136.229 +		av_freep(&s->gb.raw);
 136.230 +		decode_slice_mb_seq(d, s2);
 136.231 +
 136.232 +//         if (s2->release_cnt>0) {
 136.233 +//             int i;
 136.234 +//             for (i=0; i<s2->release_cnt; i++){
 136.235 +//                 if ((s2->release_ref[i]->reference & ~2) == 0)
 136.236 +//                     default_release_buffer(h, s2->release_ref[i]);
 136.237 +//                 else
 136.238 +//                     s2->release_ref[i]->reference &= ~2;
 136.239 +//             }
 136.240 +//             s->release_cnt=0;
 136.241 +//         }
 136.242 +
 136.243 +if (s->release_cnt>0) {
 136.244 +	int i;
 136.245 +	for (i=0; i<s->release_cnt; i++){
 136.246 +		s->release_ref[i]->reference &= ~2;
 136.247 +	}
 136.248 +	s->release_cnt=0;
 136.249 +}
 136.250 +
 136.251 +
 136.252 +        {
 136.253 +			pthread_mutex_lock(&h->lock[PARSE2]);
 136.254 +			h->slice_cnt++;
 136.255 +			pthread_cond_signal(&h->cond[PARSE2]);
 136.256 +			pthread_mutex_unlock(&h->lock[PARSE2]);
 136.257 +		}
 136.258 +
 136.259 +		out =output_frame(w, s2->current_picture, h->ofile, h->width, h->height);
 136.260 +		print_report(w->frame_number, w->video_size, 0);
 136.261 +
 136.262 +		if (out){
 136.263 +// 			if ((out->reference & ~1) == 0)
 136.264 +// 				default_release_buffer(h, out);
 136.265 +// 			else
 136.266 +				out->reference &= ~1;
 136.267 +		}
 136.268 +
 136.269 +		{
 136.270 +			pthread_mutex_lock(&h->lock[ENTROPY]);
 136.271 +			h->ed_cnt--;
 136.272 +			pthread_cond_signal(&h->cond[ENTROPY]);
 136.273 +			pthread_mutex_unlock(&h->lock[ENTROPY]);
 136.274 +		}
 136.275 +	}
 136.276 +	while (output_frame(w, NULL, h->ofile, h->width, h->height));
 136.277 +	print_report(w->frame_number, w->video_size, 1);
 136.278 +
 136.279 +	av_free(w->bit_buffer);
 136.280 +
 136.281 +	{//propagate exit
 136.282 +		pthread_mutex_lock(&h->lock[WRITE]);
 136.283 +		while (h->write_cnt>= MAX_DELAYED_PIC_COUNT)
 136.284 +			pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
 136.285 +		last_pic.reference = -1;
 136.286 +		h->write_q[h->write_fi] = &last_pic;
 136.287 +		h->write_cnt++;
 136.288 +		h->write_fi++; h->write_fi %= MAX_DELAYED_PIC_COUNT;
 136.289 +		pthread_cond_signal(&h->cond[WRITE]);
 136.290 +		pthread_mutex_unlock(&h->lock[WRITE]);
 136.291 +
 136.292 +	}
 136.293 +	free_cabac(&hcabac);
 136.294 +
 136.295 +	pthread_exit(NULL);
 136.296 +	return NULL;
 136.297 +
 136.298 +}

   137.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   137.2 +++ b/ffmpeg_smp/h264dec/libavcodec/simple_idct.c	Mon Aug 27 12:09:56 2012 +0200
   137.3 @@ -0,0 +1,372 @@
   137.4 +/*
   137.5 + * Simple IDCT
   137.6 + *
   137.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
   137.8 + *
   137.9 + * This file is part of FFmpeg.
  137.10 + *
  137.11 + * FFmpeg is free software; you can redistribute it and/or
  137.12 + * modify it under the terms of the GNU Lesser General Public
  137.13 + * License as published by the Free Software Foundation; either
  137.14 + * version 2.1 of the License, or (at your option) any later version.
  137.15 + *
  137.16 + * FFmpeg is distributed in the hope that it will be useful,
  137.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  137.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  137.19 + * Lesser General Public License for more details.
  137.20 + *
  137.21 + * You should have received a copy of the GNU Lesser General Public
  137.22 + * License along with FFmpeg; if not, write to the Free Software
  137.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  137.24 + */
  137.25 +
  137.26 +/**
  137.27 + * @file
  137.28 + * simpleidct in C.
  137.29 + */
  137.30 +
  137.31 +/*
  137.32 +  based upon some outcommented c code from mpeg2dec (idct_mmx.c
  137.33 +  written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
  137.34 + */
  137.35 +#include "avcodec.h"
  137.36 +#include "dsputil.h"
  137.37 +#include "mathops.h"
  137.38 +#include "simple_idct.h"
  137.39 +
  137.40 +#if 0
  137.41 +#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
  137.42 +#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
  137.43 +#define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
  137.44 +#define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
  137.45 +#define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
  137.46 +#define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
  137.47 +#define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
  137.48 +#define ROW_SHIFT 8
  137.49 +#define COL_SHIFT 17
  137.50 +#else
  137.51 +#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137.52 +#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137.53 +#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137.54 +#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137.55 +#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137.56 +#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137.57 +#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  137.58 +#define ROW_SHIFT 11
  137.59 +#define COL_SHIFT 20 // 6
  137.60 +#endif
  137.61 +
  137.62 +static inline void idctRowCondDC (DCTELEM * row)
  137.63 +{
  137.64 +        int a0, a1, a2, a3, b0, b1, b2, b3;
  137.65 +        uint64_t temp;
  137.66 +
  137.67 +#if HAVE_BIGENDIAN
  137.68 +#define ROW0_MASK 0xffff000000000000LL
  137.69 +#else
  137.70 +#define ROW0_MASK 0xffffLL
  137.71 +#endif
  137.72 +        if(sizeof(DCTELEM)==2){
  137.73 +            if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
  137.74 +                  ((uint64_t *)row)[1]) == 0) {
  137.75 +                temp = (row[0] << 3) & 0xffff;
  137.76 +                temp += temp << 16;
  137.77 +                temp += temp << 32;
  137.78 +                ((uint64_t *)row)[0] = temp;
  137.79 +                ((uint64_t *)row)[1] = temp;
  137.80 +                return;
  137.81 +            }
  137.82 +        }else{
  137.83 +            if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
  137.84 +                row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
  137.85 +                return;
  137.86 +            }
  137.87 +        }
  137.88 +
  137.89 +        a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
  137.90 +        a1 = a0;
  137.91 +        a2 = a0;
  137.92 +        a3 = a0;
  137.93 +
  137.94 +        /* no need to optimize : gcc does it */
  137.95 +        a0 += W2 * row[2];
  137.96 +        a1 += W6 * row[2];
  137.97 +        a2 -= W6 * row[2];
  137.98 +        a3 -= W2 * row[2];
  137.99 +
 137.100 +        b0 = MUL16(W1, row[1]);
 137.101 +        MAC16(b0, W3, row[3]);
 137.102 +        b1 = MUL16(W3, row[1]);
 137.103 +        MAC16(b1, -W7, row[3]);
 137.104 +        b2 = MUL16(W5, row[1]);
 137.105 +        MAC16(b2, -W1, row[3]);
 137.106 +        b3 = MUL16(W7, row[1]);
 137.107 +        MAC16(b3, -W5, row[3]);
 137.108 +
 137.109 +        temp = ((uint64_t*)row)[1];
 137.110 +
 137.111 +        if (temp != 0) {
 137.112 +            a0 += W4*row[4] + W6*row[6];
 137.113 +            a1 += - W4*row[4] - W2*row[6];
 137.114 +            a2 += - W4*row[4] + W2*row[6];
 137.115 +            a3 += W4*row[4] - W6*row[6];
 137.116 +
 137.117 +            MAC16(b0, W5, row[5]);
 137.118 +            MAC16(b0, W7, row[7]);
 137.119 +
 137.120 +            MAC16(b1, -W1, row[5]);
 137.121 +            MAC16(b1, -W5, row[7]);
 137.122 +
 137.123 +            MAC16(b2, W7, row[5]);
 137.124 +            MAC16(b2, W3, row[7]);
 137.125 +
 137.126 +            MAC16(b3, W3, row[5]);
 137.127 +            MAC16(b3, -W1, row[7]);
 137.128 +        }
 137.129 +
 137.130 +        row[0] = (a0 + b0) >> ROW_SHIFT;
 137.131 +        row[7] = (a0 - b0) >> ROW_SHIFT;
 137.132 +        row[1] = (a1 + b1) >> ROW_SHIFT;
 137.133 +        row[6] = (a1 - b1) >> ROW_SHIFT;
 137.134 +        row[2] = (a2 + b2) >> ROW_SHIFT;
 137.135 +        row[5] = (a2 - b2) >> ROW_SHIFT;
 137.136 +        row[3] = (a3 + b3) >> ROW_SHIFT;
 137.137 +        row[4] = (a3 - b3) >> ROW_SHIFT;
 137.138 +}
 137.139 +
 137.140 +static inline void idctSparseColPut (uint8_t *dest, int line_size,
 137.141 +                                     DCTELEM * col)
 137.142 +{
 137.143 +        int a0, a1, a2, a3, b0, b1, b2, b3;
 137.144 +        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 137.145 +
 137.146 +        /* XXX: I did that only to give same values as previous code */
 137.147 +        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
 137.148 +        a1 = a0;
 137.149 +        a2 = a0;
 137.150 +        a3 = a0;
 137.151 +
 137.152 +        a0 +=  + W2*col[8*2];
 137.153 +        a1 +=  + W6*col[8*2];
 137.154 +        a2 +=  - W6*col[8*2];
 137.155 +        a3 +=  - W2*col[8*2];
 137.156 +
 137.157 +        b0 = MUL16(W1, col[8*1]);
 137.158 +        b1 = MUL16(W3, col[8*1]);
 137.159 +        b2 = MUL16(W5, col[8*1]);
 137.160 +        b3 = MUL16(W7, col[8*1]);
 137.161 +
 137.162 +        MAC16(b0, + W3, col[8*3]);
 137.163 +        MAC16(b1, - W7, col[8*3]);
 137.164 +        MAC16(b2, - W1, col[8*3]);
 137.165 +        MAC16(b3, - W5, col[8*3]);
 137.166 +
 137.167 +        if(col[8*4]){
 137.168 +            a0 += + W4*col[8*4];
 137.169 +            a1 += - W4*col[8*4];
 137.170 +            a2 += - W4*col[8*4];
 137.171 +            a3 += + W4*col[8*4];
 137.172 +        }
 137.173 +
 137.174 +        if (col[8*5]) {
 137.175 +            MAC16(b0, + W5, col[8*5]);
 137.176 +            MAC16(b1, - W1, col[8*5]);
 137.177 +            MAC16(b2, + W7, col[8*5]);
 137.178 +            MAC16(b3, + W3, col[8*5]);
 137.179 +        }
 137.180 +
 137.181 +        if(col[8*6]){
 137.182 +            a0 += + W6*col[8*6];
 137.183 +            a1 += - W2*col[8*6];
 137.184 +            a2 += + W2*col[8*6];
 137.185 +            a3 += - W6*col[8*6];
 137.186 +        }
 137.187 +
 137.188 +        if (col[8*7]) {
 137.189 +            MAC16(b0, + W7, col[8*7]);
 137.190 +            MAC16(b1, - W5, col[8*7]);
 137.191 +            MAC16(b2, + W3, col[8*7]);
 137.192 +            MAC16(b3, - W1, col[8*7]);
 137.193 +        }
 137.194 +
 137.195 +        dest[0] = cm[(a0 + b0) >> COL_SHIFT];
 137.196 +        dest += line_size;
 137.197 +        dest[0] = cm[(a1 + b1) >> COL_SHIFT];
 137.198 +        dest += line_size;
 137.199 +        dest[0] = cm[(a2 + b2) >> COL_SHIFT];
 137.200 +        dest += line_size;
 137.201 +        dest[0] = cm[(a3 + b3) >> COL_SHIFT];
 137.202 +        dest += line_size;
 137.203 +        dest[0] = cm[(a3 - b3) >> COL_SHIFT];
 137.204 +        dest += line_size;
 137.205 +        dest[0] = cm[(a2 - b2) >> COL_SHIFT];
 137.206 +        dest += line_size;
 137.207 +        dest[0] = cm[(a1 - b1) >> COL_SHIFT];
 137.208 +        dest += line_size;
 137.209 +        dest[0] = cm[(a0 - b0) >> COL_SHIFT];
 137.210 +}
 137.211 +
 137.212 +static inline void idctSparseColAdd (uint8_t *dest, int line_size,
 137.213 +                                     DCTELEM * col)
 137.214 +{
 137.215 +        int a0, a1, a2, a3, b0, b1, b2, b3;
 137.216 +        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 137.217 +
 137.218 +        /* XXX: I did that only to give same values as previous code */
 137.219 +        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
 137.220 +        a1 = a0;
 137.221 +        a2 = a0;
 137.222 +        a3 = a0;
 137.223 +
 137.224 +        a0 +=  + W2*col[8*2];
 137.225 +        a1 +=  + W6*col[8*2];
 137.226 +        a2 +=  - W6*col[8*2];
 137.227 +        a3 +=  - W2*col[8*2];
 137.228 +
 137.229 +        b0 = MUL16(W1, col[8*1]);
 137.230 +        b1 = MUL16(W3, col[8*1]);
 137.231 +        b2 = MUL16(W5, col[8*1]);
 137.232 +        b3 = MUL16(W7, col[8*1]);
 137.233 +
 137.234 +        MAC16(b0, + W3, col[8*3]);
 137.235 +        MAC16(b1, - W7, col[8*3]);
 137.236 +        MAC16(b2, - W1, col[8*3]);
 137.237 +        MAC16(b3, - W5, col[8*3]);
 137.238 +
 137.239 +        if(col[8*4]){
 137.240 +            a0 += + W4*col[8*4];
 137.241 +            a1 += - W4*col[8*4];
 137.242 +            a2 += - W4*col[8*4];
 137.243 +            a3 += + W4*col[8*4];
 137.244 +        }
 137.245 +
 137.246 +        if (col[8*5]) {
 137.247 +            MAC16(b0, + W5, col[8*5]);
 137.248 +            MAC16(b1, - W1, col[8*5]);
 137.249 +            MAC16(b2, + W7, col[8*5]);
 137.250 +            MAC16(b3, + W3, col[8*5]);
 137.251 +        }
 137.252 +
 137.253 +        if(col[8*6]){
 137.254 +            a0 += + W6*col[8*6];
 137.255 +            a1 += - W2*col[8*6];
 137.256 +            a2 += + W2*col[8*6];
 137.257 +            a3 += - W6*col[8*6];
 137.258 +        }
 137.259 +
 137.260 +        if (col[8*7]) {
 137.261 +            MAC16(b0, + W7, col[8*7]);
 137.262 +            MAC16(b1, - W5, col[8*7]);
 137.263 +            MAC16(b2, + W3, col[8*7]);
 137.264 +            MAC16(b3, - W1, col[8*7]);
 137.265 +        }
 137.266 +
 137.267 +        dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
 137.268 +        dest += line_size;
 137.269 +        dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
 137.270 +        dest += line_size;
 137.271 +        dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
 137.272 +        dest += line_size;
 137.273 +        dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
 137.274 +        dest += line_size;
 137.275 +        dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
 137.276 +        dest += line_size;
 137.277 +        dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
 137.278 +        dest += line_size;
 137.279 +        dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
 137.280 +        dest += line_size;
 137.281 +        dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
 137.282 +}
 137.283 +
 137.284 +static inline void idctSparseCol (DCTELEM * col)
 137.285 +{
 137.286 +        int a0, a1, a2, a3, b0, b1, b2, b3;
 137.287 +
 137.288 +        /* XXX: I did that only to give same values as previous code */
 137.289 +        a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
 137.290 +        a1 = a0;
 137.291 +        a2 = a0;
 137.292 +        a3 = a0;
 137.293 +
 137.294 +        a0 +=  + W2*col[8*2];
 137.295 +        a1 +=  + W6*col[8*2];
 137.296 +        a2 +=  - W6*col[8*2];
 137.297 +        a3 +=  - W2*col[8*2];
 137.298 +
 137.299 +        b0 = MUL16(W1, col[8*1]);
 137.300 +        b1 = MUL16(W3, col[8*1]);
 137.301 +        b2 = MUL16(W5, col[8*1]);
 137.302 +        b3 = MUL16(W7, col[8*1]);
 137.303 +
 137.304 +        MAC16(b0, + W3, col[8*3]);
 137.305 +        MAC16(b1, - W7, col[8*3]);
 137.306 +        MAC16(b2, - W1, col[8*3]);
 137.307 +        MAC16(b3, - W5, col[8*3]);
 137.308 +
 137.309 +        if(col[8*4]){
 137.310 +            a0 += + W4*col[8*4];
 137.311 +            a1 += - W4*col[8*4];
 137.312 +            a2 += - W4*col[8*4];
 137.313 +            a3 += + W4*col[8*4];
 137.314 +        }
 137.315 +
 137.316 +        if (col[8*5]) {
 137.317 +            MAC16(b0, + W5, col[8*5]);
 137.318 +            MAC16(b1, - W1, col[8*5]);
 137.319 +            MAC16(b2, + W7, col[8*5]);
 137.320 +            MAC16(b3, + W3, col[8*5]);
 137.321 +        }
 137.322 +
 137.323 +        if(col[8*6]){
 137.324 +            a0 += + W6*col[8*6];
 137.325 +            a1 += - W2*col[8*6];
 137.326 +            a2 += + W2*col[8*6];
 137.327 +            a3 += - W6*col[8*6];
 137.328 +        }
 137.329 +
 137.330 +        if (col[8*7]) {
 137.331 +            MAC16(b0, + W7, col[8*7]);
 137.332 +            MAC16(b1, - W5, col[8*7]);
 137.333 +            MAC16(b2, + W3, col[8*7]);
 137.334 +            MAC16(b3, - W1, col[8*7]);
 137.335 +        }
 137.336 +
 137.337 +        col[0 ] = ((a0 + b0) >> COL_SHIFT);
 137.338 +        col[8 ] = ((a1 + b1) >> COL_SHIFT);
 137.339 +        col[16] = ((a2 + b2) >> COL_SHIFT);
 137.340 +        col[24] = ((a3 + b3) >> COL_SHIFT);
 137.341 +        col[32] = ((a3 - b3) >> COL_SHIFT);
 137.342 +        col[40] = ((a2 - b2) >> COL_SHIFT);
 137.343 +        col[48] = ((a1 - b1) >> COL_SHIFT);
 137.344 +        col[56] = ((a0 - b0) >> COL_SHIFT);
 137.345 +}
 137.346 +
 137.347 +void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
 137.348 +{
 137.349 +    int i;
 137.350 +    for(i=0; i<8; i++)
 137.351 +        idctRowCondDC(block + i*8);
 137.352 +
 137.353 +    for(i=0; i<8; i++)
 137.354 +        idctSparseColPut(dest + i, line_size, block + i);
 137.355 +}
 137.356 +
 137.357 +void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
 137.358 +{
 137.359 +    int i;
 137.360 +    for(i=0; i<8; i++)
 137.361 +        idctRowCondDC(block + i*8);
 137.362 +
 137.363 +    for(i=0; i<8; i++)
 137.364 +        idctSparseColAdd(dest + i, line_size, block + i);
 137.365 +}
 137.366 +
 137.367 +void ff_simple_idct(DCTELEM *block)
 137.368 +{
 137.369 +    int i;
 137.370 +    for(i=0; i<8; i++)
 137.371 +        idctRowCondDC(block + i*8);
 137.372 +
 137.373 +    for(i=0; i<8; i++)
 137.374 +        idctSparseCol(block + i);
 137.375 +}

   138.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   138.2 +++ b/ffmpeg_smp/h264dec/libavcodec/simple_idct.h	Mon Aug 27 12:09:56 2012 +0200
   138.3 @@ -0,0 +1,47 @@
   138.4 +/*
   138.5 + * Simple IDCT
   138.6 + *
   138.7 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
   138.8 + *
   138.9 + * This file is part of FFmpeg.
  138.10 + *
  138.11 + * FFmpeg is free software; you can redistribute it and/or
  138.12 + * modify it under the terms of the GNU Lesser General Public
  138.13 + * License as published by the Free Software Foundation; either
  138.14 + * version 2.1 of the License, or (at your option) any later version.
  138.15 + *
  138.16 + * FFmpeg is distributed in the hope that it will be useful,
  138.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  138.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  138.19 + * Lesser General Public License for more details.
  138.20 + *
  138.21 + * You should have received a copy of the GNU Lesser General Public
  138.22 + * License along with FFmpeg; if not, write to the Free Software
  138.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  138.24 + */
  138.25 +
  138.26 +/**
  138.27 + * @file
  138.28 + * simple idct header.
  138.29 + */
  138.30 +
  138.31 +#ifndef AVCODEC_SIMPLE_IDCT_H
  138.32 +#define AVCODEC_SIMPLE_IDCT_H
  138.33 +
  138.34 +#include <stdint.h>
  138.35 +#include "dsputil.h"
  138.36 +
  138.37 +void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block);
  138.38 +void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block);
  138.39 +void ff_simple_idct_mmx(int16_t *block);
  138.40 +void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
  138.41 +void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
  138.42 +void ff_simple_idct(DCTELEM *block);
  138.43 +
  138.44 +void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block);
  138.45 +
  138.46 +void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block);
  138.47 +void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block);
  138.48 +void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block);
  138.49 +
  138.50 +#endif /* AVCODEC_SIMPLE_IDCT_H */

   139.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   139.2 +++ b/ffmpeg_smp/h264dec/libavcodec/utils.c	Mon Aug 27 12:09:56 2012 +0200
   139.3 @@ -0,0 +1,68 @@
   139.4 +/*
   139.5 + * utils for libavcodec
   139.6 + * Copyright (c) 2001 Fabrice Bellard
   139.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   139.8 + *
   139.9 + * This file is part of FFmpeg.
  139.10 + *
  139.11 + * FFmpeg is free software; you can redistribute it and/or
  139.12 + * modify it under the terms of the GNU Lesser General Public
  139.13 + * License as published by the Free Software Foundation; either
  139.14 + * version 2.1 of the License, or (at your option) any later version.
  139.15 + *
  139.16 + * FFmpeg is distributed in the hope that it will be useful,
  139.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  139.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  139.19 + * Lesser General Public License for more details.
  139.20 + *
  139.21 + * You should have received a copy of the GNU Lesser General Public
  139.22 + * License along with FFmpeg; if not, write to the Free Software
  139.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  139.24 + */
  139.25 +
  139.26 +/**
  139.27 + * @file
  139.28 + * utils.
  139.29 + */
  139.30 +
  139.31 +/* needed for mkstemp() */
  139.32 +#define _XOPEN_SOURCE 600
  139.33 +
  139.34 +#include "avcodec.h"
  139.35 +#include "dsputil.h"
  139.36 +
  139.37 +#include <stdlib.h>
  139.38 +#include <stdarg.h>
  139.39 +#include <limits.h>
  139.40 +#include <float.h>
  139.41 +//#undef NDEBUG
  139.42 +#include <assert.h>
  139.43 +
  139.44 +#include <fcntl.h>
  139.45 +
  139.46 +void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size)
  139.47 +{
  139.48 +    if(min_size < *size)
  139.49 +        return ptr;
  139.50 +
  139.51 +    *size= FFMAX(17*min_size/16 + 32, min_size);
  139.52 +
  139.53 +    ptr= av_realloc(ptr, *size);
  139.54 +    if(!ptr) //we could set this to the unmodified min_size but this is safer if the user lost the ptr and uses NULL now
  139.55 +        *size= 0;
  139.56 +
  139.57 +    return ptr;
  139.58 +}
  139.59 +
  139.60 +void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size)
  139.61 +{
  139.62 +    void **p = ptr;
  139.63 +    if (min_size < *size)
  139.64 +        return;
  139.65 +    *size= FFMAX(17*min_size/16 + 32, min_size);
  139.66 +    av_free(*p);
  139.67 +    *p = av_malloc(*size);
  139.68 +    if (!*p) *size = 0;
  139.69 +}
  139.70 +
  139.71 +

   140.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   140.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/cpuid.c	Mon Aug 27 12:09:56 2012 +0200
   140.3 @@ -0,0 +1,135 @@
   140.4 +/*
   140.5 + * CPU detection code, extracted from mmx.h
   140.6 + * (c)1997-99 by H. Dietz and R. Fisher
   140.7 + * Converted to C and improved by Fabrice Bellard.
   140.8 + *
   140.9 + * This file is part of FFmpeg.
  140.10 + *
  140.11 + * FFmpeg is free software; you can redistribute it and/or
  140.12 + * modify it under the terms of the GNU Lesser General Public
  140.13 + * License as published by the Free Software Foundation; either
  140.14 + * version 2.1 of the License, or (at your option) any later version.
  140.15 + *
  140.16 + * FFmpeg is distributed in the hope that it will be useful,
  140.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  140.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  140.19 + * Lesser General Public License for more details.
  140.20 + *
  140.21 + * You should have received a copy of the GNU Lesser General Public
  140.22 + * License along with FFmpeg; if not, write to the Free Software
  140.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  140.24 + */
  140.25 +
  140.26 +#include <stdlib.h>
  140.27 +#include "libavutil/x86_cpu.h"
  140.28 +#include "libavcodec/dsputil.h"
  140.29 +
  140.30 +#undef printf
  140.31 +
  140.32 +/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
  140.33 +#define cpuid(index,eax,ebx,ecx,edx)\
  140.34 +    __asm__ volatile\
  140.35 +        ("mov %%"REG_b", %%"REG_S"\n\t"\
  140.36 +         "cpuid\n\t"\
  140.37 +         "xchg %%"REG_b", %%"REG_S\
  140.38 +         : "=a" (eax), "=S" (ebx),\
  140.39 +           "=c" (ecx), "=d" (edx)\
  140.40 +         : "0" (index));
  140.41 +
  140.42 +/* Function to test if multimedia instructions are supported...  */
  140.43 +int mm_support()
  140.44 +{
  140.45 +    int rval = 0;
  140.46 +    int eax, ebx, ecx, edx;
  140.47 +    int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
  140.48 +
  140.49 +#if ARCH_X86_32
  140.50 +    x86_reg a, c;
  140.51 +    __asm__ volatile (
  140.52 +        /* See if CPUID instruction is supported ... */
  140.53 +        /* ... Get copies of EFLAGS into eax and ecx */
  140.54 +        "pushfl\n\t"
  140.55 +        "pop %0\n\t"
  140.56 +        "mov %0, %1\n\t"
  140.57 +
  140.58 +        /* ... Toggle the ID bit in one copy and store */
  140.59 +        /*     to the EFLAGS reg */
  140.60 +        "xor $0x200000, %0\n\t"
  140.61 +        "push %0\n\t"
  140.62 +        "popfl\n\t"
  140.63 +
  140.64 +        /* ... Get the (hopefully modified) EFLAGS */
  140.65 +        "pushfl\n\t"
  140.66 +        "pop %0\n\t"
  140.67 +        : "=a" (a), "=c" (c)
  140.68 +        :
  140.69 +        : "cc"
  140.70 +        );
  140.71 +
  140.72 +    if (a == c)
  140.73 +        return 0; /* CPUID not supported */
  140.74 +#endif
  140.75 +
  140.76 +    cpuid(0, max_std_level, ebx, ecx, edx);
  140.77 +
  140.78 +    if(max_std_level >= 1){
  140.79 +        cpuid(1, eax, ebx, ecx, std_caps);
  140.80 +        if (std_caps & (1<<23))
  140.81 +            rval |= FF_MM_MMX;
  140.82 +        if (std_caps & (1<<25))
  140.83 +            rval |= FF_MM_MMX2
  140.84 +#if HAVE_SSE
  140.85 +                  | FF_MM_SSE;
  140.86 +        if (std_caps & (1<<26))
  140.87 +            rval |= FF_MM_SSE2;
  140.88 +        if (ecx & 1)
  140.89 +            rval |= FF_MM_SSE3;
  140.90 +        if (ecx & 0x00000200 )
  140.91 +            rval |= FF_MM_SSSE3;
  140.92 +        if (ecx & 0x00080000 )
  140.93 +            rval |= FF_MM_SSE4;
  140.94 +        if (ecx & 0x00100000 )
  140.95 +            rval |= FF_MM_SSE42;
  140.96 +#endif
  140.97 +                  ;
  140.98 +    }
  140.99 +
 140.100 +    cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
 140.101 +
 140.102 +    if(max_ext_level >= 0x80000001){
 140.103 +        cpuid(0x80000001, eax, ebx, ecx, ext_caps);
 140.104 +        if (ext_caps & (1<<31))
 140.105 +            rval |= FF_MM_3DNOW;
 140.106 +        if (ext_caps & (1<<30))
 140.107 +            rval |= FF_MM_3DNOWEXT;
 140.108 +        if (ext_caps & (1<<23))
 140.109 +            rval |= FF_MM_MMX;
 140.110 +        if (ext_caps & (1<<22))
 140.111 +            rval |= FF_MM_MMX2;
 140.112 +    }
 140.113 +
 140.114 +#if 0
 140.115 +    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n",
 140.116 +        (rval&FF_MM_MMX) ? "MMX ":"",
 140.117 +        (rval&FF_MM_MMX2) ? "MMX2 ":"",
 140.118 +        (rval&FF_MM_SSE) ? "SSE ":"",
 140.119 +        (rval&FF_MM_SSE2) ? "SSE2 ":"",
 140.120 +        (rval&FF_MM_SSE3) ? "SSE3 ":"",
 140.121 +        (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
 140.122 +        (rval&FF_MM_SSE4) ? "SSE4.1 ":"",
 140.123 +        (rval&FF_MM_SSE42) ? "SSE4.2 ":"",
 140.124 +        (rval&FF_MM_3DNOW) ? "3DNow ":"",
 140.125 +        (rval&FF_MM_3DNOWEXT) ? "3DNowExt ":"");
 140.126 +#endif
 140.127 +    return rval;
 140.128 +}
 140.129 +
 140.130 +#ifdef TEST
 140.131 +int main ( void )
 140.132 +{
 140.133 +    int mm_flags;
 140.134 +    mm_flags = mm_support();
 140.135 +    printf("mm_support = 0x%08X\n",mm_flags);
 140.136 +    return 0;
 140.137 +}
 140.138 +#endif

   141.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   141.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_mmx.c	Mon Aug 27 12:09:56 2012 +0200
   141.3 @@ -0,0 +1,304 @@
   141.4 +/*
   141.5 + * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
   141.6 + *                    Loren Merritt
   141.7 + *
   141.8 + * This file is part of FFmpeg.
   141.9 + *
  141.10 + * FFmpeg is free software; you can redistribute it and/or
  141.11 + * modify it under the terms of the GNU Lesser General Public
  141.12 + * License as published by the Free Software Foundation; either
  141.13 + * version 2.1 of the License, or (at your option) any later version.
  141.14 + *
  141.15 + * FFmpeg is distributed in the hope that it will be useful,
  141.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  141.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  141.18 + * Lesser General Public License for more details.
  141.19 + *
  141.20 + * You should have received a copy of the GNU Lesser General Public
  141.21 + * License along with FFmpeg; if not, write to the Free Software
  141.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  141.23 + */
  141.24 +
  141.25 +/**
  141.26 + * MMX optimized version of (put|avg)_h264_chroma_mc8.
  141.27 + * H264_CHROMA_MC8_TMPL must be defined to the desired function name
  141.28 + * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
  141.29 + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
  141.30 + */
  141.31 +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
  141.32 +{
  141.33 +    DECLARE_ALIGNED(8, uint64_t, AA);
  141.34 +    DECLARE_ALIGNED(8, uint64_t, DD);
  141.35 +    int i;
  141.36 +
  141.37 +    if(y==0 && x==0) {
  141.38 +        /* no filter needed */
  141.39 +        H264_CHROMA_MC8_MV0(dst, src, stride, h);
  141.40 +        return;
  141.41 +    }
  141.42 +
  141.43 +    assert(x<8 && y<8 && x>=0 && y>=0);
  141.44 +
  141.45 +    if(y==0 || x==0)
  141.46 +    {
  141.47 +        /* 1 dimensional filter only */
  141.48 +        const int dxy = x ? 1 : stride;
  141.49 +
  141.50 +        __asm__ volatile(
  141.51 +            "movd %0, %%mm5\n\t"
  141.52 +            "movq %1, %%mm4\n\t"
  141.53 +            "movq %2, %%mm6\n\t"         /* mm6 = rnd >> 3 */
  141.54 +            "punpcklwd %%mm5, %%mm5\n\t"
  141.55 +            "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
  141.56 +            "pxor %%mm7, %%mm7\n\t"
  141.57 +            "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
  141.58 +            :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
  141.59 +
  141.60 +        for(i=0; i<h; i++) {
  141.61 +            __asm__ volatile(
  141.62 +                /* mm0 = src[0..7], mm1 = src[1..8] */
  141.63 +                "movq %0, %%mm0\n\t"
  141.64 +                "movq %1, %%mm2\n\t"
  141.65 +                :: "m"(src[0]), "m"(src[dxy]));
  141.66 +
  141.67 +            __asm__ volatile(
  141.68 +                /* [mm0,mm1] = A * src[0..7] */
  141.69 +                /* [mm2,mm3] = B * src[1..8] */
  141.70 +                "movq %%mm0, %%mm1\n\t"
  141.71 +                "movq %%mm2, %%mm3\n\t"
  141.72 +                "punpcklbw %%mm7, %%mm0\n\t"
  141.73 +                "punpckhbw %%mm7, %%mm1\n\t"
  141.74 +                "punpcklbw %%mm7, %%mm2\n\t"
  141.75 +                "punpckhbw %%mm7, %%mm3\n\t"
  141.76 +                "pmullw %%mm4, %%mm0\n\t"
  141.77 +                "pmullw %%mm4, %%mm1\n\t"
  141.78 +                "pmullw %%mm5, %%mm2\n\t"
  141.79 +                "pmullw %%mm5, %%mm3\n\t"
  141.80 +
  141.81 +                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */
  141.82 +                "paddw %%mm6, %%mm0\n\t"
  141.83 +                "paddw %%mm6, %%mm1\n\t"
  141.84 +                "paddw %%mm2, %%mm0\n\t"
  141.85 +                "paddw %%mm3, %%mm1\n\t"
  141.86 +                "psrlw $3, %%mm0\n\t"
  141.87 +                "psrlw $3, %%mm1\n\t"
  141.88 +                "packuswb %%mm1, %%mm0\n\t"
  141.89 +                H264_CHROMA_OP(%0, %%mm0)
  141.90 +                "movq %%mm0, %0\n\t"
  141.91 +                : "=m" (dst[0]));
  141.92 +
  141.93 +            src += stride;
  141.94 +            dst += stride;
  141.95 +        }
  141.96 +        return;
  141.97 +    }
  141.98 +
  141.99 +    /* general case, bilinear */
 141.100 +    __asm__ volatile("movd %2, %%mm4\n\t"
 141.101 +                 "movd %3, %%mm6\n\t"
 141.102 +                 "punpcklwd %%mm4, %%mm4\n\t"
 141.103 +                 "punpcklwd %%mm6, %%mm6\n\t"
 141.104 +                 "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
 141.105 +                 "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
 141.106 +                 "movq %%mm4, %%mm5\n\t"
 141.107 +                 "pmullw %%mm6, %%mm4\n\t"    /* mm4 = x * y */
 141.108 +                 "psllw $3, %%mm5\n\t"
 141.109 +                 "psllw $3, %%mm6\n\t"
 141.110 +                 "movq %%mm5, %%mm7\n\t"
 141.111 +                 "paddw %%mm6, %%mm7\n\t"
 141.112 +                 "movq %%mm4, %1\n\t"         /* DD = x * y */
 141.113 +                 "psubw %%mm4, %%mm5\n\t"     /* mm5 = B = 8x - xy */
 141.114 +                 "psubw %%mm4, %%mm6\n\t"     /* mm6 = C = 8y - xy */
 141.115 +                 "paddw %4, %%mm4\n\t"
 141.116 +                 "psubw %%mm7, %%mm4\n\t"     /* mm4 = A = xy - (8x+8y) + 64 */
 141.117 +                 "pxor %%mm7, %%mm7\n\t"
 141.118 +                 "movq %%mm4, %0\n\t"
 141.119 +                 : "=m" (AA), "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
 141.120 +
 141.121 +    __asm__ volatile(
 141.122 +        /* mm0 = src[0..7], mm1 = src[1..8] */
 141.123 +        "movq %0, %%mm0\n\t"
 141.124 +        "movq %1, %%mm1\n\t"
 141.125 +        : : "m" (src[0]), "m" (src[1]));
 141.126 +
 141.127 +    for(i=0; i<h; i++) {
 141.128 +        src += stride;
 141.129 +
 141.130 +        __asm__ volatile(
 141.131 +            /* mm2 = A * src[0..3] + B * src[1..4] */
 141.132 +            /* mm3 = A * src[4..7] + B * src[5..8] */
 141.133 +            "movq %%mm0, %%mm2\n\t"
 141.134 +            "movq %%mm1, %%mm3\n\t"
 141.135 +            "punpckhbw %%mm7, %%mm0\n\t"
 141.136 +            "punpcklbw %%mm7, %%mm1\n\t"
 141.137 +            "punpcklbw %%mm7, %%mm2\n\t"
 141.138 +            "punpckhbw %%mm7, %%mm3\n\t"
 141.139 +            "pmullw %0, %%mm0\n\t"
 141.140 +            "pmullw %0, %%mm2\n\t"
 141.141 +            "pmullw %%mm5, %%mm1\n\t"
 141.142 +            "pmullw %%mm5, %%mm3\n\t"
 141.143 +            "paddw %%mm1, %%mm2\n\t"
 141.144 +            "paddw %%mm0, %%mm3\n\t"
 141.145 +            : : "m" (AA));
 141.146 +
 141.147 +        __asm__ volatile(
 141.148 +            /* [mm2,mm3] += C * src[0..7] */
 141.149 +            "movq %0, %%mm0\n\t"
 141.150 +            "movq %%mm0, %%mm1\n\t"
 141.151 +            "punpcklbw %%mm7, %%mm0\n\t"
 141.152 +            "punpckhbw %%mm7, %%mm1\n\t"
 141.153 +            "pmullw %%mm6, %%mm0\n\t"
 141.154 +            "pmullw %%mm6, %%mm1\n\t"
 141.155 +            "paddw %%mm0, %%mm2\n\t"
 141.156 +            "paddw %%mm1, %%mm3\n\t"
 141.157 +            : : "m" (src[0]));
 141.158 +
 141.159 +        __asm__ volatile(
 141.160 +            /* [mm2,mm3] += D * src[1..8] */
 141.161 +            "movq %1, %%mm1\n\t"
 141.162 +            "movq %%mm1, %%mm0\n\t"
 141.163 +            "movq %%mm1, %%mm4\n\t"
 141.164 +            "punpcklbw %%mm7, %%mm0\n\t"
 141.165 +            "punpckhbw %%mm7, %%mm4\n\t"
 141.166 +            "pmullw %2, %%mm0\n\t"
 141.167 +            "pmullw %2, %%mm4\n\t"
 141.168 +            "paddw %%mm0, %%mm2\n\t"
 141.169 +            "paddw %%mm4, %%mm3\n\t"
 141.170 +            "movq %0, %%mm0\n\t"
 141.171 +            : : "m" (src[0]), "m" (src[1]), "m" (DD));
 141.172 +
 141.173 +        __asm__ volatile(
 141.174 +            /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */
 141.175 +            "paddw %1, %%mm2\n\t"
 141.176 +            "paddw %1, %%mm3\n\t"
 141.177 +            "psrlw $6, %%mm2\n\t"
 141.178 +            "psrlw $6, %%mm3\n\t"
 141.179 +            "packuswb %%mm3, %%mm2\n\t"
 141.180 +            H264_CHROMA_OP(%0, %%mm2)
 141.181 +            "movq %%mm2, %0\n\t"
 141.182 +            : "=m" (dst[0]) : "m" (*rnd_reg));
 141.183 +        dst+= stride;
 141.184 +    }
 141.185 +}
 141.186 +
 141.187 +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
 141.188 +{
 141.189 +    __asm__ volatile(
 141.190 +        "pxor   %%mm7, %%mm7        \n\t"
 141.191 +        "movd %5, %%mm2             \n\t"
 141.192 +        "movd %6, %%mm3             \n\t"
 141.193 +        "movq "MANGLE(ff_pw_8)", %%mm4\n\t"
 141.194 +        "movq "MANGLE(ff_pw_8)", %%mm5\n\t"
 141.195 +        "punpcklwd %%mm2, %%mm2     \n\t"
 141.196 +        "punpcklwd %%mm3, %%mm3     \n\t"
 141.197 +        "punpcklwd %%mm2, %%mm2     \n\t"
 141.198 +        "punpcklwd %%mm3, %%mm3     \n\t"
 141.199 +        "psubw %%mm2, %%mm4         \n\t"
 141.200 +        "psubw %%mm3, %%mm5         \n\t"
 141.201 +
 141.202 +        "movd  (%1), %%mm0          \n\t"
 141.203 +        "movd 1(%1), %%mm6          \n\t"
 141.204 +        "add %3, %1                 \n\t"
 141.205 +        "punpcklbw %%mm7, %%mm0     \n\t"
 141.206 +        "punpcklbw %%mm7, %%mm6     \n\t"
 141.207 +        "pmullw %%mm4, %%mm0        \n\t"
 141.208 +        "pmullw %%mm2, %%mm6        \n\t"
 141.209 +        "paddw %%mm0, %%mm6         \n\t"
 141.210 +
 141.211 +        "1:                         \n\t"
 141.212 +        "movd  (%1), %%mm0          \n\t"
 141.213 +        "movd 1(%1), %%mm1          \n\t"
 141.214 +        "add %3, %1                 \n\t"
 141.215 +        "punpcklbw %%mm7, %%mm0     \n\t"
 141.216 +        "punpcklbw %%mm7, %%mm1     \n\t"
 141.217 +        "pmullw %%mm4, %%mm0        \n\t"
 141.218 +        "pmullw %%mm2, %%mm1        \n\t"
 141.219 +        "paddw %%mm0, %%mm1         \n\t"
 141.220 +        "movq %%mm1, %%mm0          \n\t"
 141.221 +        "pmullw %%mm5, %%mm6        \n\t"
 141.222 +        "pmullw %%mm3, %%mm1        \n\t"
 141.223 +        "paddw %4, %%mm6            \n\t"
 141.224 +        "paddw %%mm6, %%mm1         \n\t"
 141.225 +        "psrlw $6, %%mm1            \n\t"
 141.226 +        "packuswb %%mm1, %%mm1      \n\t"
 141.227 +        H264_CHROMA_OP4((%0), %%mm1, %%mm6)
 141.228 +        "movd %%mm1, (%0)           \n\t"
 141.229 +        "add %3, %0                 \n\t"
 141.230 +        "movd  (%1), %%mm6          \n\t"
 141.231 +        "movd 1(%1), %%mm1          \n\t"
 141.232 +        "add %3, %1                 \n\t"
 141.233 +        "punpcklbw %%mm7, %%mm6     \n\t"
 141.234 +        "punpcklbw %%mm7, %%mm1     \n\t"
 141.235 +        "pmullw %%mm4, %%mm6        \n\t"
 141.236 +        "pmullw %%mm2, %%mm1        \n\t"
 141.237 +        "paddw %%mm6, %%mm1         \n\t"
 141.238 +        "movq %%mm1, %%mm6          \n\t"
 141.239 +        "pmullw %%mm5, %%mm0        \n\t"
 141.240 +        "pmullw %%mm3, %%mm1        \n\t"
 141.241 +        "paddw %4, %%mm0            \n\t"
 141.242 +        "paddw %%mm0, %%mm1         \n\t"
 141.243 +        "psrlw $6, %%mm1            \n\t"
 141.244 +        "packuswb %%mm1, %%mm1      \n\t"
 141.245 +        H264_CHROMA_OP4((%0), %%mm1, %%mm0)
 141.246 +        "movd %%mm1, (%0)           \n\t"
 141.247 +        "add %3, %0                 \n\t"
 141.248 +        "sub $2, %2                 \n\t"
 141.249 +        "jnz 1b                     \n\t"
 141.250 +        : "+r"(dst), "+r"(src), "+r"(h)
 141.251 +        : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
 141.252 +    );
 141.253 +}
 141.254 +
 141.255 +#ifdef H264_CHROMA_MC2_TMPL
 141.256 +static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 141.257 +{
 141.258 +    int tmp = ((1<<16)-1)*x + 8;
 141.259 +    int CD= tmp*y;
 141.260 +    int AB= (tmp<<3) - CD;
 141.261 +    __asm__ volatile(
 141.262 +        /* mm5 = {A,B,A,B} */
 141.263 +        /* mm6 = {C,D,C,D} */
 141.264 +        "movd %0, %%mm5\n\t"
 141.265 +        "movd %1, %%mm6\n\t"
 141.266 +        "punpckldq %%mm5, %%mm5\n\t"
 141.267 +        "punpckldq %%mm6, %%mm6\n\t"
 141.268 +        "pxor %%mm7, %%mm7\n\t"
 141.269 +        /* mm0 = src[0,1,1,2] */
 141.270 +        "movd %2, %%mm2\n\t"
 141.271 +        "punpcklbw %%mm7, %%mm2\n\t"
 141.272 +        "pshufw $0x94, %%mm2, %%mm2\n\t"
 141.273 +        :: "r"(AB), "r"(CD), "m"(src[0]));
 141.274 +
 141.275 +
 141.276 +    __asm__ volatile(
 141.277 +        "1:\n\t"
 141.278 +        "add %4, %1\n\t"
 141.279 +        /* mm1 = A * src[0,1] + B * src[1,2] */
 141.280 +        "movq    %%mm2, %%mm1\n\t"
 141.281 +        "pmaddwd %%mm5, %%mm1\n\t"
 141.282 +        /* mm0 = src[0,1,1,2] */
 141.283 +        "movd (%1), %%mm0\n\t"
 141.284 +        "punpcklbw %%mm7, %%mm0\n\t"
 141.285 +        "pshufw $0x94, %%mm0, %%mm0\n\t"
 141.286 +        /* mm1 += C * src[0,1] + D * src[1,2] */
 141.287 +        "movq    %%mm0, %%mm2\n\t"
 141.288 +        "pmaddwd %%mm6, %%mm0\n\t"
 141.289 +        "paddw      %3, %%mm1\n\t"
 141.290 +        "paddw   %%mm0, %%mm1\n\t"
 141.291 +        /* dst[0,1] = pack((mm1 + 32) >> 6) */
 141.292 +        "psrlw $6, %%mm1\n\t"
 141.293 +        "packssdw %%mm7, %%mm1\n\t"
 141.294 +        "packuswb %%mm7, %%mm1\n\t"
 141.295 +        H264_CHROMA_OP4((%0), %%mm1, %%mm3)
 141.296 +        "movd %%mm1, %%esi\n\t"
 141.297 +        "movw %%si, (%0)\n\t"
 141.298 +        "add %4, %0\n\t"
 141.299 +        "sub $1, %2\n\t"
 141.300 +        "jnz 1b\n\t"
 141.301 +        : "+r" (dst), "+r"(src), "+r"(h)
 141.302 +        : "m" (ff_pw_32), "r"((x86_reg)stride)
 141.303 +        : "%esi");
 141.304 +
 141.305 +}
 141.306 +#endif
 141.307 +

   142.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   142.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_h264_template_ssse3.c	Mon Aug 27 12:09:56 2012 +0200
   142.3 @@ -0,0 +1,208 @@
   142.4 +/*
   142.5 + * Copyright (c) 2008 Loren Merritt
   142.6 + *
   142.7 + * This file is part of FFmpeg.
   142.8 + *
   142.9 + * FFmpeg is free software; you can redistribute it and/or
  142.10 + * modify it under the terms of the GNU Lesser General Public
  142.11 + * License as published by the Free Software Foundation; either
  142.12 + * version 2.1 of the License, or (at your option) any later version.
  142.13 + *
  142.14 + * FFmpeg is distributed in the hope that it will be useful,
  142.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  142.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  142.17 + * Lesser General Public License for more details.
  142.18 + *
  142.19 + * You should have received a copy of the GNU Lesser General Public
  142.20 + * License along with FFmpeg; if not, write to the Free Software
  142.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  142.22 + */
  142.23 +
  142.24 +/**
  142.25 + * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
  142.26 + * H264_CHROMA_MC8_TMPL must be defined to the desired function name
  142.27 + * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
  142.28 + * AVG_OP must be defined to empty for put and the identify for avg
  142.29 + */
  142.30 +static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
  142.31 +{
  142.32 +    if(y==0 && x==0) {
  142.33 +        /* no filter needed */
  142.34 +        H264_CHROMA_MC8_MV0(dst, src, stride, h);
  142.35 +        return;
  142.36 +    }
  142.37 +
  142.38 +    assert(x<8 && y<8 && x>=0 && y>=0);
  142.39 +
  142.40 +    if(y==0 || x==0)
  142.41 +    {
  142.42 +        /* 1 dimensional filter only */
  142.43 +        __asm__ volatile(
  142.44 +            "movd %0, %%xmm7 \n\t"
  142.45 +            "movq %1, %%xmm6 \n\t"
  142.46 +            "pshuflw $0, %%xmm7, %%xmm7 \n\t"
  142.47 +            "movlhps %%xmm6, %%xmm6 \n\t"
  142.48 +            "movlhps %%xmm7, %%xmm7 \n\t"
  142.49 +            :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
  142.50 +        );
  142.51 +
  142.52 +        if(x) {
  142.53 +            __asm__ volatile(
  142.54 +                "1: \n\t"
  142.55 +                "movq (%1), %%xmm0 \n\t"
  142.56 +                "movq 1(%1), %%xmm1 \n\t"
  142.57 +                "movq (%1,%3), %%xmm2 \n\t"
  142.58 +                "movq 1(%1,%3), %%xmm3 \n\t"
  142.59 +                "punpcklbw %%xmm1, %%xmm0 \n\t"
  142.60 +                "punpcklbw %%xmm3, %%xmm2 \n\t"
  142.61 +                "pmaddubsw %%xmm7, %%xmm0 \n\t"
  142.62 +                "pmaddubsw %%xmm7, %%xmm2 \n\t"
  142.63 +         AVG_OP("movq (%0), %%xmm4 \n\t")
  142.64 +         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
  142.65 +                "paddw %%xmm6, %%xmm0 \n\t"
  142.66 +                "paddw %%xmm6, %%xmm2 \n\t"
  142.67 +                "psrlw $3, %%xmm0 \n\t"
  142.68 +                "psrlw $3, %%xmm2 \n\t"
  142.69 +                "packuswb %%xmm2, %%xmm0 \n\t"
  142.70 +         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
  142.71 +                "movq %%xmm0, (%0) \n\t"
  142.72 +                "movhps %%xmm0, (%0,%3) \n\t"
  142.73 +                "sub $2, %2 \n\t"
  142.74 +                "lea (%1,%3,2), %1 \n\t"
  142.75 +                "lea (%0,%3,2), %0 \n\t"
  142.76 +                "jg 1b \n\t"
  142.77 +                :"+r"(dst), "+r"(src), "+r"(h)
  142.78 +                :"r"((x86_reg)stride)
  142.79 +            );
  142.80 +        } else {
  142.81 +            __asm__ volatile(
  142.82 +                "1: \n\t"
  142.83 +                "movq (%1), %%xmm0 \n\t"
  142.84 +                "movq (%1,%3), %%xmm1 \n\t"
  142.85 +                "movdqa %%xmm1, %%xmm2 \n\t"
  142.86 +                "movq (%1,%3,2), %%xmm3 \n\t"
  142.87 +                "punpcklbw %%xmm1, %%xmm0 \n\t"
  142.88 +                "punpcklbw %%xmm3, %%xmm2 \n\t"
  142.89 +                "pmaddubsw %%xmm7, %%xmm0 \n\t"
  142.90 +                "pmaddubsw %%xmm7, %%xmm2 \n\t"
  142.91 +         AVG_OP("movq (%0), %%xmm4 \n\t")
  142.92 +         AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
  142.93 +                "paddw %%xmm6, %%xmm0 \n\t"
  142.94 +                "paddw %%xmm6, %%xmm2 \n\t"
  142.95 +                "psrlw $3, %%xmm0 \n\t"
  142.96 +                "psrlw $3, %%xmm2 \n\t"
  142.97 +                "packuswb %%xmm2, %%xmm0 \n\t"
  142.98 +         AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
  142.99 +                "movq %%xmm0, (%0) \n\t"
 142.100 +                "movhps %%xmm0, (%0,%3) \n\t"
 142.101 +                "sub $2, %2 \n\t"
 142.102 +                "lea (%1,%3,2), %1 \n\t"
 142.103 +                "lea (%0,%3,2), %0 \n\t"
 142.104 +                "jg 1b \n\t"
 142.105 +                :"+r"(dst), "+r"(src), "+r"(h)
 142.106 +                :"r"((x86_reg)stride)
 142.107 +            );
 142.108 +        }
 142.109 +        return;
 142.110 +    }
 142.111 +
 142.112 +    /* general case, bilinear */
 142.113 +    __asm__ volatile(
 142.114 +        "movd %0, %%xmm7 \n\t"
 142.115 +        "movd %1, %%xmm6 \n\t"
 142.116 +        "movdqa %2, %%xmm5 \n\t"
 142.117 +        "pshuflw $0, %%xmm7, %%xmm7 \n\t"
 142.118 +        "pshuflw $0, %%xmm6, %%xmm6 \n\t"
 142.119 +        "movlhps %%xmm7, %%xmm7 \n\t"
 142.120 +        "movlhps %%xmm6, %%xmm6 \n\t"
 142.121 +        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
 142.122 +    );
 142.123 +
 142.124 +    __asm__ volatile(
 142.125 +        "movq (%1), %%xmm0 \n\t"
 142.126 +        "movq 1(%1), %%xmm1 \n\t"
 142.127 +        "punpcklbw %%xmm1, %%xmm0 \n\t"
 142.128 +        "add %3, %1 \n\t"
 142.129 +        "1: \n\t"
 142.130 +        "movq (%1), %%xmm1 \n\t"
 142.131 +        "movq 1(%1), %%xmm2 \n\t"
 142.132 +        "movq (%1,%3), %%xmm3 \n\t"
 142.133 +        "movq 1(%1,%3), %%xmm4 \n\t"
 142.134 +        "lea (%1,%3,2), %1 \n\t"
 142.135 +        "punpcklbw %%xmm2, %%xmm1 \n\t"
 142.136 +        "punpcklbw %%xmm4, %%xmm3 \n\t"
 142.137 +        "movdqa %%xmm1, %%xmm2 \n\t"
 142.138 +        "movdqa %%xmm3, %%xmm4 \n\t"
 142.139 +        "pmaddubsw %%xmm7, %%xmm0 \n\t"
 142.140 +        "pmaddubsw %%xmm6, %%xmm1 \n\t"
 142.141 +        "pmaddubsw %%xmm7, %%xmm2 \n\t"
 142.142 +        "pmaddubsw %%xmm6, %%xmm3 \n\t"
 142.143 +        "paddw %%xmm5, %%xmm0 \n\t"
 142.144 +        "paddw %%xmm5, %%xmm2 \n\t"
 142.145 +        "paddw %%xmm0, %%xmm1 \n\t"
 142.146 +        "paddw %%xmm2, %%xmm3 \n\t"
 142.147 +        "movdqa %%xmm4, %%xmm0 \n\t"
 142.148 +        "psrlw $6, %%xmm1 \n\t"
 142.149 +        "psrlw $6, %%xmm3 \n\t"
 142.150 + AVG_OP("movq (%0), %%xmm2 \n\t")
 142.151 + AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
 142.152 +        "packuswb %%xmm3, %%xmm1 \n\t"
 142.153 + AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
 142.154 +        "movq %%xmm1, (%0)\n\t"
 142.155 +        "movhps %%xmm1, (%0,%3)\n\t"
 142.156 +        "sub $2, %2 \n\t"
 142.157 +        "lea (%0,%3,2), %0 \n\t"
 142.158 +        "jg 1b \n\t"
 142.159 +        :"+r"(dst), "+r"(src), "+r"(h)
 142.160 +        :"r"((x86_reg)stride)
 142.161 +    );
 142.162 +}
 142.163 +
 142.164 +static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
 142.165 +{
 142.166 +    __asm__ volatile(
 142.167 +        "movd %0, %%mm7 \n\t"
 142.168 +        "movd %1, %%mm6 \n\t"
 142.169 +        "movq %2, %%mm5 \n\t"
 142.170 +        "pshufw $0, %%mm7, %%mm7 \n\t"
 142.171 +        "pshufw $0, %%mm6, %%mm6 \n\t"
 142.172 +        :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
 142.173 +    );
 142.174 +
 142.175 +    __asm__ volatile(
 142.176 +        "movd (%1), %%mm0 \n\t"
 142.177 +        "punpcklbw 1(%1), %%mm0 \n\t"
 142.178 +        "add %3, %1 \n\t"
 142.179 +        "1: \n\t"
 142.180 +        "movd (%1), %%mm1 \n\t"
 142.181 +        "movd (%1,%3), %%mm3 \n\t"
 142.182 +        "punpcklbw 1(%1), %%mm1 \n\t"
 142.183 +        "punpcklbw 1(%1,%3), %%mm3 \n\t"
 142.184 +        "lea (%1,%3,2), %1 \n\t"
 142.185 +        "movq %%mm1, %%mm2 \n\t"
 142.186 +        "movq %%mm3, %%mm4 \n\t"
 142.187 +        "pmaddubsw %%mm7, %%mm0 \n\t"
 142.188 +        "pmaddubsw %%mm6, %%mm1 \n\t"
 142.189 +        "pmaddubsw %%mm7, %%mm2 \n\t"
 142.190 +        "pmaddubsw %%mm6, %%mm3 \n\t"
 142.191 +        "paddw %%mm5, %%mm0 \n\t"
 142.192 +        "paddw %%mm5, %%mm2 \n\t"
 142.193 +        "paddw %%mm0, %%mm1 \n\t"
 142.194 +        "paddw %%mm2, %%mm3 \n\t"
 142.195 +        "movq %%mm4, %%mm0 \n\t"
 142.196 +        "psrlw $6, %%mm1 \n\t"
 142.197 +        "psrlw $6, %%mm3 \n\t"
 142.198 +        "packuswb %%mm1, %%mm1 \n\t"
 142.199 +        "packuswb %%mm3, %%mm3 \n\t"
 142.200 + AVG_OP("pavgb (%0), %%mm1 \n\t")
 142.201 + AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
 142.202 +        "movd %%mm1, (%0)\n\t"
 142.203 +        "movd %%mm3, (%0,%3)\n\t"
 142.204 +        "sub $2, %2 \n\t"
 142.205 +        "lea (%0,%3,2), %0 \n\t"
 142.206 +        "jg 1b \n\t"
 142.207 +        :"+r"(dst), "+r"(src), "+r"(h)
 142.208 +        :"r"((x86_reg)stride)
 142.209 +    );
 142.210 +}
 142.211 +

   143.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   143.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.c	Mon Aug 27 12:09:56 2012 +0200
   143.3 @@ -0,0 +1,821 @@
   143.4 +/*
   143.5 + * MMX optimized DSP utils
   143.6 + * Copyright (c) 2000, 2001 Fabrice Bellard
   143.7 + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   143.8 + *
   143.9 + * This file is part of FFmpeg.
  143.10 + *
  143.11 + * FFmpeg is free software; you can redistribute it and/or
  143.12 + * modify it under the terms of the GNU Lesser General Public
  143.13 + * License as published by the Free Software Foundation; either
  143.14 + * version 2.1 of the License, or (at your option) any later version.
  143.15 + *
  143.16 + * FFmpeg is distributed in the hope that it will be useful,
  143.17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  143.18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  143.19 + * Lesser General Public License for more details.
  143.20 + *
  143.21 + * You should have received a copy of the GNU Lesser General Public
  143.22 + * License along with FFmpeg; if not, write to the Free Software
  143.23 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  143.24 + *
  143.25 + * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  143.26 + */
  143.27 +
  143.28 +#include "libavutil/x86_cpu.h"
  143.29 +#include "libavutil/internal.h"
  143.30 +#include "libavcodec/dsputil.h"
  143.31 +#include "libavcodec/h264_dsp.h"
  143.32 +#include "dsputil_mmx.h"
  143.33 +
  143.34 +
  143.35 +//#undef NDEBUG
  143.36 +//#include <assert.h>
  143.37 +
  143.38 +int mm_flags; /* multimedia extension flags */
  143.39 +
  143.40 +/* pixel operations */
  143.41 +DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
  143.42 +DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
  143.43 +
  143.44 +DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
  143.45 +{0x8000000080000000ULL, 0x8000000080000000ULL};
  143.46 +
  143.47 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
  143.48 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
  143.49 +DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
  143.50 +DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8  ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
  143.51 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
  143.52 +DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
  143.53 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
  143.54 +DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
  143.55 +DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
  143.56 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
  143.57 +DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
  143.58 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
  143.59 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
  143.60 +DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
  143.61 +
  143.62 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
  143.63 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
  143.64 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
  143.65 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
  143.66 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
  143.67 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
  143.68 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
  143.69 +DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
  143.70 +
  143.71 +DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
  143.72 +DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
  143.73 +
  143.74 +#define ASMALIGN(ZEROBITS) ".align 1 << " #ZEROBITS "\n\t"
  143.75 +#define JUMPALIGN() __asm__ volatile (ASMALIGN(3)::)
  143.76 +#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
  143.77 +
  143.78 +#define MOVQ_BFE(regd) \
  143.79 +    __asm__ volatile ( \
  143.80 +    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
  143.81 +    "paddb %%" #regd ", %%" #regd " \n\t" ::)
  143.82 +
  143.83 +#ifndef PIC
  143.84 +#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
  143.85 +#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
  143.86 +#else
  143.87 +// for shared library it's better to use this way for accessing constants
  143.88 +// pcmpeqd -> -1
  143.89 +#define MOVQ_BONE(regd) \
  143.90 +    __asm__ volatile ( \
  143.91 +    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  143.92 +    "psrlw $15, %%" #regd " \n\t" \
  143.93 +    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
  143.94 +
  143.95 +#define MOVQ_WTWO(regd) \
  143.96 +    __asm__ volatile ( \
  143.97 +    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  143.98 +    "psrlw $15, %%" #regd " \n\t" \
  143.99 +    "psllw $1, %%" #regd " \n\t"::)
 143.100 +
 143.101 +#endif
 143.102 +
 143.103 +// using regr as temporary and for the output result
 143.104 +// first argument is unmodifed and second is trashed
 143.105 +// regfe is supposed to contain 0xfefefefefefefefe
 143.106 +#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
 143.107 +    "movq " #rega ", " #regr "  \n\t"\
 143.108 +    "pand " #regb ", " #regr "  \n\t"\
 143.109 +    "pxor " #rega ", " #regb "  \n\t"\
 143.110 +    "pand " #regfe "," #regb "  \n\t"\
 143.111 +    "psrlq $1, " #regb "        \n\t"\
 143.112 +    "paddb " #regb ", " #regr " \n\t"
 143.113 +
 143.114 +#define PAVGB_MMX(rega, regb, regr, regfe) \
 143.115 +    "movq " #rega ", " #regr "  \n\t"\
 143.116 +    "por  " #regb ", " #regr "  \n\t"\
 143.117 +    "pxor " #rega ", " #regb "  \n\t"\
 143.118 +    "pand " #regfe "," #regb "  \n\t"\
 143.119 +    "psrlq $1, " #regb "        \n\t"\
 143.120 +    "psubb " #regb ", " #regr " \n\t"
 143.121 +
 143.122 +// mm6 is supposed to contain 0xfefefefefefefefe
 143.123 +#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
 143.124 +    "movq " #rega ", " #regr "  \n\t"\
 143.125 +    "movq " #regc ", " #regp "  \n\t"\
 143.126 +    "pand " #regb ", " #regr "  \n\t"\
 143.127 +    "pand " #regd ", " #regp "  \n\t"\
 143.128 +    "pxor " #rega ", " #regb "  \n\t"\
 143.129 +    "pxor " #regc ", " #regd "  \n\t"\
 143.130 +    "pand %%mm6, " #regb "      \n\t"\
 143.131 +    "pand %%mm6, " #regd "      \n\t"\
 143.132 +    "psrlq $1, " #regb "        \n\t"\
 143.133 +    "psrlq $1, " #regd "        \n\t"\
 143.134 +    "paddb " #regb ", " #regr " \n\t"\
 143.135 +    "paddb " #regd ", " #regp " \n\t"
 143.136 +
 143.137 +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
 143.138 +    "movq " #rega ", " #regr "  \n\t"\
 143.139 +    "movq " #regc ", " #regp "  \n\t"\
 143.140 +    "por  " #regb ", " #regr "  \n\t"\
 143.141 +    "por  " #regd ", " #regp "  \n\t"\
 143.142 +    "pxor " #rega ", " #regb "  \n\t"\
 143.143 +    "pxor " #regc ", " #regd "  \n\t"\
 143.144 +    "pand %%mm6, " #regb "      \n\t"\
 143.145 +    "pand %%mm6, " #regd "      \n\t"\
 143.146 +    "psrlq $1, " #regd "        \n\t"\
 143.147 +    "psrlq $1, " #regb "        \n\t"\
 143.148 +    "psubb " #regb ", " #regr " \n\t"\
 143.149 +    "psubb " #regd ", " #regp " \n\t"
 143.150 +
 143.151 +/***********************************/
 143.152 +/* MMX2 specific */
 143.153 +
 143.154 +#define DEF(x) x ## _mmx2
 143.155 +
 143.156 +/* Introduced only in MMX2 set */
 143.157 +#define PAVGB "pavgb"
 143.158 +#define OP_AVG PAVGB
 143.159 +
 143.160 +#include "dsputil_mmx_avg_template.c"
 143.161 +
 143.162 +#undef DEF
 143.163 +#undef PAVGB
 143.164 +#undef OP_AVG
 143.165 +
 143.166 +#define put_no_rnd_pixels16_mmx put_pixels16_mmx
 143.167 +#define put_no_rnd_pixels8_mmx put_pixels8_mmx
 143.168 +#define put_pixels16_mmx2 put_pixels16_mmx
 143.169 +#define put_pixels8_mmx2 put_pixels8_mmx
 143.170 +#define put_pixels4_mmx2 put_pixels4_mmx
 143.171 +#define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
 143.172 +#define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
 143.173 +#define put_pixels16_3dnow put_pixels16_mmx
 143.174 +#define put_pixels8_3dnow put_pixels8_mmx
 143.175 +#define put_pixels4_3dnow put_pixels4_mmx
 143.176 +#define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
 143.177 +#define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
 143.178 +
 143.179 +/***********************************/
 143.180 +/* standard MMX */
 143.181 +
 143.182 +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
 143.183 +{
 143.184 +    const DCTELEM *p;
 143.185 +    uint8_t *pix;
 143.186 +
 143.187 +    /* read the pixels */
 143.188 +    p = block;
 143.189 +    pix = pixels;
 143.190 +    /* unrolled loop */
 143.191 +        __asm__ volatile(
 143.192 +                "movq   %3, %%mm0               \n\t"
 143.193 +                "movq   8%3, %%mm1              \n\t"
 143.194 +                "movq   16%3, %%mm2             \n\t"
 143.195 +                "movq   24%3, %%mm3             \n\t"
 143.196 +                "movq   32%3, %%mm4             \n\t"
 143.197 +                "movq   40%3, %%mm5             \n\t"
 143.198 +                "movq   48%3, %%mm6             \n\t"
 143.199 +                "movq   56%3, %%mm7             \n\t"
 143.200 +                "packuswb %%mm1, %%mm0          \n\t"
 143.201 +                "packuswb %%mm3, %%mm2          \n\t"
 143.202 +                "packuswb %%mm5, %%mm4          \n\t"
 143.203 +                "packuswb %%mm7, %%mm6          \n\t"
 143.204 +                "movq   %%mm0, (%0)             \n\t"
 143.205 +                "movq   %%mm2, (%0, %1)         \n\t"
 143.206 +                "movq   %%mm4, (%0, %1, 2)      \n\t"
 143.207 +                "movq   %%mm6, (%0, %2)         \n\t"
 143.208 +                ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
 143.209 +                :"memory");
 143.210 +        pix += line_size*4;
 143.211 +        p += 32;
 143.212 +
 143.213 +    // if here would be an exact copy of the code above
 143.214 +    // compiler would generate some very strange code
 143.215 +    // thus using "r"
 143.216 +    __asm__ volatile(
 143.217 +            "movq       (%3), %%mm0             \n\t"
 143.218 +            "movq       8(%3), %%mm1            \n\t"
 143.219 +            "movq       16(%3), %%mm2           \n\t"
 143.220 +            "movq       24(%3), %%mm3           \n\t"
 143.221 +            "movq       32(%3), %%mm4           \n\t"
 143.222 +            "movq       40(%3), %%mm5           \n\t"
 143.223 +            "movq       48(%3), %%mm6           \n\t"
 143.224 +            "movq       56(%3), %%mm7           \n\t"
 143.225 +            "packuswb %%mm1, %%mm0              \n\t"
 143.226 +            "packuswb %%mm3, %%mm2              \n\t"
 143.227 +            "packuswb %%mm5, %%mm4              \n\t"
 143.228 +            "packuswb %%mm7, %%mm6              \n\t"
 143.229 +            "movq       %%mm0, (%0)             \n\t"
 143.230 +            "movq       %%mm2, (%0, %1)         \n\t"
 143.231 +            "movq       %%mm4, (%0, %1, 2)      \n\t"
 143.232 +            "movq       %%mm6, (%0, %2)         \n\t"
 143.233 +            ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
 143.234 +            :"memory");
 143.235 +}
 143.236 +
 143.237 +DECLARE_ASM_CONST(8, uint8_t, ff_vector128)[8] =
 143.238 +  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
 143.239 +
 143.240 +#define put_signed_pixels_clamped_mmx_half(off) \
 143.241 +            "movq    "#off"(%2), %%mm1          \n\t"\
 143.242 +            "movq 16+"#off"(%2), %%mm2          \n\t"\
 143.243 +            "movq 32+"#off"(%2), %%mm3          \n\t"\
 143.244 +            "movq 48+"#off"(%2), %%mm4          \n\t"\
 143.245 +            "packsswb  8+"#off"(%2), %%mm1      \n\t"\
 143.246 +            "packsswb 24+"#off"(%2), %%mm2      \n\t"\
 143.247 +            "packsswb 40+"#off"(%2), %%mm3      \n\t"\
 143.248 +            "packsswb 56+"#off"(%2), %%mm4      \n\t"\
 143.249 +            "paddb %%mm0, %%mm1                 \n\t"\
 143.250 +            "paddb %%mm0, %%mm2                 \n\t"\
 143.251 +            "paddb %%mm0, %%mm3                 \n\t"\
 143.252 +            "paddb %%mm0, %%mm4                 \n\t"\
 143.253 +            "movq %%mm1, (%0)                   \n\t"\
 143.254 +            "movq %%mm2, (%0, %3)               \n\t"\
 143.255 +            "movq %%mm3, (%0, %3, 2)            \n\t"\
 143.256 +            "movq %%mm4, (%0, %1)               \n\t"
 143.257 +
 143.258 +void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
 143.259 +{
 143.260 +    x86_reg line_skip = line_size;
 143.261 +    x86_reg line_skip3;
 143.262 +
 143.263 +    __asm__ volatile (
 143.264 +            "movq "MANGLE(ff_vector128)", %%mm0 \n\t"
 143.265 +            "lea (%3, %3, 2), %1                \n\t"
 143.266 +            put_signed_pixels_clamped_mmx_half(0)
 143.267 +            "lea (%0, %3, 4), %0                \n\t"
 143.268 +            put_signed_pixels_clamped_mmx_half(64)
 143.269 +            :"+&r" (pixels), "=&r" (line_skip3)
 143.270 +            :"r" (block), "r"(line_skip)
 143.271 +            :"memory");
 143.272 +}
 143.273 +
 143.274 +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
 143.275 +{
 143.276 +    const DCTELEM *p;
 143.277 +    uint8_t *pix;
 143.278 +    int i;
 143.279 +
 143.280 +    /* read the pixels */
 143.281 +    p = block;
 143.282 +    pix = pixels;
 143.283 +    MOVQ_ZERO(mm7);
 143.284 +    i = 4;
 143.285 +    do {
 143.286 +        __asm__ volatile(
 143.287 +                "movq   (%2), %%mm0     \n\t"
 143.288 +                "movq   8(%2), %%mm1    \n\t"
 143.289 +                "movq   16(%2), %%mm2   \n\t"
 143.290 +                "movq   24(%2), %%mm3   \n\t"
 143.291 +                "movq   %0, %%mm4       \n\t"
 143.292 +                "movq   %1, %%mm6       \n\t"
 143.293 +                "movq   %%mm4, %%mm5    \n\t"
 143.294 +                "punpcklbw %%mm7, %%mm4 \n\t"
 143.295 +                "punpckhbw %%mm7, %%mm5 \n\t"
 143.296 +                "paddsw %%mm4, %%mm0    \n\t"
 143.297 +                "paddsw %%mm5, %%mm1    \n\t"
 143.298 +                "movq   %%mm6, %%mm5    \n\t"
 143.299 +                "punpcklbw %%mm7, %%mm6 \n\t"
 143.300 +                "punpckhbw %%mm7, %%mm5 \n\t"
 143.301 +                "paddsw %%mm6, %%mm2    \n\t"
 143.302 +                "paddsw %%mm5, %%mm3    \n\t"
 143.303 +                "packuswb %%mm1, %%mm0  \n\t"
 143.304 +                "packuswb %%mm3, %%mm2  \n\t"
 143.305 +                "movq   %%mm0, %0       \n\t"
 143.306 +                "movq   %%mm2, %1       \n\t"
 143.307 +                :"+m"(*pix), "+m"(*(pix+line_size))
 143.308 +                :"r"(p)
 143.309 +                :"memory");
 143.310 +        pix += line_size*2;
 143.311 +        p += 16;
 143.312 +    } while (--i);
 143.313 +}
 143.314 +
 143.315 +static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 143.316 +{
 143.317 +    __asm__ volatile(
 143.318 +         "lea (%3, %3), %%"REG_a"       \n\t"
 143.319 +         ASMALIGN(3)
 143.320 +         "1:                            \n\t"
 143.321 +         "movq (%1), %%mm0              \n\t"
 143.322 +         "movq (%1, %3), %%mm1          \n\t"
 143.323 +         "movq %%mm0, (%2)              \n\t"
 143.324 +         "movq %%mm1, (%2, %3)          \n\t"
 143.325 +         "add %%"REG_a", %1             \n\t"
 143.326 +         "add %%"REG_a", %2             \n\t"
 143.327 +         "movq (%1), %%mm0              \n\t"
 143.328 +         "movq (%1, %3), %%mm1          \n\t"
 143.329 +         "movq %%mm0, (%2)              \n\t"
 143.330 +         "movq %%mm1, (%2, %3)          \n\t"
 143.331 +         "add %%"REG_a", %1             \n\t"
 143.332 +         "add %%"REG_a", %2             \n\t"
 143.333 +         "subl $4, %0                   \n\t"
 143.334 +         "jnz 1b                        \n\t"
 143.335 +         : "+g"(h), "+r" (pixels),  "+r" (block)
 143.336 +         : "r"((x86_reg)line_size)
 143.337 +         : "%"REG_a, "memory"
 143.338 +        );
 143.339 +}
 143.340 +
 143.341 +static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 143.342 +{
 143.343 +    __asm__ volatile(
 143.344 +         "1:                            \n\t"
 143.345 +         "movdqu (%1), %%xmm0           \n\t"
 143.346 +         "movdqu (%1,%3), %%xmm1        \n\t"
 143.347 +         "movdqu (%1,%3,2), %%xmm2      \n\t"
 143.348 +         "movdqu (%1,%4), %%xmm3        \n\t"
 143.349 +         "movdqa %%xmm0, (%2)           \n\t"
 143.350 +         "movdqa %%xmm1, (%2,%3)        \n\t"
 143.351 +         "movdqa %%xmm2, (%2,%3,2)      \n\t"
 143.352 +         "movdqa %%xmm3, (%2,%4)        \n\t"
 143.353 +         "subl $4, %0                   \n\t"
 143.354 +         "lea (%1,%3,4), %1             \n\t"
 143.355 +         "lea (%2,%3,4), %2             \n\t"
 143.356 +         "jnz 1b                        \n\t"
 143.357 +         : "+g"(h), "+r" (pixels),  "+r" (block)
 143.358 +         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
 143.359 +         : "memory"
 143.360 +        );
 143.361 +}
 143.362 +
 143.363 +static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 143.364 +{
 143.365 +    __asm__ volatile(
 143.366 +         "1:                            \n\t"
 143.367 +         "movdqu (%1), %%xmm0           \n\t"
 143.368 +         "movdqu (%1,%3), %%xmm1        \n\t"
 143.369 +         "movdqu (%1,%3,2), %%xmm2      \n\t"
 143.370 +         "movdqu (%1,%4), %%xmm3        \n\t"
 143.371 +         "pavgb  (%2), %%xmm0           \n\t"
 143.372 +         "pavgb  (%2,%3), %%xmm1        \n\t"
 143.373 +         "pavgb  (%2,%3,2), %%xmm2      \n\t"
 143.374 +         "pavgb  (%2,%4), %%xmm3        \n\t"
 143.375 +         "movdqa %%xmm0, (%2)           \n\t"
 143.376 +         "movdqa %%xmm1, (%2,%3)        \n\t"
 143.377 +         "movdqa %%xmm2, (%2,%3,2)      \n\t"
 143.378 +         "movdqa %%xmm3, (%2,%4)        \n\t"
 143.379 +         "subl $4, %0                   \n\t"
 143.380 +         "lea (%1,%3,4), %1             \n\t"
 143.381 +         "lea (%2,%3,4), %2             \n\t"
 143.382 +         "jnz 1b                        \n\t"
 143.383 +         : "+g"(h), "+r" (pixels),  "+r" (block)
 143.384 +         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
 143.385 +         : "memory"
 143.386 +        );
 143.387 +}
 143.388 +
 143.389 +static void clear_block_sse(DCTELEM *block)
 143.390 +{
 143.391 +    __asm__ volatile(
 143.392 +        "xorps  %%xmm0, %%xmm0  \n"
 143.393 +        "movaps %%xmm0,    (%0) \n"
 143.394 +        "movaps %%xmm0,  16(%0) \n"
 143.395 +        "movaps %%xmm0,  32(%0) \n"
 143.396 +        "movaps %%xmm0,  48(%0) \n"
 143.397 +        "movaps %%xmm0,  64(%0) \n"
 143.398 +        "movaps %%xmm0,  80(%0) \n"
 143.399 +        "movaps %%xmm0,  96(%0) \n"
 143.400 +        "movaps %%xmm0, 112(%0) \n"
 143.401 +        :: "r"(block)
 143.402 +        : "memory"
 143.403 +    );
 143.404 +}
 143.405 +
 143.406 +static void clear_blocks_sse(DCTELEM *blocks)
 143.407 +{\
 143.408 +    __asm__ volatile(
 143.409 +        "xorps  %%xmm0, %%xmm0  \n"
 143.410 +        "mov     %1, %%"REG_a"  \n"
 143.411 +        "1:                     \n"
 143.412 +        "movaps %%xmm0,    (%0, %%"REG_a") \n"
 143.413 +        "movaps %%xmm0,  16(%0, %%"REG_a") \n"
 143.414 +        "movaps %%xmm0,  32(%0, %%"REG_a") \n"
 143.415 +        "movaps %%xmm0,  48(%0, %%"REG_a") \n"
 143.416 +        "movaps %%xmm0,  64(%0, %%"REG_a") \n"
 143.417 +        "movaps %%xmm0,  80(%0, %%"REG_a") \n"
 143.418 +        "movaps %%xmm0,  96(%0, %%"REG_a") \n"
 143.419 +        "movaps %%xmm0, 112(%0, %%"REG_a") \n"
 143.420 +        "add $128, %%"REG_a"    \n"
 143.421 +        " js 1b                 \n"
 143.422 +        : : "r" (((uint8_t *)blocks)+128*6),
 143.423 +            "i" (-128*6)
 143.424 +        : "%"REG_a
 143.425 +    );
 143.426 +}
 143.427 +
 143.428 +static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
 143.429 +    __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
 143.430 +        "movd  %4, %%mm0                \n\t"
 143.431 +        "movd  %5, %%mm1                \n\t"
 143.432 +        "movd  %6, %%mm2                \n\t"
 143.433 +        "movd  %7, %%mm3                \n\t"
 143.434 +        "punpcklbw %%mm1, %%mm0         \n\t"
 143.435 +        "punpcklbw %%mm3, %%mm2         \n\t"
 143.436 +        "movq %%mm0, %%mm1              \n\t"
 143.437 +        "punpcklwd %%mm2, %%mm0         \n\t"
 143.438 +        "punpckhwd %%mm2, %%mm1         \n\t"
 143.439 +        "movd  %%mm0, %0                \n\t"
 143.440 +        "punpckhdq %%mm0, %%mm0         \n\t"
 143.441 +        "movd  %%mm0, %1                \n\t"
 143.442 +        "movd  %%mm1, %2                \n\t"
 143.443 +        "punpckhdq %%mm1, %%mm1         \n\t"
 143.444 +        "movd  %%mm1, %3                \n\t"
 143.445 +
 143.446 +        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
 143.447 +          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
 143.448 +          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
 143.449 +          "=m" (*(uint32_t*)(dst + 3*dst_stride))
 143.450 +        :  "m" (*(uint32_t*)(src + 0*src_stride)),
 143.451 +           "m" (*(uint32_t*)(src + 1*src_stride)),
 143.452 +           "m" (*(uint32_t*)(src + 2*src_stride)),
 143.453 +           "m" (*(uint32_t*)(src + 3*src_stride))
 143.454 +    );
 143.455 +}
 143.456 +
 143.457 +#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
 143.458 +\
 143.459 +static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
 143.460 +    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
 143.461 +}\
 143.462 +\
 143.463 +static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.464 +    uint64_t temp[8];\
 143.465 +    uint8_t * const half= (uint8_t*)temp;\
 143.466 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
 143.467 +    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
 143.468 +}\
 143.469 +\
 143.470 +static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.471 +    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
 143.472 +}\
 143.473 +\
 143.474 +static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.475 +    uint64_t temp[8];\
 143.476 +    uint8_t * const half= (uint8_t*)temp;\
 143.477 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
 143.478 +    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
 143.479 +}\
 143.480 +\
 143.481 +static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.482 +    uint64_t temp[8];\
 143.483 +    uint8_t * const half= (uint8_t*)temp;\
 143.484 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
 143.485 +    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
 143.486 +}\
 143.487 +\
 143.488 +static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.489 +    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
 143.490 +}\
 143.491 +\
 143.492 +static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.493 +    uint64_t temp[8];\
 143.494 +    uint8_t * const half= (uint8_t*)temp;\
 143.495 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
 143.496 +    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
 143.497 +}\
 143.498 +static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.499 +    uint64_t half[8 + 9];\
 143.500 +    uint8_t * const halfH= ((uint8_t*)half) + 64;\
 143.501 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.502 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.503 +    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
 143.504 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 143.505 +    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
 143.506 +}\
 143.507 +static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.508 +    uint64_t half[8 + 9];\
 143.509 +    uint8_t * const halfH= ((uint8_t*)half) + 64;\
 143.510 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.511 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.512 +    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
 143.513 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 143.514 +    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
 143.515 +}\
 143.516 +static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.517 +    uint64_t half[8 + 9];\
 143.518 +    uint8_t * const halfH= ((uint8_t*)half) + 64;\
 143.519 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.520 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.521 +    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
 143.522 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 143.523 +    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
 143.524 +}\
 143.525 +static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.526 +    uint64_t half[8 + 9];\
 143.527 +    uint8_t * const halfH= ((uint8_t*)half) + 64;\
 143.528 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.529 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.530 +    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
 143.531 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 143.532 +    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
 143.533 +}\
 143.534 +static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.535 +    uint64_t half[8 + 9];\
 143.536 +    uint8_t * const halfH= ((uint8_t*)half) + 64;\
 143.537 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.538 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.539 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 143.540 +    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
 143.541 +}\
 143.542 +static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.543 +    uint64_t half[8 + 9];\
 143.544 +    uint8_t * const halfH= ((uint8_t*)half) + 64;\
 143.545 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.546 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.547 +    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 143.548 +    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
 143.549 +}\
 143.550 +static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.551 +    uint64_t half[8 + 9];\
 143.552 +    uint8_t * const halfH= ((uint8_t*)half);\
 143.553 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.554 +    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
 143.555 +    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 143.556 +}\
 143.557 +static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.558 +    uint64_t half[8 + 9];\
 143.559 +    uint8_t * const halfH= ((uint8_t*)half);\
 143.560 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.561 +    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
 143.562 +    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 143.563 +}\
 143.564 +static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.565 +    uint64_t half[9];\
 143.566 +    uint8_t * const halfH= ((uint8_t*)half);\
 143.567 +    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
 143.568 +    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 143.569 +}\
 143.570 +static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
 143.571 +    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
 143.572 +}\
 143.573 +\
 143.574 +static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.575 +    uint64_t temp[32];\
 143.576 +    uint8_t * const half= (uint8_t*)temp;\
 143.577 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
 143.578 +    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
 143.579 +}\
 143.580 +\
 143.581 +static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.582 +    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
 143.583 +}\
 143.584 +\
 143.585 +static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.586 +    uint64_t temp[32];\
 143.587 +    uint8_t * const half= (uint8_t*)temp;\
 143.588 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
 143.589 +    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
 143.590 +}\
 143.591 +\
 143.592 +static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.593 +    uint64_t temp[32];\
 143.594 +    uint8_t * const half= (uint8_t*)temp;\
 143.595 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
 143.596 +    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
 143.597 +}\
 143.598 +\
 143.599 +static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.600 +    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
 143.601 +}\
 143.602 +\
 143.603 +static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.604 +    uint64_t temp[32];\
 143.605 +    uint8_t * const half= (uint8_t*)temp;\
 143.606 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
 143.607 +    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
 143.608 +}\
 143.609 +static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.610 +    uint64_t half[16*2 + 17*2];\
 143.611 +    uint8_t * const halfH= ((uint8_t*)half) + 256;\
 143.612 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.613 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.614 +    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
 143.615 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
 143.616 +    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
 143.617 +}\
 143.618 +static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.619 +    uint64_t half[16*2 + 17*2];\
 143.620 +    uint8_t * const halfH= ((uint8_t*)half) + 256;\
 143.621 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.622 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.623 +    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
 143.624 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
 143.625 +    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
 143.626 +}\
 143.627 +static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.628 +    uint64_t half[16*2 + 17*2];\
 143.629 +    uint8_t * const halfH= ((uint8_t*)half) + 256;\
 143.630 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.631 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.632 +    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
 143.633 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
 143.634 +    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
 143.635 +}\
 143.636 +static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.637 +    uint64_t half[16*2 + 17*2];\
 143.638 +    uint8_t * const halfH= ((uint8_t*)half) + 256;\
 143.639 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.640 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.641 +    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
 143.642 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
 143.643 +    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
 143.644 +}\
 143.645 +static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.646 +    uint64_t half[16*2 + 17*2];\
 143.647 +    uint8_t * const halfH= ((uint8_t*)half) + 256;\
 143.648 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.649 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.650 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
 143.651 +    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
 143.652 +}\
 143.653 +static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.654 +    uint64_t half[16*2 + 17*2];\
 143.655 +    uint8_t * const halfH= ((uint8_t*)half) + 256;\
 143.656 +    uint8_t * const halfHV= ((uint8_t*)half);\
 143.657 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.658 +    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
 143.659 +    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
 143.660 +}\
 143.661 +static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.662 +    uint64_t half[17*2];\
 143.663 +    uint8_t * const halfH= ((uint8_t*)half);\
 143.664 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.665 +    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
 143.666 +    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 143.667 +}\
 143.668 +static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.669 +    uint64_t half[17*2];\
 143.670 +    uint8_t * const halfH= ((uint8_t*)half);\
 143.671 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.672 +    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
 143.673 +    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 143.674 +}\
 143.675 +static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
 143.676 +    uint64_t half[17*2];\
 143.677 +    uint8_t * const halfH= ((uint8_t*)half);\
 143.678 +    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
 143.679 +    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 143.680 +}
 143.681 +
 143.682 +#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
 143.683 +#define AVG_3DNOW_OP(a,b,temp, size) \
 143.684 +"mov" #size " " #b ", " #temp "   \n\t"\
 143.685 +"pavgusb " #temp ", " #a "        \n\t"\
 143.686 +"mov" #size " " #a ", " #b "      \n\t"
 143.687 +#define AVG_MMX2_OP(a,b,temp, size) \
 143.688 +"mov" #size " " #b ", " #temp "   \n\t"\
 143.689 +"pavgb " #temp ", " #a "          \n\t"\
 143.690 +"mov" #size " " #a ", " #b "      \n\t"
 143.691 +
 143.692 +#define PREFETCH(name, op) \
 143.693 +static void name(void *mem, int stride, int h){\
 143.694 +    const uint8_t *p= mem;\
 143.695 +    do{\
 143.696 +        __asm__ volatile(#op" %0" :: "m"(*p));\
 143.697 +        p+= stride;\
 143.698 +    }while(--h);\
 143.699 +}
 143.700 +PREFETCH(prefetch_mmx2,  prefetcht0)
 143.701 +#undef PREFETCH 
 143.702 +
 143.703 +#include "h264dsp_mmx.c"
 143.704 +
 143.705 +void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
 143.706 +void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
 143.707 +void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
 143.708 +void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
 143.709 +void ff_x264_deblock_h_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
 143.710 +
 143.711 +void dsputil_init_mmx(DSPContext* c)
 143.712 +{
 143.713 +    mm_flags = mm_support();
 143.714 +
 143.715 +    if (mm_flags & FF_MM_MMX) {
 143.716 +        c->clear_block  = clear_block_sse;
 143.717 +        c->clear_blocks = clear_blocks_sse;
 143.718 +        c->prefetch = prefetch_mmx2;
 143.719 +
 143.720 +
 143.721 +#define H264_QPEL_FUNCS(x, y, CPU)\
 143.722 +            c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
 143.723 +            c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
 143.724 +            c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
 143.725 +            c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
 143.726 +
 143.727 +        if((mm_flags & FF_MM_SSE2)){
 143.728 +            c->put_pixels_tab[0][0] = put_pixels16_sse2;
 143.729 +            c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
 143.730 +
 143.731 +        }
 143.732 +        if(mm_flags & FF_MM_SSE2){
 143.733 +            H264_QPEL_FUNCS(0, 1, sse2);
 143.734 +            H264_QPEL_FUNCS(0, 2, sse2);
 143.735 +            H264_QPEL_FUNCS(0, 3, sse2);
 143.736 +            H264_QPEL_FUNCS(1, 1, sse2);
 143.737 +            H264_QPEL_FUNCS(1, 2, sse2);
 143.738 +            H264_QPEL_FUNCS(1, 3, sse2);
 143.739 +            H264_QPEL_FUNCS(2, 1, sse2);
 143.740 +            H264_QPEL_FUNCS(2, 2, sse2);
 143.741 +            H264_QPEL_FUNCS(2, 3, sse2);
 143.742 +            H264_QPEL_FUNCS(3, 1, sse2);
 143.743 +            H264_QPEL_FUNCS(3, 2, sse2);
 143.744 +            H264_QPEL_FUNCS(3, 3, sse2);
 143.745 +        }
 143.746 +#if HAVE_SSSE3
 143.747 +        if(mm_flags & FF_MM_SSSE3){
 143.748 +            H264_QPEL_FUNCS(1, 0, ssse3);
 143.749 +            H264_QPEL_FUNCS(1, 1, ssse3);
 143.750 +            H264_QPEL_FUNCS(1, 2, ssse3);
 143.751 +            H264_QPEL_FUNCS(1, 3, ssse3);
 143.752 +            H264_QPEL_FUNCS(2, 0, ssse3);
 143.753 +            H264_QPEL_FUNCS(2, 1, ssse3);
 143.754 +            H264_QPEL_FUNCS(2, 2, ssse3);
 143.755 +            H264_QPEL_FUNCS(2, 3, ssse3);
 143.756 +            H264_QPEL_FUNCS(3, 0, ssse3);
 143.757 +            H264_QPEL_FUNCS(3, 1, ssse3);
 143.758 +            H264_QPEL_FUNCS(3, 2, ssse3);
 143.759 +            H264_QPEL_FUNCS(3, 3, ssse3);
 143.760 +
 143.761 +            c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_ssse3_rnd;
 143.762 +            c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_ssse3_rnd;
 143.763 +            c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
 143.764 +            c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
 143.765 +        }
 143.766 +#endif
 143.767 +
 143.768 +
 143.769 +    }
 143.770 +}
 143.771 +
 143.772 +void ff_h264dsp_init_x86(H264DSPContext *c)
 143.773 +{
 143.774 +    mm_flags = mm_support();
 143.775 +
 143.776 +    if (mm_flags & FF_MM_MMX) {
 143.777 +        c->h264_idct_dc_add=
 143.778 +        c->h264_idct_add= ff_h264_idct_add_mmx;
 143.779 +        c->h264_idct8_dc_add=
 143.780 +        c->h264_idct8_add= ff_h264_idct8_add_mmx;
 143.781 +
 143.782 +        if (mm_flags & FF_MM_MMX2) {            
 143.783 +            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
 143.784 +            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
 143.785 +			c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
 143.786 +            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
 143.787 +
 143.788 +			c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
 143.789 +			c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
 143.790 +
 143.791 +			c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
 143.792 +            c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
 143.793 +            c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
 143.794 +            c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
 143.795 +            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
 143.796 +            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
 143.797 +            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
 143.798 +
 143.799 +            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
 143.800 +            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
 143.801 +            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
 143.802 +            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
 143.803 +            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
 143.804 +            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
 143.805 +            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
 143.806 +            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
 143.807 +
 143.808 +            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
 143.809 +            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
 143.810 +            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
 143.811 +            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
 143.812 +            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
 143.813 +            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
 143.814 +            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
 143.815 +            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
 143.816 +        }
 143.817 +        if(mm_flags & FF_MM_SSE2){
 143.818 +            c->h264_idct8_add = ff_h264_idct8_add_sse2;
 143.819 +            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
 143.820 +        }
 143.821 +
 143.822 +    }
 143.823 +}
 143.824 +

   144.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   144.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx.h	Mon Aug 27 12:09:56 2012 +0200
   144.3 @@ -0,0 +1,170 @@
   144.4 +/*
   144.5 + * MMX optimized DSP utils
   144.6 + * Copyright (c) 2007  Aurelien Jacobs <aurel@gnuage.org>
   144.7 + *
   144.8 + * This file is part of FFmpeg.
   144.9 + *
  144.10 + * FFmpeg is free software; you can redistribute it and/or
  144.11 + * modify it under the terms of the GNU Lesser General Public
  144.12 + * License as published by the Free Software Foundation; either
  144.13 + * version 2.1 of the License, or (at your option) any later version.
  144.14 + *
  144.15 + * FFmpeg is distributed in the hope that it will be useful,
  144.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  144.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  144.18 + * Lesser General Public License for more details.
  144.19 + *
  144.20 + * You should have received a copy of the GNU Lesser General Public
  144.21 + * License along with FFmpeg; if not, write to the Free Software
  144.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  144.23 + */
  144.24 +
  144.25 +#ifndef AVCODEC_X86_DSPUTIL_MMX_H
  144.26 +#define AVCODEC_X86_DSPUTIL_MMX_H
  144.27 +
  144.28 +#include <stdint.h>
  144.29 +#include "libavcodec/dsputil.h"
  144.30 +
  144.31 +typedef struct { uint64_t a, b; } xmm_reg;
  144.32 +
  144.33 +extern const uint64_t ff_bone;
  144.34 +extern const uint64_t ff_wtwo;
  144.35 +
  144.36 +extern const uint64_t ff_pdw_80000000[2];
  144.37 +
  144.38 +extern const uint64_t ff_pw_3;
  144.39 +extern const uint64_t ff_pw_4;
  144.40 +extern const xmm_reg  ff_pw_5;
  144.41 +extern const xmm_reg  ff_pw_8;
  144.42 +extern const uint64_t ff_pw_15;
  144.43 +extern const xmm_reg  ff_pw_16;
  144.44 +extern const uint64_t ff_pw_20;
  144.45 +extern const xmm_reg  ff_pw_28;
  144.46 +extern const xmm_reg  ff_pw_32;
  144.47 +extern const uint64_t ff_pw_42;
  144.48 +extern const xmm_reg  ff_pw_64;
  144.49 +extern const uint64_t ff_pw_96;
  144.50 +extern const uint64_t ff_pw_128;
  144.51 +extern const uint64_t ff_pw_255;
  144.52 +
  144.53 +extern const uint64_t ff_pb_1;
  144.54 +extern const uint64_t ff_pb_3;
  144.55 +extern const uint64_t ff_pb_7;
  144.56 +extern const uint64_t ff_pb_1F;
  144.57 +extern const uint64_t ff_pb_3F;
  144.58 +extern const uint64_t ff_pb_81;
  144.59 +extern const uint64_t ff_pb_A1;
  144.60 +extern const uint64_t ff_pb_FC;
  144.61 +
  144.62 +extern const double ff_pd_1[2];
  144.63 +extern const double ff_pd_2[2];
  144.64 +
  144.65 +#define LOAD4(stride,in,a,b,c,d)\
  144.66 +    "movq 0*"#stride"+"#in", "#a"\n\t"\
  144.67 +    "movq 1*"#stride"+"#in", "#b"\n\t"\
  144.68 +    "movq 2*"#stride"+"#in", "#c"\n\t"\
  144.69 +    "movq 3*"#stride"+"#in", "#d"\n\t"
  144.70 +
  144.71 +#define STORE4(stride,out,a,b,c,d)\
  144.72 +    "movq "#a", 0*"#stride"+"#out"\n\t"\
  144.73 +    "movq "#b", 1*"#stride"+"#out"\n\t"\
  144.74 +    "movq "#c", 2*"#stride"+"#out"\n\t"\
  144.75 +    "movq "#d", 3*"#stride"+"#out"\n\t"
  144.76 +
  144.77 +/* in/out: mma=mma+mmb, mmb=mmb-mma */
  144.78 +#define SUMSUB_BA( a, b ) \
  144.79 +    "paddw "#b", "#a" \n\t"\
  144.80 +    "paddw "#b", "#b" \n\t"\
  144.81 +    "psubw "#a", "#b" \n\t"
  144.82 +
  144.83 +#define SBUTTERFLY(a,b,t,n,m)\
  144.84 +    "mov" #m " " #a ", " #t "         \n\t" /* abcd */\
  144.85 +    "punpckl" #n " " #b ", " #a "     \n\t" /* aebf */\
  144.86 +    "punpckh" #n " " #b ", " #t "     \n\t" /* cgdh */\
  144.87 +
  144.88 +#define TRANSPOSE4(a,b,c,d,t)\
  144.89 +    SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
  144.90 +    SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
  144.91 +    SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
  144.92 +    SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
  144.93 +
  144.94 +// e,f,g,h can be memory
  144.95 +// out: a,d,t,c
  144.96 +#define TRANSPOSE8x4(a,b,c,d,e,f,g,h,t)\
  144.97 +    "punpcklbw " #e ", " #a " \n\t" /* a0 e0 a1 e1 a2 e2 a3 e3 */\
  144.98 +    "punpcklbw " #f ", " #b " \n\t" /* b0 f0 b1 f1 b2 f2 b3 f3 */\
  144.99 +    "punpcklbw " #g ", " #c " \n\t" /* c0 g0 c1 g1 c2 g2 d3 g3 */\
 144.100 +    "punpcklbw " #h ", " #d " \n\t" /* d0 h0 d1 h1 d2 h2 d3 h3 */\
 144.101 +    SBUTTERFLY(a, b, t, bw, q)   /* a= a0 b0 e0 f0 a1 b1 e1 f1 */\
 144.102 +                                 /* t= a2 b2 e2 f2 a3 b3 e3 f3 */\
 144.103 +    SBUTTERFLY(c, d, b, bw, q)   /* c= c0 d0 g0 h0 c1 d1 g1 h1 */\
 144.104 +                                 /* b= c2 d2 g2 h2 c3 d3 g3 h3 */\
 144.105 +    SBUTTERFLY(a, c, d, wd, q)   /* a= a0 b0 c0 d0 e0 f0 g0 h0 */\
 144.106 +                                 /* d= a1 b1 c1 d1 e1 f1 g1 h1 */\
 144.107 +    SBUTTERFLY(t, b, c, wd, q)   /* t= a2 b2 c2 d2 e2 f2 g2 h2 */\
 144.108 +                                 /* c= a3 b3 c3 d3 e3 f3 g3 h3 */
 144.109 +
 144.110 +#if ARCH_X86_64
 144.111 +// permutes 01234567 -> 05736421
 144.112 +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
 144.113 +    SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
 144.114 +    SBUTTERFLY(c,d,b,wd,dqa)\
 144.115 +    SBUTTERFLY(e,f,d,wd,dqa)\
 144.116 +    SBUTTERFLY(g,h,f,wd,dqa)\
 144.117 +    SBUTTERFLY(a,c,h,dq,dqa)\
 144.118 +    SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
 144.119 +    SBUTTERFLY(e,g,b,dq,dqa)\
 144.120 +    SBUTTERFLY(d,f,g,dq,dqa)\
 144.121 +    SBUTTERFLY(a,e,f,qdq,dqa)\
 144.122 +    SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
 144.123 +    SBUTTERFLY(h,b,d,qdq,dqa)\
 144.124 +    SBUTTERFLY(c,g,b,qdq,dqa)\
 144.125 +    "movdqa %%xmm8, "#g"              \n\t"
 144.126 +#else
 144.127 +#define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
 144.128 +    "movdqa "#h", "#t"                \n\t"\
 144.129 +    SBUTTERFLY(a,b,h,wd,dqa)\
 144.130 +    "movdqa "#h", 16"#t"              \n\t"\
 144.131 +    "movdqa "#t", "#h"                \n\t"\
 144.132 +    SBUTTERFLY(c,d,b,wd,dqa)\
 144.133 +    SBUTTERFLY(e,f,d,wd,dqa)\
 144.134 +    SBUTTERFLY(g,h,f,wd,dqa)\
 144.135 +    SBUTTERFLY(a,c,h,dq,dqa)\
 144.136 +    "movdqa "#h", "#t"                \n\t"\
 144.137 +    "movdqa 16"#t", "#h"              \n\t"\
 144.138 +    SBUTTERFLY(h,b,c,dq,dqa)\
 144.139 +    SBUTTERFLY(e,g,b,dq,dqa)\
 144.140 +    SBUTTERFLY(d,f,g,dq,dqa)\
 144.141 +    SBUTTERFLY(a,e,f,qdq,dqa)\
 144.142 +    SBUTTERFLY(h,d,e,qdq,dqa)\
 144.143 +    "movdqa "#h", 16"#t"              \n\t"\
 144.144 +    "movdqa "#t", "#h"                \n\t"\
 144.145 +    SBUTTERFLY(h,b,d,qdq,dqa)\
 144.146 +    SBUTTERFLY(c,g,b,qdq,dqa)\
 144.147 +    "movdqa 16"#t", "#g"              \n\t"
 144.148 +#endif
 144.149 +
 144.150 +#define MOVQ_WONE(regd) \
 144.151 +    __asm__ volatile ( \
 144.152 +    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
 144.153 +    "psrlw $15, %%" #regd ::)
 144.154 +
 144.155 +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
 144.156 +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
 144.157 +void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
 144.158 +
 144.159 +void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
 144.160 +void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
 144.161 +void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
 144.162 +void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride);
 144.163 +
 144.164 +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
 144.165 +void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd);
 144.166 +
 144.167 +void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag,
 144.168 +                                   double *autoc);
 144.169 +
 144.170 +void ff_mmx_idct(DCTELEM *block);
 144.171 +void ff_mmxext_idct(DCTELEM *block);
 144.172 +
 144.173 +#endif /* AVCODEC_X86_DSPUTIL_MMX_H */

   145.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   145.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/dsputil_mmx_avg_template.c	Mon Aug 27 12:09:56 2012 +0200
   145.3 @@ -0,0 +1,250 @@
   145.4 +/*
   145.5 + * DSP utils : average functions are compiled twice for 3dnow/mmx2
   145.6 + * Copyright (c) 2000, 2001 Fabrice Bellard
   145.7 + * Copyright (c) 2002-2004 Michael Niedermayer
   145.8 + *
   145.9 + * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  145.10 + * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  145.11 + * and improved by Zdenek Kabelac <kabi@users.sf.net>
  145.12 + *
  145.13 + * This file is part of FFmpeg.
  145.14 + *
  145.15 + * FFmpeg is free software; you can redistribute it and/or
  145.16 + * modify it under the terms of the GNU Lesser General Public
  145.17 + * License as published by the Free Software Foundation; either
  145.18 + * version 2.1 of the License, or (at your option) any later version.
  145.19 + *
  145.20 + * FFmpeg is distributed in the hope that it will be useful,
  145.21 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  145.22 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  145.23 + * Lesser General Public License for more details.
  145.24 + *
  145.25 + * You should have received a copy of the GNU Lesser General Public
  145.26 + * License along with FFmpeg; if not, write to the Free Software
  145.27 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  145.28 + */
  145.29 +
  145.30 +static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  145.31 +{
  145.32 +    __asm__ volatile(
  145.33 +        "testl $1, %0                   \n\t"
  145.34 +            " jz 1f                     \n\t"
  145.35 +        "movq   (%1), %%mm0             \n\t"
  145.36 +        "movq   (%2), %%mm1             \n\t"
  145.37 +        "add    %4, %1                  \n\t"
  145.38 +        "add    $8, %2                  \n\t"
  145.39 +        PAVGB" %%mm1, %%mm0             \n\t"
  145.40 +        "movq   %%mm0, (%3)             \n\t"
  145.41 +        "add    %5, %3                  \n\t"
  145.42 +        "decl   %0                      \n\t"
  145.43 +        "1:                             \n\t"
  145.44 +        "movq   (%1), %%mm0             \n\t"
  145.45 +        "add    %4, %1                  \n\t"
  145.46 +        "movq   (%1), %%mm1             \n\t"
  145.47 +        "add    %4, %1                  \n\t"
  145.48 +        PAVGB" (%2), %%mm0              \n\t"
  145.49 +        PAVGB" 8(%2), %%mm1             \n\t"
  145.50 +        "movq   %%mm0, (%3)             \n\t"
  145.51 +        "add    %5, %3                  \n\t"
  145.52 +        "movq   %%mm1, (%3)             \n\t"
  145.53 +        "add    %5, %3                  \n\t"
  145.54 +        "movq   (%1), %%mm0             \n\t"
  145.55 +        "add    %4, %1                  \n\t"
  145.56 +        "movq   (%1), %%mm1             \n\t"
  145.57 +        "add    %4, %1                  \n\t"
  145.58 +        PAVGB" 16(%2), %%mm0            \n\t"
  145.59 +        PAVGB" 24(%2), %%mm1            \n\t"
  145.60 +        "movq   %%mm0, (%3)             \n\t"
  145.61 +        "add    %5, %3                  \n\t"
  145.62 +        "movq   %%mm1, (%3)             \n\t"
  145.63 +        "add    %5, %3                  \n\t"
  145.64 +        "add    $32, %2                 \n\t"
  145.65 +        "subl   $4, %0                  \n\t"
  145.66 +        "jnz    1b                      \n\t"
  145.67 +
  145.68 +        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
  145.69 +        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
  145.70 +        :"memory");
  145.71 +//the following should be used, though better not with gcc ...
  145.72 +/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
  145.73 +        :"r"(src1Stride), "r"(dstStride)
  145.74 +        :"memory");*/
  145.75 +}
  145.76 +
  145.77 +static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  145.78 +{
  145.79 +    __asm__ volatile(
  145.80 +        "testl $1, %0                   \n\t"
  145.81 +            " jz 1f                     \n\t"
  145.82 +        "movq   (%1), %%mm0             \n\t"
  145.83 +        "movq   (%2), %%mm1             \n\t"
  145.84 +        "add    %4, %1                  \n\t"
  145.85 +        "add    $8, %2                  \n\t"
  145.86 +        PAVGB" %%mm1, %%mm0             \n\t"
  145.87 +        PAVGB" (%3), %%mm0              \n\t"
  145.88 +        "movq   %%mm0, (%3)             \n\t"
  145.89 +        "add    %5, %3                  \n\t"
  145.90 +        "decl   %0                      \n\t"
  145.91 +        "1:                             \n\t"
  145.92 +        "movq   (%1), %%mm0             \n\t"
  145.93 +        "add    %4, %1                  \n\t"
  145.94 +        "movq   (%1), %%mm1             \n\t"
  145.95 +        "add    %4, %1                  \n\t"
  145.96 +        PAVGB" (%2), %%mm0              \n\t"
  145.97 +        PAVGB" 8(%2), %%mm1             \n\t"
  145.98 +        PAVGB" (%3), %%mm0              \n\t"
  145.99 +        "movq   %%mm0, (%3)             \n\t"
 145.100 +        "add    %5, %3                  \n\t"
 145.101 +        PAVGB" (%3), %%mm1              \n\t"
 145.102 +        "movq   %%mm1, (%3)             \n\t"
 145.103 +        "add    %5, %3                  \n\t"
 145.104 +        "movq   (%1), %%mm0             \n\t"
 145.105 +        "add    %4, %1                  \n\t"
 145.106 +        "movq   (%1), %%mm1             \n\t"
 145.107 +        "add    %4, %1                  \n\t"
 145.108 +        PAVGB" 16(%2), %%mm0            \n\t"
 145.109 +        PAVGB" 24(%2), %%mm1            \n\t"
 145.110 +        PAVGB" (%3), %%mm0              \n\t"
 145.111 +        "movq   %%mm0, (%3)             \n\t"
 145.112 +        "add    %5, %3                  \n\t"
 145.113 +        PAVGB" (%3), %%mm1              \n\t"
 145.114 +        "movq   %%mm1, (%3)             \n\t"
 145.115 +        "add    %5, %3                  \n\t"
 145.116 +        "add    $32, %2                 \n\t"
 145.117 +        "subl   $4, %0                  \n\t"
 145.118 +        "jnz    1b                      \n\t"
 145.119 +
 145.120 +        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
 145.121 +        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
 145.122 +        :"memory");
 145.123 +//the following should be used, though better not with gcc ...
 145.124 +/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
 145.125 +        :"r"(src1Stride), "r"(dstStride)
 145.126 +        :"memory");*/
 145.127 +}
 145.128 +
 145.129 +
 145.130 +static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
 145.131 +{
 145.132 +    __asm__ volatile(
 145.133 +        "testl $1, %0                   \n\t"
 145.134 +            " jz 1f                     \n\t"
 145.135 +        "movq   (%1), %%mm0             \n\t"
 145.136 +        "movq   8(%1), %%mm1            \n\t"
 145.137 +        PAVGB" (%2), %%mm0              \n\t"
 145.138 +        PAVGB" 8(%2), %%mm1             \n\t"
 145.139 +        "add    %4, %1                  \n\t"
 145.140 +        "add    $16, %2                 \n\t"
 145.141 +        "movq   %%mm0, (%3)             \n\t"
 145.142 +        "movq   %%mm1, 8(%3)            \n\t"
 145.143 +        "add    %5, %3                  \n\t"
 145.144 +        "decl   %0                      \n\t"
 145.145 +        "1:                             \n\t"
 145.146 +        "movq   (%1), %%mm0             \n\t"
 145.147 +        "movq   8(%1), %%mm1            \n\t"
 145.148 +        "add    %4, %1                  \n\t"
 145.149 +        PAVGB" (%2), %%mm0              \n\t"
 145.150 +        PAVGB" 8(%2), %%mm1             \n\t"
 145.151 +        "movq   %%mm0, (%3)             \n\t"
 145.152 +        "movq   %%mm1, 8(%3)            \n\t"
 145.153 +        "add    %5, %3                  \n\t"
 145.154 +        "movq   (%1), %%mm0             \n\t"
 145.155 +        "movq   8(%1), %%mm1            \n\t"
 145.156 +        "add    %4, %1                  \n\t"
 145.157 +        PAVGB" 16(%2), %%mm0            \n\t"
 145.158 +        PAVGB" 24(%2), %%mm1            \n\t"
 145.159 +        "movq   %%mm0, (%3)             \n\t"
 145.160 +        "movq   %%mm1, 8(%3)            \n\t"
 145.161 +        "add    %5, %3                  \n\t"
 145.162 +        "add    $32, %2                 \n\t"
 145.163 +        "subl   $2, %0                  \n\t"
 145.164 +        "jnz    1b                      \n\t"
 145.165 +
 145.166 +        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
 145.167 +
 145.168 +        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
 145.169 +        :"memory");
 145.170 +//the following should be used, though better not with gcc ...
 145.171 +/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
 145.172 +        :"r"(src1Stride), "r"(dstStride)
 145.173 +        :"memory");*/
 145.174 +}
 145.175 +
 145.176 +static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
 145.177 +{
 145.178 +    __asm__ volatile(
 145.179 +        "testl $1, %0                   \n\t"
 145.180 +            " jz 1f                     \n\t"
 145.181 +        "movq   (%1), %%mm0             \n\t"
 145.182 +        "movq   8(%1), %%mm1            \n\t"
 145.183 +        PAVGB" (%2), %%mm0              \n\t"
 145.184 +        PAVGB" 8(%2), %%mm1             \n\t"
 145.185 +        "add    %4, %1                  \n\t"
 145.186 +        "add    $16, %2                 \n\t"
 145.187 +        PAVGB" (%3), %%mm0              \n\t"
 145.188 +        PAVGB" 8(%3), %%mm1             \n\t"
 145.189 +        "movq   %%mm0, (%3)             \n\t"
 145.190 +        "movq   %%mm1, 8(%3)            \n\t"
 145.191 +        "add    %5, %3                  \n\t"
 145.192 +        "decl   %0                      \n\t"
 145.193 +        "1:                             \n\t"
 145.194 +        "movq   (%1), %%mm0             \n\t"
 145.195 +        "movq   8(%1), %%mm1            \n\t"
 145.196 +        "add    %4, %1                  \n\t"
 145.197 +        PAVGB" (%2), %%mm0              \n\t"
 145.198 +        PAVGB" 8(%2), %%mm1             \n\t"
 145.199 +        PAVGB" (%3), %%mm0              \n\t"
 145.200 +        PAVGB" 8(%3), %%mm1             \n\t"
 145.201 +        "movq   %%mm0, (%3)             \n\t"
 145.202 +        "movq   %%mm1, 8(%3)            \n\t"
 145.203 +        "add    %5, %3                  \n\t"
 145.204 +        "movq   (%1), %%mm0             \n\t"
 145.205 +        "movq   8(%1), %%mm1            \n\t"
 145.206 +        "add    %4, %1                  \n\t"
 145.207 +        PAVGB" 16(%2), %%mm0            \n\t"
 145.208 +        PAVGB" 24(%2), %%mm1            \n\t"
 145.209 +        PAVGB" (%3), %%mm0              \n\t"
 145.210 +        PAVGB" 8(%3), %%mm1             \n\t"
 145.211 +        "movq   %%mm0, (%3)             \n\t"
 145.212 +        "movq   %%mm1, 8(%3)            \n\t"
 145.213 +        "add    %5, %3                  \n\t"
 145.214 +        "add    $32, %2                 \n\t"
 145.215 +        "subl   $2, %0                  \n\t"
 145.216 +        "jnz    1b                      \n\t"
 145.217 +
 145.218 +        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
 145.219 +        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
 145.220 +        :"memory");
 145.221 +//the following should be used, though better not with gcc ...
 145.222 +/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
 145.223 +        :"r"(src1Stride), "r"(dstStride)
 145.224 +        :"memory");*/
 145.225 +}
 145.226 +
 145.227 +static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 145.228 +{
 145.229 +    __asm__ volatile(
 145.230 +        "lea (%3, %3), %%"REG_a"        \n\t"
 145.231 +        "1:                             \n\t"
 145.232 +        "movq (%2), %%mm0               \n\t"
 145.233 +        "movq (%2, %3), %%mm1           \n\t"
 145.234 +        PAVGB" (%1), %%mm0              \n\t"
 145.235 +        PAVGB" (%1, %3), %%mm1          \n\t"
 145.236 +        "movq %%mm0, (%2)               \n\t"
 145.237 +        "movq %%mm1, (%2, %3)           \n\t"
 145.238 +        "add %%"REG_a", %1              \n\t"
 145.239 +        "add %%"REG_a", %2              \n\t"
 145.240 +        "movq (%2), %%mm0               \n\t"
 145.241 +        "movq (%2, %3), %%mm1           \n\t"
 145.242 +        PAVGB" (%1), %%mm0              \n\t"
 145.243 +        PAVGB" (%1, %3), %%mm1          \n\t"
 145.244 +        "add %%"REG_a", %1              \n\t"
 145.245 +        "movq %%mm0, (%2)               \n\t"
 145.246 +        "movq %%mm1, (%2, %3)           \n\t"
 145.247 +        "add %%"REG_a", %2              \n\t"
 145.248 +        "subl $4, %0                    \n\t"
 145.249 +        "jnz 1b                         \n\t"
 145.250 +        :"+g"(h), "+S"(pixels), "+D"(block)
 145.251 +        :"r" ((x86_reg)line_size)
 145.252 +        :"%"REG_a, "memory");
 145.253 +}

   146.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   146.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/h264dsp_mmx.c	Mon Aug 27 12:09:56 2012 +0200
   146.3 @@ -0,0 +1,1741 @@
   146.4 +/*
   146.5 + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
   146.6 + *
   146.7 + * This file is part of FFmpeg.
   146.8 + *
   146.9 + * FFmpeg is free software; you can redistribute it and/or
  146.10 + * modify it under the terms of the GNU Lesser General Public
  146.11 + * License as published by the Free Software Foundation; either
  146.12 + * version 2.1 of the License, or (at your option) any later version.
  146.13 + *
  146.14 + * FFmpeg is distributed in the hope that it will be useful,
  146.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  146.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  146.17 + * Lesser General Public License for more details.
  146.18 + *
  146.19 + * You should have received a copy of the GNU Lesser General Public
  146.20 + * License along with FFmpeg; if not, write to the Free Software
  146.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  146.22 + */
  146.23 +
  146.24 +#include "dsputil_mmx.h"
  146.25 +
  146.26 +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
  146.27 +DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
  146.28 +
  146.29 +/***********************************/
  146.30 +/* IDCT */
  146.31 +
  146.32 +#define SUMSUB_BADC( a, b, c, d ) \
  146.33 +    "paddw "#b", "#a" \n\t"\
  146.34 +    "paddw "#d", "#c" \n\t"\
  146.35 +    "paddw "#b", "#b" \n\t"\
  146.36 +    "paddw "#d", "#d" \n\t"\
  146.37 +    "psubw "#a", "#b" \n\t"\
  146.38 +    "psubw "#c", "#d" \n\t"
  146.39 +
  146.40 +#define SUMSUBD2_AB( a, b, t ) \
  146.41 +    "movq  "#b", "#t" \n\t"\
  146.42 +    "psraw  $1 , "#b" \n\t"\
  146.43 +    "paddw "#a", "#b" \n\t"\
  146.44 +    "psraw  $1 , "#a" \n\t"\
  146.45 +    "psubw "#t", "#a" \n\t"
  146.46 +
  146.47 +#define IDCT4_1D( s02, s13, d02, d13, t ) \
  146.48 +    SUMSUB_BA  ( s02, d02 )\
  146.49 +    SUMSUBD2_AB( s13, d13, t )\
  146.50 +    SUMSUB_BADC( d13, s02, s13, d02 )
  146.51 +
  146.52 +#define STORE_DIFF_4P( p, t, z ) \
  146.53 +    "psraw      $6,     "#p" \n\t"\
  146.54 +    "movd       (%0),   "#t" \n\t"\
  146.55 +    "punpcklbw "#z",    "#t" \n\t"\
  146.56 +    "paddsw    "#t",    "#p" \n\t"\
  146.57 +    "packuswb  "#z",    "#p" \n\t"\
  146.58 +    "movd      "#p",    (%0) \n\t"
  146.59 +
  146.60 +static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  146.61 +{
  146.62 +    /* Load dct coeffs */
  146.63 +    __asm__ volatile(
  146.64 +        "movq   (%0), %%mm0 \n\t"
  146.65 +        "movq  8(%0), %%mm1 \n\t"
  146.66 +        "movq 16(%0), %%mm2 \n\t"
  146.67 +        "movq 24(%0), %%mm3 \n\t"
  146.68 +    :: "r"(block) );
  146.69 +
  146.70 +    __asm__ volatile(
  146.71 +        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
  146.72 +        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
  146.73 +
  146.74 +        "movq      %0,    %%mm6 \n\t"
  146.75 +        /* in: 1,4,0,2  out: 1,2,3,0 */
  146.76 +        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
  146.77 +
  146.78 +        "paddw     %%mm6, %%mm3 \n\t"
  146.79 +
  146.80 +        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
  146.81 +        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
  146.82 +
  146.83 +        "pxor %%mm7, %%mm7    \n\t"
  146.84 +    :: "m"(ff_pw_32));
  146.85 +
  146.86 +    __asm__ volatile(
  146.87 +    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
  146.88 +        "add %1, %0             \n\t"
  146.89 +    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
  146.90 +        "add %1, %0             \n\t"
  146.91 +    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
  146.92 +        "add %1, %0             \n\t"
  146.93 +    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
  146.94 +        : "+r"(dst)
  146.95 +        : "r" ((x86_reg)stride)
  146.96 +    );
  146.97 +}
  146.98 +
  146.99 +static inline void h264_idct8_1d(int16_t *block)
 146.100 +{
 146.101 +    __asm__ volatile(
 146.102 +        "movq 112(%0), %%mm7  \n\t"
 146.103 +        "movq  80(%0), %%mm0  \n\t"
 146.104 +        "movq  48(%0), %%mm3  \n\t"
 146.105 +        "movq  16(%0), %%mm5  \n\t"
 146.106 +
 146.107 +        "movq   %%mm0, %%mm4  \n\t"
 146.108 +        "movq   %%mm5, %%mm1  \n\t"
 146.109 +        "psraw  $1,    %%mm4  \n\t"
 146.110 +        "psraw  $1,    %%mm1  \n\t"
 146.111 +        "paddw  %%mm0, %%mm4  \n\t"
 146.112 +        "paddw  %%mm5, %%mm1  \n\t"
 146.113 +        "paddw  %%mm7, %%mm4  \n\t"
 146.114 +        "paddw  %%mm0, %%mm1  \n\t"
 146.115 +        "psubw  %%mm5, %%mm4  \n\t"
 146.116 +        "paddw  %%mm3, %%mm1  \n\t"
 146.117 +
 146.118 +        "psubw  %%mm3, %%mm5  \n\t"
 146.119 +        "psubw  %%mm3, %%mm0  \n\t"
 146.120 +        "paddw  %%mm7, %%mm5  \n\t"
 146.121 +        "psubw  %%mm7, %%mm0  \n\t"
 146.122 +        "psraw  $1,    %%mm3  \n\t"
 146.123 +        "psraw  $1,    %%mm7  \n\t"
 146.124 +        "psubw  %%mm3, %%mm5  \n\t"
 146.125 +        "psubw  %%mm7, %%mm0  \n\t"
 146.126 +
 146.127 +        "movq   %%mm4, %%mm3  \n\t"
 146.128 +        "movq   %%mm1, %%mm7  \n\t"
 146.129 +        "psraw  $2,    %%mm1  \n\t"
 146.130 +        "psraw  $2,    %%mm3  \n\t"
 146.131 +        "paddw  %%mm5, %%mm3  \n\t"
 146.132 +        "psraw  $2,    %%mm5  \n\t"
 146.133 +        "paddw  %%mm0, %%mm1  \n\t"
 146.134 +        "psraw  $2,    %%mm0  \n\t"
 146.135 +        "psubw  %%mm4, %%mm5  \n\t"
 146.136 +        "psubw  %%mm0, %%mm7  \n\t"
 146.137 +
 146.138 +        "movq  32(%0), %%mm2  \n\t"
 146.139 +        "movq  96(%0), %%mm6  \n\t"
 146.140 +        "movq   %%mm2, %%mm4  \n\t"
 146.141 +        "movq   %%mm6, %%mm0  \n\t"
 146.142 +        "psraw  $1,    %%mm4  \n\t"
 146.143 +        "psraw  $1,    %%mm6  \n\t"
 146.144 +        "psubw  %%mm0, %%mm4  \n\t"
 146.145 +        "paddw  %%mm2, %%mm6  \n\t"
 146.146 +
 146.147 +        "movq    (%0), %%mm2  \n\t"
 146.148 +        "movq  64(%0), %%mm0  \n\t"
 146.149 +        SUMSUB_BA( %%mm0, %%mm2 )
 146.150 +        SUMSUB_BA( %%mm6, %%mm0 )
 146.151 +        SUMSUB_BA( %%mm4, %%mm2 )
 146.152 +        SUMSUB_BA( %%mm7, %%mm6 )
 146.153 +        SUMSUB_BA( %%mm5, %%mm4 )
 146.154 +        SUMSUB_BA( %%mm3, %%mm2 )
 146.155 +        SUMSUB_BA( %%mm1, %%mm0 )
 146.156 +        :: "r"(block)
 146.157 +    );
 146.158 +}
 146.159 +
 146.160 +static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
 146.161 +{
 146.162 +    int i;
 146.163 +    DECLARE_ALIGNED(8, int16_t, b2)[64];
 146.164 +
 146.165 +    block[0] += 32;
 146.166 +
 146.167 +    for(i=0; i<2; i++){
 146.168 +        DECLARE_ALIGNED(8, uint64_t, tmp);
 146.169 +
 146.170 +        h264_idct8_1d(block+4*i);
 146.171 +
 146.172 +        __asm__ volatile(
 146.173 +            "movq   %%mm7,    %0   \n\t"
 146.174 +            TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
 146.175 +            "movq   %%mm0,  8(%1)  \n\t"
 146.176 +            "movq   %%mm6, 24(%1)  \n\t"
 146.177 +            "movq   %%mm7, 40(%1)  \n\t"
 146.178 +            "movq   %%mm4, 56(%1)  \n\t"
 146.179 +            "movq    %0,    %%mm7  \n\t"
 146.180 +            TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
 146.181 +            "movq   %%mm7,   (%1)  \n\t"
 146.182 +            "movq   %%mm1, 16(%1)  \n\t"
 146.183 +            "movq   %%mm0, 32(%1)  \n\t"
 146.184 +            "movq   %%mm3, 48(%1)  \n\t"
 146.185 +            : "=m"(tmp)
 146.186 +            : "r"(b2+32*i)
 146.187 +            : "memory"
 146.188 +        );
 146.189 +    }
 146.190 +
 146.191 +    for(i=0; i<2; i++){
 146.192 +        h264_idct8_1d(b2+4*i);
 146.193 +
 146.194 +        __asm__ volatile(
 146.195 +            "psraw     $6, %%mm7  \n\t"
 146.196 +            "psraw     $6, %%mm6  \n\t"
 146.197 +            "psraw     $6, %%mm5  \n\t"
 146.198 +            "psraw     $6, %%mm4  \n\t"
 146.199 +            "psraw     $6, %%mm3  \n\t"
 146.200 +            "psraw     $6, %%mm2  \n\t"
 146.201 +            "psraw     $6, %%mm1  \n\t"
 146.202 +            "psraw     $6, %%mm0  \n\t"
 146.203 +
 146.204 +            "movq   %%mm7,    (%0)  \n\t"
 146.205 +            "movq   %%mm5,  16(%0)  \n\t"
 146.206 +            "movq   %%mm3,  32(%0)  \n\t"
 146.207 +            "movq   %%mm1,  48(%0)  \n\t"
 146.208 +            "movq   %%mm0,  64(%0)  \n\t"
 146.209 +            "movq   %%mm2,  80(%0)  \n\t"
 146.210 +            "movq   %%mm4,  96(%0)  \n\t"
 146.211 +            "movq   %%mm6, 112(%0)  \n\t"
 146.212 +            :: "r"(b2+4*i)
 146.213 +            : "memory"
 146.214 +        );
 146.215 +    }
 146.216 +
 146.217 +    add_pixels_clamped_mmx(b2, dst, stride);
 146.218 +}
 146.219 +
 146.220 +#define STORE_DIFF_8P( p, d, t, z )\
 146.221 +        "movq       "#d", "#t" \n"\
 146.222 +        "psraw       $6,  "#p" \n"\
 146.223 +        "punpcklbw  "#z", "#t" \n"\
 146.224 +        "paddsw     "#t", "#p" \n"\
 146.225 +        "packuswb   "#p", "#p" \n"\
 146.226 +        "movq       "#p", "#d" \n"
 146.227 +
 146.228 +#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
 146.229 +        "movdqa     "#c", "#a" \n"\
 146.230 +        "movdqa     "#g", "#e" \n"\
 146.231 +        "psraw       $1,  "#c" \n"\
 146.232 +        "psraw       $1,  "#g" \n"\
 146.233 +        "psubw      "#e", "#c" \n"\
 146.234 +        "paddw      "#a", "#g" \n"\
 146.235 +        "movdqa     "#b", "#e" \n"\
 146.236 +        "psraw       $1,  "#e" \n"\
 146.237 +        "paddw      "#b", "#e" \n"\
 146.238 +        "paddw      "#d", "#e" \n"\
 146.239 +        "paddw      "#f", "#e" \n"\
 146.240 +        "movdqa     "#f", "#a" \n"\
 146.241 +        "psraw       $1,  "#a" \n"\
 146.242 +        "paddw      "#f", "#a" \n"\
 146.243 +        "paddw      "#h", "#a" \n"\
 146.244 +        "psubw      "#b", "#a" \n"\
 146.245 +        "psubw      "#d", "#b" \n"\
 146.246 +        "psubw      "#d", "#f" \n"\
 146.247 +        "paddw      "#h", "#b" \n"\
 146.248 +        "psubw      "#h", "#f" \n"\
 146.249 +        "psraw       $1,  "#d" \n"\
 146.250 +        "psraw       $1,  "#h" \n"\
 146.251 +        "psubw      "#d", "#b" \n"\
 146.252 +        "psubw      "#h", "#f" \n"\
 146.253 +        "movdqa     "#e", "#d" \n"\
 146.254 +        "movdqa     "#a", "#h" \n"\
 146.255 +        "psraw       $2,  "#d" \n"\
 146.256 +        "psraw       $2,  "#h" \n"\
 146.257 +        "paddw      "#f", "#d" \n"\
 146.258 +        "paddw      "#b", "#h" \n"\
 146.259 +        "psraw       $2,  "#f" \n"\
 146.260 +        "psraw       $2,  "#b" \n"\
 146.261 +        "psubw      "#f", "#e" \n"\
 146.262 +        "psubw      "#a", "#b" \n"\
 146.263 +        "movdqa 0x00(%1), "#a" \n"\
 146.264 +        "movdqa 0x40(%1), "#f" \n"\
 146.265 +        SUMSUB_BA(f, a)\
 146.266 +        SUMSUB_BA(g, f)\
 146.267 +        SUMSUB_BA(c, a)\
 146.268 +        SUMSUB_BA(e, g)\
 146.269 +        SUMSUB_BA(b, c)\
 146.270 +        SUMSUB_BA(h, a)\
 146.271 +        SUMSUB_BA(d, f)
 146.272 +
 146.273 +static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
 146.274 +{
 146.275 +    __asm__ volatile(
 146.276 +        "movdqa   0x10(%1), %%xmm1 \n"
 146.277 +        "movdqa   0x20(%1), %%xmm2 \n"
 146.278 +        "movdqa   0x30(%1), %%xmm3 \n"
 146.279 +        "movdqa   0x50(%1), %%xmm5 \n"
 146.280 +        "movdqa   0x60(%1), %%xmm6 \n"
 146.281 +        "movdqa   0x70(%1), %%xmm7 \n"
 146.282 +        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
 146.283 +        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
 146.284 +        "paddw          %4, %%xmm4 \n"
 146.285 +        "movdqa     %%xmm4, 0x00(%1) \n"
 146.286 +        "movdqa     %%xmm2, 0x40(%1) \n"
 146.287 +        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
 146.288 +        "movdqa     %%xmm6, 0x60(%1) \n"
 146.289 +        "movdqa     %%xmm7, 0x70(%1) \n"
 146.290 +        "pxor       %%xmm7, %%xmm7 \n"
 146.291 +        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
 146.292 +        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
 146.293 +        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
 146.294 +        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
 146.295 +        "lea     (%0,%2,4), %0 \n"
 146.296 +        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
 146.297 +        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
 146.298 +        "movdqa   0x60(%1), %%xmm0 \n"
 146.299 +        "movdqa   0x70(%1), %%xmm1 \n"
 146.300 +        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
 146.301 +        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
 146.302 +        :"+r"(dst)
 146.303 +        :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
 146.304 +    );
 146.305 +}
 146.306 +
 146.307 +static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
 146.308 +{
 146.309 +    int dc = (block[0] + 32) >> 6;
 146.310 +    __asm__ volatile(
 146.311 +        "movd          %0, %%mm0 \n\t"
 146.312 +        "pshufw $0, %%mm0, %%mm0 \n\t"
 146.313 +        "pxor       %%mm1, %%mm1 \n\t"
 146.314 +        "psubw      %%mm0, %%mm1 \n\t"
 146.315 +        "packuswb   %%mm0, %%mm0 \n\t"
 146.316 +        "packuswb   %%mm1, %%mm1 \n\t"
 146.317 +        ::"r"(dc)
 146.318 +    );
 146.319 +    __asm__ volatile(
 146.320 +        "movd          %0, %%mm2 \n\t"
 146.321 +        "movd          %1, %%mm3 \n\t"
 146.322 +        "movd          %2, %%mm4 \n\t"
 146.323 +        "movd          %3, %%mm5 \n\t"
 146.324 +        "paddusb    %%mm0, %%mm2 \n\t"
 146.325 +        "paddusb    %%mm0, %%mm3 \n\t"
 146.326 +        "paddusb    %%mm0, %%mm4 \n\t"
 146.327 +        "paddusb    %%mm0, %%mm5 \n\t"
 146.328 +        "psubusb    %%mm1, %%mm2 \n\t"
 146.329 +        "psubusb    %%mm1, %%mm3 \n\t"
 146.330 +        "psubusb    %%mm1, %%mm4 \n\t"
 146.331 +        "psubusb    %%mm1, %%mm5 \n\t"
 146.332 +        "movd       %%mm2, %0    \n\t"
 146.333 +        "movd       %%mm3, %1    \n\t"
 146.334 +        "movd       %%mm4, %2    \n\t"
 146.335 +        "movd       %%mm5, %3    \n\t"
 146.336 +        :"+m"(*(uint32_t*)(dst+0*stride)),
 146.337 +         "+m"(*(uint32_t*)(dst+1*stride)),
 146.338 +         "+m"(*(uint32_t*)(dst+2*stride)),
 146.339 +         "+m"(*(uint32_t*)(dst+3*stride))
 146.340 +    );
 146.341 +}
 146.342 +
 146.343 +static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
 146.344 +{
 146.345 +    int dc = (block[0] + 32) >> 6;
 146.346 +    int y;
 146.347 +    __asm__ volatile(
 146.348 +        "movd          %0, %%mm0 \n\t"
 146.349 +        "pshufw $0, %%mm0, %%mm0 \n\t"
 146.350 +        "pxor       %%mm1, %%mm1 \n\t"
 146.351 +        "psubw      %%mm0, %%mm1 \n\t"
 146.352 +        "packuswb   %%mm0, %%mm0 \n\t"
 146.353 +        "packuswb   %%mm1, %%mm1 \n\t"
 146.354 +        ::"r"(dc)
 146.355 +    );
 146.356 +    for(y=2; y--; dst += 4*stride){
 146.357 +    __asm__ volatile(
 146.358 +        "movq          %0, %%mm2 \n\t"
 146.359 +        "movq          %1, %%mm3 \n\t"
 146.360 +        "movq          %2, %%mm4 \n\t"
 146.361 +        "movq          %3, %%mm5 \n\t"
 146.362 +        "paddusb    %%mm0, %%mm2 \n\t"
 146.363 +        "paddusb    %%mm0, %%mm3 \n\t"
 146.364 +        "paddusb    %%mm0, %%mm4 \n\t"
 146.365 +        "paddusb    %%mm0, %%mm5 \n\t"
 146.366 +        "psubusb    %%mm1, %%mm2 \n\t"
 146.367 +        "psubusb    %%mm1, %%mm3 \n\t"
 146.368 +        "psubusb    %%mm1, %%mm4 \n\t"
 146.369 +        "psubusb    %%mm1, %%mm5 \n\t"
 146.370 +        "movq       %%mm2, %0    \n\t"
 146.371 +        "movq       %%mm3, %1    \n\t"
 146.372 +        "movq       %%mm4, %2    \n\t"
 146.373 +        "movq       %%mm5, %3    \n\t"
 146.374 +        :"+m"(*(uint64_t*)(dst+0*stride)),
 146.375 +         "+m"(*(uint64_t*)(dst+1*stride)),
 146.376 +         "+m"(*(uint64_t*)(dst+2*stride)),
 146.377 +         "+m"(*(uint64_t*)(dst+3*stride))
 146.378 +    );
 146.379 +    }
 146.380 +}
 146.381 +
 146.382 +//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
 146.383 +static const uint8_t scan8[16 + 2*4]={
 146.384 + 4+1*8, 5+1*8, 4+2*8, 5+2*8,
 146.385 + 6+1*8, 7+1*8, 6+2*8, 7+2*8,
 146.386 + 4+3*8, 5+3*8, 4+4*8, 5+4*8,
 146.387 + 6+3*8, 7+3*8, 6+4*8, 7+4*8,
 146.388 + 1+1*8, 2+1*8,
 146.389 + 1+2*8, 2+2*8,
 146.390 + 1+4*8, 2+4*8,
 146.391 + 1+5*8, 2+5*8,
 146.392 +};
 146.393 +
 146.394 +static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 146.395 +    int i;
 146.396 +    for(i=0; i<16; i++){
 146.397 +        int nnz = nnzc[ scan8[i] ];
 146.398 +        if(nnz){
 146.399 +            if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
 146.400 +            else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
 146.401 +        }
 146.402 +    }
 146.403 +}
 146.404 +
 146.405 +static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 146.406 +    int i;
 146.407 +    for(i=0; i<16; i++){
 146.408 +        if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
 146.409 +        else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
 146.410 +    }
 146.411 +}
 146.412 +
 146.413 +static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 146.414 +    int i;
 146.415 +    for(i=0; i<16; i+=4){
 146.416 +        int nnz = nnzc[ scan8[i] ];
 146.417 +        if(nnz){
 146.418 +            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
 146.419 +            else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
 146.420 +        }
 146.421 +    }
 146.422 +}
 146.423 +
 146.424 +static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 146.425 +    int i;
 146.426 +    for(i=0; i<16; i+=4){
 146.427 +        int nnz = nnzc[ scan8[i] ];
 146.428 +        if(nnz){
 146.429 +            if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
 146.430 +            else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
 146.431 +        }
 146.432 +    }
 146.433 +}
 146.434 +
 146.435 +static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
 146.436 +    int i;
 146.437 +    for(i=16; i<16+8; i++){
 146.438 +        if(nnzc[ scan8[i] ])
 146.439 +            ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
 146.440 +        else if(block[i*16])
 146.441 +            ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
 146.442 +    }
 146.443 +}
 146.444 +
 146.445 +/***********************************/
 146.446 +/* deblocking */
 146.447 +
 146.448 +// out: o = |x-y|>a
 146.449 +// clobbers: t
 146.450 +#define DIFF_GT_MMX(x,y,a,o,t)\
 146.451 +    "movq     "#y", "#t"  \n\t"\
 146.452 +    "movq     "#x", "#o"  \n\t"\
 146.453 +    "psubusb  "#x", "#t"  \n\t"\
 146.454 +    "psubusb  "#y", "#o"  \n\t"\
 146.455 +    "por      "#t", "#o"  \n\t"\
 146.456 +    "psubusb  "#a", "#o"  \n\t"
 146.457 +
 146.458 +// out: o = |x-y|>a
 146.459 +// clobbers: t
 146.460 +#define DIFF_GT2_MMX(x,y,a,o,t)\
 146.461 +    "movq     "#y", "#t"  \n\t"\
 146.462 +    "movq     "#x", "#o"  \n\t"\
 146.463 +    "psubusb  "#x", "#t"  \n\t"\
 146.464 +    "psubusb  "#y", "#o"  \n\t"\
 146.465 +    "psubusb  "#a", "#t"  \n\t"\
 146.466 +    "psubusb  "#a", "#o"  \n\t"\
 146.467 +    "pcmpeqb  "#t", "#o"  \n\t"\
 146.468 +
 146.469 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
 146.470 +// out: mm5=beta-1, mm7=mask
 146.471 +// clobbers: mm4,mm6
 146.472 +#define H264_DEBLOCK_MASK(alpha1, beta1) \
 146.473 +    "pshufw $0, "#alpha1", %%mm4 \n\t"\
 146.474 +    "pshufw $0, "#beta1 ", %%mm5 \n\t"\
 146.475 +    "packuswb  %%mm4, %%mm4      \n\t"\
 146.476 +    "packuswb  %%mm5, %%mm5      \n\t"\
 146.477 +    DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
 146.478 +    DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
 146.479 +    "por       %%mm4, %%mm7      \n\t"\
 146.480 +    DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
 146.481 +    "por       %%mm4, %%mm7      \n\t"\
 146.482 +    "pxor      %%mm6, %%mm6      \n\t"\
 146.483 +    "pcmpeqb   %%mm6, %%mm7      \n\t"
 146.484 +
 146.485 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
 146.486 +// out: mm1=p0' mm2=q0'
 146.487 +// clobbers: mm0,3-6
 146.488 +#define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
 146.489 +        "movq    %%mm1              , %%mm5 \n\t"\
 146.490 +        "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
 146.491 +        "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
 146.492 +        "pcmpeqb %%mm4              , %%mm4 \n\t"\
 146.493 +        "pxor    %%mm4              , %%mm3 \n\t"\
 146.494 +        "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
 146.495 +        "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
 146.496 +        "pxor    %%mm1              , %%mm4 \n\t"\
 146.497 +        "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
 146.498 +        "pavgb   %%mm5              , %%mm3 \n\t"\
 146.499 +        "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
 146.500 +        "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
 146.501 +        "psubusb %%mm3              , %%mm6 \n\t"\
 146.502 +        "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
 146.503 +        "pminub  %%mm7              , %%mm6 \n\t"\
 146.504 +        "pminub  %%mm7              , %%mm3 \n\t"\
 146.505 +        "psubusb %%mm6              , %%mm1 \n\t"\
 146.506 +        "psubusb %%mm3              , %%mm2 \n\t"\
 146.507 +        "paddusb %%mm3              , %%mm1 \n\t"\
 146.508 +        "paddusb %%mm6              , %%mm2 \n\t"
 146.509 +
 146.510 +// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
 146.511 +// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
 146.512 +// clobbers: q2, tmp, tc0
 146.513 +#define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
 146.514 +        "movq     %%mm1,  "#tmp"   \n\t"\
 146.515 +        "pavgb    %%mm2,  "#tmp"   \n\t"\
 146.516 +        "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
 146.517 +        "pxor   "q2addr", "#tmp"   \n\t"\
 146.518 +        "pand     %9,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
 146.519 +        "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
 146.520 +        "movq     "#p1",  "#tmp"   \n\t"\
 146.521 +        "psubusb  "#tc0", "#tmp"   \n\t"\
 146.522 +        "paddusb  "#p1",  "#tc0"   \n\t"\
 146.523 +        "pmaxub   "#tmp", "#q2"    \n\t"\
 146.524 +        "pminub   "#tc0", "#q2"    \n\t"\
 146.525 +        "movq     "#q2",  "q1addr" \n\t"
 146.526 +
 146.527 +static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
 146.528 +{
 146.529 +    DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
 146.530 +
 146.531 +    __asm__ volatile(
 146.532 +        "movq    (%2,%4), %%mm0    \n\t" //p1
 146.533 +        "movq    (%2,%4,2), %%mm1  \n\t" //p0
 146.534 +        "movq    (%3),    %%mm2    \n\t" //q0
 146.535 +        "movq    (%3,%4), %%mm3    \n\t" //q1
 146.536 +        H264_DEBLOCK_MASK(%7, %8)
 146.537 +
 146.538 +        "movd      %6,    %%mm4    \n\t"
 146.539 +        "punpcklbw %%mm4, %%mm4    \n\t"
 146.540 +        "punpcklwd %%mm4, %%mm4    \n\t"
 146.541 +        "pcmpeqb   %%mm3, %%mm3    \n\t"
 146.542 +        "movq      %%mm4, %%mm6    \n\t"
 146.543 +        "pcmpgtb   %%mm3, %%mm4    \n\t"
 146.544 +        "movq      %%mm6, %1       \n\t"
 146.545 +        "pand      %%mm4, %%mm7    \n\t"
 146.546 +        "movq      %%mm7, %0       \n\t"
 146.547 +
 146.548 +        /* filter p1 */
 146.549 +        "movq     (%2),   %%mm3    \n\t" //p2
 146.550 +        DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
 146.551 +        "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
 146.552 +        "pand     %1,     %%mm7    \n\t" // mask & tc0
 146.553 +        "movq     %%mm7,  %%mm4    \n\t"
 146.554 +        "psubb    %%mm6,  %%mm7    \n\t"
 146.555 +        "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
 146.556 +        H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%2)", "(%2,%4)", %%mm6, %%mm4)
 146.557 +
 146.558 +        /* filter q1 */
 146.559 +        "movq    (%3,%4,2), %%mm4  \n\t" //q2
 146.560 +        DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
 146.561 +        "pand     %0,     %%mm6    \n\t"
 146.562 +        "movq     %1,     %%mm5    \n\t" // can be merged with the and below but is slower then
 146.563 +        "pand     %%mm6,  %%mm5    \n\t"
 146.564 +        "psubb    %%mm6,  %%mm7    \n\t"
 146.565 +        "movq    (%3,%4), %%mm3    \n\t"
 146.566 +        H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%3,%4,2)", "(%3,%4)", %%mm5, %%mm6)
 146.567 +
 146.568 +        /* filter p0, q0 */
 146.569 +        H264_DEBLOCK_P0_Q0(%9, unused)
 146.570 +        "movq      %%mm1, (%2,%4,2) \n\t"
 146.571 +        "movq      %%mm2, (%3)      \n\t"
 146.572 +
 146.573 +        : "=m"(tmp0[0]), "=m"(tmp0[1])
 146.574 +        : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
 146.575 +          "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
 146.576 +          "m"(ff_bone)
 146.577 +    );
 146.578 +}
 146.579 +
 146.580 +static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 146.581 +{
 146.582 +    if((tc0[0] & tc0[1]) >= 0)
 146.583 +        h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
 146.584 +    if((tc0[2] & tc0[3]) >= 0)
 146.585 +        h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
 146.586 +}
 146.587 +static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 146.588 +{
 146.589 +    //FIXME: could cut some load/stores by merging transpose with filter
 146.590 +    // also, it only needs to transpose 6x8
 146.591 +    DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
 146.592 +    int i;
 146.593 +    for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
 146.594 +        if((tc0[0] & tc0[1]) < 0)
 146.595 +            continue;
 146.596 +        transpose4x4(trans,       pix-4,          8, stride);
 146.597 +        transpose4x4(trans  +4*8, pix,            8, stride);
 146.598 +        transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
 146.599 +        transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
 146.600 +        h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
 146.601 +        transpose4x4(pix-2,          trans  +2*8, stride, 8);
 146.602 +        transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
 146.603 +    }
 146.604 +}
 146.605 +
 146.606 +static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
 146.607 +{
 146.608 +    __asm__ volatile(
 146.609 +        "movq    (%0),    %%mm0     \n\t" //p1
 146.610 +        "movq    (%0,%2), %%mm1     \n\t" //p0
 146.611 +        "movq    (%1),    %%mm2     \n\t" //q0
 146.612 +        "movq    (%1,%2), %%mm3     \n\t" //q1
 146.613 +        H264_DEBLOCK_MASK(%4, %5)
 146.614 +        "movd      %3,    %%mm6     \n\t"
 146.615 +        "punpcklbw %%mm6, %%mm6     \n\t"
 146.616 +        "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
 146.617 +        H264_DEBLOCK_P0_Q0(%6, %7)
 146.618 +        "movq      %%mm1, (%0,%2)   \n\t"
 146.619 +        "movq      %%mm2, (%1)      \n\t"
 146.620 +
 146.621 +        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
 146.622 +           "r"(*(uint32_t*)tc0),
 146.623 +           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
 146.624 +    );
 146.625 +}
 146.626 +
 146.627 +static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 146.628 +{
 146.629 +    h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
 146.630 +}
 146.631 +
 146.632 +static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
 146.633 +{
 146.634 +    //FIXME: could cut some load/stores by merging transpose with filter
 146.635 +    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
 146.636 +    transpose4x4(trans, pix-2, 8, stride);
 146.637 +    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
 146.638 +    h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
 146.639 +    transpose4x4(pix-2, trans, stride, 8);
 146.640 +    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
 146.641 +}
 146.642 +
 146.643 +// p0 = (p0 + q1 + 2*p1 + 2) >> 2
 146.644 +#define H264_FILTER_CHROMA4(p0, p1, q1, one) \
 146.645 +    "movq    "#p0", %%mm4  \n\t"\
 146.646 +    "pxor    "#q1", %%mm4  \n\t"\
 146.647 +    "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
 146.648 +    "pavgb   "#q1", "#p0"  \n\t"\
 146.649 +    "psubusb %%mm4, "#p0"  \n\t"\
 146.650 +    "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
 146.651 +
 146.652 +static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
 146.653 +{
 146.654 +    __asm__ volatile(
 146.655 +        "movq    (%0),    %%mm0     \n\t"
 146.656 +        "movq    (%0,%2), %%mm1     \n\t"
 146.657 +        "movq    (%1),    %%mm2     \n\t"
 146.658 +        "movq    (%1,%2), %%mm3     \n\t"
 146.659 +        H264_DEBLOCK_MASK(%3, %4)
 146.660 +        "movq    %%mm1,   %%mm5     \n\t"
 146.661 +        "movq    %%mm2,   %%mm6     \n\t"
 146.662 +        H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
 146.663 +        H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
 146.664 +        "psubb   %%mm5,   %%mm1     \n\t"
 146.665 +        "psubb   %%mm6,   %%mm2     \n\t"
 146.666 +        "pand    %%mm7,   %%mm1     \n\t"
 146.667 +        "pand    %%mm7,   %%mm2     \n\t"
 146.668 +        "paddb   %%mm5,   %%mm1     \n\t"
 146.669 +        "paddb   %%mm6,   %%mm2     \n\t"
 146.670 +        "movq    %%mm1,   (%0,%2)   \n\t"
 146.671 +        "movq    %%mm2,   (%1)      \n\t"
 146.672 +        :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
 146.673 +           "m"(alpha1), "m"(beta1), "m"(ff_bone)
 146.674 +    );
 146.675 +}
 146.676 +
 146.677 +static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
 146.678 +{
 146.679 +    h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
 146.680 +}
 146.681 +
 146.682 +static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
 146.683 +{
 146.684 +    //FIXME: could cut some load/stores by merging transpose with filter
 146.685 +    DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
 146.686 +    transpose4x4(trans, pix-2, 8, stride);
 146.687 +    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
 146.688 +    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
 146.689 +    transpose4x4(pix-2, trans, stride, 8);
 146.690 +    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
 146.691 +}
 146.692 +
 146.693 +static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
 146.694 +                                            int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
 146.695 +    int dir;
 146.696 +    __asm__ volatile(
 146.697 +        "movq %0, %%mm7 \n"
 146.698 +        "movq %1, %%mm6 \n"
 146.699 +        ::"m"(ff_pb_1), "m"(ff_pb_3)
 146.700 +    );
 146.701 +    if(field)
 146.702 +        __asm__ volatile(
 146.703 +            "movq %0, %%mm6 \n"
 146.704 +            ::"m"(ff_pb_3_1)
 146.705 +        );
 146.706 +    __asm__ volatile(
 146.707 +        "movq  %%mm6, %%mm5 \n"
 146.708 +        "paddb %%mm5, %%mm5 \n"
 146.709 +    :);
 146.710 +
 146.711 +    // could do a special case for dir==0 && edges==1, but it only reduces the
 146.712 +    // average filter time by 1.2%
 146.713 +    for( dir=1; dir>=0; dir-- ) {
 146.714 +        const x86_reg d_idx = dir ? -8 : -1;
 146.715 +        const int mask_mv = dir ? mask_mv1 : mask_mv0;
 146.716 +        DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
 146.717 +        int b_idx, edge;
 146.718 +        for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
 146.719 +            __asm__ volatile(
 146.720 +                "pand %0, %%mm0 \n\t"
 146.721 +                ::"m"(mask_dir)
 146.722 +            );
 146.723 +            if(!(mask_mv & edge)) {
 146.724 +                if(bidir) {
 146.725 +                    __asm__ volatile(
 146.726 +                        "movd         (%1,%0), %%mm2 \n"
 146.727 +                        "punpckldq  40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] }
 146.728 +                        "pshufw $0x44,   (%1), %%mm0 \n" // { ref0[b], ref0[b] }
 146.729 +                        "pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] }
 146.730 +                        "pshufw $0x4E, %%mm2, %%mm3 \n"
 146.731 +                        "psubb         %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
 146.732 +                        "psubb         %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
 146.733 +                        "1: \n"
 146.734 +                        "por           %%mm1, %%mm0 \n"
 146.735 +                        "movq      (%2,%0,4), %%mm1 \n"
 146.736 +                        "movq     8(%2,%0,4), %%mm2 \n"
 146.737 +                        "movq          %%mm1, %%mm3 \n"
 146.738 +                        "movq          %%mm2, %%mm4 \n"
 146.739 +                        "psubw          (%2), %%mm1 \n"
 146.740 +                        "psubw         8(%2), %%mm2 \n"
 146.741 +                        "psubw       160(%2), %%mm3 \n"
 146.742 +                        "psubw       168(%2), %%mm4 \n"
 146.743 +                        "packsswb      %%mm2, %%mm1 \n"
 146.744 +                        "packsswb      %%mm4, %%mm3 \n"
 146.745 +                        "paddb         %%mm6, %%mm1 \n"
 146.746 +                        "paddb         %%mm6, %%mm3 \n"
 146.747 +                        "psubusb       %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
 146.748 +                        "psubusb       %%mm5, %%mm3 \n"
 146.749 +                        "packsswb      %%mm3, %%mm1 \n"
 146.750 +                        "add $40, %0 \n"
 146.751 +                        "cmp $40, %0 \n"
 146.752 +                        "jl 1b \n"
 146.753 +                        "sub $80, %0 \n"
 146.754 +                        "pshufw $0x4E, %%mm1, %%mm1 \n"
 146.755 +                        "por           %%mm1, %%mm0 \n"
 146.756 +                        "pshufw $0x4E, %%mm0, %%mm1 \n"
 146.757 +                        "pminub        %%mm1, %%mm0 \n"
 146.758 +                        ::"r"(d_idx),
 146.759 +                          "r"(ref[0]+b_idx),
 146.760 +                          "r"(mv[0]+b_idx)
 146.761 +                    );
 146.762 +                } else {
 146.763 +                    __asm__ volatile(
 146.764 +                        "movd        (%1), %%mm0 \n"
 146.765 +                        "psubb    (%1,%0), %%mm0 \n" // ref[b] != ref[bn]
 146.766 +                        "movq        (%2), %%mm1 \n"
 146.767 +                        "movq       8(%2), %%mm2 \n"
 146.768 +                        "psubw  (%2,%0,4), %%mm1 \n"
 146.769 +                        "psubw 8(%2,%0,4), %%mm2 \n"
 146.770 +                        "packsswb   %%mm2, %%mm1 \n"
 146.771 +                        "paddb      %%mm6, %%mm1 \n"
 146.772 +                        "psubusb    %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
 146.773 +                        "packsswb   %%mm1, %%mm1 \n"
 146.774 +                        "por        %%mm1, %%mm0 \n"
 146.775 +                        ::"r"(d_idx),
 146.776 +                          "r"(ref[0]+b_idx),
 146.777 +                          "r"(mv[0]+b_idx)
 146.778 +                    );
 146.779 +                }
 146.780 +            }
 146.781 +            __asm__ volatile(
 146.782 +                "movd %0, %%mm1 \n"
 146.783 +                "por  %1, %%mm1 \n" // nnz[b] || nnz[bn]
 146.784 +                ::"m"(nnz[b_idx]),
 146.785 +                  "m"(nnz[b_idx+d_idx])
 146.786 +            );
 146.787 +            __asm__ volatile(
 146.788 +                "pminub    %%mm7, %%mm1 \n"
 146.789 +                "pminub    %%mm7, %%mm0 \n"
 146.790 +                "psllw        $1, %%mm1 \n"
 146.791 +                "pxor      %%mm2, %%mm2 \n"
 146.792 +                "pmaxub    %%mm0, %%mm1 \n"
 146.793 +                "punpcklbw %%mm2, %%mm1 \n"
 146.794 +                "movq      %%mm1, %0    \n"
 146.795 +                :"=m"(*bS[dir][edge])
 146.796 +                ::"memory"
 146.797 +            );
 146.798 +        }
 146.799 +        edges = 4;
 146.800 +        step = 1;
 146.801 +    }
 146.802 +    __asm__ volatile(
 146.803 +        "movq   (%0), %%mm0 \n\t"
 146.804 +        "movq  8(%0), %%mm1 \n\t"
 146.805 +        "movq 16(%0), %%mm2 \n\t"
 146.806 +        "movq 24(%0), %%mm3 \n\t"
 146.807 +        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
 146.808 +        "movq %%mm0,   (%0) \n\t"
 146.809 +        "movq %%mm3,  8(%0) \n\t"
 146.810 +        "movq %%mm4, 16(%0) \n\t"
 146.811 +        "movq %%mm2, 24(%0) \n\t"
 146.812 +        ::"r"(bS[0])
 146.813 +        :"memory"
 146.814 +    );
 146.815 +}
 146.816 +
 146.817 +/***********************************/
 146.818 +/* motion compensation */
 146.819 +
 146.820 +#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
 146.821 +        "mov"#q" "#C", "#T"         \n\t"\
 146.822 +        "mov"#d" (%0), "#F"         \n\t"\
 146.823 +        "paddw "#D", "#T"           \n\t"\
 146.824 +        "psllw $2, "#T"             \n\t"\
 146.825 +        "psubw "#B", "#T"           \n\t"\
 146.826 +        "psubw "#E", "#T"           \n\t"\
 146.827 +        "punpcklbw "#Z", "#F"       \n\t"\
 146.828 +        "pmullw %4, "#T"            \n\t"\
 146.829 +        "paddw %5, "#A"             \n\t"\
 146.830 +        "add %2, %0                 \n\t"\
 146.831 +        "paddw "#F", "#A"           \n\t"\
 146.832 +        "paddw "#A", "#T"           \n\t"\
 146.833 +        "psraw $5, "#T"             \n\t"\
 146.834 +        "packuswb "#T", "#T"        \n\t"\
 146.835 +        OP(T, (%1), A, d)\
 146.836 +        "add %3, %1                 \n\t"
 146.837 +
 146.838 +#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
 146.839 +        "mov"#q" "#C", "#T"         \n\t"\
 146.840 +        "mov"#d" (%0), "#F"         \n\t"\
 146.841 +        "paddw "#D", "#T"           \n\t"\
 146.842 +        "psllw $2, "#T"             \n\t"\
 146.843 +        "paddw %4, "#A"             \n\t"\
 146.844 +        "psubw "#B", "#T"           \n\t"\
 146.845 +        "psubw "#E", "#T"           \n\t"\
 146.846 +        "punpcklbw "#Z", "#F"       \n\t"\
 146.847 +        "pmullw %3, "#T"            \n\t"\
 146.848 +        "paddw "#F", "#A"           \n\t"\
 146.849 +        "add %2, %0                 \n\t"\
 146.850 +        "paddw "#A", "#T"           \n\t"\
 146.851 +        "mov"#q" "#T", "#OF"(%1)    \n\t"
 146.852 +
 146.853 +#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
 146.854 +#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
 146.855 +#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
 146.856 +#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
 146.857 +
 146.858 +
 146.859 +#define QPEL_H264(OPNAME, OP, MMX)\
 146.860 +\
 146.861 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
 146.862 +    int h=8;\
 146.863 +    __asm__ volatile(\
 146.864 +        "pxor %%mm7, %%mm7          \n\t"\
 146.865 +        "movq %0, %%mm6             \n\t"\
 146.866 +        :: "m"(ff_pw_5)\
 146.867 +    );\
 146.868 +    do{\
 146.869 +    __asm__ volatile(\
 146.870 +        "movq    (%0), %%mm0        \n\t"\
 146.871 +        "movq   1(%0), %%mm2        \n\t"\
 146.872 +        "movq %%mm0, %%mm1          \n\t"\
 146.873 +        "movq %%mm2, %%mm3          \n\t"\
 146.874 +        "punpcklbw %%mm7, %%mm0     \n\t"\
 146.875 +        "punpckhbw %%mm7, %%mm1     \n\t"\
 146.876 +        "punpcklbw %%mm7, %%mm2     \n\t"\
 146.877 +        "punpckhbw %%mm7, %%mm3     \n\t"\
 146.878 +        "paddw %%mm2, %%mm0         \n\t"\
 146.879 +        "paddw %%mm3, %%mm1         \n\t"\
 146.880 +        "psllw $2, %%mm0            \n\t"\
 146.881 +        "psllw $2, %%mm1            \n\t"\
 146.882 +        "movq   -1(%0), %%mm2       \n\t"\
 146.883 +        "movq    2(%0), %%mm4       \n\t"\
 146.884 +        "movq %%mm2, %%mm3          \n\t"\
 146.885 +        "movq %%mm4, %%mm5          \n\t"\
 146.886 +        "punpcklbw %%mm7, %%mm2     \n\t"\
 146.887 +        "punpckhbw %%mm7, %%mm3     \n\t"\
 146.888 +        "punpcklbw %%mm7, %%mm4     \n\t"\
 146.889 +        "punpckhbw %%mm7, %%mm5     \n\t"\
 146.890 +        "paddw %%mm4, %%mm2         \n\t"\
 146.891 +        "paddw %%mm3, %%mm5         \n\t"\
 146.892 +        "psubw %%mm2, %%mm0         \n\t"\
 146.893 +        "psubw %%mm5, %%mm1         \n\t"\
 146.894 +        "pmullw %%mm6, %%mm0        \n\t"\
 146.895 +        "pmullw %%mm6, %%mm1        \n\t"\
 146.896 +        "movd   -2(%0), %%mm2       \n\t"\
 146.897 +        "movd    7(%0), %%mm5       \n\t"\
 146.898 +        "punpcklbw %%mm7, %%mm2     \n\t"\
 146.899 +        "punpcklbw %%mm7, %%mm5     \n\t"\
 146.900 +        "paddw %%mm3, %%mm2         \n\t"\
 146.901 +        "paddw %%mm5, %%mm4         \n\t"\
 146.902 +        "movq %5, %%mm5             \n\t"\
 146.903 +        "paddw %%mm5, %%mm2         \n\t"\
 146.904 +        "paddw %%mm5, %%mm4         \n\t"\
 146.905 +        "paddw %%mm2, %%mm0         \n\t"\
 146.906 +        "paddw %%mm4, %%mm1         \n\t"\
 146.907 +        "psraw $5, %%mm0            \n\t"\
 146.908 +        "psraw $5, %%mm1            \n\t"\
 146.909 +        "movq (%2), %%mm4           \n\t"\
 146.910 +        "packuswb %%mm1, %%mm0      \n\t"\
 146.911 +        PAVGB" %%mm4, %%mm0         \n\t"\
 146.912 +        OP(%%mm0, (%1),%%mm5, q)\
 146.913 +        "add %4, %0                 \n\t"\
 146.914 +        "add %4, %1                 \n\t"\
 146.915 +        "add %3, %2                 \n\t"\
 146.916 +        : "+a"(src), "+c"(dst), "+d"(src2)\
 146.917 +        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
 146.918 +          "m"(ff_pw_16)\
 146.919 +        : "memory"\
 146.920 +    );\
 146.921 +    }while(--h);\
 146.922 +}\
 146.923 +\
 146.924 +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
 146.925 +    int w = size>>4;\
 146.926 +    do{\
 146.927 +    int h = size;\
 146.928 +    __asm__ volatile(\
 146.929 +        "1:                         \n\t"\
 146.930 +        "movq     (%0), %%mm0       \n\t"\
 146.931 +        "movq    8(%0), %%mm3       \n\t"\
 146.932 +        "movq    2(%0), %%mm1       \n\t"\
 146.933 +        "movq   10(%0), %%mm4       \n\t"\
 146.934 +        "paddw   %%mm4, %%mm0       \n\t"\
 146.935 +        "paddw   %%mm3, %%mm1       \n\t"\
 146.936 +        "paddw  18(%0), %%mm3       \n\t"\
 146.937 +        "paddw  16(%0), %%mm4       \n\t"\
 146.938 +        "movq    4(%0), %%mm2       \n\t"\
 146.939 +        "movq   12(%0), %%mm5       \n\t"\
 146.940 +        "paddw   6(%0), %%mm2       \n\t"\
 146.941 +        "paddw  14(%0), %%mm5       \n\t"\
 146.942 +        "psubw %%mm1, %%mm0         \n\t"\
 146.943 +        "psubw %%mm4, %%mm3         \n\t"\
 146.944 +        "psraw $2, %%mm0            \n\t"\
 146.945 +        "psraw $2, %%mm3            \n\t"\
 146.946 +        "psubw %%mm1, %%mm0         \n\t"\
 146.947 +        "psubw %%mm4, %%mm3         \n\t"\
 146.948 +        "paddsw %%mm2, %%mm0        \n\t"\
 146.949 +        "paddsw %%mm5, %%mm3        \n\t"\
 146.950 +        "psraw $2, %%mm0            \n\t"\
 146.951 +        "psraw $2, %%mm3            \n\t"\
 146.952 +        "paddw %%mm2, %%mm0         \n\t"\
 146.953 +        "paddw %%mm5, %%mm3         \n\t"\
 146.954 +        "psraw $6, %%mm0            \n\t"\
 146.955 +        "psraw $6, %%mm3            \n\t"\
 146.956 +        "packuswb %%mm3, %%mm0      \n\t"\
 146.957 +        OP(%%mm0, (%1),%%mm7, q)\
 146.958 +        "add $48, %0                \n\t"\
 146.959 +        "add %3, %1                 \n\t"\
 146.960 +        "decl %2                    \n\t"\
 146.961 +        " jnz 1b                    \n\t"\
 146.962 +        : "+a"(tmp), "+c"(dst), "+g"(h)\
 146.963 +        : "S"((x86_reg)dstStride)\
 146.964 +        : "memory"\
 146.965 +    );\
 146.966 +    tmp += 8 - size*24;\
 146.967 +    dst += 8 - size*dstStride;\
 146.968 +    }while(w--);\
 146.969 +}\
 146.970 +\
 146.971 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
 146.972 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
 146.973 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
 146.974 +    src += 8*dstStride;\
 146.975 +    dst += 8*dstStride;\
 146.976 +    src2 += 8*src2Stride;\
 146.977 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
 146.978 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
 146.979 +}\
 146.980 +static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
 146.981 +{\
 146.982 +    do{\
 146.983 +    __asm__ volatile(\
 146.984 +        "movq      (%1), %%mm0          \n\t"\
 146.985 +        "movq     8(%1), %%mm1          \n\t"\
 146.986 +        "movq    48(%1), %%mm2          \n\t"\
 146.987 +        "movq  8+48(%1), %%mm3          \n\t"\
 146.988 +        "psraw      $5,  %%mm0          \n\t"\
 146.989 +        "psraw      $5,  %%mm1          \n\t"\
 146.990 +        "psraw      $5,  %%mm2          \n\t"\
 146.991 +        "psraw      $5,  %%mm3          \n\t"\
 146.992 +        "packuswb %%mm1, %%mm0          \n\t"\
 146.993 +        "packuswb %%mm3, %%mm2          \n\t"\
 146.994 +        PAVGB"     (%0), %%mm0          \n\t"\
 146.995 +        PAVGB"  (%0,%3), %%mm2          \n\t"\
 146.996 +        OP(%%mm0, (%2), %%mm5, q)\
 146.997 +        OP(%%mm2, (%2,%4), %%mm5, q)\
 146.998 +        ::"a"(src8), "c"(src16), "d"(dst),\
 146.999 +          "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
146.1000 +        :"memory");\
146.1001 +        src8 += 2L*src8Stride;\
146.1002 +        src16 += 48;\
146.1003 +        dst += 2L*dstStride;\
146.1004 +    }while(h-=2);\
146.1005 +}\
146.1006 +static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
146.1007 +{\
146.1008 +    OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
146.1009 +    OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
146.1010 +}\
146.1011 +
146.1012 +
146.1013 +#if ARCH_X86_64
146.1014 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
146.1015 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
146.1016 +    int h=16;\
146.1017 +    __asm__ volatile(\
146.1018 +        "pxor %%xmm15, %%xmm15      \n\t"\
146.1019 +        "movdqa %6, %%xmm14         \n\t"\
146.1020 +        "movdqa %7, %%xmm13         \n\t"\
146.1021 +        "1:                         \n\t"\
146.1022 +        "lddqu    6(%0), %%xmm1     \n\t"\
146.1023 +        "lddqu   -2(%0), %%xmm7     \n\t"\
146.1024 +        "movdqa  %%xmm1, %%xmm0     \n\t"\
146.1025 +        "punpckhbw %%xmm15, %%xmm1  \n\t"\
146.1026 +        "punpcklbw %%xmm15, %%xmm0  \n\t"\
146.1027 +        "punpcklbw %%xmm15, %%xmm7  \n\t"\
146.1028 +        "movdqa  %%xmm1, %%xmm2     \n\t"\
146.1029 +        "movdqa  %%xmm0, %%xmm6     \n\t"\
146.1030 +        "movdqa  %%xmm1, %%xmm3     \n\t"\
146.1031 +        "movdqa  %%xmm0, %%xmm8     \n\t"\
146.1032 +        "movdqa  %%xmm1, %%xmm4     \n\t"\
146.1033 +        "movdqa  %%xmm0, %%xmm9     \n\t"\
146.1034 +        "movdqa  %%xmm0, %%xmm12    \n\t"\
146.1035 +        "movdqa  %%xmm1, %%xmm11    \n\t"\
146.1036 +        "palignr $10,%%xmm0, %%xmm11\n\t"\
146.1037 +        "palignr $10,%%xmm7, %%xmm12\n\t"\
146.1038 +        "palignr $2, %%xmm0, %%xmm4 \n\t"\
146.1039 +        "palignr $2, %%xmm7, %%xmm9 \n\t"\
146.1040 +        "palignr $4, %%xmm0, %%xmm3 \n\t"\
146.1041 +        "palignr $4, %%xmm7, %%xmm8 \n\t"\
146.1042 +        "palignr $6, %%xmm0, %%xmm2 \n\t"\
146.1043 +        "palignr $6, %%xmm7, %%xmm6 \n\t"\
146.1044 +        "paddw   %%xmm0 ,%%xmm11    \n\t"\
146.1045 +        "palignr $8, %%xmm0, %%xmm1 \n\t"\
146.1046 +        "palignr $8, %%xmm7, %%xmm0 \n\t"\
146.1047 +        "paddw   %%xmm12,%%xmm7     \n\t"\
146.1048 +        "paddw   %%xmm3, %%xmm2     \n\t"\
146.1049 +        "paddw   %%xmm8, %%xmm6     \n\t"\
146.1050 +        "paddw   %%xmm4, %%xmm1     \n\t"\
146.1051 +        "paddw   %%xmm9, %%xmm0     \n\t"\
146.1052 +        "psllw   $2,     %%xmm2     \n\t"\
146.1053 +        "psllw   $2,     %%xmm6     \n\t"\
146.1054 +        "psubw   %%xmm1, %%xmm2     \n\t"\
146.1055 +        "psubw   %%xmm0, %%xmm6     \n\t"\
146.1056 +        "paddw   %%xmm13,%%xmm11    \n\t"\
146.1057 +        "paddw   %%xmm13,%%xmm7     \n\t"\
146.1058 +        "pmullw  %%xmm14,%%xmm2     \n\t"\
146.1059 +        "pmullw  %%xmm14,%%xmm6     \n\t"\
146.1060 +        "lddqu   (%2),   %%xmm3     \n\t"\
146.1061 +        "paddw   %%xmm11,%%xmm2     \n\t"\
146.1062 +        "paddw   %%xmm7, %%xmm6     \n\t"\
146.1063 +        "psraw   $5,     %%xmm2     \n\t"\
146.1064 +        "psraw   $5,     %%xmm6     \n\t"\
146.1065 +        "packuswb %%xmm2,%%xmm6     \n\t"\
146.1066 +        "pavgb   %%xmm3, %%xmm6     \n\t"\
146.1067 +        OP(%%xmm6, (%1), %%xmm4, dqa)\
146.1068 +        "add %5, %0                 \n\t"\
146.1069 +        "add %5, %1                 \n\t"\
146.1070 +        "add %4, %2                 \n\t"\
146.1071 +        "decl %3                    \n\t"\
146.1072 +        "jg 1b                      \n\t"\
146.1073 +        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
146.1074 +        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
146.1075 +          "m"(ff_pw_5), "m"(ff_pw_16)\
146.1076 +        : "memory"\
146.1077 +    );\
146.1078 +}
146.1079 +#else // ARCH_X86_64
146.1080 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
146.1081 +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
146.1082 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
146.1083 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
146.1084 +    src += 8*dstStride;\
146.1085 +    dst += 8*dstStride;\
146.1086 +    src2 += 8*src2Stride;\
146.1087 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
146.1088 +    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
146.1089 +}
146.1090 +#endif // ARCH_X86_64
146.1091 +
146.1092 +#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
146.1093 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
146.1094 +    int h=8;\
146.1095 +    __asm__ volatile(\
146.1096 +        "pxor %%xmm7, %%xmm7        \n\t"\
146.1097 +        "movdqa %0, %%xmm6          \n\t"\
146.1098 +        :: "m"(ff_pw_5)\
146.1099 +    );\
146.1100 +    do{\
146.1101 +    __asm__ volatile(\
146.1102 +        "lddqu   -2(%0), %%xmm1     \n\t"\
146.1103 +        "movdqa  %%xmm1, %%xmm0     \n\t"\
146.1104 +        "punpckhbw %%xmm7, %%xmm1   \n\t"\
146.1105 +        "punpcklbw %%xmm7, %%xmm0   \n\t"\
146.1106 +        "movdqa  %%xmm1, %%xmm2     \n\t"\
146.1107 +        "movdqa  %%xmm1, %%xmm3     \n\t"\
146.1108 +        "movdqa  %%xmm1, %%xmm4     \n\t"\
146.1109 +        "movdqa  %%xmm1, %%xmm5     \n\t"\
146.1110 +        "palignr $2, %%xmm0, %%xmm4 \n\t"\
146.1111 +        "palignr $4, %%xmm0, %%xmm3 \n\t"\
146.1112 +        "palignr $6, %%xmm0, %%xmm2 \n\t"\
146.1113 +        "palignr $8, %%xmm0, %%xmm1 \n\t"\
146.1114 +        "palignr $10,%%xmm0, %%xmm5 \n\t"\
146.1115 +        "paddw   %%xmm5, %%xmm0     \n\t"\
146.1116 +        "paddw   %%xmm3, %%xmm2     \n\t"\
146.1117 +        "paddw   %%xmm4, %%xmm1     \n\t"\
146.1118 +        "psllw   $2,     %%xmm2     \n\t"\
146.1119 +        "movq    (%2),   %%xmm3     \n\t"\
146.1120 +        "psubw   %%xmm1, %%xmm2     \n\t"\
146.1121 +        "paddw   %5,     %%xmm0     \n\t"\
146.1122 +        "pmullw  %%xmm6, %%xmm2     \n\t"\
146.1123 +        "paddw   %%xmm0, %%xmm2     \n\t"\
146.1124 +        "psraw   $5,     %%xmm2     \n\t"\
146.1125 +        "packuswb %%xmm2, %%xmm2    \n\t"\
146.1126 +        "pavgb   %%xmm3, %%xmm2     \n\t"\
146.1127 +        OP(%%xmm2, (%1), %%xmm4, q)\
146.1128 +        "add %4, %0                 \n\t"\
146.1129 +        "add %4, %1                 \n\t"\
146.1130 +        "add %3, %2                 \n\t"\
146.1131 +        : "+a"(src), "+c"(dst), "+d"(src2)\
146.1132 +        : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
146.1133 +          "m"(ff_pw_16)\
146.1134 +        : "memory"\
146.1135 +    );\
146.1136 +    }while(--h);\
146.1137 +}\
146.1138 +QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
146.1139 +\
146.1140 +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
146.1141 +    int h=8;\
146.1142 +    __asm__ volatile(\
146.1143 +        "pxor %%xmm7, %%xmm7        \n\t"\
146.1144 +        "movdqa %5, %%xmm6          \n\t"\
146.1145 +        "1:                         \n\t"\
146.1146 +        "lddqu   -2(%0), %%xmm1     \n\t"\
146.1147 +        "movdqa  %%xmm1, %%xmm0     \n\t"\
146.1148 +        "punpckhbw %%xmm7, %%xmm1   \n\t"\
146.1149 +        "punpcklbw %%xmm7, %%xmm0   \n\t"\
146.1150 +        "movdqa  %%xmm1, %%xmm2     \n\t"\
146.1151 +        "movdqa  %%xmm1, %%xmm3     \n\t"\
146.1152 +        "movdqa  %%xmm1, %%xmm4     \n\t"\
146.1153 +        "movdqa  %%xmm1, %%xmm5     \n\t"\
146.1154 +        "palignr $2, %%xmm0, %%xmm4 \n\t"\
146.1155 +        "palignr $4, %%xmm0, %%xmm3 \n\t"\
146.1156 +        "palignr $6, %%xmm0, %%xmm2 \n\t"\
146.1157 +        "palignr $8, %%xmm0, %%xmm1 \n\t"\
146.1158 +        "palignr $10,%%xmm0, %%xmm5 \n\t"\
146.1159 +        "paddw   %%xmm5, %%xmm0     \n\t"\
146.1160 +        "paddw   %%xmm3, %%xmm2     \n\t"\
146.1161 +        "paddw   %%xmm4, %%xmm1     \n\t"\
146.1162 +        "psllw   $2,     %%xmm2     \n\t"\
146.1163 +        "psubw   %%xmm1, %%xmm2     \n\t"\
146.1164 +        "paddw   %6,     %%xmm0     \n\t"\
146.1165 +        "pmullw  %%xmm6, %%xmm2     \n\t"\
146.1166 +        "paddw   %%xmm0, %%xmm2     \n\t"\
146.1167 +        "psraw   $5,     %%xmm2     \n\t"\
146.1168 +        "packuswb %%xmm2, %%xmm2    \n\t"\
146.1169 +        OP(%%xmm2, (%1), %%xmm4, q)\
146.1170 +        "add %3, %0                 \n\t"\
146.1171 +        "add %4, %1                 \n\t"\
146.1172 +        "decl %2                    \n\t"\
146.1173 +        " jnz 1b                    \n\t"\
146.1174 +        : "+a"(src), "+c"(dst), "+g"(h)\
146.1175 +        : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
146.1176 +          "m"(ff_pw_5), "m"(ff_pw_16)\
146.1177 +        : "memory"\
146.1178 +    );\
146.1179 +}\
146.1180 +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
146.1181 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
146.1182 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
146.1183 +    src += 8*srcStride;\
146.1184 +    dst += 8*dstStride;\
146.1185 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
146.1186 +    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
146.1187 +}\
146.1188 +
146.1189 +#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
146.1190 +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
146.1191 +    src -= 2*srcStride;\
146.1192 +    \
146.1193 +    __asm__ volatile(\
146.1194 +        "pxor %%xmm7, %%xmm7        \n\t"\
146.1195 +        "movq (%0), %%xmm0          \n\t"\
146.1196 +        "add %2, %0                 \n\t"\
146.1197 +        "movq (%0), %%xmm1          \n\t"\
146.1198 +        "add %2, %0                 \n\t"\
146.1199 +        "movq (%0), %%xmm2          \n\t"\
146.1200 +        "add %2, %0                 \n\t"\
146.1201 +        "movq (%0), %%xmm3          \n\t"\
146.1202 +        "add %2, %0                 \n\t"\
146.1203 +        "movq (%0), %%xmm4          \n\t"\
146.1204 +        "add %2, %0                 \n\t"\
146.1205 +        "punpcklbw %%xmm7, %%xmm0   \n\t"\
146.1206 +        "punpcklbw %%xmm7, %%xmm1   \n\t"\
146.1207 +        "punpcklbw %%xmm7, %%xmm2   \n\t"\
146.1208 +        "punpcklbw %%xmm7, %%xmm3   \n\t"\
146.1209 +        "punpcklbw %%xmm7, %%xmm4   \n\t"\
146.1210 +        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
146.1211 +        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
146.1212 +        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
146.1213 +        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
146.1214 +        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
146.1215 +        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
146.1216 +        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
146.1217 +        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
146.1218 +         \
146.1219 +        : "+a"(src), "+c"(dst)\
146.1220 +        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
146.1221 +        : "memory"\
146.1222 +    );\
146.1223 +    if(h==16){\
146.1224 +        __asm__ volatile(\
146.1225 +            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
146.1226 +            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
146.1227 +            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
146.1228 +            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
146.1229 +            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
146.1230 +            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
146.1231 +            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
146.1232 +            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
146.1233 +            \
146.1234 +            : "+a"(src), "+c"(dst)\
146.1235 +            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
146.1236 +            : "memory"\
146.1237 +        );\
146.1238 +    }\
146.1239 +}\
146.1240 +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
146.1241 +    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
146.1242 +}\
146.1243 +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
146.1244 +    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
146.1245 +    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
146.1246 +}
146.1247 +
146.1248 +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
146.1249 +    int w = (size+8)>>3;
146.1250 +    src -= 2*srcStride+2;
146.1251 +    while(w--){
146.1252 +        __asm__ volatile(
146.1253 +            "pxor %%xmm7, %%xmm7        \n\t"
146.1254 +            "movq (%0), %%xmm0          \n\t"
146.1255 +            "add %2, %0                 \n\t"
146.1256 +            "movq (%0), %%xmm1          \n\t"
146.1257 +            "add %2, %0                 \n\t"
146.1258 +            "movq (%0), %%xmm2          \n\t"
146.1259 +            "add %2, %0                 \n\t"
146.1260 +            "movq (%0), %%xmm3          \n\t"
146.1261 +            "add %2, %0                 \n\t"
146.1262 +            "movq (%0), %%xmm4          \n\t"
146.1263 +            "add %2, %0                 \n\t"
146.1264 +            "punpcklbw %%xmm7, %%xmm0   \n\t"
146.1265 +            "punpcklbw %%xmm7, %%xmm1   \n\t"
146.1266 +            "punpcklbw %%xmm7, %%xmm2   \n\t"
146.1267 +            "punpcklbw %%xmm7, %%xmm3   \n\t"
146.1268 +            "punpcklbw %%xmm7, %%xmm4   \n\t"
146.1269 +            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
146.1270 +            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
146.1271 +            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
146.1272 +            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
146.1273 +            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
146.1274 +            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
146.1275 +            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
146.1276 +            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
146.1277 +            : "+a"(src)
146.1278 +            : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
146.1279 +            : "memory"
146.1280 +        );
146.1281 +        if(size==16){
146.1282 +            __asm__ volatile(
146.1283 +                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
146.1284 +                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
146.1285 +                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
146.1286 +                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
146.1287 +                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
146.1288 +                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
146.1289 +                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
146.1290 +                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
146.1291 +                : "+a"(src)
146.1292 +                : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
146.1293 +                : "memory"
146.1294 +            );
146.1295 +        }
146.1296 +        tmp += 8;
146.1297 +        src += 8 - (size+5)*srcStride;
146.1298 +    }
146.1299 +}
146.1300 +
146.1301 +#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
146.1302 +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
146.1303 +    int h = size;\
146.1304 +    if(size == 16){\
146.1305 +        __asm__ volatile(\
146.1306 +            "1:                         \n\t"\
146.1307 +            "movdqa 32(%0), %%xmm4      \n\t"\
146.1308 +            "movdqa 16(%0), %%xmm5      \n\t"\
146.1309 +            "movdqa   (%0), %%xmm7      \n\t"\
146.1310 +            "movdqa %%xmm4, %%xmm3      \n\t"\
146.1311 +            "movdqa %%xmm4, %%xmm2      \n\t"\
146.1312 +            "movdqa %%xmm4, %%xmm1      \n\t"\
146.1313 +            "movdqa %%xmm4, %%xmm0      \n\t"\
146.1314 +            "palignr $10, %%xmm5, %%xmm0 \n\t"\
146.1315 +            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
146.1316 +            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
146.1317 +            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
146.1318 +            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
146.1319 +            "paddw  %%xmm5, %%xmm0      \n\t"\
146.1320 +            "paddw  %%xmm4, %%xmm1      \n\t"\
146.1321 +            "paddw  %%xmm3, %%xmm2      \n\t"\
146.1322 +            "movdqa %%xmm5, %%xmm6      \n\t"\
146.1323 +            "movdqa %%xmm5, %%xmm4      \n\t"\
146.1324 +            "movdqa %%xmm5, %%xmm3      \n\t"\
146.1325 +            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
146.1326 +            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
146.1327 +            "palignr $10, %%xmm7, %%xmm3 \n\t"\
146.1328 +            "paddw  %%xmm6, %%xmm4      \n\t"\
146.1329 +            "movdqa %%xmm5, %%xmm6      \n\t"\
146.1330 +            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
146.1331 +            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
146.1332 +            "paddw  %%xmm7, %%xmm3      \n\t"\
146.1333 +            "paddw  %%xmm6, %%xmm5      \n\t"\
146.1334 +            \
146.1335 +            "psubw  %%xmm1, %%xmm0      \n\t"\
146.1336 +            "psubw  %%xmm4, %%xmm3      \n\t"\
146.1337 +            "psraw      $2, %%xmm0      \n\t"\
146.1338 +            "psraw      $2, %%xmm3      \n\t"\
146.1339 +            "psubw  %%xmm1, %%xmm0      \n\t"\
146.1340 +            "psubw  %%xmm4, %%xmm3      \n\t"\
146.1341 +            "paddw  %%xmm2, %%xmm0      \n\t"\
146.1342 +            "paddw  %%xmm5, %%xmm3      \n\t"\
146.1343 +            "psraw      $2, %%xmm0      \n\t"\
146.1344 +            "psraw      $2, %%xmm3      \n\t"\
146.1345 +            "paddw  %%xmm2, %%xmm0      \n\t"\
146.1346 +            "paddw  %%xmm5, %%xmm3      \n\t"\
146.1347 +            "psraw      $6, %%xmm0      \n\t"\
146.1348 +            "psraw      $6, %%xmm3      \n\t"\
146.1349 +            "packuswb %%xmm0, %%xmm3    \n\t"\
146.1350 +            OP(%%xmm3, (%1), %%xmm7, dqa)\
146.1351 +            "add $48, %0                \n\t"\
146.1352 +            "add %3, %1                 \n\t"\
146.1353 +            "decl %2                    \n\t"\
146.1354 +            " jnz 1b                    \n\t"\
146.1355 +            : "+a"(tmp), "+c"(dst), "+g"(h)\
146.1356 +            : "S"((x86_reg)dstStride)\
146.1357 +            : "memory"\
146.1358 +        );\
146.1359 +    }else{\
146.1360 +        __asm__ volatile(\
146.1361 +            "1:                         \n\t"\
146.1362 +            "movdqa 16(%0), %%xmm1      \n\t"\
146.1363 +            "movdqa   (%0), %%xmm0      \n\t"\
146.1364 +            "movdqa %%xmm1, %%xmm2      \n\t"\
146.1365 +            "movdqa %%xmm1, %%xmm3      \n\t"\
146.1366 +            "movdqa %%xmm1, %%xmm4      \n\t"\
146.1367 +            "movdqa %%xmm1, %%xmm5      \n\t"\
146.1368 +            "palignr $10, %%xmm0, %%xmm5 \n\t"\
146.1369 +            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
146.1370 +            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
146.1371 +            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
146.1372 +            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
146.1373 +            "paddw  %%xmm5, %%xmm0      \n\t"\
146.1374 +            "paddw  %%xmm4, %%xmm1      \n\t"\
146.1375 +            "paddw  %%xmm3, %%xmm2      \n\t"\
146.1376 +            "psubw  %%xmm1, %%xmm0      \n\t"\
146.1377 +            "psraw      $2, %%xmm0      \n\t"\
146.1378 +            "psubw  %%xmm1, %%xmm0      \n\t"\
146.1379 +            "paddw  %%xmm2, %%xmm0      \n\t"\
146.1380 +            "psraw      $2, %%xmm0      \n\t"\
146.1381 +            "paddw  %%xmm2, %%xmm0      \n\t"\
146.1382 +            "psraw      $6, %%xmm0      \n\t"\
146.1383 +            "packuswb %%xmm0, %%xmm0    \n\t"\
146.1384 +            OP(%%xmm0, (%1), %%xmm7, q)\
146.1385 +            "add $48, %0                \n\t"\
146.1386 +            "add %3, %1                 \n\t"\
146.1387 +            "decl %2                    \n\t"\
146.1388 +            " jnz 1b                    \n\t"\
146.1389 +            : "+a"(tmp), "+c"(dst), "+g"(h)\
146.1390 +            : "S"((x86_reg)dstStride)\
146.1391 +            : "memory"\
146.1392 +        );\
146.1393 +    }\
146.1394 +}
146.1395 +
146.1396 +#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
146.1397 +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
146.1398 +          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
146.1399 +    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
146.1400 +}\
146.1401 +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
146.1402 +    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
146.1403 +}\
146.1404 +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
146.1405 +    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
146.1406 +}\
146.1407 +
146.1408 +#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
146.1409 +#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
146.1410 +#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
146.1411 +#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
146.1412 +#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
146.1413 +#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
146.1414 +#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
146.1415 +#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
146.1416 +
146.1417 +#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
146.1418 +#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
146.1419 +#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
146.1420 +#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
146.1421 +#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
146.1422 +#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
146.1423 +#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
146.1424 +#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
146.1425 +
146.1426 +#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
146.1427 +#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
146.1428 +#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
146.1429 +#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
146.1430 +
146.1431 +#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
146.1432 +#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
146.1433 +#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
146.1434 +#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
146.1435 +
146.1436 +#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
146.1437 +#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
146.1438 +
146.1439 +#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
146.1440 +H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
146.1441 +H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
146.1442 +H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
146.1443 +H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
146.1444 +
146.1445 +// static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
146.1446 +//     put_pixels16_sse2(dst, src, stride, 16);
146.1447 +// }
146.1448 +// static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
146.1449 +//     avg_pixels16_sse2(dst, src, stride, 16);
146.1450 +// }
146.1451 +#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
146.1452 +#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
146.1453 +
146.1454 +#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
146.1455 +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
146.1456 +    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
146.1457 +}\
146.1458 +
146.1459 +#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
146.1460 +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1461 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
146.1462 +}\
146.1463 +\
146.1464 +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1465 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
146.1466 +}\
146.1467 +\
146.1468 +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1469 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
146.1470 +}\
146.1471 +
146.1472 +#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
146.1473 +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1474 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
146.1475 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
146.1476 +    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
146.1477 +}\
146.1478 +\
146.1479 +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1480 +    OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
146.1481 +}\
146.1482 +\
146.1483 +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1484 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
146.1485 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
146.1486 +    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
146.1487 +}\
146.1488 +
146.1489 +#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
146.1490 +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1491 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
146.1492 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
146.1493 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
146.1494 +}\
146.1495 +\
146.1496 +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1497 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
146.1498 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
146.1499 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
146.1500 +}\
146.1501 +\
146.1502 +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1503 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
146.1504 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
146.1505 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
146.1506 +}\
146.1507 +\
146.1508 +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1509 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
146.1510 +    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
146.1511 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
146.1512 +}\
146.1513 +\
146.1514 +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1515 +    DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
146.1516 +    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
146.1517 +}\
146.1518 +\
146.1519 +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1520 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
146.1521 +    uint8_t * const halfHV= temp;\
146.1522 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
146.1523 +    assert(((int)temp & 7) == 0);\
146.1524 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
146.1525 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
146.1526 +}\
146.1527 +\
146.1528 +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1529 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
146.1530 +    uint8_t * const halfHV= temp;\
146.1531 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
146.1532 +    assert(((int)temp & 7) == 0);\
146.1533 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
146.1534 +    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
146.1535 +}\
146.1536 +\
146.1537 +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1538 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
146.1539 +    uint8_t * const halfHV= temp;\
146.1540 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
146.1541 +    assert(((int)temp & 7) == 0);\
146.1542 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
146.1543 +    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
146.1544 +}\
146.1545 +\
146.1546 +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
146.1547 +    DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
146.1548 +    uint8_t * const halfHV= temp;\
146.1549 +    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
146.1550 +    assert(((int)temp & 7) == 0);\
146.1551 +    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
146.1552 +    OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
146.1553 +}\
146.1554 +
146.1555 +#define H264_MC_4816(MMX)\
146.1556 +H264_MC(put_, 4, MMX, 8)\
146.1557 +H264_MC(put_, 8, MMX, 8)\
146.1558 +H264_MC(put_, 16,MMX, 8)\
146.1559 +H264_MC(avg_, 4, MMX, 8)\
146.1560 +H264_MC(avg_, 8, MMX, 8)\
146.1561 +H264_MC(avg_, 16,MMX, 8)\
146.1562 +
146.1563 +#define H264_MC_816(QPEL, XMM)\
146.1564 +QPEL(put_, 8, XMM, 16)\
146.1565 +QPEL(put_, 16,XMM, 16)\
146.1566 +QPEL(avg_, 8, XMM, 16)\
146.1567 +QPEL(avg_, 16,XMM, 16)\
146.1568 +
146.1569 +
146.1570 +#define AVG_3DNOW_OP(a,b,temp, size) \
146.1571 +"mov" #size " " #b ", " #temp "   \n\t"\
146.1572 +"pavgusb " #temp ", " #a "        \n\t"\
146.1573 +"mov" #size " " #a ", " #b "      \n\t"
146.1574 +#define AVG_MMX2_OP(a,b,temp, size) \
146.1575 +"mov" #size " " #b ", " #temp "   \n\t"\
146.1576 +"pavgb " #temp ", " #a "          \n\t"\
146.1577 +"mov" #size " " #a ", " #b "      \n\t"
146.1578 +
146.1579 +///this does not get detected correctly, uncomment on AMD machine
146.1580 +#ifdef HAVE_AMD3DNOW
146.1581 +#define PAVGB "pavgusb"
146.1582 +//QPEL_H264(put_,       PUT_OP, 3dnow)
146.1583 +//QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
146.1584 +#undef PAVGB
146.1585 +#endif
146.1586 +
146.1587 +#define PAVGB "pavgb"
146.1588 +QPEL_H264(put_,       PUT_OP, mmx2)
146.1589 +QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
146.1590 +QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
146.1591 +QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
146.1592 +QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
146.1593 +QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
146.1594 +#if HAVE_SSSE3
146.1595 +QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
146.1596 +QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
146.1597 +QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
146.1598 +QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
146.1599 +QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
146.1600 +QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
146.1601 +#endif
146.1602 +#undef PAVGB
146.1603 +
146.1604 +H264_MC_816(H264_MC_V, sse2)
146.1605 +H264_MC_816(H264_MC_HV, sse2)
146.1606 +#if HAVE_SSSE3
146.1607 +H264_MC_816(H264_MC_H, ssse3)
146.1608 +H264_MC_816(H264_MC_HV, ssse3)
146.1609 +#endif
146.1610 +
146.1611 +/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
146.1612 +DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
146.1613 +    0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
146.1614 +};
146.1615 +
146.1616 +#if HAVE_SSSE3
146.1617 +#define AVG_OP(X)
146.1618 +#undef H264_CHROMA_MC8_TMPL
146.1619 +#undef H264_CHROMA_MC4_TMPL
146.1620 +#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
146.1621 +#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
146.1622 +#define H264_CHROMA_MC8_MV0 put_pixels8_mmx
146.1623 +#include "dsputil_h264_template_ssse3.c"
146.1624 +static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
146.1625 +{
146.1626 +    put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
146.1627 +}
146.1628 +
146.1629 +#undef AVG_OP
146.1630 +#undef H264_CHROMA_MC8_TMPL
146.1631 +#undef H264_CHROMA_MC4_TMPL
146.1632 +#undef H264_CHROMA_MC8_MV0
146.1633 +#define AVG_OP(X) X
146.1634 +#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
146.1635 +#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
146.1636 +#define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
146.1637 +#include "dsputil_h264_template_ssse3.c"
146.1638 +static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
146.1639 +{
146.1640 +    avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
146.1641 +}
146.1642 +#undef AVG_OP
146.1643 +#undef H264_CHROMA_MC8_TMPL
146.1644 +#undef H264_CHROMA_MC4_TMPL
146.1645 +#undef H264_CHROMA_MC8_MV0
146.1646 +#endif
146.1647 +
146.1648 +/***********************************/
146.1649 +/* weighted prediction */
146.1650 +
146.1651 +static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
146.1652 +{
146.1653 +    int x, y;
146.1654 +    offset <<= log2_denom;
146.1655 +    offset += (1 << log2_denom) >> 1;
146.1656 +    __asm__ volatile(
146.1657 +        "movd    %0, %%mm4        \n\t"
146.1658 +        "movd    %1, %%mm5        \n\t"
146.1659 +        "movd    %2, %%mm6        \n\t"
146.1660 +        "pshufw  $0, %%mm4, %%mm4 \n\t"
146.1661 +        "pshufw  $0, %%mm5, %%mm5 \n\t"
146.1662 +        "pxor    %%mm7, %%mm7     \n\t"
146.1663 +        :: "g"(weight), "g"(offset), "g"(log2_denom)
146.1664 +    );
146.1665 +    for(y=0; y<h; y+=2){
146.1666 +        for(x=0; x<w; x+=4){
146.1667 +            __asm__ volatile(
146.1668 +                "movd      %0,    %%mm0 \n\t"
146.1669 +                "movd      %1,    %%mm1 \n\t"
146.1670 +                "punpcklbw %%mm7, %%mm0 \n\t"
146.1671 +                "punpcklbw %%mm7, %%mm1 \n\t"
146.1672 +                "pmullw    %%mm4, %%mm0 \n\t"
146.1673 +                "pmullw    %%mm4, %%mm1 \n\t"
146.1674 +                "paddsw    %%mm5, %%mm0 \n\t"
146.1675 +                "paddsw    %%mm5, %%mm1 \n\t"
146.1676 +                "psraw     %%mm6, %%mm0 \n\t"
146.1677 +                "psraw     %%mm6, %%mm1 \n\t"
146.1678 +                "packuswb  %%mm7, %%mm0 \n\t"
146.1679 +                "packuswb  %%mm7, %%mm1 \n\t"
146.1680 +                "movd      %%mm0, %0    \n\t"
146.1681 +                "movd      %%mm1, %1    \n\t"
146.1682 +                : "+m"(*(uint32_t*)(dst+x)),
146.1683 +                  "+m"(*(uint32_t*)(dst+x+stride))
146.1684 +            );
146.1685 +        }
146.1686 +        dst += 2*stride;
146.1687 +    }
146.1688 +}
146.1689 +
146.1690 +static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
146.1691 +{
146.1692 +    int x, y;
146.1693 +    offset = ((offset + 1) | 1) << log2_denom;
146.1694 +    __asm__ volatile(
146.1695 +        "movd    %0, %%mm3        \n\t"
146.1696 +        "movd    %1, %%mm4        \n\t"
146.1697 +        "movd    %2, %%mm5        \n\t"
146.1698 +        "movd    %3, %%mm6        \n\t"
146.1699 +        "pshufw  $0, %%mm3, %%mm3 \n\t"
146.1700 +        "pshufw  $0, %%mm4, %%mm4 \n\t"
146.1701 +        "pshufw  $0, %%mm5, %%mm5 \n\t"
146.1702 +        "pxor    %%mm7, %%mm7     \n\t"
146.1703 +        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
146.1704 +    );
146.1705 +    for(y=0; y<h; y++){
146.1706 +        for(x=0; x<w; x+=4){
146.1707 +            __asm__ volatile(
146.1708 +                "movd      %0,    %%mm0 \n\t"
146.1709 +                "movd      %1,    %%mm1 \n\t"
146.1710 +                "punpcklbw %%mm7, %%mm0 \n\t"
146.1711 +                "punpcklbw %%mm7, %%mm1 \n\t"
146.1712 +                "pmullw    %%mm3, %%mm0 \n\t"
146.1713 +                "pmullw    %%mm4, %%mm1 \n\t"
146.1714 +                "paddsw    %%mm1, %%mm0 \n\t"
146.1715 +                "paddsw    %%mm5, %%mm0 \n\t"
146.1716 +                "psraw     %%mm6, %%mm0 \n\t"
146.1717 +                "packuswb  %%mm0, %%mm0 \n\t"
146.1718 +                "movd      %%mm0, %0    \n\t"
146.1719 +                : "+m"(*(uint32_t*)(dst+x))
146.1720 +                :  "m"(*(uint32_t*)(src+x))
146.1721 +            );
146.1722 +        }
146.1723 +        src += stride;
146.1724 +        dst += stride;
146.1725 +    }
146.1726 +}
146.1727 +
146.1728 +#define H264_WEIGHT(W,H) \
146.1729 +static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
146.1730 +    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
146.1731 +} \
146.1732 +static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
146.1733 +    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
146.1734 +}
146.1735 +
146.1736 +H264_WEIGHT(16,16)
146.1737 +H264_WEIGHT(16, 8)
146.1738 +H264_WEIGHT( 8,16)
146.1739 +H264_WEIGHT( 8, 8)
146.1740 +H264_WEIGHT( 8, 4)
146.1741 +H264_WEIGHT( 4, 8)
146.1742 +H264_WEIGHT( 4, 4)
146.1743 +H264_WEIGHT( 4, 2)
146.1744 +

   147.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   147.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/mathops.h	Mon Aug 27 12:09:56 2012 +0200
   147.3 @@ -0,0 +1,67 @@
   147.4 +/*
   147.5 + * simple math operations
   147.6 + * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
   147.7 + *
   147.8 + * This file is part of FFmpeg.
   147.9 + *
  147.10 + * FFmpeg is free software; you can redistribute it and/or
  147.11 + * modify it under the terms of the GNU Lesser General Public
  147.12 + * License as published by the Free Software Foundation; either
  147.13 + * version 2.1 of the License, or (at your option) any later version.
  147.14 + *
  147.15 + * FFmpeg is distributed in the hope that it will be useful,
  147.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  147.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  147.18 + * Lesser General Public License for more details.
  147.19 + *
  147.20 + * You should have received a copy of the GNU Lesser General Public
  147.21 + * License along with FFmpeg; if not, write to the Free Software
  147.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  147.23 + */
  147.24 +
  147.25 +#ifndef AVCODEC_X86_MATHOPS_H
  147.26 +#define AVCODEC_X86_MATHOPS_H
  147.27 +
  147.28 +#include "config.h"
  147.29 +#include "libavutil/common.h"
  147.30 +
  147.31 +#if ARCH_X86_32
  147.32 +#define MULL(ra, rb, shift) \
  147.33 +        ({ int rt, dummy; __asm__ (\
  147.34 +            "imull %3               \n\t"\
  147.35 +            "shrdl %4, %%edx, %%eax \n\t"\
  147.36 +            : "=a"(rt), "=d"(dummy)\
  147.37 +            : "a" ((int)ra), "rm" ((int)rb), "i"(shift));\
  147.38 +         rt; })
  147.39 +
  147.40 +#define MULH(ra, rb) \
  147.41 +    ({ int rt, dummy;\
  147.42 +     __asm__ ("imull %3\n\t" : "=d"(rt), "=a"(dummy): "a" ((int)ra), "rm" ((int)rb));\
  147.43 +     rt; })
  147.44 +
  147.45 +#define MUL64(ra, rb) \
  147.46 +    ({ int64_t rt;\
  147.47 +     __asm__ ("imull %2\n\t" : "=A"(rt) : "a" ((int)ra), "g" ((int)rb));\
  147.48 +     rt; })
  147.49 +#endif
  147.50 +
  147.51 +// avoid +32 for shift optimization (gcc should do that ...)
  147.52 +#define NEG_SSR32 NEG_SSR32
  147.53 +static inline  int32_t NEG_SSR32( int32_t a, int8_t s){
  147.54 +    __asm__ ("sarl %1, %0\n\t"
  147.55 +         : "+r" (a)
  147.56 +         : "ic" ((uint8_t)(-s))
  147.57 +    );
  147.58 +    return a;
  147.59 +}
  147.60 +
  147.61 +#define NEG_USR32 NEG_USR32
  147.62 +static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
  147.63 +    __asm__ ("shrl %1, %0\n\t"
  147.64 +         : "+r" (a)
  147.65 +         : "ic" ((uint8_t)(-s))
  147.66 +    );
  147.67 +    return a;
  147.68 +}
  147.69 +
  147.70 +#endif /* AVCODEC_X86_MATHOPS_H */

   148.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   148.2 +++ b/ffmpeg_smp/h264dec/libavcodec/x86/mmx.h	Mon Aug 27 12:09:56 2012 +0200
   148.3 @@ -0,0 +1,267 @@
   148.4 +/*
   148.5 + * mmx.h
   148.6 + * Copyright (C) 1997-2001 H. Dietz and R. Fisher
   148.7 + *
   148.8 + * This file is part of FFmpeg.
   148.9 + *
  148.10 + * FFmpeg is free software; you can redistribute it and/or
  148.11 + * modify it under the terms of the GNU Lesser General Public
  148.12 + * License as published by the Free Software Foundation; either
  148.13 + * version 2.1 of the License, or (at your option) any later version.
  148.14 + *
  148.15 + * FFmpeg is distributed in the hope that it will be useful,
  148.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  148.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  148.18 + * Lesser General Public License for more details.
  148.19 + *
  148.20 + * You should have received a copy of the GNU Lesser General Public
  148.21 + * License along with FFmpeg; if not, write to the Free Software
  148.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  148.23 + */
  148.24 +#ifndef AVCODEC_X86_MMX_H
  148.25 +#define AVCODEC_X86_MMX_H
  148.26 +
  148.27 +#warning Everything in this header is deprecated, use plain __asm__()! New code using this header will be rejected.
  148.28 +
  148.29 +
  148.30 +#define         mmx_i2r(op,imm,reg) \
  148.31 +        __asm__ volatile (#op " %0, %%" #reg \
  148.32 +                              : /* nothing */ \
  148.33 +                              : "i" (imm) )
  148.34 +
  148.35 +#define         mmx_m2r(op,mem,reg) \
  148.36 +        __asm__ volatile (#op " %0, %%" #reg \
  148.37 +                              : /* nothing */ \
  148.38 +                              : "m" (mem))
  148.39 +
  148.40 +#define         mmx_r2m(op,reg,mem) \
  148.41 +        __asm__ volatile (#op " %%" #reg ", %0" \
  148.42 +                              : "=m" (mem) \
  148.43 +                              : /* nothing */ )
  148.44 +
  148.45 +#define         mmx_r2r(op,regs,regd) \
  148.46 +        __asm__ volatile (#op " %" #regs ", %" #regd)
  148.47 +
  148.48 +
  148.49 +#define         emms() __asm__ volatile ("emms")
  148.50 +
  148.51 +#define         movd_m2r(var,reg)           mmx_m2r (movd, var, reg)
  148.52 +#define         movd_r2m(reg,var)           mmx_r2m (movd, reg, var)
  148.53 +#define         movd_r2r(regs,regd)         mmx_r2r (movd, regs, regd)
  148.54 +
  148.55 +#define         movq_m2r(var,reg)           mmx_m2r (movq, var, reg)
  148.56 +#define         movq_r2m(reg,var)           mmx_r2m (movq, reg, var)
  148.57 +#define         movq_r2r(regs,regd)         mmx_r2r (movq, regs, regd)
  148.58 +
  148.59 +#define         packssdw_m2r(var,reg)       mmx_m2r (packssdw, var, reg)
  148.60 +#define         packssdw_r2r(regs,regd)     mmx_r2r (packssdw, regs, regd)
  148.61 +#define         packsswb_m2r(var,reg)       mmx_m2r (packsswb, var, reg)
  148.62 +#define         packsswb_r2r(regs,regd)     mmx_r2r (packsswb, regs, regd)
  148.63 +
  148.64 +#define         packuswb_m2r(var,reg)       mmx_m2r (packuswb, var, reg)
  148.65 +#define         packuswb_r2r(regs,regd)     mmx_r2r (packuswb, regs, regd)
  148.66 +
  148.67 +#define         paddb_m2r(var,reg)          mmx_m2r (paddb, var, reg)
  148.68 +#define         paddb_r2r(regs,regd)        mmx_r2r (paddb, regs, regd)
  148.69 +#define         paddd_m2r(var,reg)          mmx_m2r (paddd, var, reg)
  148.70 +#define         paddd_r2r(regs,regd)        mmx_r2r (paddd, regs, regd)
  148.71 +#define         paddw_m2r(var,reg)          mmx_m2r (paddw, var, reg)
  148.72 +#define         paddw_r2r(regs,regd)        mmx_r2r (paddw, regs, regd)
  148.73 +
  148.74 +#define         paddsb_m2r(var,reg)         mmx_m2r (paddsb, var, reg)
  148.75 +#define         paddsb_r2r(regs,regd)       mmx_r2r (paddsb, regs, regd)
  148.76 +#define         paddsw_m2r(var,reg)         mmx_m2r (paddsw, var, reg)
  148.77 +#define         paddsw_r2r(regs,regd)       mmx_r2r (paddsw, regs, regd)
  148.78 +
  148.79 +#define         paddusb_m2r(var,reg)        mmx_m2r (paddusb, var, reg)
  148.80 +#define         paddusb_r2r(regs,regd)      mmx_r2r (paddusb, regs, regd)
  148.81 +#define         paddusw_m2r(var,reg)        mmx_m2r (paddusw, var, reg)
  148.82 +#define         paddusw_r2r(regs,regd)      mmx_r2r (paddusw, regs, regd)
  148.83 +
  148.84 +#define         pand_m2r(var,reg)           mmx_m2r (pand, var, reg)
  148.85 +#define         pand_r2r(regs,regd)         mmx_r2r (pand, regs, regd)
  148.86 +
  148.87 +#define         pandn_m2r(var,reg)          mmx_m2r (pandn, var, reg)
  148.88 +#define         pandn_r2r(regs,regd)        mmx_r2r (pandn, regs, regd)
  148.89 +
  148.90 +#define         pcmpeqb_m2r(var,reg)        mmx_m2r (pcmpeqb, var, reg)
  148.91 +#define         pcmpeqb_r2r(regs,regd)      mmx_r2r (pcmpeqb, regs, regd)
  148.92 +#define         pcmpeqd_m2r(var,reg)        mmx_m2r (pcmpeqd, var, reg)
  148.93 +#define         pcmpeqd_r2r(regs,regd)      mmx_r2r (pcmpeqd, regs, regd)
  148.94 +#define         pcmpeqw_m2r(var,reg)        mmx_m2r (pcmpeqw, var, reg)
  148.95 +#define         pcmpeqw_r2r(regs,regd)      mmx_r2r (pcmpeqw, regs, regd)
  148.96 +
  148.97 +#define         pcmpgtb_m2r(var,reg)        mmx_m2r (pcmpgtb, var, reg)
  148.98 +#define         pcmpgtb_r2r(regs,regd)      mmx_r2r (pcmpgtb, regs, regd)
  148.99 +#define         pcmpgtd_m2r(var,reg)        mmx_m2r (pcmpgtd, var, reg)
 148.100 +#define         pcmpgtd_r2r(regs,regd)      mmx_r2r (pcmpgtd, regs, regd)
 148.101 +#define         pcmpgtw_m2r(var,reg)        mmx_m2r (pcmpgtw, var, reg)
 148.102 +#define         pcmpgtw_r2r(regs,regd)      mmx_r2r (pcmpgtw, regs, regd)
 148.103 +
 148.104 +#define         pmaddwd_m2r(var,reg)        mmx_m2r (pmaddwd, var, reg)
 148.105 +#define         pmaddwd_r2r(regs,regd)      mmx_r2r (pmaddwd, regs, regd)
 148.106 +
 148.107 +#define         pmulhw_m2r(var,reg)         mmx_m2r (pmulhw, var, reg)
 148.108 +#define         pmulhw_r2r(regs,regd)       mmx_r2r (pmulhw, regs, regd)
 148.109 +
 148.110 +#define         pmullw_m2r(var,reg)         mmx_m2r (pmullw, var, reg)
 148.111 +#define         pmullw_r2r(regs,regd)       mmx_r2r (pmullw, regs, regd)
 148.112 +
 148.113 +#define         por_m2r(var,reg)            mmx_m2r (por, var, reg)
 148.114 +#define         por_r2r(regs,regd)          mmx_r2r (por, regs, regd)
 148.115 +
 148.116 +#define         pslld_i2r(imm,reg)          mmx_i2r (pslld, imm, reg)
 148.117 +#define         pslld_m2r(var,reg)          mmx_m2r (pslld, var, reg)
 148.118 +#define         pslld_r2r(regs,regd)        mmx_r2r (pslld, regs, regd)
 148.119 +#define         psllq_i2r(imm,reg)          mmx_i2r (psllq, imm, reg)
 148.120 +#define         psllq_m2r(var,reg)          mmx_m2r (psllq, var, reg)
 148.121 +#define         psllq_r2r(regs,regd)        mmx_r2r (psllq, regs, regd)
 148.122 +#define         psllw_i2r(imm,reg)          mmx_i2r (psllw, imm, reg)
 148.123 +#define         psllw_m2r(var,reg)          mmx_m2r (psllw, var, reg)
 148.124 +#define         psllw_r2r(regs,regd)        mmx_r2r (psllw, regs, regd)
 148.125 +
 148.126 +#define         psrad_i2r(imm,reg)          mmx_i2r (psrad, imm, reg)
 148.127 +#define         psrad_m2r(var,reg)          mmx_m2r (psrad, var, reg)
 148.128 +#define         psrad_r2r(regs,regd)        mmx_r2r (psrad, regs, regd)
 148.129 +#define         psraw_i2r(imm,reg)          mmx_i2r (psraw, imm, reg)
 148.130 +#define         psraw_m2r(var,reg)          mmx_m2r (psraw, var, reg)
 148.131 +#define         psraw_r2r(regs,regd)        mmx_r2r (psraw, regs, regd)
 148.132 +
 148.133 +#define         psrld_i2r(imm,reg)          mmx_i2r (psrld, imm, reg)
 148.134 +#define         psrld_m2r(var,reg)          mmx_m2r (psrld, var, reg)
 148.135 +#define         psrld_r2r(regs,regd)        mmx_r2r (psrld, regs, regd)
 148.136 +#define         psrlq_i2r(imm,reg)          mmx_i2r (psrlq, imm, reg)
 148.137 +#define         psrlq_m2r(var,reg)          mmx_m2r (psrlq, var, reg)
 148.138 +#define         psrlq_r2r(regs,regd)        mmx_r2r (psrlq, regs, regd)
 148.139 +#define         psrlw_i2r(imm,reg)          mmx_i2r (psrlw, imm, reg)
 148.140 +#define         psrlw_m2r(var,reg)          mmx_m2r (psrlw, var, reg)
 148.141 +#define         psrlw_r2r(regs,regd)        mmx_r2r (psrlw, regs, regd)
 148.142 +
 148.143 +#define         psubb_m2r(var,reg)          mmx_m2r (psubb, var, reg)
 148.144 +#define         psubb_r2r(regs,regd)        mmx_r2r (psubb, regs, regd)
 148.145 +#define         psubd_m2r(var,reg)          mmx_m2r (psubd, var, reg)
 148.146 +#define         psubd_r2r(regs,regd)        mmx_r2r (psubd, regs, regd)
 148.147 +#define         psubw_m2r(var,reg)          mmx_m2r (psubw, var, reg)
 148.148 +#define         psubw_r2r(regs,regd)        mmx_r2r (psubw, regs, regd)
 148.149 +
 148.150 +#define         psubsb_m2r(var,reg)         mmx_m2r (psubsb, var, reg)
 148.151 +#define         psubsb_r2r(regs,regd)       mmx_r2r (psubsb, regs, regd)
 148.152 +#define         psubsw_m2r(var,reg)         mmx_m2r (psubsw, var, reg)
 148.153 +#define         psubsw_r2r(regs,regd)       mmx_r2r (psubsw, regs, regd)
 148.154 +
 148.155 +#define         psubusb_m2r(var,reg)        mmx_m2r (psubusb, var, reg)
 148.156 +#define         psubusb_r2r(regs,regd)      mmx_r2r (psubusb, regs, regd)
 148.157 +#define         psubusw_m2r(var,reg)        mmx_m2r (psubusw, var, reg)
 148.158 +#define         psubusw_r2r(regs,regd)      mmx_r2r (psubusw, regs, regd)
 148.159 +
 148.160 +#define         punpckhbw_m2r(var,reg)      mmx_m2r (punpckhbw, var, reg)
 148.161 +#define         punpckhbw_r2r(regs,regd)    mmx_r2r (punpckhbw, regs, regd)
 148.162 +#define         punpckhdq_m2r(var,reg)      mmx_m2r (punpckhdq, var, reg)
 148.163 +#define         punpckhdq_r2r(regs,regd)    mmx_r2r (punpckhdq, regs, regd)
 148.164 +#define         punpckhwd_m2r(var,reg)      mmx_m2r (punpckhwd, var, reg)
 148.165 +#define         punpckhwd_r2r(regs,regd)    mmx_r2r (punpckhwd, regs, regd)
 148.166 +
 148.167 +#define         punpcklbw_m2r(var,reg)      mmx_m2r (punpcklbw, var, reg)
 148.168 +#define         punpcklbw_r2r(regs,regd)    mmx_r2r (punpcklbw, regs, regd)
 148.169 +#define         punpckldq_m2r(var,reg)      mmx_m2r (punpckldq, var, reg)
 148.170 +#define         punpckldq_r2r(regs,regd)    mmx_r2r (punpckldq, regs, regd)
 148.171 +#define         punpcklwd_m2r(var,reg)      mmx_m2r (punpcklwd, var, reg)
 148.172 +#define         punpcklwd_r2r(regs,regd)    mmx_r2r (punpcklwd, regs, regd)
 148.173 +
 148.174 +#define         pxor_m2r(var,reg)           mmx_m2r (pxor, var, reg)
 148.175 +#define         pxor_r2r(regs,regd)         mmx_r2r (pxor, regs, regd)
 148.176 +
 148.177 +
 148.178 +/* 3DNOW extensions */
 148.179 +
 148.180 +#define         pavgusb_m2r(var,reg)        mmx_m2r (pavgusb, var, reg)
 148.181 +#define         pavgusb_r2r(regs,regd)      mmx_r2r (pavgusb, regs, regd)
 148.182 +
 148.183 +
 148.184 +/* AMD MMX extensions - also available in intel SSE */
 148.185 +
 148.186 +
 148.187 +#define         mmx_m2ri(op,mem,reg,imm) \
 148.188 +        __asm__ volatile (#op " %1, %0, %%" #reg \
 148.189 +                              : /* nothing */ \
 148.190 +                              : "m" (mem), "i" (imm))
 148.191 +#define         mmx_r2ri(op,regs,regd,imm) \
 148.192 +        __asm__ volatile (#op " %0, %%" #regs ", %%" #regd \
 148.193 +                              : /* nothing */ \
 148.194 +                              : "i" (imm) )
 148.195 +
 148.196 +#define         mmx_fetch(mem,hint) \
 148.197 +        __asm__ volatile ("prefetch" #hint " %0" \
 148.198 +                              : /* nothing */ \
 148.199 +                              : "m" (mem))
 148.200 +
 148.201 +
 148.202 +#define         maskmovq(regs,maskreg)      mmx_r2ri (maskmovq, regs, maskreg)
 148.203 +
 148.204 +#define         movntq_r2m(mmreg,var)       mmx_r2m (movntq, mmreg, var)
 148.205 +
 148.206 +#define         pavgb_m2r(var,reg)          mmx_m2r (pavgb, var, reg)
 148.207 +#define         pavgb_r2r(regs,regd)        mmx_r2r (pavgb, regs, regd)
 148.208 +#define         pavgw_m2r(var,reg)          mmx_m2r (pavgw, var, reg)
 148.209 +#define         pavgw_r2r(regs,regd)        mmx_r2r (pavgw, regs, regd)
 148.210 +
 148.211 +#define         pextrw_r2r(mmreg,reg,imm)   mmx_r2ri (pextrw, mmreg, reg, imm)
 148.212 +
 148.213 +#define         pinsrw_r2r(reg,mmreg,imm)   mmx_r2ri (pinsrw, reg, mmreg, imm)
 148.214 +
 148.215 +#define         pmaxsw_m2r(var,reg)         mmx_m2r (pmaxsw, var, reg)
 148.216 +#define         pmaxsw_r2r(regs,regd)       mmx_r2r (pmaxsw, regs, regd)
 148.217 +
 148.218 +#define         pmaxub_m2r(var,reg)         mmx_m2r (pmaxub, var, reg)
 148.219 +#define         pmaxub_r2r(regs,regd)       mmx_r2r (pmaxub, regs, regd)
 148.220 +
 148.221 +#define         pminsw_m2r(var,reg)         mmx_m2r (pminsw, var, reg)
 148.222 +#define         pminsw_r2r(regs,regd)       mmx_r2r (pminsw, regs, regd)
 148.223 +
 148.224 +#define         pminub_m2r(var,reg)         mmx_m2r (pminub, var, reg)
 148.225 +#define         pminub_r2r(regs,regd)       mmx_r2r (pminub, regs, regd)
 148.226 +
 148.227 +#define         pmovmskb(mmreg,reg) \
 148.228 +        __asm__ volatile ("movmskps %" #mmreg ", %" #reg)
 148.229 +
 148.230 +#define         pmulhuw_m2r(var,reg)        mmx_m2r (pmulhuw, var, reg)
 148.231 +#define         pmulhuw_r2r(regs,regd)      mmx_r2r (pmulhuw, regs, regd)
 148.232 +
 148.233 +#define         prefetcht0(mem)             mmx_fetch (mem, t0)
 148.234 +#define         prefetcht1(mem)             mmx_fetch (mem, t1)
 148.235 +#define         prefetcht2(mem)             mmx_fetch (mem, t2)
 148.236 +#define         prefetchnta(mem)            mmx_fetch (mem, nta)
 148.237 +
 148.238 +#define         psadbw_m2r(var,reg)         mmx_m2r (psadbw, var, reg)
 148.239 +#define         psadbw_r2r(regs,regd)       mmx_r2r (psadbw, regs, regd)
 148.240 +
 148.241 +#define         pshufw_m2r(var,reg,imm)     mmx_m2ri(pshufw, var, reg, imm)
 148.242 +#define         pshufw_r2r(regs,regd,imm)   mmx_r2ri(pshufw, regs, regd, imm)
 148.243 +
 148.244 +#define         sfence() __asm__ volatile ("sfence\n\t")
 148.245 +
 148.246 +/* SSE2 */
 148.247 +#define         pshufhw_m2r(var,reg,imm)    mmx_m2ri(pshufhw, var, reg, imm)
 148.248 +#define         pshufhw_r2r(regs,regd,imm)  mmx_r2ri(pshufhw, regs, regd, imm)
 148.249 +#define         pshuflw_m2r(var,reg,imm)    mmx_m2ri(pshuflw, var, reg, imm)
 148.250 +#define         pshuflw_r2r(regs,regd,imm)  mmx_r2ri(pshuflw, regs, regd, imm)
 148.251 +
 148.252 +#define         pshufd_r2r(regs,regd,imm)   mmx_r2ri(pshufd, regs, regd, imm)
 148.253 +
 148.254 +#define         movdqa_m2r(var,reg)         mmx_m2r (movdqa, var, reg)
 148.255 +#define         movdqa_r2m(reg,var)         mmx_r2m (movdqa, reg, var)
 148.256 +#define         movdqa_r2r(regs,regd)       mmx_r2r (movdqa, regs, regd)
 148.257 +#define         movdqu_m2r(var,reg)         mmx_m2r (movdqu, var, reg)
 148.258 +#define         movdqu_r2m(reg,var)         mmx_r2m (movdqu, reg, var)
 148.259 +#define         movdqu_r2r(regs,regd)       mmx_r2r (movdqu, regs, regd)
 148.260 +
 148.261 +#define         pmullw_r2m(reg,var)         mmx_r2m (pmullw, reg, var)
 148.262 +
 148.263 +#define         pslldq_i2r(imm,reg)         mmx_i2r (pslldq, imm, reg)
 148.264 +#define         psrldq_i2r(imm,reg)         mmx_i2r (psrldq, imm, reg)
 148.265 +
 148.266 +#define         punpcklqdq_r2r(regs,regd)   mmx_r2r (punpcklqdq, regs, regd)
 148.267 +#define         punpckhqdq_r2r(regs,regd)   mmx_r2r (punpckhqdq, regs, regd)
 148.268 +
 148.269 +
 148.270 +#endif /* AVCODEC_X86_MMX_H */

   149.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   149.2 +++ b/ffmpeg_smp/h264dec/libavutil/arm/bswap.h	Mon Aug 27 12:09:56 2012 +0200
   149.3 @@ -0,0 +1,72 @@
   149.4 +/*
   149.5 + * This file is part of FFmpeg.
   149.6 + *
   149.7 + * FFmpeg is free software; you can redistribute it and/or
   149.8 + * modify it under the terms of the GNU Lesser General Public
   149.9 + * License as published by the Free Software Foundation; either
  149.10 + * version 2.1 of the License, or (at your option) any later version.
  149.11 + *
  149.12 + * FFmpeg is distributed in the hope that it will be useful,
  149.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  149.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  149.15 + * Lesser General Public License for more details.
  149.16 + *
  149.17 + * You should have received a copy of the GNU Lesser General Public
  149.18 + * License along with FFmpeg; if not, write to the Free Software
  149.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  149.20 + */
  149.21 +
  149.22 +#ifndef AVUTIL_ARM_BSWAP_H
  149.23 +#define AVUTIL_ARM_BSWAP_H
  149.24 +
  149.25 +#include <stdint.h>
  149.26 +#include "config.h"
  149.27 +#include "libavutil/attributes.h"
  149.28 +
  149.29 +#ifdef __ARMCC_VERSION
  149.30 +
  149.31 +#if HAVE_ARMV6
  149.32 +#define bswap_16 bswap_16
  149.33 +static av_always_inline av_const unsigned bswap_16(unsigned x)
  149.34 +{
  149.35 +    __asm { rev16 x, x }
  149.36 +    return x;
  149.37 +}
  149.38 +
  149.39 +#define bswap_32 bswap_32
  149.40 +static av_always_inline av_const uint32_t bswap_32(uint32_t x)
  149.41 +{
  149.42 +    return __rev(x);
  149.43 +}
  149.44 +#endif /* HAVE_ARMV6 */
  149.45 +
  149.46 +#elif HAVE_INLINE_ASM
  149.47 +
  149.48 +#if HAVE_ARMV6
  149.49 +#define bswap_16 bswap_16
  149.50 +static av_always_inline av_const unsigned bswap_16(unsigned x)
  149.51 +{
  149.52 +    __asm__("rev16 %0, %0" : "+r"(x));
  149.53 +    return x;
  149.54 +}
  149.55 +#endif
  149.56 +
  149.57 +#define bswap_32 bswap_32
  149.58 +static av_always_inline av_const uint32_t bswap_32(uint32_t x)
  149.59 +{
  149.60 +#if HAVE_ARMV6
  149.61 +    __asm__("rev %0, %0" : "+r"(x));
  149.62 +#else
  149.63 +    uint32_t t;
  149.64 +    __asm__ ("eor %1, %0, %0, ror #16 \n\t"
  149.65 +             "bic %1, %1, #0xFF0000   \n\t"
  149.66 +             "mov %0, %0, ror #8      \n\t"
  149.67 +             "eor %0, %0, %1, lsr #8  \n\t"
  149.68 +             : "+r"(x), "=&r"(t));
  149.69 +#endif /* HAVE_ARMV6 */
  149.70 +    return x;
  149.71 +}
  149.72 +
  149.73 +#endif /* __ARMCC_VERSION */
  149.74 +
  149.75 +#endif /* AVUTIL_ARM_BSWAP_H */

   150.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   150.2 +++ b/ffmpeg_smp/h264dec/libavutil/arm/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
   150.3 @@ -0,0 +1,78 @@
   150.4 +/*
   150.5 + * This file is part of FFmpeg.
   150.6 + *
   150.7 + * FFmpeg is free software; you can redistribute it and/or
   150.8 + * modify it under the terms of the GNU Lesser General Public
   150.9 + * License as published by the Free Software Foundation; either
  150.10 + * version 2.1 of the License, or (at your option) any later version.
  150.11 + *
  150.12 + * FFmpeg is distributed in the hope that it will be useful,
  150.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  150.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  150.15 + * Lesser General Public License for more details.
  150.16 + *
  150.17 + * You should have received a copy of the GNU Lesser General Public
  150.18 + * License along with FFmpeg; if not, write to the Free Software
  150.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  150.20 + */
  150.21 +
  150.22 +#ifndef AVUTIL_ARM_INTREADWRITE_H
  150.23 +#define AVUTIL_ARM_INTREADWRITE_H
  150.24 +
  150.25 +#include <stdint.h>
  150.26 +#include "config.h"
  150.27 +
  150.28 +#if HAVE_FAST_UNALIGNED && HAVE_INLINE_ASM
  150.29 +
  150.30 +#define AV_RN16 AV_RN16
  150.31 +static av_always_inline uint16_t AV_RN16(const void *p)
  150.32 +{
  150.33 +    uint16_t v;
  150.34 +    __asm__ ("ldrh %0, %1" : "=r"(v) : "m"(*(const uint16_t *)p));
  150.35 +    return v;
  150.36 +}
  150.37 +
  150.38 +#define AV_WN16 AV_WN16
  150.39 +static av_always_inline void AV_WN16(void *p, uint16_t v)
  150.40 +{
  150.41 +    __asm__ ("strh %1, %0" : "=m"(*(uint16_t *)p) : "r"(v));
  150.42 +}
  150.43 +
  150.44 +#define AV_RN32 AV_RN32
  150.45 +static av_always_inline uint32_t AV_RN32(const void *p)
  150.46 +{
  150.47 +    uint32_t v;
  150.48 +    __asm__ ("ldr  %0, %1" : "=r"(v) : "m"(*(const uint32_t *)p));
  150.49 +    return v;
  150.50 +}
  150.51 +
  150.52 +#define AV_WN32 AV_WN32
  150.53 +static av_always_inline void AV_WN32(void *p, uint32_t v)
  150.54 +{
  150.55 +    __asm__ ("str  %1, %0" : "=m"(*(uint32_t *)p) : "r"(v));
  150.56 +}
  150.57 +
  150.58 +#define AV_RN64 AV_RN64
  150.59 +static av_always_inline uint64_t AV_RN64(const void *p)
  150.60 +{
  150.61 +    union { uint64_t v; uint32_t hl[2]; } v;
  150.62 +    __asm__ ("ldr   %0, %2  \n\t"
  150.63 +             "ldr   %1, %3  \n\t"
  150.64 +             : "=&r"(v.hl[0]), "=r"(v.hl[1])
  150.65 +             : "m"(*(const uint32_t*)p), "m"(*((const uint32_t*)p+1)));
  150.66 +    return v.v;
  150.67 +}
  150.68 +
  150.69 +#define AV_WN64 AV_WN64
  150.70 +static av_always_inline void AV_WN64(void *p, uint64_t v)
  150.71 +{
  150.72 +    union { uint64_t v; uint32_t hl[2]; } vv = { v };
  150.73 +    __asm__ ("str  %2, %0  \n\t"
  150.74 +             "str  %3, %1  \n\t"
  150.75 +             : "=m"(*(uint32_t*)p), "=m"(*((uint32_t*)p+1))
  150.76 +             : "r"(vv.hl[0]), "r"(vv.hl[1]));
  150.77 +}
  150.78 +
  150.79 +#endif /* HAVE_INLINE_ASM */
  150.80 +
  150.81 +#endif /* AVUTIL_ARM_INTREADWRITE_H */

   151.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   151.2 +++ b/ffmpeg_smp/h264dec/libavutil/arm/timer.h	Mon Aug 27 12:09:56 2012 +0200
   151.3 @@ -0,0 +1,40 @@
   151.4 +/*
   151.5 + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
   151.6 + *
   151.7 + * This file is part of FFmpeg.
   151.8 + *
   151.9 + * FFmpeg is free software; you can redistribute it and/or
  151.10 + * modify it under the terms of the GNU Lesser General Public
  151.11 + * License as published by the Free Software Foundation; either
  151.12 + * version 2.1 of the License, or (at your option) any later version.
  151.13 + *
  151.14 + * FFmpeg is distributed in the hope that it will be useful,
  151.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  151.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  151.17 + * Lesser General Public License for more details.
  151.18 + *
  151.19 + * You should have received a copy of the GNU Lesser General Public
  151.20 + * License along with FFmpeg; if not, write to the Free Software
  151.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  151.22 + */
  151.23 +
  151.24 +#ifndef AVUTIL_ARM_TIMER_H
  151.25 +#define AVUTIL_ARM_TIMER_H
  151.26 +
  151.27 +#include <stdint.h>
  151.28 +#include "config.h"
  151.29 +
  151.30 +#if HAVE_INLINE_ASM && defined(__ARM_ARCH_7A__)
  151.31 +
  151.32 +#define AV_READ_TIME read_time
  151.33 +
  151.34 +static inline uint64_t read_time(void)
  151.35 +{
  151.36 +    unsigned cc;
  151.37 +    __asm__ volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r"(cc));
  151.38 +    return cc;
  151.39 +}
  151.40 +
  151.41 +#endif /* HAVE_INLINE_ASM && __ARM_ARCH_7A__ */
  151.42 +
  151.43 +#endif /* AVUTIL_ARM_TIMER_H */

   152.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   152.2 +++ b/ffmpeg_smp/h264dec/libavutil/attributes.h	Mon Aug 27 12:09:56 2012 +0200
   152.3 @@ -0,0 +1,113 @@
   152.4 +/*
   152.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   152.6 + *
   152.7 + * This file is part of FFmpeg.
   152.8 + *
   152.9 + * FFmpeg is free software; you can redistribute it and/or
  152.10 + * modify it under the terms of the GNU Lesser General Public
  152.11 + * License as published by the Free Software Foundation; either
  152.12 + * version 2.1 of the License, or (at your option) any later version.
  152.13 + *
  152.14 + * FFmpeg is distributed in the hope that it will be useful,
  152.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  152.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  152.17 + * Lesser General Public License for more details.
  152.18 + *
  152.19 + * You should have received a copy of the GNU Lesser General Public
  152.20 + * License along with FFmpeg; if not, write to the Free Software
  152.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  152.22 + */
  152.23 +
  152.24 +/**
  152.25 + * @file
  152.26 + * Macro definitions for various function/variable attributes
  152.27 + */
  152.28 +
  152.29 +#ifndef AVUTIL_ATTRIBUTES_H
  152.30 +#define AVUTIL_ATTRIBUTES_H
  152.31 +
  152.32 +#ifdef __GNUC__
  152.33 +#    define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
  152.34 +#else
  152.35 +#    define AV_GCC_VERSION_AT_LEAST(x,y) 0
  152.36 +#endif
  152.37 +
  152.38 +#ifndef av_always_inline
  152.39 +#if AV_GCC_VERSION_AT_LEAST(3,1)
  152.40 +#    define av_always_inline __attribute__((always_inline)) inline
  152.41 +#else
  152.42 +#    define av_always_inline inline
  152.43 +#endif
  152.44 +#endif
  152.45 +
  152.46 +#ifndef av_noinline
  152.47 +#if AV_GCC_VERSION_AT_LEAST(3,1)
  152.48 +#    define av_noinline __attribute__((noinline))
  152.49 +#else
  152.50 +#    define av_noinline
  152.51 +#endif
  152.52 +#endif
  152.53 +
  152.54 +#ifndef av_pure
  152.55 +#if AV_GCC_VERSION_AT_LEAST(3,1)
  152.56 +#    define av_pure __attribute__((pure))
  152.57 +#else
  152.58 +#    define av_pure
  152.59 +#endif
  152.60 +#endif
  152.61 +
  152.62 +#ifndef av_const
  152.63 +#if AV_GCC_VERSION_AT_LEAST(2,6)
  152.64 +#    define av_const __attribute__((const))
  152.65 +#else
  152.66 +#    define av_const
  152.67 +#endif
  152.68 +#endif
  152.69 +
  152.70 +#ifndef av_cold
  152.71 +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3)
  152.72 +#    define av_cold __attribute__((cold))
  152.73 +#else
  152.74 +#    define av_cold
  152.75 +#endif
  152.76 +#endif
  152.77 +
  152.78 +#ifndef av_flatten
  152.79 +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1)
  152.80 +#    define av_flatten __attribute__((flatten))
  152.81 +#else
  152.82 +#    define av_flatten
  152.83 +#endif
  152.84 +#endif
  152.85 +
  152.86 +#ifndef attribute_deprecated
  152.87 +#if AV_GCC_VERSION_AT_LEAST(3,1)
  152.88 +#    define attribute_deprecated __attribute__((deprecated))
  152.89 +#else
  152.90 +#    define attribute_deprecated
  152.91 +#endif
  152.92 +#endif
  152.93 +
  152.94 +#ifndef av_unused
  152.95 +#if defined(__GNUC__)
  152.96 +#    define av_unused __attribute__((unused))
  152.97 +#else
  152.98 +#    define av_unused
  152.99 +#endif
 152.100 +#endif
 152.101 +
 152.102 +#ifndef av_uninit
 152.103 +#if defined(__GNUC__) && !defined(__ICC)
 152.104 +#    define av_uninit(x) x=x
 152.105 +#else
 152.106 +#    define av_uninit(x) x
 152.107 +#endif
 152.108 +#endif
 152.109 +
 152.110 +#ifdef __GNUC__
 152.111 +#    define av_builtin_constant_p __builtin_constant_p
 152.112 +#else
 152.113 +#    define av_builtin_constant_p(x) 0
 152.114 +#endif
 152.115 +
 152.116 +#endif /* AVUTIL_ATTRIBUTES_H */

   153.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   153.2 +++ b/ffmpeg_smp/h264dec/libavutil/bswap.h	Mon Aug 27 12:09:56 2012 +0200
   153.3 @@ -0,0 +1,95 @@
   153.4 +/*
   153.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   153.6 + *
   153.7 + * This file is part of FFmpeg.
   153.8 + *
   153.9 + * FFmpeg is free software; you can redistribute it and/or
  153.10 + * modify it under the terms of the GNU Lesser General Public
  153.11 + * License as published by the Free Software Foundation; either
  153.12 + * version 2.1 of the License, or (at your option) any later version.
  153.13 + *
  153.14 + * FFmpeg is distributed in the hope that it will be useful,
  153.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  153.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  153.17 + * Lesser General Public License for more details.
  153.18 + *
  153.19 + * You should have received a copy of the GNU Lesser General Public
  153.20 + * License along with FFmpeg; if not, write to the Free Software
  153.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  153.22 + */
  153.23 +
  153.24 +/**
  153.25 + * @file
  153.26 + * byte swapping routines
  153.27 + */
  153.28 +
  153.29 +#ifndef AVUTIL_BSWAP_H
  153.30 +#define AVUTIL_BSWAP_H
  153.31 +
  153.32 +#include <stdint.h>
  153.33 +#include "config.h"
  153.34 +#include "attributes.h"
  153.35 +
  153.36 +#if   ARCH_ARM
  153.37 +#   include "arm/bswap.h"
  153.38 +#elif ARCH_X86
  153.39 +#   include "x86/bswap.h"
  153.40 +#endif
  153.41 +
  153.42 +#ifndef bswap_16
  153.43 +static av_always_inline av_const uint16_t bswap_16(uint16_t x)
  153.44 +{
  153.45 +    x= (x>>8) | (x<<8);
  153.46 +    return x;
  153.47 +}
  153.48 +#endif
  153.49 +
  153.50 +#ifndef bswap_32
  153.51 +static av_always_inline av_const uint32_t bswap_32(uint32_t x)
  153.52 +{
  153.53 +    x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
  153.54 +    x= (x>>16) | (x<<16);
  153.55 +    return x;
  153.56 +}
  153.57 +#endif
  153.58 +
  153.59 +#ifndef bswap_64
  153.60 +static inline uint64_t av_const bswap_64(uint64_t x)
  153.61 +{
  153.62 +#if 0
  153.63 +    x= ((x<< 8)&0xFF00FF00FF00FF00ULL) | ((x>> 8)&0x00FF00FF00FF00FFULL);
  153.64 +    x= ((x<<16)&0xFFFF0000FFFF0000ULL) | ((x>>16)&0x0000FFFF0000FFFFULL);
  153.65 +    return (x>>32) | (x<<32);
  153.66 +#else
  153.67 +    union {
  153.68 +        uint64_t ll;
  153.69 +        uint32_t l[2];
  153.70 +    } w, r;
  153.71 +    w.ll = x;
  153.72 +    r.l[0] = bswap_32 (w.l[1]);
  153.73 +    r.l[1] = bswap_32 (w.l[0]);
  153.74 +    return r.ll;
  153.75 +#endif
  153.76 +}
  153.77 +#endif
  153.78 +
  153.79 +// be2me ... big-endian to machine-endian
  153.80 +// le2me ... little-endian to machine-endian
  153.81 +
  153.82 +#if HAVE_BIGENDIAN
  153.83 +#define be2me_16(x) (x)
  153.84 +#define be2me_32(x) (x)
  153.85 +#define be2me_64(x) (x)
  153.86 +#define le2me_16(x) bswap_16(x)
  153.87 +#define le2me_32(x) bswap_32(x)
  153.88 +#define le2me_64(x) bswap_64(x)
  153.89 +#else
  153.90 +#define be2me_16(x) bswap_16(x)
  153.91 +#define be2me_32(x) bswap_32(x)
  153.92 +#define be2me_64(x) bswap_64(x)
  153.93 +#define le2me_16(x) (x)
  153.94 +#define le2me_32(x) (x)
  153.95 +#define le2me_64(x) (x)
  153.96 +#endif
  153.97 +
  153.98 +#endif /* AVUTIL_BSWAP_H */

   154.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   154.2 +++ b/ffmpeg_smp/h264dec/libavutil/common.h	Mon Aug 27 12:09:56 2012 +0200
   154.3 @@ -0,0 +1,298 @@
   154.4 +/*
   154.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   154.6 + *
   154.7 + * This file is part of FFmpeg.
   154.8 + *
   154.9 + * FFmpeg is free software; you can redistribute it and/or
  154.10 + * modify it under the terms of the GNU Lesser General Public
  154.11 + * License as published by the Free Software Foundation; either
  154.12 + * version 2.1 of the License, or (at your option) any later version.
  154.13 + *
  154.14 + * FFmpeg is distributed in the hope that it will be useful,
  154.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  154.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  154.17 + * Lesser General Public License for more details.
  154.18 + *
  154.19 + * You should have received a copy of the GNU Lesser General Public
  154.20 + * License along with FFmpeg; if not, write to the Free Software
  154.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  154.22 + */
  154.23 +
  154.24 +/**
  154.25 + * @file
  154.26 + * common internal and external API header
  154.27 + */
  154.28 +
  154.29 +#ifndef AVUTIL_COMMON_H
  154.30 +#define AVUTIL_COMMON_H
  154.31 +
  154.32 +#include <ctype.h>
  154.33 +#include <errno.h>
  154.34 +#include <inttypes.h>
  154.35 +#include <limits.h>
  154.36 +#include <math.h>
  154.37 +#include <stdio.h>
  154.38 +#include <stdlib.h>
  154.39 +#include <string.h>
  154.40 +#include "attributes.h"
  154.41 +
  154.42 +//rounded division & shift
  154.43 +#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b))
  154.44 +/* assume b>0 */
  154.45 +#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
  154.46 +#define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
  154.47 +#define FFSIGN(a) ((a) > 0 ? 1 : -1)
  154.48 +
  154.49 +#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
  154.50 +#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c)
  154.51 +#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
  154.52 +#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c)
  154.53 +
  154.54 +#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
  154.55 +#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
  154.56 +#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
  154.57 +
  154.58 +/* misc math functions */
  154.59 +extern const uint8_t ff_log2_tab[256];
  154.60 +
  154.61 +static inline av_const int av_log2_c(unsigned int v)
  154.62 +{
  154.63 +    int n = 0;
  154.64 +    if (v & 0xffff0000) {
  154.65 +        v >>= 16;
  154.66 +        n += 16;
  154.67 +    }
  154.68 +    if (v & 0xff00) {
  154.69 +        v >>= 8;
  154.70 +        n += 8;
  154.71 +    }
  154.72 +    n += ff_log2_tab[v];
  154.73 +
  154.74 +    return n;
  154.75 +}
  154.76 +
  154.77 +static inline av_const int av_log2_16bit_c(unsigned int v)
  154.78 +{
  154.79 +    int n = 0;
  154.80 +    if (v & 0xff00) {
  154.81 +        v >>= 8;
  154.82 +        n += 8;
  154.83 +    }
  154.84 +    n += ff_log2_tab[v];
  154.85 +
  154.86 +    return n;
  154.87 +}
  154.88 +
  154.89 +#ifdef HAVE_AV_CONFIG_H
  154.90 +#   include "config.h"
  154.91 +#endif
  154.92 +
  154.93 +/**
  154.94 + * Clips a signed integer value into the amin-amax range.
  154.95 + * @param a value to clip
  154.96 + * @param amin minimum value of the clip range
  154.97 + * @param amax maximum value of the clip range
  154.98 + * @return clipped value
  154.99 + */
 154.100 +static inline av_const int av_clip(int a, int amin, int amax)
 154.101 +{
 154.102 +    if      (a < amin) return amin;
 154.103 +    else if (a > amax) return amax;
 154.104 +    else               return a;
 154.105 +}
 154.106 +
 154.107 +/**
 154.108 + * Clips a signed integer value into the 0-255 range.
 154.109 + * @param a value to clip
 154.110 + * @return clipped value
 154.111 + */
 154.112 +static inline av_const uint8_t av_clip_uint8(int a)
 154.113 +{
 154.114 +    if (a&(~0xFF)) return (-a)>>31;
 154.115 +    else           return a;
 154.116 +}
 154.117 +
 154.118 +/**
 154.119 + * Clips a signed integer value into the 0-65535 range.
 154.120 + * @param a value to clip
 154.121 + * @return clipped value
 154.122 + */
 154.123 +static inline av_const uint16_t av_clip_uint16(int a)
 154.124 +{
 154.125 +    if (a&(~0xFFFF)) return (-a)>>31;
 154.126 +    else             return a;
 154.127 +}
 154.128 +
 154.129 +/**
 154.130 + * Clips a signed integer value into the -32768,32767 range.
 154.131 + * @param a value to clip
 154.132 + * @return clipped value
 154.133 + */
 154.134 +static inline av_const int16_t av_clip_int16(int a)
 154.135 +{
 154.136 +    if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF;
 154.137 +    else                      return a;
 154.138 +}
 154.139 +
 154.140 +/**
 154.141 + * Clips a signed 64-bit integer value into the -2147483648,2147483647 range.
 154.142 + * @param a value to clip
 154.143 + * @return clipped value
 154.144 + */
 154.145 +static inline av_const int32_t av_clipl_int32(int64_t a)
 154.146 +{
 154.147 +    if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF;
 154.148 +    else                                         return a;
 154.149 +}
 154.150 +
 154.151 +/**
 154.152 + * Clips a float value into the amin-amax range.
 154.153 + * @param a value to clip
 154.154 + * @param amin minimum value of the clip range
 154.155 + * @param amax maximum value of the clip range
 154.156 + * @return clipped value
 154.157 + */
 154.158 +static inline av_const float av_clipf(float a, float amin, float amax)
 154.159 +{
 154.160 +    if      (a < amin) return amin;
 154.161 +    else if (a > amax) return amax;
 154.162 +    else               return a;
 154.163 +}
 154.164 +
 154.165 +/** Computes ceil(log2(x)).
 154.166 + * @param x value used to compute ceil(log2(x))
 154.167 + * @return computed ceiling of log2(x)
 154.168 + */
 154.169 +static inline av_const int av_ceil_log2(int x)
 154.170 +{
 154.171 +    return av_log2_c((x - 1) << 1);
 154.172 +}
 154.173 +
 154.174 +#define MKTAG(a,b,c,d) (a | (b << 8) | (c << 16) | (d << 24))
 154.175 +#define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24))
 154.176 +
 154.177 +/*!
 154.178 + * \def GET_UTF8(val, GET_BYTE, ERROR)
 154.179 + * Converts a UTF-8 character (up to 4 bytes long) to its 32-bit UCS-4 encoded form
 154.180 + * \param val is the output and should be of type uint32_t. It holds the converted
 154.181 + * UCS-4 character and should be a left value.
 154.182 + * \param GET_BYTE gets UTF-8 encoded bytes from any proper source. It can be
 154.183 + * a function or a statement whose return value or evaluated value is of type
 154.184 + * uint8_t. It will be executed up to 4 times for values in the valid UTF-8 range,
 154.185 + * and up to 7 times in the general case.
 154.186 + * \param ERROR action that should be taken when an invalid UTF-8 byte is returned
 154.187 + * from GET_BYTE. It should be a statement that jumps out of the macro,
 154.188 + * like exit(), goto, return, break, or continue.
 154.189 + */
 154.190 +#define GET_UTF8(val, GET_BYTE, ERROR)\
 154.191 +    val= GET_BYTE;\
 154.192 +    {\
 154.193 +        int ones= 7 - av_log2(val ^ 255);\
 154.194 +        if(ones==1)\
 154.195 +            ERROR\
 154.196 +        val&= 127>>ones;\
 154.197 +        while(--ones > 0){\
 154.198 +            int tmp= GET_BYTE - 128;\
 154.199 +            if(tmp>>6)\
 154.200 +                ERROR\
 154.201 +            val= (val<<6) + tmp;\
 154.202 +        }\
 154.203 +    }
 154.204 +
 154.205 +/*!
 154.206 + * \def GET_UTF16(val, GET_16BIT, ERROR)
 154.207 + * Converts a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form
 154.208 + * \param val is the output and should be of type uint32_t. It holds the converted
 154.209 + * UCS-4 character and should be a left value.
 154.210 + * \param GET_16BIT gets two bytes of UTF-16 encoded data converted to native endianness.
 154.211 + * It can be a function or a statement whose return value or evaluated value is of type
 154.212 + * uint16_t. It will be executed up to 2 times.
 154.213 + * \param ERROR action that should be taken when an invalid UTF-16 surrogate is
 154.214 + * returned from GET_BYTE. It should be a statement that jumps out of the macro,
 154.215 + * like exit(), goto, return, break, or continue.
 154.216 + */
 154.217 +#define GET_UTF16(val, GET_16BIT, ERROR)\
 154.218 +    val = GET_16BIT;\
 154.219 +    {\
 154.220 +        unsigned int hi = val - 0xD800;\
 154.221 +        if (hi < 0x800) {\
 154.222 +            val = GET_16BIT - 0xDC00;\
 154.223 +            if (val > 0x3FFU || hi > 0x3FFU)\
 154.224 +                ERROR\
 154.225 +            val += (hi<<10) + 0x10000;\
 154.226 +        }\
 154.227 +    }\
 154.228 +
 154.229 +/*!
 154.230 + * \def PUT_UTF8(val, tmp, PUT_BYTE)
 154.231 + * Converts a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long).
 154.232 + * \param val is an input-only argument and should be of type uint32_t. It holds
 154.233 + * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If
 154.234 + * val is given as a function it is executed only once.
 154.235 + * \param tmp is a temporary variable and should be of type uint8_t. It
 154.236 + * represents an intermediate value during conversion that is to be
 154.237 + * output by PUT_BYTE.
 154.238 + * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination.
 154.239 + * It could be a function or a statement, and uses tmp as the input byte.
 154.240 + * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be
 154.241 + * executed up to 4 times for values in the valid UTF-8 range and up to
 154.242 + * 7 times in the general case, depending on the length of the converted
 154.243 + * Unicode character.
 154.244 + */
 154.245 +#define PUT_UTF8(val, tmp, PUT_BYTE)\
 154.246 +    {\
 154.247 +        int bytes, shift;\
 154.248 +        uint32_t in = val;\
 154.249 +        if (in < 0x80) {\
 154.250 +            tmp = in;\
 154.251 +            PUT_BYTE\
 154.252 +        } else {\
 154.253 +            bytes = (av_log2(in) + 4) / 5;\
 154.254 +            shift = (bytes - 1) * 6;\
 154.255 +            tmp = (256 - (256 >> bytes)) | (in >> shift);\
 154.256 +            PUT_BYTE\
 154.257 +            while (shift >= 6) {\
 154.258 +                shift -= 6;\
 154.259 +                tmp = 0x80 | ((in >> shift) & 0x3f);\
 154.260 +                PUT_BYTE\
 154.261 +            }\
 154.262 +        }\
 154.263 +    }
 154.264 +
 154.265 +/*!
 154.266 + * \def PUT_UTF16(val, tmp, PUT_16BIT)
 154.267 + * Converts a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes).
 154.268 + * \param val is an input-only argument and should be of type uint32_t. It holds
 154.269 + * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If
 154.270 + * val is given as a function it is executed only once.
 154.271 + * \param tmp is a temporary variable and should be of type uint16_t. It
 154.272 + * represents an intermediate value during conversion that is to be
 154.273 + * output by PUT_16BIT.
 154.274 + * \param PUT_16BIT writes the converted UTF-16 data to any proper destination
 154.275 + * in desired endianness. It could be a function or a statement, and uses tmp
 154.276 + * as the input byte.  For example, PUT_BYTE could be "*output++ = tmp;"
 154.277 + * PUT_BYTE will be executed 1 or 2 times depending on input character.
 154.278 + */
 154.279 +#define PUT_UTF16(val, tmp, PUT_16BIT)\
 154.280 +    {\
 154.281 +        uint32_t in = val;\
 154.282 +        if (in < 0x10000) {\
 154.283 +            tmp = in;\
 154.284 +            PUT_16BIT\
 154.285 +        } else {\
 154.286 +            tmp = 0xD800 | ((in - 0x10000) >> 10);\
 154.287 +            PUT_16BIT\
 154.288 +            tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\
 154.289 +            PUT_16BIT\
 154.290 +        }\
 154.291 +    }\
 154.292 +
 154.293 +
 154.294 +
 154.295 +#include "mem.h"
 154.296 +
 154.297 +#ifdef HAVE_AV_CONFIG_H
 154.298 +#    include "internal.h"
 154.299 +#endif /* HAVE_AV_CONFIG_H */
 154.300 +
 154.301 +#endif /* AVUTIL_COMMON_H */

   155.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   155.2 +++ b/ffmpeg_smp/h264dec/libavutil/error.h	Mon Aug 27 12:09:56 2012 +0200
   155.3 @@ -0,0 +1,53 @@
   155.4 +/*
   155.5 + * This file is part of FFmpeg.
   155.6 + *
   155.7 + * FFmpeg is free software; you can redistribute it and/or
   155.8 + * modify it under the terms of the GNU Lesser General Public
   155.9 + * License as published by the Free Software Foundation; either
  155.10 + * version 2.1 of the License, or (at your option) any later version.
  155.11 + *
  155.12 + * FFmpeg is distributed in the hope that it will be useful,
  155.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  155.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  155.15 + * Lesser General Public License for more details.
  155.16 + *
  155.17 + * You should have received a copy of the GNU Lesser General Public
  155.18 + * License along with FFmpeg; if not, write to the Free Software
  155.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  155.20 + */
  155.21 +
  155.22 +/**
  155.23 + * @file
  155.24 + * error code definitions
  155.25 + */
  155.26 +
  155.27 +#ifndef AVUTIL_ERROR_H
  155.28 +#define AVUTIL_ERROR_H
  155.29 +
  155.30 +#include <errno.h>
  155.31 +#include "common.h"
  155.32 +
  155.33 +/* error handling */
  155.34 +#if EDOM > 0
  155.35 +#define AVERROR(e) (-(e))   ///< Returns a negative error code from a POSIX error code, to return from library functions.
  155.36 +#define AVUNERROR(e) (-(e)) ///< Returns a POSIX error code from a library function error return value.
  155.37 +#else
  155.38 +/* Some platforms have E* and errno already negated. */
  155.39 +#define AVERROR(e) (e)
  155.40 +#define AVUNERROR(e) (e)
  155.41 +#endif
  155.42 +
  155.43 +#define AVERROR_EOF         AVERROR(EPIPE)   ///< End of file
  155.44 +
  155.45 +
  155.46 +/**
  155.47 + * Puts a description of the AVERROR code errnum in errbuf.
  155.48 + * In case of failure the global variable errno is set to indicate the
  155.49 + * error.
  155.50 + *
  155.51 + * @param errbuf_size the size in bytes of errbuf
  155.52 + * @return 0 on success, a negative value otherwise
  155.53 + */
  155.54 +int av_strerror(int errnum, char *errbuf, size_t errbuf_size);
  155.55 +
  155.56 +#endif /* AVUTIL_ERROR_H */

   156.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   156.2 +++ b/ffmpeg_smp/h264dec/libavutil/internal.h	Mon Aug 27 12:09:56 2012 +0200
   156.3 @@ -0,0 +1,168 @@
   156.4 +/*
   156.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   156.6 + *
   156.7 + * This file is part of FFmpeg.
   156.8 + *
   156.9 + * FFmpeg is free software; you can redistribute it and/or
  156.10 + * modify it under the terms of the GNU Lesser General Public
  156.11 + * License as published by the Free Software Foundation; either
  156.12 + * version 2.1 of the License, or (at your option) any later version.
  156.13 + *
  156.14 + * FFmpeg is distributed in the hope that it will be useful,
  156.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  156.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  156.17 + * Lesser General Public License for more details.
  156.18 + *
  156.19 + * You should have received a copy of the GNU Lesser General Public
  156.20 + * License along with FFmpeg; if not, write to the Free Software
  156.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  156.22 + */
  156.23 +
  156.24 +/**
  156.25 + * @file
  156.26 + * common internal API header
  156.27 + */
  156.28 +
  156.29 +#ifndef AVUTIL_INTERNAL_H
  156.30 +#define AVUTIL_INTERNAL_H
  156.31 +
  156.32 +#if !defined(DEBUG) && !defined(NDEBUG)
  156.33 +#    define NDEBUG
  156.34 +#endif
  156.35 +
  156.36 +#include <limits.h>
  156.37 +#include <stdint.h>
  156.38 +#include <stddef.h>
  156.39 +#include <assert.h>
  156.40 +#include "config.h"
  156.41 +#include "attributes.h"
  156.42 +#include "timer.h"
  156.43 +
  156.44 +
  156.45 +
  156.46 +#ifndef INT16_MIN
  156.47 +#define INT16_MIN       (-0x7fff - 1)
  156.48 +#endif
  156.49 +
  156.50 +#ifndef INT16_MAX
  156.51 +#define INT16_MAX       0x7fff
  156.52 +#endif
  156.53 +
  156.54 +#ifndef INT32_MIN
  156.55 +#define INT32_MIN       (-0x7fffffff - 1)
  156.56 +#endif
  156.57 +
  156.58 +#ifndef INT32_MAX
  156.59 +#define INT32_MAX       0x7fffffff
  156.60 +#endif
  156.61 +
  156.62 +#ifndef UINT32_MAX
  156.63 +#define UINT32_MAX      0xffffffff
  156.64 +#endif
  156.65 +
  156.66 +#ifndef INT64_MIN
  156.67 +#define INT64_MIN       (-0x7fffffffffffffffLL - 1)
  156.68 +#endif
  156.69 +
  156.70 +#ifndef INT64_MAX
  156.71 +#define INT64_MAX INT64_C(9223372036854775807)
  156.72 +#endif
  156.73 +
  156.74 +#ifndef UINT64_MAX
  156.75 +#define UINT64_MAX UINT64_C(0xFFFFFFFFFFFFFFFF)
  156.76 +#endif
  156.77 +
  156.78 +#ifndef INT_BIT
  156.79 +#    define INT_BIT (CHAR_BIT * sizeof(int))
  156.80 +#endif
  156.81 +
  156.82 +#ifndef offsetof
  156.83 +#    define offsetof(T, F) ((unsigned int)((char *)&((T *)0)->F))
  156.84 +#endif
  156.85 +
  156.86 +/* Use to export labels from asm. */
  156.87 +#define LABEL_MANGLE(a) #a
  156.88 +#define LOCAL_MANGLE(a) #a
  156.89 +#define MANGLE(a) #a
  156.90 +
  156.91 +// Use rip-relative addressing if compiling PIC code on x86-64.
  156.92 +// #if ARCH_X86_64 && defined(PIC)
  156.93 +// #    define LOCAL_MANGLE(a) #a "(%%rip)"
  156.94 +// #else
  156.95 +// #    define LOCAL_MANGLE(a) #a
  156.96 +// #endif
  156.97 +// 
  156.98 +// #define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
  156.99 +
 156.100 +/* debug stuff */
 156.101 +
 156.102 +/* dprintf macros */
 156.103 +#ifdef DEBUG
 156.104 +#    define dprintf(pctx, ...) av_log(pctx, AV_LOG_DEBUG, __VA_ARGS__)
 156.105 +#else
 156.106 +#    define dprintf(pctx, ...)
 156.107 +#endif
 156.108 +
 156.109 +#define av_abort()      do { av_log(NULL, AV_LOG_ERROR, "Abort at %s:%d\n", __FILE__, __LINE__); abort(); } while (0)
 156.110 +
 156.111 +/* math */
 156.112 +
 156.113 +
 156.114 +/* avoid usage of dangerous/inappropriate system functions */
 156.115 +// #undef  malloc
 156.116 +// #define malloc please_use_av_malloc
 156.117 +// #undef  free
 156.118 +// #define free please_use_av_free
 156.119 +#undef  realloc
 156.120 +#define realloc please_use_av_realloc
 156.121 +#undef  time
 156.122 +#define time time_is_forbidden_due_to_security_issues
 156.123 +#undef  rand
 156.124 +#define rand rand_is_forbidden_due_to_state_trashing_use_av_lfg_get
 156.125 +#undef  srand
 156.126 +#define srand srand_is_forbidden_due_to_state_trashing_use_av_lfg_init
 156.127 +#undef  random
 156.128 +#define random random_is_forbidden_due_to_state_trashing_use_av_lfg_get
 156.129 +#undef  sprintf
 156.130 +#define sprintf sprintf_is_forbidden_due_to_security_issues_use_snprintf
 156.131 +//#undef  exit
 156.132 +//#define exit exit_is_forbidden
 156.133 +#ifndef LIBAVFORMAT_BUILD
 156.134 +
 156.135 +#undef  puts
 156.136 +#define puts please_use_av_log_instead_of_puts
 156.137 +#undef  perror
 156.138 +#define perror please_use_av_log_instead_of_perror
 156.139 +#endif
 156.140 +
 156.141 +#define FF_ALLOC_OR_GOTO(p, size, label)\
 156.142 +{\
 156.143 +    p = av_malloc(size);\
 156.144 +    if (p == NULL && (size) != 0) {\
 156.145 +        av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\
 156.146 +        goto label;\
 156.147 +    }\
 156.148 +}
 156.149 +
 156.150 +#define FF_ALLOCZ_OR_GOTO(p, size, label)\
 156.151 +{\
 156.152 +    p = av_mallocz(size);\
 156.153 +    if (p == NULL && (size) != 0) {\
 156.154 +        av_log(AV_LOG_ERROR, "Cannot allocate memory.\n");\
 156.155 +        goto label;\
 156.156 +    }\
 156.157 +}
 156.158 +
 156.159 +
 156.160 +/**
 156.161 + * Returns NULL if CONFIG_SMALL is true, otherwise the argument
 156.162 + * without modification. Used to disable the definition of strings
 156.163 + * (for example AVCodec long_names).
 156.164 + */
 156.165 +#if CONFIG_SMALL
 156.166 +#   define NULL_IF_CONFIG_SMALL(x) NULL
 156.167 +#else
 156.168 +#   define NULL_IF_CONFIG_SMALL(x) x
 156.169 +#endif
 156.170 +
 156.171 +#endif /* AVUTIL_INTERNAL_H */

   157.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   157.2 +++ b/ffmpeg_smp/h264dec/libavutil/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
   157.3 @@ -0,0 +1,498 @@
   157.4 +/*
   157.5 + * This file is part of FFmpeg.
   157.6 + *
   157.7 + * FFmpeg is free software; you can redistribute it and/or
   157.8 + * modify it under the terms of the GNU Lesser General Public
   157.9 + * License as published by the Free Software Foundation; either
  157.10 + * version 2.1 of the License, or (at your option) any later version.
  157.11 + *
  157.12 + * FFmpeg is distributed in the hope that it will be useful,
  157.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  157.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  157.15 + * Lesser General Public License for more details.
  157.16 + *
  157.17 + * You should have received a copy of the GNU Lesser General Public
  157.18 + * License along with FFmpeg; if not, write to the Free Software
  157.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  157.20 + */
  157.21 +
  157.22 +#ifndef AVUTIL_INTREADWRITE_H
  157.23 +#define AVUTIL_INTREADWRITE_H
  157.24 +
  157.25 +#include <stdint.h>
  157.26 +#include "config.h"
  157.27 +#include "bswap.h"
  157.28 +#include "common.h"
  157.29 +
  157.30 +typedef union {
  157.31 +    uint64_t u64;
  157.32 +    uint32_t u32[2];
  157.33 +    uint16_t u16[4];
  157.34 +    uint8_t  u8 [8];
  157.35 +    double   f64;
  157.36 +    float    f32[2];
  157.37 +} __attribute__((__may_alias__)) av_alias64;
  157.38 +
  157.39 +typedef union {
  157.40 +    uint32_t u32;
  157.41 +    uint16_t u16[2];
  157.42 +    uint8_t  u8 [4];
  157.43 +    float    f32;
  157.44 +} __attribute__((__may_alias__)) av_alias32;
  157.45 +
  157.46 +typedef union {
  157.47 +    uint16_t u16;
  157.48 +    uint8_t  u8 [2];
  157.49 +} __attribute__((__may_alias__)) av_alias16  ;
  157.50 +
  157.51 +/*
  157.52 + * Arch-specific headers can provide any combination of
  157.53 + * AV_[RW][BLN](16|24|32|64) and AV_(COPY|SWAP|ZERO)(64|128) macros.
  157.54 + * Preprocessor symbols must be defined, even if these are implemented
  157.55 + * as inline functions.
  157.56 + */
  157.57 +
  157.58 +#if   ARCH_ARM
  157.59 +#   include "arm/intreadwrite.h"
  157.60 +#elif ARCH_PPC
  157.61 +#   include "ppc/intreadwrite.h"
  157.62 +#elif ARCH_X86
  157.63 +#   include "x86/intreadwrite.h"
  157.64 +#endif
  157.65 +
  157.66 +/*
  157.67 + * Map AV_RNXX <-> AV_R[BL]XX for all variants provided by per-arch headers.
  157.68 + */
  157.69 +
  157.70 +#if HAVE_BIGENDIAN
  157.71 +
  157.72 +#   if    defined(AV_RN16) && !defined(AV_RB16)
  157.73 +#       define AV_RB16(p) AV_RN16(p)
  157.74 +#   elif !defined(AV_RN16) &&  defined(AV_RB16)
  157.75 +#       define AV_RN16(p) AV_RB16(p)
  157.76 +#   endif
  157.77 +
  157.78 +#   if    defined(AV_WN16) && !defined(AV_WB16)
  157.79 +#       define AV_WB16(p, v) AV_WN16(p, v)
  157.80 +#   elif !defined(AV_WN16) &&  defined(AV_WB16)
  157.81 +#       define AV_WN16(p, v) AV_WB16(p, v)
  157.82 +#   endif
  157.83 +
  157.84 +#   if    defined(AV_RN24) && !defined(AV_RB24)
  157.85 +#       define AV_RB24(p) AV_RN24(p)
  157.86 +#   elif !defined(AV_RN24) &&  defined(AV_RB24)
  157.87 +#       define AV_RN24(p) AV_RB24(p)
  157.88 +#   endif
  157.89 +
  157.90 +#   if    defined(AV_WN24) && !defined(AV_WB24)
  157.91 +#       define AV_WB24(p, v) AV_WN24(p, v)
  157.92 +#   elif !defined(AV_WN24) &&  defined(AV_WB24)
  157.93 +#       define AV_WN24(p, v) AV_WB24(p, v)
  157.94 +#   endif
  157.95 +
  157.96 +#   if    defined(AV_RN32) && !defined(AV_RB32)
  157.97 +#       define AV_RB32(p) AV_RN32(p)
  157.98 +#   elif !defined(AV_RN32) &&  defined(AV_RB32)
  157.99 +#       define AV_RN32(p) AV_RB32(p)
 157.100 +#   endif
 157.101 +
 157.102 +#   if    defined(AV_WN32) && !defined(AV_WB32)
 157.103 +#       define AV_WB32(p, v) AV_WN32(p, v)
 157.104 +#   elif !defined(AV_WN32) &&  defined(AV_WB32)
 157.105 +#       define AV_WN32(p, v) AV_WB32(p, v)
 157.106 +#   endif
 157.107 +
 157.108 +#   if    defined(AV_RN64) && !defined(AV_RB64)
 157.109 +#       define AV_RB64(p) AV_RN64(p)
 157.110 +#   elif !defined(AV_RN64) &&  defined(AV_RB64)
 157.111 +#       define AV_RN64(p) AV_RB64(p)
 157.112 +#   endif
 157.113 +
 157.114 +#   if    defined(AV_WN64) && !defined(AV_WB64)
 157.115 +#       define AV_WB64(p, v) AV_WN64(p, v)
 157.116 +#   elif !defined(AV_WN64) &&  defined(AV_WB64)
 157.117 +#       define AV_WN64(p, v) AV_WB64(p, v)
 157.118 +#   endif
 157.119 +
 157.120 +#else /* HAVE_BIGENDIAN */
 157.121 +
 157.122 +#   if    defined(AV_RN16) && !defined(AV_RL16)
 157.123 +#       define AV_RL16(p) AV_RN16(p)
 157.124 +#   elif !defined(AV_RN16) &&  defined(AV_RL16)
 157.125 +#       define AV_RN16(p) AV_RL16(p)
 157.126 +#   endif
 157.127 +
 157.128 +#   if    defined(AV_WN16) && !defined(AV_WL16)
 157.129 +#       define AV_WL16(p, v) AV_WN16(p, v)
 157.130 +#   elif !defined(AV_WN16) &&  defined(AV_WL16)
 157.131 +#       define AV_WN16(p, v) AV_WL16(p, v)
 157.132 +#   endif
 157.133 +
 157.134 +#   if    defined(AV_RN24) && !defined(AV_RL24)
 157.135 +#       define AV_RL24(p) AV_RN24(p)
 157.136 +#   elif !defined(AV_RN24) &&  defined(AV_RL24)
 157.137 +#       define AV_RN24(p) AV_RL24(p)
 157.138 +#   endif
 157.139 +
 157.140 +#   if    defined(AV_WN24) && !defined(AV_WL24)
 157.141 +#       define AV_WL24(p, v) AV_WN24(p, v)
 157.142 +#   elif !defined(AV_WN24) &&  defined(AV_WL24)
 157.143 +#       define AV_WN24(p, v) AV_WL24(p, v)
 157.144 +#   endif
 157.145 +
 157.146 +#   if    defined(AV_RN32) && !defined(AV_RL32)
 157.147 +#       define AV_RL32(p) AV_RN32(p)
 157.148 +#   elif !defined(AV_RN32) &&  defined(AV_RL32)
 157.149 +#       define AV_RN32(p) AV_RL32(p)
 157.150 +#   endif
 157.151 +
 157.152 +#   if    defined(AV_WN32) && !defined(AV_WL32)
 157.153 +#       define AV_WL32(p, v) AV_WN32(p, v)
 157.154 +#   elif !defined(AV_WN32) &&  defined(AV_WL32)
 157.155 +#       define AV_WN32(p, v) AV_WL32(p, v)
 157.156 +#   endif
 157.157 +
 157.158 +#   if    defined(AV_RN64) && !defined(AV_RL64)
 157.159 +#       define AV_RL64(p) AV_RN64(p)
 157.160 +#   elif !defined(AV_RN64) &&  defined(AV_RL64)
 157.161 +#       define AV_RN64(p) AV_RL64(p)
 157.162 +#   endif
 157.163 +
 157.164 +#   if    defined(AV_WN64) && !defined(AV_WL64)
 157.165 +#       define AV_WL64(p, v) AV_WN64(p, v)
 157.166 +#   elif !defined(AV_WN64) &&  defined(AV_WL64)
 157.167 +#       define AV_WN64(p, v) AV_WL64(p, v)
 157.168 +#   endif
 157.169 +
 157.170 +#endif /* !HAVE_BIGENDIAN */
 157.171 +
 157.172 +/*
 157.173 + * Define AV_[RW]N helper macros to simplify definitions not provided
 157.174 + * by per-arch headers.
 157.175 + */
 157.176 +
 157.177 +
 157.178 +
 157.179 +#if defined(__DECC)
 157.180 +
 157.181 +#   define AV_RN(s, p) (*((const __unaligned uint##s##_t*)(p)))
 157.182 +#   define AV_WN(s, p, v) (*((__unaligned uint##s##_t*)(p)) = (v))
 157.183 +
 157.184 +#else
 157.185 +
 157.186 +#ifndef AV_RB16
 157.187 +#   define AV_RB16(x)                           \
 157.188 +    ((((const uint8_t*)(x))[0] << 8) |          \
 157.189 +      ((const uint8_t*)(x))[1])
 157.190 +#endif
 157.191 +#ifndef AV_WB16
 157.192 +#   define AV_WB16(p, d) do {                   \
 157.193 +        ((uint8_t*)(p))[1] = (d);               \
 157.194 +        ((uint8_t*)(p))[0] = (d)>>8;            \
 157.195 +    } while(0)
 157.196 +#endif
 157.197 +
 157.198 +#ifndef AV_RL16
 157.199 +#   define AV_RL16(x)                           \
 157.200 +    ((((const uint8_t*)(x))[1] << 8) |          \
 157.201 +      ((const uint8_t*)(x))[0])
 157.202 +#endif
 157.203 +#ifndef AV_WL16
 157.204 +#   define AV_WL16(p, d) do {                   \
 157.205 +        ((uint8_t*)(p))[0] = (d);               \
 157.206 +        ((uint8_t*)(p))[1] = (d)>>8;            \
 157.207 +    } while(0)
 157.208 +#endif
 157.209 +
 157.210 +#ifndef AV_RB32
 157.211 +#   define AV_RB32(x)                           \
 157.212 +    ((((const uint8_t*)(x))[0] << 24) |         \
 157.213 +     (((const uint8_t*)(x))[1] << 16) |         \
 157.214 +     (((const uint8_t*)(x))[2] <<  8) |         \
 157.215 +      ((const uint8_t*)(x))[3])
 157.216 +#endif
 157.217 +#ifndef AV_WB32
 157.218 +#   define AV_WB32(p, d) do {                   \
 157.219 +        ((uint8_t*)(p))[3] = (d);               \
 157.220 +        ((uint8_t*)(p))[2] = (d)>>8;            \
 157.221 +        ((uint8_t*)(p))[1] = (d)>>16;           \
 157.222 +        ((uint8_t*)(p))[0] = (d)>>24;           \
 157.223 +    } while(0)
 157.224 +#endif
 157.225 +
 157.226 +#ifndef AV_RL32
 157.227 +#   define AV_RL32(x)                           \
 157.228 +    ((((const uint8_t*)(x))[3] << 24) |         \
 157.229 +     (((const uint8_t*)(x))[2] << 16) |         \
 157.230 +     (((const uint8_t*)(x))[1] <<  8) |         \
 157.231 +      ((const uint8_t*)(x))[0])
 157.232 +#endif
 157.233 +#ifndef AV_WL32
 157.234 +#   define AV_WL32(p, d) do {                   \
 157.235 +        ((uint8_t*)(p))[0] = (d);               \
 157.236 +        ((uint8_t*)(p))[1] = (d)>>8;            \
 157.237 +        ((uint8_t*)(p))[2] = (d)>>16;           \
 157.238 +        ((uint8_t*)(p))[3] = (d)>>24;           \
 157.239 +    } while(0)
 157.240 +#endif
 157.241 +
 157.242 +#ifndef AV_RB64
 157.243 +#   define AV_RB64(x)                                   \
 157.244 +    (((uint64_t)((const uint8_t*)(x))[0] << 56) |       \
 157.245 +     ((uint64_t)((const uint8_t*)(x))[1] << 48) |       \
 157.246 +     ((uint64_t)((const uint8_t*)(x))[2] << 40) |       \
 157.247 +     ((uint64_t)((const uint8_t*)(x))[3] << 32) |       \
 157.248 +     ((uint64_t)((const uint8_t*)(x))[4] << 24) |       \
 157.249 +     ((uint64_t)((const uint8_t*)(x))[5] << 16) |       \
 157.250 +     ((uint64_t)((const uint8_t*)(x))[6] <<  8) |       \
 157.251 +      (uint64_t)((const uint8_t*)(x))[7])
 157.252 +#endif
 157.253 +#ifndef AV_WB64
 157.254 +#   define AV_WB64(p, d) do {                   \
 157.255 +        ((uint8_t*)(p))[7] = (d);               \
 157.256 +        ((uint8_t*)(p))[6] = (d)>>8;            \
 157.257 +        ((uint8_t*)(p))[5] = (d)>>16;           \
 157.258 +        ((uint8_t*)(p))[4] = (d)>>24;           \
 157.259 +        ((uint8_t*)(p))[3] = (d)>>32;           \
 157.260 +        ((uint8_t*)(p))[2] = (d)>>40;           \
 157.261 +        ((uint8_t*)(p))[1] = (d)>>48;           \
 157.262 +        ((uint8_t*)(p))[0] = (d)>>56;           \
 157.263 +    } while(0)
 157.264 +#endif
 157.265 +
 157.266 +#ifndef AV_RL64
 157.267 +#   define AV_RL64(x)                                   \
 157.268 +    (((uint64_t)((const uint8_t*)(x))[7] << 56) |       \
 157.269 +     ((uint64_t)((const uint8_t*)(x))[6] << 48) |       \
 157.270 +     ((uint64_t)((const uint8_t*)(x))[5] << 40) |       \
 157.271 +     ((uint64_t)((const uint8_t*)(x))[4] << 32) |       \
 157.272 +     ((uint64_t)((const uint8_t*)(x))[3] << 24) |       \
 157.273 +     ((uint64_t)((const uint8_t*)(x))[2] << 16) |       \
 157.274 +     ((uint64_t)((const uint8_t*)(x))[1] <<  8) |       \
 157.275 +      (uint64_t)((const uint8_t*)(x))[0])
 157.276 +#endif
 157.277 +#ifndef AV_WL64
 157.278 +#   define AV_WL64(p, d) do {                   \
 157.279 +        ((uint8_t*)(p))[0] = (d);               \
 157.280 +        ((uint8_t*)(p))[1] = (d)>>8;            \
 157.281 +        ((uint8_t*)(p))[2] = (d)>>16;           \
 157.282 +        ((uint8_t*)(p))[3] = (d)>>24;           \
 157.283 +        ((uint8_t*)(p))[4] = (d)>>32;           \
 157.284 +        ((uint8_t*)(p))[5] = (d)>>40;           \
 157.285 +        ((uint8_t*)(p))[6] = (d)>>48;           \
 157.286 +        ((uint8_t*)(p))[7] = (d)>>56;           \
 157.287 +    } while(0)
 157.288 +#endif
 157.289 +
 157.290 +#if HAVE_BIGENDIAN
 157.291 +#   define AV_RN(s, p)    AV_RB##s(p)
 157.292 +#   define AV_WN(s, p, v) AV_WB##s(p, v)
 157.293 +#else
 157.294 +#   define AV_RN(s, p)    AV_RL##s(p)
 157.295 +#   define AV_WN(s, p, v) AV_WL##s(p, v)
 157.296 +#endif
 157.297 +
 157.298 +#endif /* HAVE_FAST_UNALIGNED */
 157.299 +
 157.300 +#ifndef AV_RN16
 157.301 +#   define AV_RN16(p) AV_RN(16, p)
 157.302 +#endif
 157.303 +
 157.304 +#ifndef AV_RN32
 157.305 +#   define AV_RN32(p) AV_RN(32, p)
 157.306 +#endif
 157.307 +
 157.308 +#ifndef AV_RN64
 157.309 +#   define AV_RN64(p) AV_RN(64, p)
 157.310 +#endif
 157.311 +
 157.312 +#ifndef AV_WN16
 157.313 +#   define AV_WN16(p, v) AV_WN(16, p, v)
 157.314 +#endif
 157.315 +
 157.316 +#ifndef AV_WN32
 157.317 +#   define AV_WN32(p, v) AV_WN(32, p, v)
 157.318 +#endif
 157.319 +
 157.320 +#ifndef AV_WN64
 157.321 +#   define AV_WN64(p, v) AV_WN(64, p, v)
 157.322 +#endif
 157.323 +
 157.324 +#if HAVE_BIGENDIAN
 157.325 +#   define AV_RB(s, p)    AV_RN##s(p)
 157.326 +#   define AV_WB(s, p, v) AV_WN##s(p, v)
 157.327 +#   define AV_RL(s, p)    bswap_##s(AV_RN##s(p))
 157.328 +#   define AV_WL(s, p, v) AV_WN##s(p, bswap_##s(v))
 157.329 +#else
 157.330 +#   define AV_RB(s, p)    bswap_##s(AV_RN##s(p))
 157.331 +#   define AV_WB(s, p, v) AV_WN##s(p, bswap_##s(v))
 157.332 +#   define AV_RL(s, p)    AV_RN##s(p)
 157.333 +#   define AV_WL(s, p, v) AV_WN##s(p, v)
 157.334 +#endif
 157.335 +
 157.336 +#define AV_RB8(x)     (((const uint8_t*)(x))[0])
 157.337 +#define AV_WB8(p, d)  do { ((uint8_t*)(p))[0] = (d); } while(0)
 157.338 +
 157.339 +#define AV_RL8(x)     AV_RB8(x)
 157.340 +#define AV_WL8(p, d)  AV_WB8(p, d)
 157.341 +
 157.342 +#ifndef AV_RB16
 157.343 +#   define AV_RB16(p)    AV_RB(16, p)
 157.344 +#endif
 157.345 +#ifndef AV_WB16
 157.346 +#   define AV_WB16(p, v) AV_WB(16, p, v)
 157.347 +#endif
 157.348 +
 157.349 +#ifndef AV_RL16
 157.350 +#   define AV_RL16(p)    AV_RL(16, p)
 157.351 +#endif
 157.352 +#ifndef AV_WL16
 157.353 +#   define AV_WL16(p, v) AV_WL(16, p, v)
 157.354 +#endif
 157.355 +
 157.356 +#ifndef AV_RB32
 157.357 +#   define AV_RB32(p)    AV_RB(32, p)
 157.358 +#endif
 157.359 +#ifndef AV_WB32
 157.360 +#   define AV_WB32(p, v) AV_WB(32, p, v)
 157.361 +#endif
 157.362 +
 157.363 +#ifndef AV_RL32
 157.364 +#   define AV_RL32(p)    AV_RL(32, p)
 157.365 +#endif
 157.366 +#ifndef AV_WL32
 157.367 +#   define AV_WL32(p, v) AV_WL(32, p, v)
 157.368 +#endif
 157.369 +
 157.370 +#ifndef AV_RB64
 157.371 +#   define AV_RB64(p)    AV_RB(64, p)
 157.372 +#endif
 157.373 +#ifndef AV_WB64
 157.374 +#   define AV_WB64(p, v) AV_WB(64, p, v)
 157.375 +#endif
 157.376 +
 157.377 +#ifndef AV_RL64
 157.378 +#   define AV_RL64(p)    AV_RL(64, p)
 157.379 +#endif
 157.380 +#ifndef AV_WL64
 157.381 +#   define AV_WL64(p, v) AV_WL(64, p, v)
 157.382 +#endif
 157.383 +
 157.384 +#ifndef AV_RB24
 157.385 +#   define AV_RB24(x)                           \
 157.386 +    ((((const uint8_t*)(x))[0] << 16) |         \
 157.387 +     (((const uint8_t*)(x))[1] <<  8) |         \
 157.388 +      ((const uint8_t*)(x))[2])
 157.389 +#endif
 157.390 +#ifndef AV_WB24
 157.391 +#   define AV_WB24(p, d) do {                   \
 157.392 +        ((uint8_t*)(p))[2] = (d);               \
 157.393 +        ((uint8_t*)(p))[1] = (d)>>8;            \
 157.394 +        ((uint8_t*)(p))[0] = (d)>>16;           \
 157.395 +    } while(0)
 157.396 +#endif
 157.397 +
 157.398 +#ifndef AV_RL24
 157.399 +#   define AV_RL24(x)                           \
 157.400 +    ((((const uint8_t*)(x))[2] << 16) |         \
 157.401 +     (((const uint8_t*)(x))[1] <<  8) |         \
 157.402 +      ((const uint8_t*)(x))[0])
 157.403 +#endif
 157.404 +#ifndef AV_WL24
 157.405 +#   define AV_WL24(p, d) do {                   \
 157.406 +        ((uint8_t*)(p))[0] = (d);               \
 157.407 +        ((uint8_t*)(p))[1] = (d)>>8;            \
 157.408 +        ((uint8_t*)(p))[2] = (d)>>16;           \
 157.409 +    } while(0)
 157.410 +#endif
 157.411 +
 157.412 +/*
 157.413 + * The AV_[RW]NA macros access naturally aligned data
 157.414 + * in a type-safe way.
 157.415 + */
 157.416 +
 157.417 +#define AV_RNA(s, p)    (((const av_alias##s*)(p))->u##s)
 157.418 +#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v))
 157.419 +
 157.420 +#ifndef AV_RN16A
 157.421 +#   define AV_RN16A(p) AV_RNA(16, p)
 157.422 +#endif
 157.423 +
 157.424 +#ifndef AV_RN32A
 157.425 +#   define AV_RN32A(p) AV_RNA(32, p)
 157.426 +#endif
 157.427 +
 157.428 +#ifndef AV_RN64A
 157.429 +#   define AV_RN64A(p) AV_RNA(64, p)
 157.430 +#endif
 157.431 +
 157.432 +#ifndef AV_WN16A
 157.433 +#   define AV_WN16A(p, v) AV_WNA(16, p, v)
 157.434 +#endif
 157.435 +
 157.436 +#ifndef AV_WN32A
 157.437 +#   define AV_WN32A(p, v) AV_WNA(32, p, v)
 157.438 +#endif
 157.439 +
 157.440 +#ifndef AV_WN64A
 157.441 +#   define AV_WN64A(p, v) AV_WNA(64, p, v)
 157.442 +#endif
 157.443 +
 157.444 +/* Parameters for AV_COPY*, AV_SWAP*, AV_ZERO* must be
 157.445 + * naturally aligned. They may be implemented using MMX,
 157.446 + * so emms_c() must be called before using any float code
 157.447 + * afterwards.
 157.448 + */
 157.449 +
 157.450 +#define AV_COPY(n, d, s) \
 157.451 +    (((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
 157.452 +
 157.453 +#ifndef AV_COPY16
 157.454 +#   define AV_COPY16(d, s) AV_COPY(16, d, s)
 157.455 +#endif
 157.456 +
 157.457 +#ifndef AV_COPY32
 157.458 +#   define AV_COPY32(d, s) AV_COPY(32, d, s)
 157.459 +#endif
 157.460 +
 157.461 +#ifndef AV_COPY64
 157.462 +#   define AV_COPY64(d, s) AV_COPY(64, d, s)
 157.463 +#endif
 157.464 +
 157.465 +#ifndef AV_COPY128
 157.466 +#   define AV_COPY128(d, s)                    \
 157.467 +    do {                                       \
 157.468 +        AV_COPY64(d, s);                       \
 157.469 +        AV_COPY64((char*)(d)+8, (char*)(s)+8); \
 157.470 +    } while(0)
 157.471 +#endif
 157.472 +
 157.473 +#define AV_SWAP(n, a, b) FFSWAP(av_alias##n, *(av_alias##n*)(a), *(av_alias##n*)(b))
 157.474 +
 157.475 +#ifndef AV_SWAP64
 157.476 +#   define AV_SWAP64(a, b) AV_SWAP(64, a, b)
 157.477 +#endif
 157.478 +
 157.479 +#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0)
 157.480 +
 157.481 +#ifndef AV_ZERO16
 157.482 +#   define AV_ZERO16(d) AV_ZERO(16, d)
 157.483 +#endif
 157.484 +
 157.485 +#ifndef AV_ZERO32
 157.486 +#   define AV_ZERO32(d) AV_ZERO(32, d)
 157.487 +#endif
 157.488 +
 157.489 +#ifndef AV_ZERO64
 157.490 +#   define AV_ZERO64(d) AV_ZERO(64, d)
 157.491 +#endif
 157.492 +
 157.493 +#ifndef AV_ZERO128
 157.494 +#   define AV_ZERO128(d)         \
 157.495 +    do {                         \
 157.496 +        AV_ZERO64(d);            \
 157.497 +        AV_ZERO64((char*)(d)+8); \
 157.498 +    } while(0)
 157.499 +#endif
 157.500 +
 157.501 +#endif /* AVUTIL_INTREADWRITE_H */

   158.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   158.2 +++ b/ffmpeg_smp/h264dec/libavutil/log.c	Mon Aug 27 12:09:56 2012 +0200
   158.3 @@ -0,0 +1,111 @@
   158.4 +/*
   158.5 + * log functions
   158.6 + * Copyright (c) 2003 Michel Bardiaux
   158.7 + *
   158.8 + * This file is part of FFmpeg.
   158.9 + *
  158.10 + * FFmpeg is free software; you can redistribute it and/or
  158.11 + * modify it under the terms of the GNU Lesser General Public
  158.12 + * License as published by the Free Software Foundation; either
  158.13 + * version 2.1 of the License, or (at your option) any later version.
  158.14 + *
  158.15 + * FFmpeg is distributed in the hope that it will be useful,
  158.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  158.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  158.18 + * Lesser General Public License for more details.
  158.19 + *
  158.20 + * You should have received a copy of the GNU Lesser General Public
  158.21 + * License along with FFmpeg; if not, write to the Free Software
  158.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  158.23 + */
  158.24 +
  158.25 +/**
  158.26 + * @file
  158.27 + * logging functions
  158.28 + */
  158.29 +#include "error.h"
  158.30 +#include <unistd.h>
  158.31 +#include <stdlib.h>
  158.32 +#include "log.h"
  158.33 +
  158.34 +
  158.35 +static int av_log_level = AV_LOG_INFO;
  158.36 +
  158.37 +static int use_ansi_color=-1;
  158.38 +
  158.39 +#undef fprintf
  158.40 +static void colored_fputs(int color, const char *str){
  158.41 +    if(use_ansi_color<0){
  158.42 +#if HAVE_ISATTY && !defined(_WIN32)
  158.43 +        use_ansi_color= getenv("TERM") && !getenv("NO_COLOR") && isatty(2);
  158.44 +#else
  158.45 +        use_ansi_color= 0;
  158.46 +#endif
  158.47 +    }
  158.48 +
  158.49 +    if(use_ansi_color){
  158.50 +        fprintf(stderr, "\033[%d;3%dm", color>>4, color&15);
  158.51 +    }
  158.52 +    fputs(str, stderr);
  158.53 +    if(use_ansi_color){
  158.54 +        fprintf(stderr, "\033[0m");
  158.55 +    }
  158.56 +}
  158.57 +
  158.58 +void av_log_default_callback(int level, const char* fmt, va_list vl)
  158.59 +{
  158.60 +    static int print_prefix=1;
  158.61 +    static int count;
  158.62 +    static char line[1024], prev[1024];
  158.63 +    static const uint8_t color[]={0x41,0x41,0x11,0x03,9,9,9};
  158.64 +
  158.65 +    if(level>av_log_level)
  158.66 +        return;
  158.67 +#undef fprintf
  158.68 +
  158.69 +    line[0]=0;
  158.70 +
  158.71 +    vsnprintf(line + strlen(line), sizeof(line) - strlen(line), fmt, vl);
  158.72 +
  158.73 +    print_prefix= line[strlen(line)-1] == '\n';
  158.74 +    if(print_prefix && !strcmp(line, prev)){
  158.75 +        count++;
  158.76 +        return;
  158.77 +    }
  158.78 +    if(count>0){
  158.79 +        fprintf(stderr, "    Last message repeated %d times\n", count);
  158.80 +        count=0;
  158.81 +    }
  158.82 +    colored_fputs(color[av_clip(level>>3, 0, 6)], line);
  158.83 +    strcpy(prev, line);
  158.84 +}
  158.85 +
  158.86 +static void (*av_log_callback)(int, const char*, va_list) = av_log_default_callback;
  158.87 +
  158.88 +void av_log(int level, const char *fmt, ...)
  158.89 +{
  158.90 +    va_list vl;
  158.91 +    va_start(vl, fmt);
  158.92 +    av_vlog(level, fmt, vl);
  158.93 +    va_end(vl);
  158.94 +}
  158.95 +
  158.96 +void av_vlog(int level, const char *fmt, va_list vl)
  158.97 +{
  158.98 +    av_log_callback(level, fmt, vl);
  158.99 +}
 158.100 +
 158.101 +int av_log_get_level(void)
 158.102 +{
 158.103 +    return av_log_level;
 158.104 +}
 158.105 +
 158.106 +void av_log_set_level(int level)
 158.107 +{
 158.108 +    av_log_level = level;
 158.109 +}
 158.110 +
 158.111 +void av_log_set_callback(void (*callback)(int, const char*, va_list))
 158.112 +{
 158.113 +    av_log_callback = callback;
 158.114 +}

   159.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   159.2 +++ b/ffmpeg_smp/h264dec/libavutil/log.h	Mon Aug 27 12:09:56 2012 +0200
   159.3 @@ -0,0 +1,120 @@
   159.4 +/*
   159.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   159.6 + *
   159.7 + * This file is part of FFmpeg.
   159.8 + *
   159.9 + * FFmpeg is free software; you can redistribute it and/or
  159.10 + * modify it under the terms of the GNU Lesser General Public
  159.11 + * License as published by the Free Software Foundation; either
  159.12 + * version 2.1 of the License, or (at your option) any later version.
  159.13 + *
  159.14 + * FFmpeg is distributed in the hope that it will be useful,
  159.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  159.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  159.17 + * Lesser General Public License for more details.
  159.18 + *
  159.19 + * You should have received a copy of the GNU Lesser General Public
  159.20 + * License along with FFmpeg; if not, write to the Free Software
  159.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  159.22 + */
  159.23 +
  159.24 +#ifndef AVUTIL_LOG_H
  159.25 +#define AVUTIL_LOG_H
  159.26 +
  159.27 +#include <stdarg.h>
  159.28 +//#include "avutil.h"
  159.29 +
  159.30 +/**
  159.31 + * Describes the class of an AVClass context structure. That is an
  159.32 + * arbitrary struct of which the first field is a pointer to an
  159.33 + * AVClass struct (e.g. AVCodecContext, AVFormatContext etc.).
  159.34 + */
  159.35 +typedef struct {
  159.36 +    /**
  159.37 +     * The name of the class; usually it is the same name as the
  159.38 +     * context structure type to which the AVClass is associated.
  159.39 +     */
  159.40 +    const char* class_name;
  159.41 +
  159.42 +    /**
  159.43 +     * A pointer to a function which returns the name of a context
  159.44 +     * instance ctx associated with the class.
  159.45 +     */
  159.46 +    const char* (*item_name)(void* ctx);
  159.47 +
  159.48 +    /**
  159.49 +     * a pointer to the first option specified in the class if any or NULL
  159.50 +     *
  159.51 +     * @see av_set_default_options()
  159.52 +     */
  159.53 +    const struct AVOption *option;
  159.54 +
  159.55 +    /**
  159.56 +     * LIBAVUTIL_VERSION with which this structure was created.
  159.57 +     * This is used to allow fields to be added without requiring major
  159.58 +     * version bumps everywhere.
  159.59 +     */
  159.60 +
  159.61 +    int version;
  159.62 +} AVClass;
  159.63 +
  159.64 +/* av_log API */
  159.65 +
  159.66 +#define AV_LOG_QUIET    -8
  159.67 +
  159.68 +/**
  159.69 + * Something went really wrong and we will crash now.
  159.70 + */
  159.71 +#define AV_LOG_PANIC     0
  159.72 +
  159.73 +/**
  159.74 + * Something went wrong and recovery is not possible.
  159.75 + * For example, no header was found for a format which depends
  159.76 + * on headers or an illegal combination of parameters is used.
  159.77 + */
  159.78 +#define AV_LOG_FATAL     8
  159.79 +
  159.80 +/**
  159.81 + * Something went wrong and cannot losslessly be recovered.
  159.82 + * However, not all future data is affected.
  159.83 + */
  159.84 +#define AV_LOG_ERROR    16
  159.85 +
  159.86 +/**
  159.87 + * Something somehow does not look correct. This may or may not
  159.88 + * lead to problems. An example would be the use of '-vstrict -2'.
  159.89 + */
  159.90 +#define AV_LOG_WARNING  24
  159.91 +
  159.92 +#define AV_LOG_INFO     32
  159.93 +#define AV_LOG_VERBOSE  40
  159.94 +
  159.95 +/**
  159.96 + * Stuff which is only useful for libav* developers.
  159.97 + */
  159.98 +#define AV_LOG_DEBUG    48
  159.99 +
 159.100 +/**
 159.101 + * Sends the specified message to the log if the level is less than or equal
 159.102 + * to the current av_log_level. By default, all logging messages are sent to
 159.103 + * stderr. This behavior can be altered by setting a different av_vlog callback
 159.104 + * function.
 159.105 + *
 159.106 + * @param avcl A pointer to an arbitrary struct of which the first field is a
 159.107 + * pointer to an AVClass struct.
 159.108 + * @param level The importance level of the message, lower values signifying
 159.109 + * higher importance.
 159.110 + * @param fmt The format string (printf-compatible) that specifies how
 159.111 + * subsequent arguments are converted to output.
 159.112 + * @see av_vlog
 159.113 + */
 159.114 +
 159.115 +void av_log(int level, const char *fmt, ...);
 159.116 +
 159.117 +void av_vlog(int level, const char *fmt, va_list);
 159.118 +int av_log_get_level(void);
 159.119 +void av_log_set_level(int);
 159.120 +void av_log_set_callback(void (*)(int, const char*, va_list));
 159.121 +void av_log_default_callback(int level, const char* fmt, va_list vl);
 159.122 +
 159.123 +#endif /* AVUTIL_LOG_H */

   160.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   160.2 +++ b/ffmpeg_smp/h264dec/libavutil/mem.c	Mon Aug 27 12:09:56 2012 +0200
   160.3 @@ -0,0 +1,127 @@
   160.4 +/*
   160.5 + * default memory allocator for libavutil
   160.6 + * Copyright (c) 2002 Fabrice Bellard
   160.7 + *
   160.8 + * This file is part of FFmpeg.
   160.9 + *
  160.10 + * FFmpeg is free software; you can redistribute it and/or
  160.11 + * modify it under the terms of the GNU Lesser General Public
  160.12 + * License as published by the Free Software Foundation; either
  160.13 + * version 2.1 of the License, or (at your option) any later version.
  160.14 + *
  160.15 + * FFmpeg is distributed in the hope that it will be useful,
  160.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  160.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  160.18 + * Lesser General Public License for more details.
  160.19 + *
  160.20 + * You should have received a copy of the GNU Lesser General Public
  160.21 + * License along with FFmpeg; if not, write to the Free Software
  160.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  160.23 + */
  160.24 +
  160.25 +/**
  160.26 + * @file
  160.27 + * default memory allocator for libavutil
  160.28 + */
  160.29 +
  160.30 +#include "config.h"
  160.31 +
  160.32 +#include <limits.h>
  160.33 +#include <stdlib.h>
  160.34 +#include <stdint.h>
  160.35 +#include <string.h>
  160.36 +#if HAVE_MALLOC_H
  160.37 +#include <malloc.h>
  160.38 +#endif
  160.39 +
  160.40 +#include "mem.h"
  160.41 +
  160.42 +/* here we can use OS-dependent allocation functions */
  160.43 +#undef free
  160.44 +#undef malloc
  160.45 +#undef realloc
  160.46 +
  160.47 +#ifdef MALLOC_PREFIX
  160.48 +
  160.49 +#define malloc         AV_JOIN(MALLOC_PREFIX, malloc)
  160.50 +#define memalign       AV_JOIN(MALLOC_PREFIX, memalign)
  160.51 +#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign)
  160.52 +#define realloc        AV_JOIN(MALLOC_PREFIX, realloc)
  160.53 +#define free           AV_JOIN(MALLOC_PREFIX, free)
  160.54 +
  160.55 +void *malloc(size_t size);
  160.56 +void *memalign(size_t align, size_t size);
  160.57 +int   posix_memalign(void **ptr, size_t align, size_t size);
  160.58 +void *realloc(void *ptr, size_t size);
  160.59 +void  free(void *ptr);
  160.60 +
  160.61 +#endif /* MALLOC_PREFIX */
  160.62 +
  160.63 +
  160.64 +/* You can redefine av_malloc and av_free in your project to use your
  160.65 +   memory allocator. You do not need to suppress this file because the
  160.66 +   linker will do it automatically. */
  160.67 +
  160.68 +void *av_malloc(unsigned int size)
  160.69 +{
  160.70 +    void *ptr = NULL;
  160.71 +    /* let's disallow possible ambiguous cases */
  160.72 +    if(size > (INT_MAX-16) )
  160.73 +        return NULL;
  160.74 +
  160.75 +//FIXME: when no aligned mallocs vector code should be disabled.
  160.76 +#if HAVE_POSIX_MEMALIGN
  160.77 +    if (posix_memalign(&ptr,16,size))
  160.78 +        ptr = NULL;
  160.79 +#elif HAVE_MEMALIGN
  160.80 +    ptr = memalign(16,size);
  160.81 +#else
  160.82 +    ptr = malloc(size);
  160.83 +#endif
  160.84 +    return ptr;
  160.85 +}
  160.86 +
  160.87 +void *av_realloc(void *ptr, unsigned int size)
  160.88 +{
  160.89 +    /* let's disallow possible ambiguous cases */
  160.90 +    if(size > (INT_MAX-16) )
  160.91 +        return NULL;
  160.92 +
  160.93 +    return realloc(ptr, size);
  160.94 +
  160.95 +}
  160.96 +
  160.97 +void av_free(void *ptr)
  160.98 +{
  160.99 +    /* XXX: this test should not be needed on most libcs */
 160.100 +    if (ptr)
 160.101 +        free(ptr);
 160.102 +
 160.103 +}
 160.104 +
 160.105 +void av_freep(void *arg)
 160.106 +{
 160.107 +    void **ptr= (void**)arg;
 160.108 +    av_free(*ptr);
 160.109 +    *ptr = NULL;
 160.110 +}
 160.111 +
 160.112 +void *av_mallocz(unsigned int size)
 160.113 +{
 160.114 +    void *ptr = av_malloc(size);
 160.115 +    if (ptr)
 160.116 +        memset(ptr, 0, size);
 160.117 +    return ptr;
 160.118 +}
 160.119 +
 160.120 +char *av_strdup(const char *s)
 160.121 +{
 160.122 +    char *ptr= NULL;
 160.123 +    if(s){
 160.124 +        int len = strlen(s) + 1;
 160.125 +        ptr = av_malloc(len);
 160.126 +        if (ptr)
 160.127 +            memcpy(ptr, s, len);
 160.128 +    }
 160.129 +    return ptr;
 160.130 +}

   161.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   161.2 +++ b/ffmpeg_smp/h264dec/libavutil/mem.h	Mon Aug 27 12:09:56 2012 +0200
   161.3 @@ -0,0 +1,143 @@
   161.4 +/*
   161.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   161.6 + *
   161.7 + * This file is part of FFmpeg.
   161.8 + *
   161.9 + * FFmpeg is free software; you can redistribute it and/or
  161.10 + * modify it under the terms of the GNU Lesser General Public
  161.11 + * License as published by the Free Software Foundation; either
  161.12 + * version 2.1 of the License, or (at your option) any later version.
  161.13 + *
  161.14 + * FFmpeg is distributed in the hope that it will be useful,
  161.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  161.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  161.17 + * Lesser General Public License for more details.
  161.18 + *
  161.19 + * You should have received a copy of the GNU Lesser General Public
  161.20 + * License along with FFmpeg; if not, write to the Free Software
  161.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  161.22 + */
  161.23 +
  161.24 +/**
  161.25 + * @file
  161.26 + * memory handling functions
  161.27 + */
  161.28 +
  161.29 +#ifndef AVUTIL_MEM_H
  161.30 +#define AVUTIL_MEM_H
  161.31 +
  161.32 +#include "attributes.h"
  161.33 +#include "config.h"
  161.34 +
  161.35 +#define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
  161.36 +#define DECLARE_ALIGNED_16(t,v)      t __attribute__ ((aligned (16))) v
  161.37 +#define DECLARE_ASM_CONST(n,t,v)    static const t __attribute__((used)) __attribute__ ((aligned (n))) v
  161.38 +
  161.39 +#if AV_GCC_VERSION_AT_LEAST(3,1)
  161.40 +    #define av_malloc_attrib __attribute__((__malloc__))
  161.41 +#else
  161.42 +    #define av_malloc_attrib
  161.43 +#endif
  161.44 +
  161.45 +/**
  161.46 + * Allocates a block of size bytes with alignment suitable for all
  161.47 + * memory accesses (including vectors if available on the CPU).
  161.48 + * @param size Size in bytes for the memory block to be allocated.
  161.49 + * @return Pointer to the allocated block, NULL if the block cannot
  161.50 + * be allocated.
  161.51 + * @see av_mallocz()
  161.52 + */
  161.53 +void *av_malloc(unsigned int size) av_malloc_attrib;
  161.54 +
  161.55 +/**
  161.56 + * Allocates or reallocates a block of memory.
  161.57 + * If ptr is NULL and size > 0, allocates a new block. If
  161.58 + * size is zero, frees the memory block pointed to by ptr.
  161.59 + * @param size Size in bytes for the memory block to be allocated or
  161.60 + * reallocated.
  161.61 + * @param ptr Pointer to a memory block already allocated with
  161.62 + * av_malloc(z)() or av_realloc() or NULL.
  161.63 + * @return Pointer to a newly reallocated block or NULL if the block
  161.64 + * cannot be reallocated or the function is used to free the memory block.
  161.65 + * @see av_fast_realloc()
  161.66 + */
  161.67 +void *av_realloc(void *ptr, unsigned int size);
  161.68 +
  161.69 +/**
  161.70 + * Reallocates the given block if it is not large enough, otherwise it
  161.71 + * does nothing.
  161.72 + *
  161.73 + * @see av_realloc
  161.74 + */
  161.75 +void *av_fast_realloc(void *ptr, unsigned int *size, unsigned int min_size);
  161.76 +
  161.77 +/**
  161.78 + * Allocates a buffer, reusing the given one if large enough.
  161.79 + *
  161.80 + * Contrary to av_fast_realloc the current buffer contents might not be
  161.81 + * preserved and on error the old buffer is freed, thus no special
  161.82 + * handling to avoid memleaks is necessary.
  161.83 + *
  161.84 + * @param ptr pointer to pointer to already allocated buffer, overwritten with pointer to new buffer
  161.85 + * @param size size of the buffer *ptr points to
  161.86 + * @param min_size minimum size of *ptr buffer after returning, *ptr will be NULL and
  161.87 + *                 *size 0 if an error occurred.
  161.88 + */
  161.89 +void av_fast_malloc(void *ptr, unsigned int *size, unsigned int min_size);
  161.90 +
  161.91 +/**
  161.92 + * Frees a memory block which has been allocated with av_malloc(z)() or
  161.93 + * av_realloc().
  161.94 + * @param ptr Pointer to the memory block which should be freed.
  161.95 + * @note ptr = NULL is explicitly allowed.
  161.96 + * @note It is recommended that you use av_freep() instead.
  161.97 + * @see av_freep()
  161.98 + */
  161.99 +
 161.100 +void av_free(void *ptr);
 161.101 +
 161.102 +/**
 161.103 + * Allocates a block of size bytes with alignment suitable for all
 161.104 + * memory accesses (including vectors if available on the CPU) and
 161.105 + * zeroes all the bytes of the block.
 161.106 + * @param size Size in bytes for the memory block to be allocated.
 161.107 + * @return Pointer to the allocated block, NULL if it cannot be allocated.
 161.108 + * @see av_malloc()
 161.109 + */
 161.110 +void *av_mallocz(unsigned int size) av_malloc_attrib;
 161.111 +
 161.112 +/**
 161.113 + * Duplicates the string s.
 161.114 + * @param s string to be duplicated
 161.115 + * @return Pointer to a newly allocated string containing a
 161.116 + * copy of s or NULL if the string cannot be allocated.
 161.117 + */
 161.118 +char *av_strdup(const char *s) av_malloc_attrib;
 161.119 +
 161.120 +/**
 161.121 + * Frees a memory block which has been allocated with av_malloc(z)() or
 161.122 + * av_realloc() and set the pointer pointing to it to NULL.
 161.123 + * @param ptr Pointer to the pointer to the memory block which should
 161.124 + * be freed.
 161.125 + * @see av_free()
 161.126 + */
 161.127 +void av_freep(void *ptr);
 161.128 +
 161.129 +
 161.130 +static av_always_inline uint32_t pack16to32(int a, int b){
 161.131 +#if HAVE_BIGENDIAN
 161.132 +   return (b&0xFFFF) + (a<<16);
 161.133 +#else
 161.134 +   return (a&0xFFFF) + (b<<16);
 161.135 +#endif
 161.136 +}
 161.137 +
 161.138 +static av_always_inline uint16_t pack8to16(int a, int b){
 161.139 +#if HAVE_BIGENDIAN
 161.140 +   return (b&0xFF) + (a<<8);
 161.141 +#else
 161.142 +   return (a&0xFF) + (b<<8);
 161.143 +#endif
 161.144 +}
 161.145 +
 161.146 +#endif /* AVUTIL_MEM_H */

   162.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   162.2 +++ b/ffmpeg_smp/h264dec/libavutil/pixfmt.h	Mon Aug 27 12:09:56 2012 +0200
   162.3 @@ -0,0 +1,161 @@
   162.4 +/*
   162.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   162.6 + *
   162.7 + * This file is part of FFmpeg.
   162.8 + *
   162.9 + * FFmpeg is free software; you can redistribute it and/or
  162.10 + * modify it under the terms of the GNU Lesser General Public
  162.11 + * License as published by the Free Software Foundation; either
  162.12 + * version 2.1 of the License, or (at your option) any later version.
  162.13 + *
  162.14 + * FFmpeg is distributed in the hope that it will be useful,
  162.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  162.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  162.17 + * Lesser General Public License for more details.
  162.18 + *
  162.19 + * You should have received a copy of the GNU Lesser General Public
  162.20 + * License along with FFmpeg; if not, write to the Free Software
  162.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  162.22 + */
  162.23 +
  162.24 +#ifndef AVUTIL_PIXFMT_H
  162.25 +#define AVUTIL_PIXFMT_H
  162.26 +
  162.27 +/**
  162.28 + * @file
  162.29 + * pixel format definitions
  162.30 + *
  162.31 + * @warning This file has to be considered an internal but installed
  162.32 + * header, so it should not be directly included in your projects.
  162.33 + */
  162.34 +
  162.35 +/**
  162.36 + * Pixel format. Notes:
  162.37 + *
  162.38 + * PIX_FMT_RGB32 is handled in an endian-specific manner. An RGBA
  162.39 + * color is put together as:
  162.40 + *  (A << 24) | (R << 16) | (G << 8) | B
  162.41 + * This is stored as BGRA on little-endian CPU architectures and ARGB on
  162.42 + * big-endian CPUs.
  162.43 + *
  162.44 + * When the pixel format is palettized RGB (PIX_FMT_PAL8), the palettized
  162.45 + * image data is stored in AVFrame.data[0]. The palette is transported in
  162.46 + * AVFrame.data[1], is 1024 bytes long (256 4-byte entries) and is
  162.47 + * formatted the same as in PIX_FMT_RGB32 described above (i.e., it is
  162.48 + * also endian-specific). Note also that the individual RGB palette
  162.49 + * components stored in AVFrame.data[1] should be in the range 0..255.
  162.50 + * This is important as many custom PAL8 video codecs that were designed
  162.51 + * to run on the IBM VGA graphics adapter use 6-bit palette components.
  162.52 + *
  162.53 + * For all the 8bit per pixel formats, an RGB32 palette is in data[1] like
  162.54 + * for pal8. This palette is filled in automatically by the function
  162.55 + * allocating the picture.
  162.56 + *
  162.57 + * Note, make sure that all newly added big endian formats have pix_fmt&1==1
  162.58 + *       and that all newly added little endian formats have pix_fmt&1==0
  162.59 + *       this allows simpler detection of big vs little endian.
  162.60 + */
  162.61 +enum PixelFormat {
  162.62 +    PIX_FMT_NONE= -1,
  162.63 +    PIX_FMT_YUV420P,   ///< planar YUV 4:2:0, 12bpp, (1 Cr & Cb sample per 2x2 Y samples)
  162.64 +    PIX_FMT_YUYV422,   ///< packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
  162.65 +    PIX_FMT_RGB24,     ///< packed RGB 8:8:8, 24bpp, RGBRGB...
  162.66 +    PIX_FMT_BGR24,     ///< packed RGB 8:8:8, 24bpp, BGRBGR...
  162.67 +    PIX_FMT_YUV422P,   ///< planar YUV 4:2:2, 16bpp, (1 Cr & Cb sample per 2x1 Y samples)
  162.68 +    PIX_FMT_YUV444P,   ///< planar YUV 4:4:4, 24bpp, (1 Cr & Cb sample per 1x1 Y samples)
  162.69 +    PIX_FMT_YUV410P,   ///< planar YUV 4:1:0,  9bpp, (1 Cr & Cb sample per 4x4 Y samples)
  162.70 +    PIX_FMT_YUV411P,   ///< planar YUV 4:1:1, 12bpp, (1 Cr & Cb sample per 4x1 Y samples)
  162.71 +    PIX_FMT_GRAY8,     ///<        Y        ,  8bpp
  162.72 +    PIX_FMT_MONOWHITE, ///<        Y        ,  1bpp, 0 is white, 1 is black
  162.73 +    PIX_FMT_MONOBLACK, ///<        Y        ,  1bpp, 0 is black, 1 is white
  162.74 +    PIX_FMT_PAL8,      ///< 8 bit with PIX_FMT_RGB32 palette
  162.75 +    PIX_FMT_YUVJ420P,  ///< planar YUV 4:2:0, 12bpp, full scale (JPEG)
  162.76 +    PIX_FMT_YUVJ422P,  ///< planar YUV 4:2:2, 16bpp, full scale (JPEG)
  162.77 +    PIX_FMT_YUVJ444P,  ///< planar YUV 4:4:4, 24bpp, full scale (JPEG)
  162.78 +    PIX_FMT_XVMC_MPEG2_MC,///< XVideo Motion Acceleration via common packet passing
  162.79 +    PIX_FMT_XVMC_MPEG2_IDCT,
  162.80 +    PIX_FMT_UYVY422,   ///< packed YUV 4:2:2, 16bpp, Cb Y0 Cr Y1
  162.81 +    PIX_FMT_UYYVYY411, ///< packed YUV 4:1:1, 12bpp, Cb Y0 Y1 Cr Y2 Y3
  162.82 +    PIX_FMT_BGR8,      ///< packed RGB 3:3:2,  8bpp, (msb)2B 3G 3R(lsb)
  162.83 +    PIX_FMT_BGR4,      ///< packed RGB 1:2:1,  4bpp, (msb)1B 2G 1R(lsb)
  162.84 +    PIX_FMT_BGR4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1B 2G 1R(lsb)
  162.85 +    PIX_FMT_RGB8,      ///< packed RGB 3:3:2,  8bpp, (msb)2R 3G 3B(lsb)
  162.86 +    PIX_FMT_RGB4,      ///< packed RGB 1:2:1,  4bpp, (msb)1R 2G 1B(lsb)
  162.87 +    PIX_FMT_RGB4_BYTE, ///< packed RGB 1:2:1,  8bpp, (msb)1R 2G 1B(lsb)
  162.88 +    PIX_FMT_NV12,      ///< planar YUV 4:2:0, 12bpp, 1 plane for Y and 1 for UV
  162.89 +    PIX_FMT_NV21,      ///< as above, but U and V bytes are swapped
  162.90 +
  162.91 +    PIX_FMT_ARGB,      ///< packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
  162.92 +    PIX_FMT_RGBA,      ///< packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
  162.93 +    PIX_FMT_ABGR,      ///< packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
  162.94 +    PIX_FMT_BGRA,      ///< packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
  162.95 +
  162.96 +    PIX_FMT_GRAY16BE,  ///<        Y        , 16bpp, big-endian
  162.97 +    PIX_FMT_GRAY16LE,  ///<        Y        , 16bpp, little-endian
  162.98 +    PIX_FMT_YUV440P,   ///< planar YUV 4:4:0 (1 Cr & Cb sample per 1x2 Y samples)
  162.99 +    PIX_FMT_YUVJ440P,  ///< planar YUV 4:4:0 full scale (JPEG)
 162.100 +    PIX_FMT_YUVA420P,  ///< planar YUV 4:2:0, 20bpp, (1 Cr & Cb sample per 2x2 Y & A samples)
 162.101 +    PIX_FMT_VDPAU_H264,///< H.264 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
 162.102 +    PIX_FMT_VDPAU_MPEG1,///< MPEG-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
 162.103 +    PIX_FMT_VDPAU_MPEG2,///< MPEG-2 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
 162.104 +    PIX_FMT_VDPAU_WMV3,///< WMV3 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
 162.105 +    PIX_FMT_VDPAU_VC1, ///< VC-1 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
 162.106 +    PIX_FMT_RGB48BE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, big-endian
 162.107 +    PIX_FMT_RGB48LE,   ///< packed RGB 16:16:16, 48bpp, 16R, 16G, 16B, little-endian
 162.108 +
 162.109 +    PIX_FMT_RGB565BE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), big-endian
 162.110 +    PIX_FMT_RGB565LE,  ///< packed RGB 5:6:5, 16bpp, (msb)   5R 6G 5B(lsb), little-endian
 162.111 +    PIX_FMT_RGB555BE,  ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), big-endian, most significant bit to 0
 162.112 +    PIX_FMT_RGB555LE,  ///< packed RGB 5:5:5, 16bpp, (msb)1A 5R 5G 5B(lsb), little-endian, most significant bit to 0
 162.113 +
 162.114 +    PIX_FMT_BGR565BE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), big-endian
 162.115 +    PIX_FMT_BGR565LE,  ///< packed BGR 5:6:5, 16bpp, (msb)   5B 6G 5R(lsb), little-endian
 162.116 +    PIX_FMT_BGR555BE,  ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), big-endian, most significant bit to 1
 162.117 +    PIX_FMT_BGR555LE,  ///< packed BGR 5:5:5, 16bpp, (msb)1A 5B 5G 5R(lsb), little-endian, most significant bit to 1
 162.118 +
 162.119 +    PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers
 162.120 +    PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers
 162.121 +    PIX_FMT_VAAPI_VLD,  ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
 162.122 +
 162.123 +    PIX_FMT_YUV420P16LE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
 162.124 +    PIX_FMT_YUV420P16BE,  ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
 162.125 +    PIX_FMT_YUV422P16LE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), little-endian
 162.126 +    PIX_FMT_YUV422P16BE,  ///< planar YUV 4:2:2, 32bpp, (1 Cr & Cb sample per 2x1 Y samples), big-endian
 162.127 +    PIX_FMT_YUV444P16LE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), little-endian
 162.128 +    PIX_FMT_YUV444P16BE,  ///< planar YUV 4:4:4, 48bpp, (1 Cr & Cb sample per 1x1 Y samples), big-endian
 162.129 +    PIX_FMT_VDPAU_MPEG4,  ///< MPEG4 HW decoding with VDPAU, data[0] contains a vdpau_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
 162.130 +    PIX_FMT_DXVA2_VLD,    ///< HW decoding through DXVA2, Picture.data[3] contains a LPDIRECT3DSURFACE9 pointer
 162.131 +
 162.132 +    PIX_FMT_RGB444BE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), big-endian, most significant bits to 0
 162.133 +    PIX_FMT_RGB444LE,  ///< packed RGB 4:4:4, 16bpp, (msb)4A 4R 4G 4B(lsb), little-endian, most significant bits to 0
 162.134 +    PIX_FMT_BGR444BE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), big-endian, most significant bits to 1
 162.135 +    PIX_FMT_BGR444LE,  ///< packed BGR 4:4:4, 16bpp, (msb)4A 4B 4G 4R(lsb), little-endian, most significant bits to 1
 162.136 +    PIX_FMT_Y400A,     ///< 8bit gray, 8bit alpha
 162.137 +    PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
 162.138 +};
 162.139 +
 162.140 +#if HAVE_BIGENDIAN
 162.141 +#   define PIX_FMT_NE(be, le) PIX_FMT_##be
 162.142 +#else
 162.143 +#   define PIX_FMT_NE(be, le) PIX_FMT_##le
 162.144 +#endif
 162.145 +
 162.146 +#define PIX_FMT_RGB32   PIX_FMT_NE(ARGB, BGRA)
 162.147 +#define PIX_FMT_RGB32_1 PIX_FMT_NE(RGBA, ABGR)
 162.148 +#define PIX_FMT_BGR32   PIX_FMT_NE(ABGR, RGBA)
 162.149 +#define PIX_FMT_BGR32_1 PIX_FMT_NE(BGRA, ARGB)
 162.150 +
 162.151 +#define PIX_FMT_GRAY16 PIX_FMT_NE(GRAY16BE, GRAY16LE)
 162.152 +#define PIX_FMT_RGB48  PIX_FMT_NE(RGB48BE,  RGB48LE)
 162.153 +#define PIX_FMT_RGB565 PIX_FMT_NE(RGB565BE, RGB565LE)
 162.154 +#define PIX_FMT_RGB555 PIX_FMT_NE(RGB555BE, RGB555LE)
 162.155 +#define PIX_FMT_RGB444 PIX_FMT_NE(RGB444BE, RGB444LE)
 162.156 +#define PIX_FMT_BGR565 PIX_FMT_NE(BGR565BE, BGR565LE)
 162.157 +#define PIX_FMT_BGR555 PIX_FMT_NE(BGR555BE, BGR555LE)
 162.158 +#define PIX_FMT_BGR444 PIX_FMT_NE(BGR444BE, BGR444LE)
 162.159 +
 162.160 +#define PIX_FMT_YUV420P16 PIX_FMT_NE(YUV420P16BE, YUV420P16LE)
 162.161 +#define PIX_FMT_YUV422P16 PIX_FMT_NE(YUV422P16BE, YUV422P16LE)
 162.162 +#define PIX_FMT_YUV444P16 PIX_FMT_NE(YUV444P16BE, YUV444P16LE)
 162.163 +
 162.164 +#endif /* AVUTIL_PIXFMT_H */

   163.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   163.2 +++ b/ffmpeg_smp/h264dec/libavutil/ppc/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
   163.3 @@ -0,0 +1,108 @@
   163.4 +/*
   163.5 + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
   163.6 + *
   163.7 + * This file is part of FFmpeg.
   163.8 + *
   163.9 + * FFmpeg is free software; you can redistribute it and/or
  163.10 + * modify it under the terms of the GNU Lesser General Public
  163.11 + * License as published by the Free Software Foundation; either
  163.12 + * version 2.1 of the License, or (at your option) any later version.
  163.13 + *
  163.14 + * FFmpeg is distributed in the hope that it will be useful,
  163.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  163.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  163.17 + * Lesser General Public License for more details.
  163.18 + *
  163.19 + * You should have received a copy of the GNU Lesser General Public
  163.20 + * License along with FFmpeg; if not, write to the Free Software
  163.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  163.22 + */
  163.23 +
  163.24 +#ifndef AVUTIL_PPC_INTREADWRITE_H
  163.25 +#define AVUTIL_PPC_INTREADWRITE_H
  163.26 +
  163.27 +#include <stdint.h>
  163.28 +#include "config.h"
  163.29 +
  163.30 +#if HAVE_XFORM_ASM
  163.31 +
  163.32 +#define AV_RL16 AV_RL16
  163.33 +static av_always_inline uint16_t AV_RL16(const void *p)
  163.34 +{
  163.35 +    uint16_t v;
  163.36 +    __asm__ ("lhbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint16_t*)p));
  163.37 +    return v;
  163.38 +}
  163.39 +
  163.40 +#define AV_WL16 AV_WL16
  163.41 +static av_always_inline void AV_WL16(void *p, uint16_t v)
  163.42 +{
  163.43 +    __asm__ ("sthbrx  %1, %y0" : "=Z"(*(uint16_t*)p) : "r"(v));
  163.44 +}
  163.45 +
  163.46 +#define AV_RL32 AV_RL32
  163.47 +static av_always_inline uint32_t AV_RL32(const void *p)
  163.48 +{
  163.49 +    uint32_t v;
  163.50 +    __asm__ ("lwbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint32_t*)p));
  163.51 +    return v;
  163.52 +}
  163.53 +
  163.54 +#define AV_WL32 AV_WL32
  163.55 +static av_always_inline void AV_WL32(void *p, uint32_t v)
  163.56 +{
  163.57 +    __asm__ ("stwbrx  %1, %y0" : "=Z"(*(uint32_t*)p) : "r"(v));
  163.58 +}
  163.59 +
  163.60 +#if HAVE_LDBRX
  163.61 +
  163.62 +#define AV_RL64 AV_RL64
  163.63 +static av_always_inline uint64_t AV_RL64(const void *p)
  163.64 +{
  163.65 +    uint64_t v;
  163.66 +    __asm__ ("ldbrx   %0, %y1" : "=r"(v) : "Z"(*(const uint64_t*)p));
  163.67 +    return v;
  163.68 +}
  163.69 +
  163.70 +#define AV_WL64 AV_WL64
  163.71 +static av_always_inline void AV_WL64(void *p, uint64_t v)
  163.72 +{
  163.73 +    __asm__ ("stdbrx  %1, %y0" : "=Z"(*(uint64_t*)p) : "r"(v));
  163.74 +}
  163.75 +
  163.76 +#else
  163.77 +
  163.78 +#define AV_RL64 AV_RL64
  163.79 +static av_always_inline uint64_t AV_RL64(const void *p)
  163.80 +{
  163.81 +    union { uint64_t v; uint32_t hl[2]; } v;
  163.82 +    __asm__ ("lwbrx   %0, %y2  \n\t"
  163.83 +             "lwbrx   %1, %y3  \n\t"
  163.84 +             : "=&r"(v.hl[1]), "=r"(v.hl[0])
  163.85 +             : "Z"(*(const uint32_t*)p), "Z"(*((const uint32_t*)p+1)));
  163.86 +    return v.v;
  163.87 +}
  163.88 +
  163.89 +#define AV_WL64 AV_WL64
  163.90 +static av_always_inline void AV_WL64(void *p, uint64_t v)
  163.91 +{
  163.92 +    union { uint64_t v; uint32_t hl[2]; } vv = { v };
  163.93 +    __asm__ ("stwbrx  %2, %y0  \n\t"
  163.94 +             "stwbrx  %3, %y1  \n\t"
  163.95 +             : "=Z"(*(uint32_t*)p), "=Z"(*((uint32_t*)p+1))
  163.96 +             : "r"(vv.hl[1]), "r"(vv.hl[0]));
  163.97 +}
  163.98 +
  163.99 +#endif /* HAVE_LDBRX */
 163.100 +
 163.101 +#endif /* HAVE_XFORM_ASM */
 163.102 +
 163.103 +/*
 163.104 + * GCC fails miserably on the packed struct version which is used by
 163.105 + * default, so we override it here.
 163.106 + */
 163.107 +
 163.108 +#define AV_RB64(p) (*(const uint64_t *)(p))
 163.109 +#define AV_WB64(p, v) (*(uint64_t *)(p) = (v))
 163.110 +
 163.111 +#endif /* AVUTIL_PPC_INTREADWRITE_H */

   164.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   164.2 +++ b/ffmpeg_smp/h264dec/libavutil/ppc/timer.h	Mon Aug 27 12:09:56 2012 +0200
   164.3 @@ -0,0 +1,47 @@
   164.4 +/*
   164.5 + * Copyright (c) 2005 Luca Barbato <lu_zero@gentoo.org>
   164.6 + *
   164.7 + * This file is part of FFmpeg.
   164.8 + *
   164.9 + * FFmpeg is free software; you can redistribute it and/or
  164.10 + * modify it under the terms of the GNU Lesser General Public
  164.11 + * License as published by the Free Software Foundation; either
  164.12 + * version 2.1 of the License, or (at your option) any later version.
  164.13 + *
  164.14 + * FFmpeg is distributed in the hope that it will be useful,
  164.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  164.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  164.17 + * Lesser General Public License for more details.
  164.18 + *
  164.19 + * You should have received a copy of the GNU Lesser General Public
  164.20 + * License along with FFmpeg; if not, write to the Free Software
  164.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  164.22 + */
  164.23 +
  164.24 +#ifndef AVUTIL_PPC_TIMER_H
  164.25 +#define AVUTIL_PPC_TIMER_H
  164.26 +
  164.27 +#include <stdint.h>
  164.28 +
  164.29 +#define AV_READ_TIME read_time
  164.30 +
  164.31 +static inline uint64_t read_time(void)
  164.32 +{
  164.33 +    uint32_t tbu, tbl, temp;
  164.34 +
  164.35 +     /* from section 2.2.1 of the 32-bit PowerPC PEM */
  164.36 +     __asm__ volatile(
  164.37 +         "1:\n"
  164.38 +         "mftbu  %2\n"
  164.39 +         "mftb   %0\n"
  164.40 +         "mftbu  %1\n"
  164.41 +         "cmpw   %2,%1\n"
  164.42 +         "bne    1b\n"
  164.43 +     : "=r"(tbl), "=r"(tbu), "=r"(temp)
  164.44 +     :
  164.45 +     : "cc");
  164.46 +
  164.47 +     return (((uint64_t)tbu)<<32) | (uint64_t)tbl;
  164.48 +}
  164.49 +
  164.50 +#endif /* AVUTIL_PPC_TIMER_H */

   165.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   165.2 +++ b/ffmpeg_smp/h264dec/libavutil/timer.h	Mon Aug 27 12:09:56 2012 +0200
   165.3 @@ -0,0 +1,69 @@
   165.4 +/**
   165.5 + * @file
   165.6 + * high precision timer, useful to profile code
   165.7 + *
   165.8 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   165.9 + *
  165.10 + * This file is part of FFmpeg.
  165.11 + *
  165.12 + * FFmpeg is free software; you can redistribute it and/or
  165.13 + * modify it under the terms of the GNU Lesser General Public
  165.14 + * License as published by the Free Software Foundation; either
  165.15 + * version 2.1 of the License, or (at your option) any later version.
  165.16 + *
  165.17 + * FFmpeg is distributed in the hope that it will be useful,
  165.18 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  165.19 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  165.20 + * Lesser General Public License for more details.
  165.21 + *
  165.22 + * You should have received a copy of the GNU Lesser General Public
  165.23 + * License along with FFmpeg; if not, write to the Free Software
  165.24 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  165.25 + */
  165.26 +
  165.27 +#ifndef AVUTIL_TIMER_H
  165.28 +#define AVUTIL_TIMER_H
  165.29 +
  165.30 +#include <stdlib.h>
  165.31 +#include <stdint.h>
  165.32 +#include "config.h"
  165.33 +
  165.34 +#if   ARCH_ARM
  165.35 +#   include "arm/timer.h"
  165.36 +#elif ARCH_PPC
  165.37 +#   include "ppc/timer.h"
  165.38 +#elif ARCH_X86
  165.39 +#   include "x86/timer.h"
  165.40 +#endif
  165.41 +
  165.42 +#if !defined(AV_READ_TIME) && HAVE_GETHRTIME
  165.43 +#   define AV_READ_TIME gethrtime
  165.44 +#endif
  165.45 +
  165.46 +#ifdef AV_READ_TIME
  165.47 +#define START_TIMER \
  165.48 +uint64_t tend;\
  165.49 +uint64_t tstart= AV_READ_TIME();\
  165.50 +
  165.51 +#define STOP_TIMER(id) \
  165.52 +tend= AV_READ_TIME();\
  165.53 +{\
  165.54 +    static uint64_t tsum=0;\
  165.55 +    static int tcount=0;\
  165.56 +    static int tskip_count=0;\
  165.57 +    if(tcount<2 || tend - tstart < 8*tsum/tcount || tend - tstart < 2000){\
  165.58 +        tsum+= tend - tstart;\
  165.59 +        tcount++;\
  165.60 +    }else\
  165.61 +        tskip_count++;\
  165.62 +    if(((tcount+tskip_count)&(tcount+tskip_count-1))==0){\
  165.63 +        av_log(NULL, AV_LOG_ERROR, "%"PRIu64" dezicycles in %s, %d runs, %d skips\n",\
  165.64 +               tsum*10/tcount, id, tcount, tskip_count);\
  165.65 +    }\
  165.66 +}
  165.67 +#else
  165.68 +#define START_TIMER
  165.69 +#define STOP_TIMER(id) {}
  165.70 +#endif
  165.71 +
  165.72 +#endif /* AVUTIL_TIMER_H */

   166.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   166.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86/bswap.h	Mon Aug 27 12:09:56 2012 +0200
   166.3 @@ -0,0 +1,61 @@
   166.4 +/*
   166.5 + * This file is part of FFmpeg.
   166.6 + *
   166.7 + * FFmpeg is free software; you can redistribute it and/or
   166.8 + * modify it under the terms of the GNU Lesser General Public
   166.9 + * License as published by the Free Software Foundation; either
  166.10 + * version 2.1 of the License, or (at your option) any later version.
  166.11 + *
  166.12 + * FFmpeg is distributed in the hope that it will be useful,
  166.13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  166.14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  166.15 + * Lesser General Public License for more details.
  166.16 + *
  166.17 + * You should have received a copy of the GNU Lesser General Public
  166.18 + * License along with FFmpeg; if not, write to the Free Software
  166.19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  166.20 + */
  166.21 +
  166.22 +/**
  166.23 + * @file
  166.24 + * byte swapping routines
  166.25 + */
  166.26 +
  166.27 +#ifndef AVUTIL_X86_BSWAP_H
  166.28 +#define AVUTIL_X86_BSWAP_H
  166.29 +
  166.30 +#include <stdint.h>
  166.31 +#include "config.h"
  166.32 +#include "libavutil/attributes.h"
  166.33 +
  166.34 +#define bswap_16 bswap_16
  166.35 +static av_always_inline av_const uint16_t bswap_16(uint16_t x)
  166.36 +{
  166.37 +    __asm__("rorw $8, %0" : "+r"(x));
  166.38 +    return x;
  166.39 +}
  166.40 +
  166.41 +#define bswap_32 bswap_32
  166.42 +static av_always_inline av_const uint32_t bswap_32(uint32_t x)
  166.43 +{
  166.44 +// #if HAVE_BSWAP
  166.45 +    __asm__("bswap   %0" : "+r" (x));
  166.46 +// #else
  166.47 +//     __asm__("rorw    $8,  %w0 \n\t"
  166.48 +//             "rorl    $16, %0  \n\t"
  166.49 +//             "rorw    $8,  %w0"
  166.50 +//             : "+r"(x));
  166.51 +// #endif
  166.52 +    return x;
  166.53 +}
  166.54 +
  166.55 +#if ARCH_X86_64
  166.56 +#define bswap_64 bswap_64
  166.57 +static inline uint64_t av_const bswap_64(uint64_t x)
  166.58 +{
  166.59 +    __asm__("bswap  %0": "=r" (x) : "0" (x));
  166.60 +    return x;
  166.61 +}
  166.62 +#endif
  166.63 +
  166.64 +#endif /* AVUTIL_X86_BSWAP_H */

   167.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   167.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86/intreadwrite.h	Mon Aug 27 12:09:56 2012 +0200
   167.3 @@ -0,0 +1,97 @@
   167.4 +/*
   167.5 + * Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
   167.6 + *
   167.7 + * This file is part of FFmpeg.
   167.8 + *
   167.9 + * FFmpeg is free software; you can redistribute it and/or
  167.10 + * modify it under the terms of the GNU Lesser General Public
  167.11 + * License as published by the Free Software Foundation; either
  167.12 + * version 2.1 of the License, or (at your option) any later version.
  167.13 + *
  167.14 + * FFmpeg is distributed in the hope that it will be useful,
  167.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  167.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  167.17 + * Lesser General Public License for more details.
  167.18 + *
  167.19 + * You should have received a copy of the GNU Lesser General Public
  167.20 + * License along with FFmpeg; if not, write to the Free Software
  167.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  167.22 + */
  167.23 +
  167.24 +#ifndef AVUTIL_X86_INTREADWRITE_H
  167.25 +#define AVUTIL_X86_INTREADWRITE_H
  167.26 +
  167.27 +#include <stdint.h>
  167.28 +#include "config.h"
  167.29 +#include "libavutil/attributes.h"
  167.30 +
  167.31 +#if HAVE_MMX
  167.32 +
  167.33 +#if defined(__MMX__)
  167.34 +
  167.35 +#define AV_COPY64 AV_COPY64
  167.36 +static av_always_inline void AV_COPY64(void *d, const void *s)
  167.37 +{
  167.38 +    __asm__("movq   %1, %%mm0  \n\t"
  167.39 +            "movq   %%mm0, %0  \n\t"
  167.40 +            : "=m"(*(uint64_t*)d)
  167.41 +            : "m" (*(const uint64_t*)s)
  167.42 +            : "mm0");
  167.43 +}
  167.44 +
  167.45 +#define AV_SWAP64 AV_SWAP64
  167.46 +static av_always_inline void AV_SWAP64(void *a, void *b)
  167.47 +{
  167.48 +    __asm__("movq   %1, %%mm0  \n\t"
  167.49 +            "movq   %0, %%mm1  \n\t"
  167.50 +            "movq   %%mm0, %0  \n\t"
  167.51 +            "movq   %%mm1, %1  \n\t"
  167.52 +            : "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
  167.53 +            ::"mm0", "mm1");
  167.54 +}
  167.55 +
  167.56 +#define AV_ZERO64 AV_ZERO64
  167.57 +static av_always_inline void AV_ZERO64(void *d)
  167.58 +{
  167.59 +    __asm__("pxor %%mm0, %%mm0  \n\t"
  167.60 +            "movq %%mm0, %0     \n\t"
  167.61 +            : "=m"(*(uint64_t*)d)
  167.62 +            :: "mm0");
  167.63 +}
  167.64 +
  167.65 +#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */
  167.66 +
  167.67 +#ifdef __SSE__
  167.68 +
  167.69 +#define AV_COPY128 AV_COPY128
  167.70 +static av_always_inline void AV_COPY128(void *d, const void *s)
  167.71 +{
  167.72 +    struct v {uint64_t v[2];};
  167.73 +
  167.74 +    __asm__("movaps   %1, %%xmm0  \n\t"
  167.75 +            "movaps   %%xmm0, %0  \n\t"
  167.76 +            : "=m"(*(struct v*)d)
  167.77 +            : "m" (*(const struct v*)s)
  167.78 +            : "xmm0");
  167.79 +}
  167.80 +
  167.81 +#endif /* __SSE__ */
  167.82 +
  167.83 +#ifdef __SSE2__
  167.84 +
  167.85 +#define AV_ZERO128 AV_ZERO128
  167.86 +static av_always_inline void AV_ZERO128(void *d)
  167.87 +{
  167.88 +    struct v {uint64_t v[2];};
  167.89 +
  167.90 +    __asm__("pxor %%xmm0, %%xmm0  \n\t"
  167.91 +            "movdqa   %%xmm0, %0  \n\t"
  167.92 +            : "=m"(*(struct v*)d)
  167.93 +            :: "xmm0");
  167.94 +}
  167.95 +
  167.96 +#endif /* __SSE2__ */
  167.97 +
  167.98 +#endif /* HAVE_MMX */
  167.99 +
 167.100 +#endif /* AVUTIL_X86_INTREADWRITE_H */

   168.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   168.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86/timer.h	Mon Aug 27 12:09:56 2012 +0200
   168.3 @@ -0,0 +1,35 @@
   168.4 +/*
   168.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   168.6 + *
   168.7 + * This file is part of FFmpeg.
   168.8 + *
   168.9 + * FFmpeg is free software; you can redistribute it and/or
  168.10 + * modify it under the terms of the GNU Lesser General Public
  168.11 + * License as published by the Free Software Foundation; either
  168.12 + * version 2.1 of the License, or (at your option) any later version.
  168.13 + *
  168.14 + * FFmpeg is distributed in the hope that it will be useful,
  168.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  168.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  168.17 + * Lesser General Public License for more details.
  168.18 + *
  168.19 + * You should have received a copy of the GNU Lesser General Public
  168.20 + * License along with FFmpeg; if not, write to the Free Software
  168.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  168.22 + */
  168.23 +
  168.24 +#ifndef AVUTIL_X86_TIMER_H
  168.25 +#define AVUTIL_X86_TIMER_H
  168.26 +
  168.27 +#include <stdint.h>
  168.28 +
  168.29 +#define AV_READ_TIME read_time
  168.30 +
  168.31 +static inline uint64_t read_time(void)
  168.32 +{
  168.33 +    uint32_t a, d;
  168.34 +    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
  168.35 +    return ((uint64_t)d << 32) + a;
  168.36 +}
  168.37 +
  168.38 +#endif /* AVUTIL_X86_TIMER_H */

   169.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
   169.2 +++ b/ffmpeg_smp/h264dec/libavutil/x86_cpu.h	Mon Aug 27 12:09:56 2012 +0200
   169.3 @@ -0,0 +1,73 @@
   169.4 +/*
   169.5 + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
   169.6 + *
   169.7 + * This file is part of FFmpeg.
   169.8 + *
   169.9 + * FFmpeg is free software; you can redistribute it and/or
  169.10 + * modify it under the terms of the GNU Lesser General Public
  169.11 + * License as published by the Free Software Foundation; either
  169.12 + * version 2.1 of the License, or (at your option) any later version.
  169.13 + *
  169.14 + * FFmpeg is distributed in the hope that it will be useful,
  169.15 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  169.16 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  169.17 + * Lesser General Public License for more details.
  169.18 + *
  169.19 + * You should have received a copy of the GNU Lesser General Public
  169.20 + * License along with FFmpeg; if not, write to the Free Software
  169.21 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  169.22 + */
  169.23 +
  169.24 +#ifndef AVUTIL_X86_CPU_H
  169.25 +#define AVUTIL_X86_CPU_H
  169.26 +
  169.27 +#include <stdint.h>
  169.28 +#include "config.h"
  169.29 +
  169.30 +#if ARCH_X86_64
  169.31 +#    define REG_a "rax"
  169.32 +#    define REG_b "rbx"
  169.33 +#    define REG_c "rcx"
  169.34 +#    define REG_d "rdx"
  169.35 +#    define REG_D "rdi"
  169.36 +#    define REG_S "rsi"
  169.37 +#    define PTR_SIZE "8"
  169.38 +typedef int64_t x86_reg;
  169.39 +
  169.40 +#    define REG_SP "rsp"
  169.41 +#    define REG_BP "rbp"
  169.42 +#    define REGBP   rbp
  169.43 +#    define REGa    rax
  169.44 +#    define REGb    rbx
  169.45 +#    define REGc    rcx
  169.46 +#    define REGd    rdx
  169.47 +#    define REGSP   rsp
  169.48 +
  169.49 +#elif ARCH_X86_32
  169.50 +
  169.51 +#    define REG_a "eax"
  169.52 +#    define REG_b "ebx"
  169.53 +#    define REG_c "ecx"
  169.54 +#    define REG_d "edx"
  169.55 +#    define REG_D "edi"
  169.56 +#    define REG_S "esi"
  169.57 +#    define PTR_SIZE "4"
  169.58 +typedef int32_t x86_reg;
  169.59 +
  169.60 +#    define REG_SP "esp"
  169.61 +#    define REG_BP "ebp"
  169.62 +#    define REGBP   ebp
  169.63 +#    define REGa    eax
  169.64 +#    define REGb    ebx
  169.65 +#    define REGc    ecx
  169.66 +#    define REGd    edx
  169.67 +#    define REGSP   esp
  169.68 +#else
  169.69 +typedef int x86_reg;
  169.70 +#endif
  169.71 +
  169.72 +// #if ARCH_X86_64 && defined(PIC)
  169.73 +// #    define BROKEN_RELOCATIONS 1
  169.74 +// #endif
  169.75 +
  169.76 +#endif /* AVUTIL_X86_CPU_H */