diff libavcodec/h264_deblock.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/h264_deblock.c	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,507 @@
     1.4 +/*
     1.5 + * H.26L/H.264/AVC/JVT/14496-10/... loop filter
     1.6 + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
     1.7 + *
     1.8 + * This file is part of FFmpeg.
     1.9 + *
    1.10 + * FFmpeg is free software; you can redistribute it and/or
    1.11 + * modify it under the terms of the GNU Lesser General Public
    1.12 + * License as published by the Free Software Foundation; either
    1.13 + * version 2.1 of the License, or (at your option) any later version.
    1.14 + *
    1.15 + * FFmpeg is distributed in the hope that it will be useful,
    1.16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    1.18 + * Lesser General Public License for more details.
    1.19 + *
    1.20 + * You should have received a copy of the GNU Lesser General Public
    1.21 + * License along with FFmpeg; if not, write to the Free Software
    1.22 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    1.23 + */
    1.24 +
    1.25 +/**
    1.26 + * @file
    1.27 + * H.264 / AVC / MPEG4 part10 loop filter.
    1.28 + * @author Michael Niedermayer <michaelni@gmx.at>
    1.29 + */
    1.30 +
    1.31 +#include "dsputil.h"
    1.32 +#include "mathops.h"
    1.33 +#include "rectangle.h"
    1.34 +#include "h264_types.h"
    1.35 +#include "h264_misc.h"
    1.36 +#include "h264_data.h"
    1.37 +//#undef NDEBUG
    1.38 +#include <assert.h>
    1.39 +
    1.40 +/* Deblocking filter (p153) */
    1.41 +static const uint8_t alpha_table[52*3] = {
    1.42 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.43 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.44 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.45 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.46 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.47 +     0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
    1.48 +     7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
    1.49 +    25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
    1.50 +    80, 90,101,113,127,144,162,182,203,226,
    1.51 +   255,255,
    1.52 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
    1.53 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
    1.54 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
    1.55 +   255,255,255,255,255,255,255,255,255,255,255,255,255,
    1.56 +};
    1.57 +static const uint8_t beta_table[52*3] = {
    1.58 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.59 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.60 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.61 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.62 +     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    1.63 +     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
    1.64 +     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
    1.65 +     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
    1.66 +    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
    1.67 +    18, 18,
    1.68 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
    1.69 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
    1.70 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
    1.71 +    18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
    1.72 +};
    1.73 +static const uint8_t tc0_table[52*3][4] = {
    1.74 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.75 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.76 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.77 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.78 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.79 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.80 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.81 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.82 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.83 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.84 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
    1.85 +    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
    1.86 +    {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
    1.87 +    {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
    1.88 +    {-1, 1, 1, 2 }, {-1, 1, 2, 3 }, {-1, 1, 2, 3 }, {-1, 2, 2, 3 }, {-1, 2, 2, 4 }, {-1, 2, 3, 4 },
    1.89 +    {-1, 2, 3, 4 }, {-1, 3, 3, 5 }, {-1, 3, 4, 6 }, {-1, 3, 4, 6 }, {-1, 4, 5, 7 }, {-1, 4, 5, 8 },
    1.90 +    {-1, 4, 6, 9 }, {-1, 5, 7,10 }, {-1, 6, 8,11 }, {-1, 6, 8,13 }, {-1, 7,10,14 }, {-1, 8,11,16 },
    1.91 +    {-1, 9,12,18 }, {-1,10,13,20 }, {-1,11,15,23 }, {-1,13,17,25 },
    1.92 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    1.93 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    1.94 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    1.95 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    1.96 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    1.97 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    1.98 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
    1.99 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   1.100 +    {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
   1.101 +};
   1.102 +
   1.103 +av_always_inline static void filter_mb_edgev( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s) {
   1.104 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
   1.105 +    const int alpha = alpha_table[index_a];
   1.106 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   1.107 +    if (alpha ==0 || beta == 0) return;
   1.108 +
   1.109 +    if( bS[0] < 4 ) {
   1.110 +        int8_t tc[4];
   1.111 +        tc[0] = tc0_table[index_a][bS[0]];
   1.112 +        tc[1] = tc0_table[index_a][bS[1]];
   1.113 +        tc[2] = tc0_table[index_a][bS[2]];
   1.114 +        tc[3] = tc0_table[index_a][bS[3]];
   1.115 +        mrc->hdsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
   1.116 +    } else {
   1.117 +        mrc->hdsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
   1.118 +    }
   1.119 +}
   1.120 +
   1.121 +av_always_inline static void filter_mb_edgecv( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
   1.122 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
   1.123 +    const int alpha = alpha_table[index_a];
   1.124 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   1.125 +    if (alpha ==0 || beta == 0) return;
   1.126 +
   1.127 +    if( bS[0] < 4 ) {
   1.128 +        int8_t tc[4];
   1.129 +        tc[0] = tc0_table[index_a][bS[0]]+1;
   1.130 +        tc[1] = tc0_table[index_a][bS[1]]+1;
   1.131 +        tc[2] = tc0_table[index_a][bS[2]]+1;
   1.132 +        tc[3] = tc0_table[index_a][bS[3]]+1;
   1.133 +        mrc->hdsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
   1.134 +    } else {
   1.135 +        mrc->hdsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
   1.136 +    }
   1.137 +}
   1.138 +
   1.139 +
   1.140 +av_always_inline static void filter_mb_edgeh( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
   1.141 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
   1.142 +    const int alpha = alpha_table[index_a];
   1.143 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   1.144 +    if (alpha ==0 || beta == 0) return;
   1.145 +
   1.146 +    if( bS[0] < 4 ) {
   1.147 +        int8_t tc[4];
   1.148 +        tc[0] = tc0_table[index_a][bS[0]];
   1.149 +        tc[1] = tc0_table[index_a][bS[1]];
   1.150 +        tc[2] = tc0_table[index_a][bS[2]];
   1.151 +        tc[3] = tc0_table[index_a][bS[3]];
   1.152 +        mrc->hdsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
   1.153 +    } else {
   1.154 +        mrc->hdsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
   1.155 +    }
   1.156 +}
   1.157 +
   1.158 +av_always_inline static void filter_mb_edgech( uint8_t *pix, int stride, int16_t bS[4], unsigned int qp, MBRecContext *mrc, H264Slice *s ) {
   1.159 +    const unsigned int index_a = qp + s->slice_alpha_c0_offset;
   1.160 +    const int alpha = alpha_table[index_a];
   1.161 +    const int beta  = beta_table[qp + s->slice_beta_offset];
   1.162 +    if (alpha ==0 || beta == 0) return;
   1.163 +
   1.164 +    if( bS[0] < 4 ) {
   1.165 +        int8_t tc[4];
   1.166 +        tc[0] = tc0_table[index_a][bS[0]]+1;
   1.167 +        tc[1] = tc0_table[index_a][bS[1]]+1;
   1.168 +        tc[2] = tc0_table[index_a][bS[2]]+1;
   1.169 +        tc[3] = tc0_table[index_a][bS[3]]+1;
   1.170 +        mrc->hdsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
   1.171 +    } else {
   1.172 +        mrc->hdsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
   1.173 +    }
   1.174 +}
   1.175 +
   1.176 +static av_always_inline void filter_mb_dir(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, int dir) {
   1.177 +    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
   1.178 +    const int qp_xy= m->qscale_mb_xy;
   1.179 +    const int qp_dir = dir == 0 ? m->qscale_left_mb_xy : m->qscale_top_mb_xy;
   1.180 +    const int linesize = mrc->linesize;
   1.181 +    const int uvlinesize = mrc->uvlinesize;
   1.182 +    const int mb_type = m->mb_type;
   1.183 +    int edge;
   1.184 +    const int edges = mrs->edges[dir];
   1.185 +
   1.186 +    if(mbm_type){
   1.187 +        int16_t* bS=mrs->bS[dir][0];
   1.188 +        /* Filter edge */
   1.189 +        // Do not use s->qscale as luma quantizer because it has not the same
   1.190 +        // value in IPCM macroblocks.
   1.191 +        if(bS[0]+bS[1]+bS[2]+bS[3]){
   1.192 +            int qp = ( qp_xy + qp_dir + 1 ) >> 1;
   1.193 +            if( dir == 0 ) {
   1.194 +                filter_mb_edgev( &img_y[0], linesize, bS, qp, mrc, s );
   1.195 +                {
   1.196 +                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
   1.197 +                    filter_mb_edgecv( &img_cb[0], uvlinesize, bS, qp, mrc, s);
   1.198 +                    filter_mb_edgecv( &img_cr[0], uvlinesize, bS, qp, mrc, s);
   1.199 +                }
   1.200 +            } else {
   1.201 +                filter_mb_edgeh( &img_y[0], linesize, bS, qp, mrc, s );
   1.202 +                {
   1.203 +                    int qp= ( get_chroma_qp(s, 0, qp_xy) + get_chroma_qp( s, 0, qp_dir) + 1 ) >> 1;
   1.204 +                    filter_mb_edgech( &img_cb[0], uvlinesize, bS, qp, mrc, s);
   1.205 +                    filter_mb_edgech( &img_cr[0], uvlinesize, bS, qp, mrc, s);
   1.206 +                }
   1.207 +            }
   1.208 +        }
   1.209 +    }
   1.210 +
   1.211 +    for( edge = 1; edge < edges; edge++ ) {
   1.212 +        int16_t* bS=mrs->bS[dir][edge];
   1.213 +        int qp = qp_xy;
   1.214 +
   1.215 +        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
   1.216 +            continue;
   1.217 +
   1.218 +        if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
   1.219 +            continue;
   1.220 +
   1.221 +        /* Filter edge */
   1.222 +        // Do not use s->qscale as luma quantizer because it has not the same
   1.223 +        // value in IPCM macroblocks.
   1.224 +
   1.225 +        if( dir == 0 ) {
   1.226 +            filter_mb_edgev( &img_y[4*edge], linesize, bS, qp, mrc, s);
   1.227 +            if( (edge&1) == 0 ) {
   1.228 +                filter_mb_edgecv( &img_cb[2*edge], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
   1.229 +                filter_mb_edgecv( &img_cr[2*edge], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
   1.230 +            }
   1.231 +        } else {
   1.232 +            filter_mb_edgeh( &img_y[4*edge*linesize], linesize, bS, qp, mrc, s );
   1.233 +            if( (edge&1) == 0 ) {
   1.234 +                filter_mb_edgech( &img_cb[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 0, qp_xy), mrc, s);
   1.235 +                filter_mb_edgech( &img_cr[2*edge*uvlinesize], uvlinesize, bS, get_chroma_qp(s, 1, qp_xy), mrc, s);
   1.236 +            }
   1.237 +        }
   1.238 +    }
   1.239 +}
   1.240 +
   1.241 +static int check_mv(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, long b_idx, long bn_idx, int mvy_limit){
   1.242 +    int v;
   1.243 +    v= mrs->ref_cache[0][b_idx] != mrs->ref_cache[0][bn_idx];
   1.244 +    if(!v && mrs->ref_cache[0][b_idx]!=-1)
   1.245 +        // absolute value >= 7 | ...
   1.246 +        v= ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
   1.247 +        ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
   1.248 +
   1.249 +    if(s->list_count==2){
   1.250 +        if(!v)
   1.251 +            v = (mrs->ref_cache[1][b_idx] != mrs->ref_cache[1][bn_idx]) |
   1.252 +            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
   1.253 +            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit);
   1.254 +
   1.255 +        if(v){
   1.256 +            if((mrs->ref_cache[0][b_idx] != mrs->ref_cache[1][bn_idx]) |
   1.257 +                (mrs->ref_cache[1][b_idx] != mrs->ref_cache[0][bn_idx]))
   1.258 +                return 1;
   1.259 +            return
   1.260 +            ((unsigned) (mrs->mv_cache[0][b_idx][0] - mrs->mv_cache[1][bn_idx][0] + 3) >= 7U) |
   1.261 +            ((FFABS( mrs->mv_cache[0][b_idx][1] - mrs->mv_cache[1][bn_idx][1] )) >= mvy_limit) |
   1.262 +            ((unsigned) (mrs->mv_cache[1][b_idx][0] - mrs->mv_cache[0][bn_idx][0] + 3) >= 7U) |
   1.263 +            ((FFABS( mrs->mv_cache[1][b_idx][1] - mrs->mv_cache[0][bn_idx][1] )) >= mvy_limit);
   1.264 +        }
   1.265 +    }
   1.266 +
   1.267 +    return v;
   1.268 +}
   1.269 +
   1.270 +static void calc_bS_values(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mvy_limit, int dir) {
   1.271 +    int mb_type = m->mb_type;
   1.272 +    int edge;
   1.273 +    const int mbm_type = dir == 0 ? mrs->left_type : mrs->top_type;
   1.274 +
   1.275 +    // how often to recheck mv-based bS when iterating between edges
   1.276 +    static const uint8_t mask_edge_tab[2][8]={{0,3,3,3,1,1,1,1},
   1.277 +    {0,3,1,1,3,3,3,3}};
   1.278 +    const int mask_edge = mask_edge_tab[dir][(mb_type>>3)&7];
   1.279 +    const int edges = mask_edge== 3 && !(m->cbp&15) ? 1 : 4;
   1.280 +    // how often to recheck mv-based bS when iterating along each edge
   1.281 +    const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
   1.282 +
   1.283 +    mrs->edges[dir]= edges;
   1.284 +
   1.285 +    if(mbm_type){
   1.286 +        int16_t* bS=mrs->bS[dir][0];
   1.287 +        if( IS_INTRA(mb_type|mbm_type)) {
   1.288 +            AV_WN64A(bS, 0x0004000400040004ULL);
   1.289 +        } else {
   1.290 +            int i;
   1.291 +            int mv_done;
   1.292 +            if( mask_par0 && ((mbm_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
   1.293 +                int b_idx= 8 + 4;
   1.294 +                int bn_idx= b_idx - (dir ? 8:1);
   1.295 +
   1.296 +                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, 8 + 4, bn_idx, mvy_limit);
   1.297 +                mv_done = 1;
   1.298 +            }
   1.299 +            else
   1.300 +                mv_done = 0;
   1.301 +
   1.302 +            for( i = 0; i < 4; i++ ) {
   1.303 +                int x = dir == 0 ? 0 : i;
   1.304 +                int y = dir == 0 ? i    : 0;
   1.305 +                int b_idx= 8 + 4 + x + 8*y;
   1.306 +                int bn_idx= b_idx - (dir ? 8:1);
   1.307 +
   1.308 +                if( mrs->non_zero_count_cache[b_idx] |
   1.309 +                    mrs->non_zero_count_cache[bn_idx] ) {
   1.310 +                    bS[i] = 2;
   1.311 +                }
   1.312 +                else if(!mv_done)
   1.313 +                {
   1.314 +                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
   1.315 +                }
   1.316 +            }
   1.317 +        }
   1.318 +    }
   1.319 +
   1.320 +    /* Calculate bS */
   1.321 +    for( edge = 1; edge < edges; edge++ ) {
   1.322 +        int16_t* bS=mrs->bS[dir][edge];
   1.323 +
   1.324 +        if( IS_8x8DCT(mb_type & (edge<<24)) ) // (edge&1) && IS_8x8DCT(mb_type)
   1.325 +            continue;
   1.326 +
   1.327 +        if( IS_INTRA(mb_type)) {
   1.328 +            AV_WN64A(bS, 0x0003000300030003ULL);
   1.329 +        } else {
   1.330 +            int i;
   1.331 +            int mv_done;
   1.332 +
   1.333 +            if( edge & mask_edge ) {
   1.334 +                AV_ZERO64(bS);
   1.335 +                mv_done = 1;
   1.336 +            }
   1.337 +            else if( mask_par0 ) {
   1.338 +                int b_idx= 8 + 4 + edge * (dir ? 8:1);
   1.339 +                int bn_idx= b_idx - (dir ? 8:1);
   1.340 +
   1.341 +                bS[0] = bS[1] = bS[2] = bS[3] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
   1.342 +                mv_done = 1;
   1.343 +            }
   1.344 +            else
   1.345 +                mv_done = 0;
   1.346 +
   1.347 +            for( i = 0; i < 4; i++ ) {
   1.348 +                int x = dir == 0 ? edge : i;
   1.349 +                int y = dir == 0 ? i    : edge;
   1.350 +                int b_idx= 8 + 4 + x + 8*y;
   1.351 +                int bn_idx= b_idx - (dir ? 8:1);
   1.352 +
   1.353 +                if( mrs->non_zero_count_cache[b_idx] |
   1.354 +                    mrs->non_zero_count_cache[bn_idx] ) {
   1.355 +                    bS[i] = 2;
   1.356 +                }
   1.357 +                else if(!mv_done)
   1.358 +                {
   1.359 +                    bS[i] = check_mv(mrc, mrs, s, b_idx, bn_idx, mvy_limit);
   1.360 +                }
   1.361 +            }
   1.362 +
   1.363 +            if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
   1.364 +                continue;
   1.365 +        }
   1.366 +
   1.367 +    }
   1.368 +}
   1.369 +
   1.370 +
   1.371 +/**
   1.372 +*
   1.373 +* @return zero if the loop filter can be skiped
   1.374 +*/
   1.375 +static int fill_filter_caches(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, int mb_type){
   1.376 +    H264Mb *m_top = m - mrc->mb_width;
   1.377 +    H264Mb *m_left = m - 1;
   1.378 +    const int mb_x = m->mb_x;
   1.379 +    const int mb_y = m->mb_y;
   1.380 +    int top_type, left_type;
   1.381 +    int qp, top_qp, left_qp;
   1.382 +    int qp_thresh = s->qp_thresh; //FIXME strictly we should store qp_thresh for each mb of a slice
   1.383 +
   1.384 +    qp = m->qscale_mb_xy ;
   1.385 +    left_qp = m->qscale_left_mb_xy ;
   1.386 +    top_qp  = m->qscale_top_mb_xy ;
   1.387 +
   1.388 +    //for sufficiently low qp, filtering wouldn't do anything
   1.389 +    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
   1.390 +    if(qp <= qp_thresh
   1.391 +        && (!(mb_x+mb_y) || ((qp + left_qp + 1)>>1) <= qp_thresh)
   1.392 +        && ( mb_y==0 || ((qp + top_qp + 1)>>1) <= qp_thresh)){
   1.393 +        return 0;
   1.394 +    }
   1.395 +
   1.396 +    if(IS_INTRA(mb_type)){
   1.397 +        return 1;
   1.398 +    }
   1.399 +
   1.400 +    {
   1.401 +        int list;
   1.402 +        for(list=0; list<s->list_count; list++){
   1.403 +            int8_t *ref;
   1.404 +
   1.405 +            if(!USES_LIST(mb_type, list)){
   1.406 +                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
   1.407 +                fill_rectangle( mrs->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
   1.408 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
   1.409 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
   1.410 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
   1.411 +                AV_WN32A(&mrs->ref_cache[list][scan8[10]], ((LIST_NOT_USED)&0xFF)*0x01010101u);
   1.412 +                continue;
   1.413 +            }
   1.414 +
   1.415 +            ref = &mrs->ref_index[list][4*mb_x];
   1.416 +            {
   1.417 +                int (*ref2frm)[64] =(void *) (s->ref2frm[0] +  2);
   1.418 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 0]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
   1.419 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 2]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
   1.420 +                ref += 2;
   1.421 +
   1.422 +                AV_WN32A(&mrs->ref_cache[list][scan8[ 8]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
   1.423 +                AV_WN32A(&mrs->ref_cache[list][scan8[10]], (pack16to32(ref2frm[list][ref[0]],ref2frm[list][ref[1]])&0x00FF00FF)*0x0101);
   1.424 +            }
   1.425 +        }
   1.426 +    }
   1.427 +
   1.428 +    /*
   1.429 +    0 . T T. T T T T
   1.430 +    1 L . .L . . . .
   1.431 +    2 L . .L . . . .
   1.432 +    3 . T TL . . . .
   1.433 +    4 L . .L . . . .
   1.434 +    5 L . .. . . . .
   1.435 +    */
   1.436 +
   1.437 +    if (IS_SKIP(mb_type)){
   1.438 +        memset(mrs->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
   1.439 +    }
   1.440 +
   1.441 +    //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
   1.442 +    top_type  = mrs->top_type;
   1.443 +    left_type = mrs->left_type;
   1.444 +    if(top_type){
   1.445 +        AV_COPY32(&mrs->non_zero_count_cache[4+8*0], &m_top->non_zero_count[3*4]);
   1.446 +    }
   1.447 +
   1.448 +    if(left_type){
   1.449 +        mrs->non_zero_count_cache[3+8*1]= m_left->non_zero_count[3+0*4];
   1.450 +        mrs->non_zero_count_cache[3+8*2]= m_left->non_zero_count[3+1*4];
   1.451 +        mrs->non_zero_count_cache[3+8*3]= m_left->non_zero_count[3+2*4];
   1.452 +        mrs->non_zero_count_cache[3+8*4]= m_left->non_zero_count[3+3*4];
   1.453 +    }
   1.454 +
   1.455 +    if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
   1.456 +        int list;
   1.457 +        for(list=0; list<s->list_count; list++){
   1.458 +            if(USES_LIST(top_type, list)){
   1.459 +                const int b_xy= 4*mb_x + 3*mrc->b_stride;
   1.460 +                const int b8_x= 4*mb_x + 2;
   1.461 +                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
   1.462 +                AV_COPY128(mrs->mv_cache[list][scan8[0] + 0 - 1*8], mrs->motion_val_top[list][b_xy + 0]);
   1.463 +
   1.464 +                mrs->ref_cache[list][scan8[0] + 0 - 1*8]=
   1.465 +                mrs->ref_cache[list][scan8[0] + 1 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 0]];
   1.466 +                mrs->ref_cache[list][scan8[0] + 2 - 1*8]=
   1.467 +                mrs->ref_cache[list][scan8[0] + 3 - 1*8]= ref2frm[list][mrs->ref_index_top[list][b8_x + 1]];
   1.468 +            }else{
   1.469 +                AV_ZERO128(mrs->mv_cache[list][scan8[0] + 0 - 1*8]);
   1.470 +                AV_WN32A(&mrs->ref_cache[list][scan8[0] + 0 - 1*8], ((LIST_NOT_USED)&0xFF)*0x01010101u);
   1.471 +            }
   1.472 +
   1.473 +            if(USES_LIST(left_type, list)){
   1.474 +                const int b_x = 4*(mb_x-1) + 3;
   1.475 +                const int b8_x= 4*(mb_x-1) + 1;
   1.476 +                int (*ref2frm)[64] = (void *) (s->ref2frm[0] +  2);
   1.477 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 0 ], mrs->motion_val[list][b_x + mrc->b_stride*0]);
   1.478 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 + 8 ], mrs->motion_val[list][b_x + mrc->b_stride*1]);
   1.479 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +16 ], mrs->motion_val[list][b_x + mrc->b_stride*2]);
   1.480 +                AV_COPY32(mrs->mv_cache[list][scan8[0] - 1 +24 ], mrs->motion_val[list][b_x + mrc->b_stride*3]);
   1.481 +
   1.482 +                mrs->ref_cache[list][scan8[0] - 1 + 0 ]=
   1.483 +                mrs->ref_cache[list][scan8[0] - 1 + 8 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*0]];
   1.484 +                mrs->ref_cache[list][scan8[0] - 1 +16 ]=
   1.485 +                mrs->ref_cache[list][scan8[0] - 1 +24 ]= ref2frm[list][mrs->ref_index[list][b8_x + 2*1]];
   1.486 +
   1.487 +            }else{
   1.488 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 0 ]);
   1.489 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 + 8 ]);
   1.490 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +16 ]);
   1.491 +                AV_ZERO32(mrs->mv_cache [list][scan8[0] - 1 +24 ]);
   1.492 +
   1.493 +                mrs->ref_cache[list][scan8[0] - 1 + 0  ]=
   1.494 +                mrs->ref_cache[list][scan8[0] - 1 + 8  ]=
   1.495 +                mrs->ref_cache[list][scan8[0] - 1 + 16 ]=
   1.496 +                mrs->ref_cache[list][scan8[0] - 1 + 24 ]= LIST_NOT_USED;
   1.497 +            }
   1.498 +        }
   1.499 +    }
   1.500 +    return 1;
   1.501 +}
   1.502 +
   1.503 +void ff_h264_filter_mb(MBRecContext *mrc, MBRecState *mrs, H264Slice *s, H264Mb *m, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr) {
   1.504 +    if (fill_filter_caches(mrc, mrs, s, m, m->mb_type)){
   1.505 +        calc_bS_values(mrc, mrs, s, m, 4, 0);
   1.506 +        calc_bS_values(mrc, mrs, s, m, 4, 1);
   1.507 +        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 0);
   1.508 +        filter_mb_dir(mrc, mrs, s, m, img_y, img_cb, img_cr, 1);
   1.509 +    }
   1.510 +}