diff libavcodec/h264_cell.c @ 2:897f711a7157

rearrange to work with autoconf
author Nina Engelhardt <nengel@mailbox.tu-berlin.de>
date Tue, 25 Sep 2012 15:55:33 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/libavcodec/h264_cell.c	Tue Sep 25 15:55:33 2012 +0200
     1.3 @@ -0,0 +1,1242 @@
     1.4 +
     1.5 +#include "h264_types.h"
     1.6 +#include "h264_parser.h"
     1.7 +#include "h264_nal.h"
     1.8 +#include "h264_entropy.h"
     1.9 +#include "h264_rec.h"
    1.10 +#include "h264_misc.h"
    1.11 +#include "cell/h264_types_spu.h"
    1.12 +#include "h264_pthread.h"
    1.13 +
    1.14 +#include <pthread.h>
    1.15 +#include <assert.h>
    1.16 +#include <unistd.h>
    1.17 +
    1.18 +#include <libspe2.h>
    1.19 +#include <ppu_intrinsics.h>
    1.20 +#include <cbe_mfc.h>
    1.21 +#include <libsync.h>
    1.22 +
    1.23 +// spe global variables
    1.24 +unsigned rl_cnt_var, rl_mutex_var, rl_cond_var;
    1.25 +atomic_ea_t rl_cnt;
    1.26 +cond_ea_t rl_cond;
    1.27 +mutex_ea_t rl_lock;
    1.28 +
    1.29 +H264spe * spe_params;
    1.30 +unsigned mutex_var[16];
    1.31 +unsigned cond_var[16];
    1.32 +unsigned atomic_var[16];
    1.33 +
    1.34 +pthread_t * spe_tid;
    1.35 +spe_context_ptr_t *spe_context;
    1.36 +void** spe_control_area;
    1.37 +void** spe_ls_area;
    1.38 +H264slice **spe_slice_buf;
    1.39 +
    1.40 +H264spe * spe_ed_params;
    1.41 +unsigned mutex_ed_var[16];
    1.42 +unsigned cond_ed_var[16];
    1.43 +unsigned atomic_ed_var[16];
    1.44 +
    1.45 +pthread_t * spe_ed_tid;
    1.46 +spe_context_ptr_t *spe_ed_context;
    1.47 +void** spe_ed_control_area;
    1.48 +void** spe_ed_ls_area;
    1.49 +EDSlice_spu **spe_ed_slice_buf;
    1.50 +
    1.51 +//structs to propagate stop signal
    1.52 +MBSlice last_slice;
    1.53 +EDSlice last_ed_slice;
    1.54 +DecodedPicture last_pic;
    1.55 +RawFrame last_frm;
    1.56 +
    1.57 +static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){
    1.58 +    int i;
    1.59 +    int cnt = *poc_cnt;
    1.60 +    for(i=0; i<cnt; i++){
    1.61 +        if (poc_list[i]==s->ref_list[1][0]->poc){
    1.62 +            *poc_cnt=i+1;
    1.63 +            while(++i<cnt)
    1.64 +                poc_list[i]=0;
    1.65 +            return 1;
    1.66 +        }
    1.67 +    }
    1.68 +    return 0;
    1.69 +}
    1.70 +
    1.71 +static void update_IP_poc_list(int *poc_list, int *poc_cnt, int poc) {
    1.72 +    int i=0;
    1.73 +    int cnt = *poc_cnt;
    1.74 +
    1.75 +    while (poc_list[i] > poc) { i++;}
    1.76 +    if ( i< cnt)
    1.77 +        memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int));
    1.78 +
    1.79 +    poc_list[i]=poc;
    1.80 +    (*poc_cnt)++;
    1.81 +}
    1.82 +
    1.83 +static void *spe_ed_thread(void *arg){
    1.84 +    H264spe *params = (H264spe *)arg;
    1.85 +    unsigned int idx = params->idx;
    1.86 +    unsigned int runflags = 0;
    1.87 +    unsigned int entry = SPE_DEFAULT_ENTRY;
    1.88 +    // run SPE context
    1.89 +    spe_context_run(spe_ed_context[idx],  &entry, runflags, (void*) params, NULL, NULL);
    1.90 +    // done - now exit thread
    1.91 +    pthread_exit(NULL);
    1.92 +}
    1.93 +
    1.94 +static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) {
    1.95 +    int i;
    1.96 +    int num_threads = ip_threads+b_threads;
    1.97 +    spe_program_handle_t * spe_program = spe_image_open("spe_ed");
    1.98 +    // reserve memory for spe thread id, context and argument addresses
    1.99 +    spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t));
   1.100 +    spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
   1.101 +    spe_ed_params = av_malloc(num_threads * sizeof (H264spe));
   1.102 +    spe_ed_control_area = av_malloc(num_threads * sizeof (void*));
   1.103 +    spe_ed_ls_area = av_malloc(num_threads * sizeof (void*));
   1.104 +    spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*));
   1.105 +
   1.106 +    if (spe_program == NULL)
   1.107 +        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
   1.108 +
   1.109 +    for (i = 0; i < num_threads; i++) {
   1.110 +        // create context for spe program
   1.111 +        spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL);
   1.112 +        if (spe_ed_context[i] == NULL)
   1.113 +            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
   1.114 +        // load SPE program into main memory
   1.115 +        if ((spe_program_load(spe_ed_context[i], spe_program)) == -1)
   1.116 +            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
   1.117 +        //get the control_area for fast mailboxing
   1.118 +        if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL)
   1.119 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
   1.120 +        //get ls area for inter spe communication
   1.121 +        if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL)
   1.122 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
   1.123 +    }
   1.124 +
   1.125 +    for (i = 0; i < ip_threads; i++) {
   1.126 +        spe_ed_params[i].mb_width = h->mb_width;
   1.127 +        spe_ed_params[i].mb_stride = h->mb_stride;
   1.128 +        spe_ed_params[i].mb_height = h->mb_height;
   1.129 +        spe_ed_params[i].type = EDIP;
   1.130 +        spe_ed_params[i].spe_id = i;
   1.131 +        spe_ed_params[i].idx = i;
   1.132 +        //spe_ed_params[i].spe_total = ip_threads; //not used
   1.133 +        //spe_params[i].slice_params= &slice_params;
   1.134 +        spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
   1.135 +        spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads];
   1.136 +
   1.137 +        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
   1.138 +        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
   1.139 +        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
   1.140 +
   1.141 +        mutex_init(spe_ed_params[i].lock);
   1.142 +        cond_init(spe_ed_params[i].cond);
   1.143 +        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
   1.144 +            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
   1.145 +
   1.146 +        //slicebufaddr
   1.147 +        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
   1.148 +        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
   1.149 +    }
   1.150 +    for (int j = 0; j < b_threads; j++) {
   1.151 +        i = j+ip_threads;
   1.152 +        spe_ed_params[i].mb_width = h->mb_width;
   1.153 +        spe_ed_params[i].mb_stride = h->mb_stride;
   1.154 +        spe_ed_params[i].mb_height = h->mb_height;
   1.155 +        spe_ed_params[i].type = EDB;
   1.156 +        spe_ed_params[i].idx = i;
   1.157 +        spe_ed_params[i].spe_id = j;
   1.158 +        spe_ed_params[i].spe_total = b_threads;
   1.159 +        //spe_params[i].slice_params= &slice_params;
   1.160 +        //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads];
   1.161 +        spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads];
   1.162 +
   1.163 +        spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i];
   1.164 +        spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i];
   1.165 +        spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0);
   1.166 +
   1.167 +        mutex_init(spe_ed_params[i].lock);
   1.168 +        cond_init(spe_ed_params[i].cond);
   1.169 +        if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i]))
   1.170 +            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
   1.171 +
   1.172 +        //slicebufaddr
   1.173 +        spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]);
   1.174 +        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
   1.175 +    }
   1.176 +    spe_image_close(spe_program);
   1.177 +
   1.178 +}
   1.179 +
   1.180 +static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){
   1.181 +    dst->pps 	= src->pps;
   1.182 +    dst->mbs 	= src->mbs;
   1.183 +    dst->state 	= src->state;
   1.184 +    dst->qp_thresh = src->qp_thresh;
   1.185 +    dst->pic	= *src->current_picture;
   1.186 +
   1.187 +    dst->ref_count[0] = src->ref_count[0];
   1.188 +    dst->ref_count[1] = src->ref_count[1];
   1.189 +    dst->slice_type	  = src->slice_type;
   1.190 +    dst->slice_type_nos = src->slice_type_nos;
   1.191 +    dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag;
   1.192 +    dst->list_count = src->list_count;
   1.193 +    dst->coded_pic_num = src->coded_pic_num;
   1.194 +
   1.195 +    GetBitContext *gb = &src->gb;
   1.196 +    align_get_bits( gb);
   1.197 +    dst->bytestream_start = gb->buffer + get_bits_count(gb)/8;
   1.198 +    dst->byte_bufsize = (get_bits_left(gb) + 7)/8;
   1.199 +
   1.200 +    dst->transform_bypass = src->transform_bypass;
   1.201 +    dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred;
   1.202 +    memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int));
   1.203 +    memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int));
   1.204 +    dst->cabac_init_idc = src->cabac_init_idc;
   1.205 +    memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int));
   1.206 +    dst->chroma_qp[0]= src->chroma_qp[0];
   1.207 +    dst->chroma_qp[1]= src->chroma_qp[1];
   1.208 +    dst->qscale = src->qscale;
   1.209 +    dst->last_qscale_diff = src->last_qscale_diff;
   1.210 +
   1.211 +    if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0];
   1.212 +}
   1.213 +
   1.214 +static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){
   1.215 +    unsigned status;
   1.216 +
   1.217 +    spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0);
   1.218 +    spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status);
   1.219 +
   1.220 +
   1.221 +    _spe_in_mbox_write(spe_ed_control_area[id], 0);
   1.222 +
   1.223 +    while (!spe_out_mbox_status(spe_ed_context[id])){
   1.224 +        //pthread_yield();
   1.225 +        usleep(1000);
   1.226 +    }
   1.227 +    _spe_out_mbox_read(spe_ed_control_area[id]);
   1.228 +}
   1.229 +
   1.230 +static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){
   1.231 +    int i,j;
   1.232 +
   1.233 +    if( !s->pps.cabac ){
   1.234 +        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
   1.235 +        return -1;
   1.236 +    }
   1.237 +    DECLARE_ALIGNED(16, EDSlice_spu, slice);
   1.238 +    fill_EDSlice_spu(&slice, s);
   1.239 +
   1.240 +    send_slice_to_spe_and_wait(&slice, id);
   1.241 +
   1.242 +    return 0;
   1.243 +}
   1.244 +
   1.245 +static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){
   1.246 +    int i,j;
   1.247 +
   1.248 +    if( !s->pps.cabac ){
   1.249 +        av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n");
   1.250 +        return -1;
   1.251 +    }
   1.252 +    DECLARE_ALIGNED(16, EDSlice_spu, slice);
   1.253 +    fill_EDSlice_spu(&slice, s);
   1.254 +
   1.255 +    send_slice_to_spe_and_wait(&slice, 0);
   1.256 +    
   1.257 +    if (s->release_cnt>0) {
   1.258 +        for (int i=0; i<s->release_cnt; i++){
   1.259 +            release_pib_entry(h, s->release_ref[i], 2);
   1.260 +        }
   1.261 +        s->release_cnt=0;
   1.262 +    }
   1.263 +
   1.264 +    release_pib_entry(h, s->current_picture, 1);
   1.265 +    av_freep(&s->gb.raw);
   1.266 +    if (s->gb.rbsp)
   1.267 +        av_freep(&s->gb.rbsp);
   1.268 +
   1.269 +    return 0;
   1.270 +}
   1.271 +
   1.272 +static void *entr_IP_spe_thread(void *arg){
   1.273 +    EDThreadContext *eip = (EDThreadContext *) arg;
   1.274 +    H264Context *h = eip->h;
   1.275 +// 	printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid));
   1.276 +    for (int i=0; i<SLICE_BUFS; i++){
   1.277 +        eip->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
   1.278 +    }
   1.279 +
   1.280 +    EntropyContext *ec = get_entropy_context(h);
   1.281 +    EDSlice *s;
   1.282 +
   1.283 +    for(;;){
   1.284 +        {
   1.285 +            pthread_mutex_lock(&eip->ed_lock);
   1.286 +            while (eip->ed_cnt <= 0)
   1.287 +                pthread_cond_wait(&eip->ed_cond, &eip->ed_lock);
   1.288 +            s = &eip->ed_q[eip->ed_fo];
   1.289 +            eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT;
   1.290 +            pthread_mutex_unlock(&eip->ed_lock);
   1.291 +        }
   1.292 +
   1.293 +        if (s->state<0)
   1.294 +            break;
   1.295 +        {
   1.296 +            pthread_mutex_lock(&eip->mbs_lock);
   1.297 +            while (eip->mbs_cnt <= 0)
   1.298 +                pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock);
   1.299 +
   1.300 +            s->mbs = eip->mbs[eip->mbs_fo];
   1.301 +            s->ed = eip;
   1.302 +            eip->mbs_cnt--;
   1.303 +            eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS;
   1.304 +            pthread_mutex_unlock(&eip->mbs_lock);
   1.305 +        }
   1.306 +        if (eip->cell){
   1.307 +            decode_slice_entropy_cell(ec, s, eip->thread_num);
   1.308 +        }else{
   1.309 +            decode_slice_entropy(ec, s);
   1.310 +        }
   1.311 +
   1.312 +//         {
   1.313 +//             pthread_mutex_lock(&h->lock[ENTROPY2]);
   1.314 +//             h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc;
   1.315 +//             while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT)
   1.316 +//                 h->ed_poc_fo++;
   1.317 +//
   1.318 +//             pthread_cond_signal(&h->cond[ENTROPY2]);
   1.319 +//             pthread_mutex_unlock(&h->lock[ENTROPY2]);
   1.320 +//         }
   1.321 +
   1.322 +        {
   1.323 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
   1.324 +            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
   1.325 +                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
   1.326 +            h->ed_reorder_q[h->ed_reorder_fi] = *s;
   1.327 +            h->ed_reorder_cnt++;
   1.328 +            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
   1.329 +            pthread_cond_signal(&h->cond[ENTROPY4]);
   1.330 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
   1.331 +        }
   1.332 +
   1.333 +        {
   1.334 +            pthread_mutex_lock(&eip->ed_lock);
   1.335 +            eip->ed_cnt--;
   1.336 +            pthread_cond_signal(&eip->ed_cond);
   1.337 +            pthread_mutex_unlock(&eip->ed_lock);
   1.338 +        }
   1.339 +    }
   1.340 +
   1.341 +    free_entropy_context(ec);
   1.342 +
   1.343 +    pthread_exit(NULL);
   1.344 +    return NULL;
   1.345 +}
   1.346 +
   1.347 +static void *entr_B_spe_thread(void *arg){
   1.348 +    EDThreadContext *eb = (EDThreadContext *) arg;
   1.349 +    H264Context *h = eb->h;
   1.350 +// 	printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid));
   1.351 +    for (int i=0; i<SLICE_BUFS; i++){
   1.352 +        eb->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb));
   1.353 +    }
   1.354 +
   1.355 +    EntropyContext *ec = get_entropy_context(h);
   1.356 +    EDSlice *s;
   1.357 +
   1.358 +    for(;;){
   1.359 +        {
   1.360 +            pthread_mutex_lock(&eb->ed_lock);
   1.361 +            while (eb->ed_cnt <= 0)
   1.362 +                pthread_cond_wait(&eb->ed_cond, &eb->ed_lock);
   1.363 +            s = &eb->ed_q[eb->ed_fo];
   1.364 +            eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT;
   1.365 +            pthread_mutex_unlock(&eb->ed_lock);
   1.366 +        }
   1.367 +
   1.368 +        if (s->state<0)
   1.369 +            break;
   1.370 +        {
   1.371 +            pthread_mutex_lock(&eb->mbs_lock);
   1.372 +            while (eb->mbs_cnt <= 0)
   1.373 +                pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock);
   1.374 +            s->mbs = eb->mbs[eb->mbs_fo];
   1.375 +            s->ed = eb;
   1.376 +            eb->mbs_cnt--;
   1.377 +            eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS;
   1.378 +            pthread_mutex_unlock(&eb->mbs_lock);
   1.379 +        }
   1.380 +        //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed);
   1.381 +        decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads);
   1.382 +
   1.383 +        {
   1.384 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
   1.385 +            while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
   1.386 +                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
   1.387 +            h->ed_reorder_q[h->ed_reorder_fi] = *s;
   1.388 +            h->ed_reorder_cnt++;
   1.389 +            h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
   1.390 +            pthread_cond_signal(&h->cond[ENTROPY4]);
   1.391 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
   1.392 +
   1.393 +        }
   1.394 +
   1.395 +        {
   1.396 +            pthread_mutex_lock(&eb->ed_lock);
   1.397 +            eb->ed_cnt--;
   1.398 +            pthread_cond_signal(&eb->ed_cond);
   1.399 +            pthread_mutex_unlock(&eb->ed_lock);
   1.400 +        }
   1.401 +    }
   1.402 +    eb->lines_cnt++;
   1.403 +
   1.404 +    free_entropy_context(ec);
   1.405 +
   1.406 +    pthread_exit(NULL);
   1.407 +    return NULL;
   1.408 +}
   1.409 +
   1.410 +static void *entr_B_distribute(void *arg){
   1.411 +    H264Context *h = (H264Context *) arg;
   1.412 +    EDSlice *s;
   1.413 +
   1.414 +    int i, n=0, poc;
   1.415 +
   1.416 +// 	printf("eb dist, pid %d\n", syscall(SYS_gettid));
   1.417 +
   1.418 +    for(i=0; i<h->edb_threads; i++){
   1.419 +        h->b[i].h =h;
   1.420 +        h->b[i].thread_num =i;
   1.421 +        h->b[i].thread_total =h->edb_threads;
   1.422 +        pthread_mutex_init(&h->b[i].mbs_lock, NULL);
   1.423 +        pthread_cond_init(&h->b[i].mbs_cond, NULL);
   1.424 +        h->b[i].mbs_fo = 0;
   1.425 +        h->b[i].mbs_cnt = SLICE_BUFS;
   1.426 +        h->b[i].ed_fi =0;
   1.427 +        h->b[i].ed_fo =0;
   1.428 +        h->b[i].ed_cnt =0;
   1.429 +        h->b[i].lines_cnt =0;
   1.430 +        h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads];
   1.431 +        pthread_mutex_init(&h->b[i].ed_lock, NULL);
   1.432 +        pthread_cond_init(&h->b[i].ed_cond, NULL);
   1.433 +        pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]);
   1.434 +    }
   1.435 +
   1.436 +    for(;;){
   1.437 +        {
   1.438 +            pthread_mutex_lock(&h->lock[ENTROPY3B]);
   1.439 +            while (h->ed_B_cnt<=0)
   1.440 +                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
   1.441 +            s= &h->ed_B_q[h->ed_B_fo];
   1.442 +            h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT;
   1.443 +            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
   1.444 +
   1.445 +        }
   1.446 +        if (s->state<0)
   1.447 +            break;
   1.448 +
   1.449 +        if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){
   1.450 +            while (poc < s->ref_list[1][0]->poc){
   1.451 +                pthread_mutex_lock(&h->lock[ENTROPY2]);
   1.452 +                while (poc == h->ed_poc)
   1.453 +                    pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]);
   1.454 +                poc = h->ed_poc;
   1.455 +                pthread_mutex_unlock(&h->lock[ENTROPY2]);
   1.456 +            }
   1.457 +        }
   1.458 +        {
   1.459 +            pthread_mutex_lock(&h->b[n].ed_lock);
   1.460 +            while (h->b[n].ed_cnt >= MAX_SLICE_COUNT)
   1.461 +                pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock);
   1.462 +            h->b[n].ed_q[ h->b[n].ed_fi] = *s;
   1.463 +            h->b[n].ed_cnt++;
   1.464 +            h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT;
   1.465 +            pthread_cond_signal(&h->b[n].ed_cond);
   1.466 +            pthread_mutex_unlock(&h->b[n].ed_lock);
   1.467 +
   1.468 +            n++; n%=h->edb_threads;
   1.469 +        }
   1.470 +        {
   1.471 +            pthread_mutex_lock(&h->lock[ENTROPY3B]);
   1.472 +            h->ed_B_cnt--;
   1.473 +            pthread_cond_signal(&h->cond[ENTROPY3B]);
   1.474 +            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
   1.475 +
   1.476 +        }
   1.477 +
   1.478 +    }
   1.479 +
   1.480 +    for (i=0; i<h->edb_threads; i++){
   1.481 +        pthread_mutex_lock(&h->b[i].ed_lock);
   1.482 +        while (h->b[i].ed_cnt >= MAX_SLICE_COUNT)
   1.483 +            pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock);
   1.484 +        h->b[i].ed_q[ h->b[i].ed_fi] = *s;
   1.485 +        h->b[i].ed_cnt++;
   1.486 +        h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT;
   1.487 +        pthread_cond_signal(&h->b[i].ed_cond);
   1.488 +        pthread_mutex_unlock(&h->b[i].ed_lock);
   1.489 +
   1.490 +    }
   1.491 +    for(int i=0; i<h->edb_threads; i++){
   1.492 +        pthread_join(h->ed_B_thr[i], NULL);
   1.493 +    }
   1.494 +    pthread_exit(NULL);
   1.495 +    return NULL;
   1.496 +}
   1.497 +
   1.498 +
   1.499 +static void *entr_IPB_distribute(void *arg){
   1.500 +    H264Context *h = (H264Context *) arg;
   1.501 +    EDSlice *s;
   1.502 +    int i,n=0;
   1.503 +
   1.504 +    create_spe_ED_threads(h, h->edip_threads, h->edb_threads);
   1.505 +    pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h);
   1.506 +    for(i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
   1.507 +        h->ip[i].h =h;
   1.508 +        h->ip[i].cell = (i >= h->edip_ppe_threads);
   1.509 +        pthread_mutex_init(&h->ip[i].mbs_lock, NULL);
   1.510 +        pthread_cond_init(&h->ip[i].mbs_cond, NULL);
   1.511 +        h->ip[i].thread_num = i - h->edip_ppe_threads;
   1.512 +        h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads;
   1.513 +        h->ip[i].mbs_fo = 0;
   1.514 +        h->ip[i].mbs_cnt = SLICE_BUFS;
   1.515 +        h->ip[i].ed_fi =0;
   1.516 +        h->ip[i].ed_fo =0;
   1.517 +        pthread_mutex_init(&h->ip[i].ed_lock, NULL);
   1.518 +        pthread_cond_init(&h->ip[i].ed_cond, NULL);
   1.519 +        pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]);
   1.520 +    }
   1.521 +
   1.522 +    for(;;){
   1.523 +        {
   1.524 +            pthread_mutex_lock(&h->lock[ENTROPY]);
   1.525 +            while (h->ed_cnt<=0)
   1.526 +                pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]);
   1.527 +            s= &h->ed_q[h->ed_fo];
   1.528 +
   1.529 +            pthread_mutex_unlock(&h->lock[ENTROPY]);
   1.530 +            h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT;
   1.531 +        }
   1.532 +        if (s->state<0)
   1.533 +            break;
   1.534 +
   1.535 +        assert(s->current_picture);
   1.536 +        if (s->slice_type_nos == FF_B_TYPE )
   1.537 +        {
   1.538 +            pthread_mutex_lock(&h->lock[ENTROPY3B]);
   1.539 +            while (h->ed_B_cnt>=MAX_SLICE_COUNT)
   1.540 +                pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
   1.541 +            h->ed_B_q[h->ed_B_fi] = *s;
   1.542 +            h->ed_B_cnt++;
   1.543 +            h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
   1.544 +            pthread_cond_signal(&h->cond[ENTROPY3B]);
   1.545 +            pthread_mutex_unlock(&h->lock[ENTROPY3B]);
   1.546 +        }else
   1.547 +        {
   1.548 +            ///round robin now, change to based on rawframes size.
   1.549 +            pthread_mutex_lock(&h->ip[n].ed_lock);
   1.550 +            while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT)
   1.551 +                pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock);
   1.552 +            h->ip[n].ed_q[ h->ip[n].ed_fi] = *s;
   1.553 +            h->ip[n].ed_cnt++;
   1.554 +            h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT;
   1.555 +            pthread_cond_signal(&h->ip[n].ed_cond);
   1.556 +            pthread_mutex_unlock(&h->ip[n].ed_lock);
   1.557 +
   1.558 +            n++; n %=(h->edip_threads+h->edip_ppe_threads);
   1.559 +        }
   1.560 +        {
   1.561 +            pthread_mutex_lock(&h->lock[ENTROPY]);
   1.562 +            h->ed_cnt--;
   1.563 +            pthread_cond_signal(&h->cond[ENTROPY]);
   1.564 +            pthread_mutex_unlock(&h->lock[ENTROPY]);
   1.565 +
   1.566 +        }
   1.567 +    }
   1.568 +
   1.569 +    {
   1.570 +        pthread_mutex_lock(&h->lock[ENTROPY3B]);
   1.571 +        while (h->ed_B_cnt>=MAX_SLICE_COUNT)
   1.572 +            pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]);
   1.573 +        h->ed_B_q[h->ed_B_fi] = *s;
   1.574 +        h->ed_B_cnt++;
   1.575 +        h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT;
   1.576 +        pthread_cond_signal(&h->cond[ENTROPY3B]);
   1.577 +        pthread_mutex_unlock(&h->lock[ENTROPY3B]);
   1.578 +    }
   1.579 +    {
   1.580 +        for (i=0; i<h->edip_threads + h->edip_ppe_threads; i++){
   1.581 +            pthread_mutex_lock(&h->ip[i].ed_lock);
   1.582 +            while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT)
   1.583 +                pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock);
   1.584 +            h->ip[i].ed_q[ h->ip[i].ed_fi] = *s;
   1.585 +            h->ip[i].ed_cnt++;
   1.586 +            h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT;
   1.587 +            pthread_cond_signal(&h->ip[i].ed_cond);
   1.588 +            pthread_mutex_unlock(&h->ip[i].ed_lock);
   1.589 +        }
   1.590 +    }
   1.591 +    {
   1.592 +        pthread_mutex_lock(&h->lock[ENTROPY4]);
   1.593 +        while (h->ed_reorder_cnt>=MAX_SLICE_COUNT)
   1.594 +            pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
   1.595 +        h->ed_reorder_q[h->ed_reorder_fi] = *s;
   1.596 +        h->ed_reorder_cnt++;
   1.597 +        h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT;
   1.598 +        pthread_cond_signal(&h->cond[ENTROPY4]);
   1.599 +        pthread_mutex_unlock(&h->lock[ENTROPY4]);
   1.600 +
   1.601 +    }
   1.602 +    pthread_join(h->ed_B_dist, NULL);
   1.603 +    for(i=0; i<h->edip_threads; i++){
   1.604 +        pthread_join(h->ed_IP_thr[i], NULL);
   1.605 +    }
   1.606 +    pthread_exit(NULL);
   1.607 +    return NULL;
   1.608 +}
   1.609 +
   1.610 +static pthread_t ed_IPB_dist;
   1.611 +static void *entropy_IPB_cell_thread(void *arg){
   1.612 +    H264Context *h = (H264Context *) arg;
   1.613 +    int i;
   1.614 +    EDSlice reorder[MAX_SLICE_COUNT];
   1.615 +    int ip_poc[MAX_SLICE_COUNT][2]={0,};
   1.616 +    int next_ip_id=0;
   1.617 +    int ip_poc_cnt=0;
   1.618 +    EDSlice *s;
   1.619 +    int reorder_cnt=0;
   1.620 +    unsigned next_pic_num=0;
   1.621 +
   1.622 +    pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h);
   1.623 +    int count =0;
   1.624 +    for(;;){
   1.625 +        //signals received from the entropy decoders
   1.626 +        {
   1.627 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
   1.628 +            while (h->ed_reorder_cnt<=0)
   1.629 +                pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]);
   1.630 +            s= &h->ed_reorder_q[h->ed_reorder_fo];
   1.631 +            h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT;
   1.632 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
   1.633 +        }
   1.634 +
   1.635 +        if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){
   1.636 +            for (i=0; i<ip_poc_cnt; i++){
   1.637 +                if (s->ip_id < ip_poc[i][0]){
   1.638 +                    memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int));
   1.639 +                    break;
   1.640 +                }
   1.641 +            }
   1.642 +            ip_poc[i][0]= s->ip_id;
   1.643 +            ip_poc[i][1]= s->current_picture->poc;
   1.644 +            ip_poc_cnt++;
   1.645 +
   1.646 +            while (next_ip_id == ip_poc[0][0]){
   1.647 +                pthread_mutex_lock(&h->lock[ENTROPY2]);
   1.648 +                h->ed_poc = ip_poc[0][1];
   1.649 +
   1.650 +                pthread_cond_signal(&h->cond[ENTROPY2]);
   1.651 +                pthread_mutex_unlock(&h->lock[ENTROPY2]);
   1.652 +                memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int));
   1.653 +                ip_poc_cnt--;
   1.654 +                next_ip_id++;
   1.655 +            }
   1.656 +        }
   1.657 +
   1.658 +        for(i=reorder_cnt; i>0; i--){
   1.659 +            if (s->coded_pic_num < reorder[i-1].coded_pic_num)
   1.660 +                break;
   1.661 +            reorder[i]=reorder[i-1];
   1.662 +        }
   1.663 +        reorder[i]=*s;
   1.664 +
   1.665 +        while(reorder_cnt>=0){
   1.666 +            if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){
   1.667 +                break;
   1.668 +            }
   1.669 +            EDSlice *es = &reorder[reorder_cnt];
   1.670 +
   1.671 +            {
   1.672 +                pthread_mutex_lock(&h->lock[MBDEC]);
   1.673 +                while (h->mbdec_cnt >= MAX_SLICE_COUNT)
   1.674 +                    pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
   1.675 +                copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es);
   1.676 +
   1.677 +                h->mbdec_cnt++;
   1.678 +                h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT;
   1.679 +                pthread_cond_signal(&h->cond[MBDEC]);
   1.680 +                pthread_mutex_unlock(&h->lock[MBDEC]);
   1.681 +
   1.682 +            }
   1.683 +
   1.684 +            if (es->state<0)
   1.685 +                goto end;
   1.686 +
   1.687 +            assert(es->current_picture);
   1.688 +            for (int i=0; i<es->release_cnt; i++){
   1.689 +                release_pib_entry(h, es->release_ref[i], 2);
   1.690 +            }
   1.691 +            release_pib_entry(h, es->current_picture, 1);
   1.692 +            av_freep(&es->gb.raw);
   1.693 +            if (es->gb.rbsp)
   1.694 +                av_freep(&es->gb.rbsp);
   1.695 +
   1.696 +            next_pic_num++;
   1.697 +            reorder_cnt--;
   1.698 +        }
   1.699 +        reorder_cnt++;
   1.700 +
   1.701 +        {
   1.702 +            pthread_mutex_lock(&h->lock[ENTROPY4]);
   1.703 +            h->ed_reorder_cnt--;
   1.704 +            pthread_cond_signal(&h->cond[ENTROPY4]);
   1.705 +            pthread_mutex_unlock(&h->lock[ENTROPY4]);
   1.706 +        }
   1.707 +    }
   1.708 +
   1.709 +end:
   1.710 +    pthread_join(ed_IPB_dist, NULL);
   1.711 +    pthread_exit(NULL);
   1.712 +    return NULL;
   1.713 +}
   1.714 +
   1.715 +
   1.716 +static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){
   1.717 +    dst->deblocking_filter =1;
   1.718 +    dst->linesize = src->current_picture->linesize[0];
   1.719 +    dst->uvlinesize = src->current_picture->linesize[1];
   1.720 +    dst->mb_width = h->mb_width;
   1.721 +    dst->mb_height = h->mb_height;
   1.722 +    dst->use_weight = src->use_weight;
   1.723 +    dst->use_weight_chroma = src->use_weight_chroma;
   1.724 +    dst->luma_log2_weight_denom = src->luma_log2_weight_denom;
   1.725 +    dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom;
   1.726 +
   1.727 +    //weights later
   1.728 +    memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t));
   1.729 +    memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t));
   1.730 +    memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t));
   1.731 +
   1.732 +    for(int list=0; list<2; list++){
   1.733 +        for (int i=0; i<src->ref_count[list]; i++){
   1.734 +            Picture_spu *p_dst = &dst->ref_list[list][i];
   1.735 +            DecodedPicture *p_src = src->ref_list[list][i];
   1.736 +            if (p_src){
   1.737 +                p_dst->data[0] = p_src->data[0];
   1.738 +                p_dst->data[1] = p_src->data[1];
   1.739 +                p_dst->data[2] = p_src->data[2];
   1.740 +            }
   1.741 +        }
   1.742 +    }
   1.743 +    dst->state = src->state;
   1.744 +
   1.745 +    dst->emu_edge_width  =32;
   1.746 +    dst->emu_edge_height =32;
   1.747 +    dst->slice_type = src->slice_type;
   1.748 +    dst->slice_type_nos = src->slice_type_nos;
   1.749 +    dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset;
   1.750 +    dst->slice_beta_offset = src->slice_beta_offset;
   1.751 +
   1.752 +    memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64);
   1.753 +
   1.754 +    dst->blocks = src->mbs;
   1.755 +    dst->dst_y = src->current_picture->data[0];
   1.756 +    dst->dst_cb = src->current_picture->data[1];
   1.757 +    dst->dst_cr = src->current_picture->data[2];
   1.758 +}
   1.759 +
   1.760 +static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){
   1.761 +    static int rl_fi=0;
   1.762 +
   1.763 +    DECLARE_ALIGNED(16, H264slice, spe_slice);
   1.764 +    H264spe *p=&spe_params[0];
   1.765 +    unsigned status;
   1.766 +    uint8_t *dst_y, *dst_cb, *dst_cr;
   1.767 +
   1.768 +    DecodedPicture *dp;
   1.769 +
   1.770 +    for (int i=0; i<2; i++){
   1.771 +        for(int j=0; j< s->ref_count[i]; j++){
   1.772 +            if (s->ref_list_cpn[i][j] ==-1)
   1.773 +                continue;
   1.774 +            int k;
   1.775 +            for (k=0; k<DPB_SIZE; k++){
   1.776 +                if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
   1.777 +                    s->ref_list[i][j] = &h->dpb[k];
   1.778 +                    break;
   1.779 +                }
   1.780 +            }
   1.781 +        }
   1.782 +    }
   1.783 +
   1.784 +    dp = get_dpb_entry(h);
   1.785 +    init_dpb_entry(dp, s, d->width, d->height);
   1.786 +
   1.787 +    if (h->no_mbd)
   1.788 +        return;
   1.789 +
   1.790 +
   1.791 +    fill_spe_slice(&spe_slice, s, h);
   1.792 +    spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0);
   1.793 +    spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status);
   1.794 +    rl_fi++; rl_fi %= 2;
   1.795 +
   1.796 +    _spe_in_mbox_write(spe_control_area[0], 0);
   1.797 +    while (atomic_read(rl_cnt)<=0){
   1.798 +        //pthread_yield();
   1.799 +        usleep(1000);
   1.800 +    }
   1.801 +    atomic_dec(rl_cnt);
   1.802 +
   1.803 +
   1.804 +/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/
   1.805 +// 	memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16);
   1.806 +// 	memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8);
   1.807 +// 	memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8);
   1.808 +//
   1.809 +// 	memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16);
   1.810 +// 	memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8);
   1.811 +// 	memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8);
   1.812 +//
   1.813 +// 	decode_slice_mb_seq(d, s);
   1.814 +//
   1.815 +// 	for (int i=0; i<h->mb_height*16; i++){
   1.816 +// 		for (int j=0; j<h->width; j++){
   1.817 +// 			if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){
   1.818 +// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]);
   1.819 +// 				return;
   1.820 +// 			}
   1.821 +// 		}
   1.822 +// 	}
   1.823 +//
   1.824 +// 	for (int i=0; i<h->mb_height*8; i++){
   1.825 +// 		for (int j=0; j<h->width/2; j++){
   1.826 +// 			if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){
   1.827 +// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]);
   1.828 +// 				return;
   1.829 +// 			}
   1.830 +// 		}
   1.831 +// 	}
   1.832 +//
   1.833 +// 	for (int i=0; i<h->mb_height*8; i++){
   1.834 +// 		for (int j=0; j<h->width/2; j++){
   1.835 +// 			if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){
   1.836 +// 				printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]);
   1.837 +// 				return;
   1.838 +// 			}
   1.839 +// 		}
   1.840 +// 	}
   1.841 +
   1.842 +
   1.843 +    //printf("dst_y %p\n", dst_y);
   1.844 +
   1.845 +
   1.846 +     for (int i=0; i<s->release_cnt; i++){
   1.847 +        for(int j=0; j<DPB_SIZE; j++){
   1.848 +            if(h->dpb[j].cpn== s->release_ref_cpn[i]){
   1.849 +                release_dpb_entry(h, &h->dpb[j], 2);
   1.850 +                break;
   1.851 +            }
   1.852 +        }
   1.853 +    }
   1.854 +    s->release_cnt=0;
   1.855 +
   1.856 +}
   1.857 +
   1.858 +static void *h264_spe_thread(void * thread_args ) {
   1.859 +    H264spe *params = (H264spe *)thread_args;
   1.860 +    unsigned int spe_id = params->spe_id;
   1.861 +    unsigned int runflags = 0;
   1.862 +    unsigned int entry = SPE_DEFAULT_ENTRY;
   1.863 +    // run SPE context
   1.864 +    spe_context_run(spe_context[spe_id],  &entry, runflags, (void*) params, NULL, NULL);
   1.865 +    // done - now exit thread
   1.866 +    pthread_exit(NULL);
   1.867 +}
   1.868 +
   1.869 +static int create_spe_MBR_threads(H264Context *h, int num_threads) {
   1.870 +    int i;
   1.871 +
   1.872 +    // reserve memory for spe thread id, context and argument addresses
   1.873 +    spe_tid = av_malloc(num_threads * sizeof (pthread_t));
   1.874 +    spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t));
   1.875 +    spe_params = av_malloc(num_threads * sizeof (H264spe));
   1.876 +    spe_control_area = av_malloc(num_threads * sizeof (void*));
   1.877 +    spe_ls_area = av_malloc(num_threads * sizeof (void*));
   1.878 +    spe_slice_buf = av_malloc(num_threads * sizeof (void*));
   1.879 +
   1.880 +    spe_program_handle_t *spe_program = spe_image_open("spe_mbd");
   1.881 +
   1.882 +    if (spe_program == NULL)
   1.883 +        av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno));
   1.884 +
   1.885 +    for (i = 0; i < num_threads; i++) {
   1.886 +        // create context for spe program
   1.887 +        spe_context[i] = spe_context_create(SPE_MAP_PS, NULL);
   1.888 +        if (spe_context[i] == NULL)
   1.889 +            av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno));
   1.890 +        // load SPE program into main memory
   1.891 +        if ((spe_program_load(spe_context[i], spe_program)) == -1)
   1.892 +            av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno));
   1.893 +        //get the control_area for fast mailboxing
   1.894 +        if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL)
   1.895 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno));
   1.896 +        //get ls area for inter spe communication
   1.897 +        if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL)
   1.898 +            av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno));
   1.899 +    }
   1.900 +
   1.901 +    for (i = 0; i < num_threads; i++) {
   1.902 +        spe_params[i].mb_width = h->mb_width;
   1.903 +        spe_params[i].mb_height = h->mb_height;
   1.904 +        spe_params[i].mb_stride = h->mb_stride;
   1.905 +        spe_params[i].spe_id = i;
   1.906 +        spe_params[i].spe_total = num_threads;
   1.907 +        //spe_params[i].slice_params= &slice_params;
   1.908 +        spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads];
   1.909 +        spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads];
   1.910 +
   1.911 +        spe_params[i].rl_lock = rl_lock;
   1.912 +        spe_params[i].rl_cond = rl_cond;
   1.913 +        spe_params[i].rl_cnt = rl_cnt;
   1.914 +        spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i];
   1.915 +        spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i];
   1.916 +        spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0);
   1.917 +
   1.918 +        mutex_init(spe_params[i].lock);
   1.919 +        cond_init(spe_params[i].cond);
   1.920 +        if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i]))
   1.921 +            av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i);
   1.922 +
   1.923 +        //slicebufaddr
   1.924 +        spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]);
   1.925 +
   1.926 +        av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i);
   1.927 +    }
   1.928 +    spe_image_close(spe_program);
   1.929 +    return 0;
   1.930 +}
   1.931 +
   1.932 +//_spe_out_mbox_read(spe_control_area[i]);
   1.933 +/**
   1.934 +* joins all the spe worker threads.
   1.935 +*/
   1.936 +static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) {
   1.937 +    int i;
   1.938 +    ///just to keep coding consistency.
   1.939 +    {
   1.940 +        for (i=0; i<num_threads; i++){
   1.941 +            H264spe *p=&spe_params[i];
   1.942 +            unsigned status;
   1.943 +
   1.944 +            while (atomic_read(p->cnt)>=2) {//double buffered
   1.945 +                usleep(1000);//cond_wait(p->cond, p->lock);
   1.946 +            }
   1.947 +
   1.948 +            spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0);
   1.949 +            spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
   1.950 +            //mutex_unlock(p->lock);
   1.951 +            _spe_in_mbox_write(spe_control_area[i], 0);
   1.952 +        }
   1.953 +    }
   1.954 +
   1.955 +    for (i=0; i<num_threads; i++){
   1.956 +        pthread_join(spe_tid[i], NULL);
   1.957 +    }
   1.958 +
   1.959 +    for (i=0; i<num_threads; i++){
   1.960 +        spe_context_destroy(spe_context[i]);
   1.961 +    }
   1.962 +    atomic_inc(rl_cnt);
   1.963 +
   1.964 +    // destroy memory reserved for spe thread id, context and argument addresses
   1.965 +    av_freep(&spe_tid);
   1.966 +    av_freep(&spe_context);
   1.967 +    av_freep(&spe_params);
   1.968 +    av_freep(&spe_control_area);
   1.969 +    av_freep(&spe_slice_buf);
   1.970 +}
   1.971 +
   1.972 +
   1.973 +static void *rl_dist_thread(void *arg){
   1.974 +    int i;
   1.975 +    H264Context *h = (H264Context *) arg;
   1.976 +    MBSlice *s;
   1.977 +    DecodedPicture *dp;
   1.978 +    int rl_fi[16]={0,};
   1.979 +    DECLARE_ALIGNED(16, H264slice, spe_slice);
   1.980 +
   1.981 +    create_spe_MBR_threads(h, h->rl_threads);
   1.982 +    for(;;){
   1.983 +        {
   1.984 +            pthread_mutex_lock(&h->lock[MBDEC]);
   1.985 +            while (h->mbdec_cnt<=0)
   1.986 +                pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]);
   1.987 +            s= &h->mbdec_q[h->mbdec_fo];
   1.988 +            h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT;
   1.989 +            pthread_mutex_unlock(&h->lock[MBDEC]);
   1.990 +        }
   1.991 +
   1.992 +        if (s->state<0){
   1.993 +            break;
   1.994 +        }
   1.995 +        for (int i=0; i<2; i++){
   1.996 +            for(int j=0; j< s->ref_count[i]; j++){
   1.997 +                if (s->ref_list_cpn[i][j] ==-1)
   1.998 +                    continue;
   1.999 +                int k;
  1.1000 +                for (k=0; k<DPB_SIZE; k++){
  1.1001 +                    if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){
  1.1002 +                        s->ref_list[i][j] = &h->dpb[k];
  1.1003 +                        break;
  1.1004 +                    }
  1.1005 +                }
  1.1006 +
  1.1007 +            }
  1.1008 +        }
  1.1009 +        dp = get_dpb_entry(h);
  1.1010 +        init_dpb_entry(dp, s, h->width, h->height);
  1.1011 +        assert(s->current_picture);
  1.1012 +        {
  1.1013 +            while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
  1.1014 +                usleep(1000);
  1.1015 +            }
  1.1016 +            h->mbrel_q[h->mbrel_fi] = *s;
  1.1017 +
  1.1018 +            h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
  1.1019 +        }
  1.1020 +        {
  1.1021 +            if(h->no_mbd){
  1.1022 +                atomic_inc(rl_cnt);
  1.1023 +            }else {
  1.1024 +                fill_spe_slice(&spe_slice, s, h);
  1.1025 +                for (i=0; i<h->rl_threads; i++){
  1.1026 +                    H264spe *p=&spe_params[i];
  1.1027 +                    unsigned status;
  1.1028 +                    while (atomic_read(p->cnt)>=2){ //double buffered
  1.1029 +                        usleep(1000);
  1.1030 +                        //cond_wait(p->cond, p->lock);
  1.1031 +                    }
  1.1032 +                    spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0);
  1.1033 +                    spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status);
  1.1034 +                    rl_fi[i]++; rl_fi[i] %= 2;
  1.1035 +                    atomic_inc(p->cnt);
  1.1036 +
  1.1037 +                    _spe_in_mbox_write(spe_control_area[i], 0);
  1.1038 +                }
  1.1039 +            }
  1.1040 +        }
  1.1041 +
  1.1042 +        {
  1.1043 +            pthread_mutex_lock(&h->lock[MBDEC]);
  1.1044 +            h->mbdec_cnt--;
  1.1045 +            pthread_cond_signal(&h->cond[MBDEC]);
  1.1046 +            pthread_mutex_unlock(&h->lock[MBDEC]);
  1.1047 +        }
  1.1048 +
  1.1049 +    }
  1.1050 +
  1.1051 +    {
  1.1052 +        while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){
  1.1053 +            usleep(1000);
  1.1054 +        }
  1.1055 +        h->mbrel_q[h->mbrel_fi] = *s;
  1.1056 +
  1.1057 +        h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT;
  1.1058 +    }
  1.1059 +    spe_slice.state=-1;
  1.1060 +    join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi);
  1.1061 +    pthread_exit(NULL);
  1.1062 +    return NULL;
  1.1063 +}
  1.1064 +
  1.1065 +static void *mbdec_cell_thread(void *arg){
  1.1066 +    H264Context *h = (H264Context *) arg;
  1.1067 +
  1.1068 +    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
  1.1069 +    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
  1.1070 +    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
  1.1071 +    atomic_set(rl_cnt, 0);
  1.1072 +    mutex_init(rl_lock);
  1.1073 +    cond_init(rl_cond);
  1.1074 +// 	printf("mbdec, pid %d\n", syscall(SYS_gettid));
  1.1075 +    pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h);
  1.1076 +
  1.1077 +    for(;;){
  1.1078 +        MBSlice *s=NULL;
  1.1079 +        {
  1.1080 +            while (atomic_read(rl_cnt)<=0){
  1.1081 +                usleep(1000);
  1.1082 +            }
  1.1083 +            s= &h->mbrel_q[h->mbrel_fo];
  1.1084 +            h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT;
  1.1085 +        }
  1.1086 +
  1.1087 +        if (s->state<0)
  1.1088 +            break;
  1.1089 +
  1.1090 +        for (int i=0; i<s->release_cnt; i++){
  1.1091 +            for(int j=0; j<DPB_SIZE; j++){
  1.1092 +                if(h->dpb[j].cpn== s->release_ref_cpn[i]){
  1.1093 +                    release_dpb_entry(h, &h->dpb[j], 2);
  1.1094 +                    break;
  1.1095 +                }
  1.1096 +            }
  1.1097 +        }
  1.1098 +
  1.1099 +        {
  1.1100 +            EDThreadContext *ed = s->ed;
  1.1101 +            pthread_mutex_lock(&ed->mbs_lock);
  1.1102 +            ed->mbs_cnt++;
  1.1103 +            pthread_cond_signal(&ed->mbs_cond);
  1.1104 +            pthread_mutex_unlock(&ed->mbs_lock);
  1.1105 +        }
  1.1106 +
  1.1107 +        {
  1.1108 +            pthread_mutex_lock(&h->lock[WRITE]);
  1.1109 +            while (h->write_cnt>= DPB_SIZE)
  1.1110 +                pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
  1.1111 +            assert(s);
  1.1112 +            assert(s->current_picture);
  1.1113 +            h->write_q[h->write_fi]= s->current_picture;
  1.1114 +            h->write_cnt++;
  1.1115 +            h->write_fi++; h->write_fi %= DPB_SIZE;
  1.1116 +            pthread_cond_signal(&h->cond[WRITE]);
  1.1117 +            pthread_mutex_unlock(&h->lock[WRITE]);
  1.1118 +
  1.1119 +        }
  1.1120 +        {
  1.1121 +            atomic_dec(rl_cnt);
  1.1122 +        }
  1.1123 +
  1.1124 +    }
  1.1125 +
  1.1126 +    {//propagate exit
  1.1127 +        pthread_mutex_lock(&h->lock[WRITE]);
  1.1128 +        while (h->write_cnt>= DPB_SIZE)
  1.1129 +            pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]);
  1.1130 +        last_pic.reference = -1;
  1.1131 +        h->write_q[h->write_fi] = &last_pic;
  1.1132 +        h->write_cnt++;
  1.1133 +        h->write_fi++; h->write_fi %= DPB_SIZE;
  1.1134 +        pthread_cond_signal(&h->cond[WRITE]);
  1.1135 +        pthread_mutex_unlock(&h->lock[WRITE]);
  1.1136 +
  1.1137 +    }
  1.1138 +    pthread_join(h->rl_dist_thr, NULL);
  1.1139 +    pthread_exit(NULL);
  1.1140 +    return NULL;
  1.1141 +}
  1.1142 +
  1.1143 +/*
  1.1144 +* The following code is the main loop of the file converter
  1.1145 +*/
  1.1146 +int h264_decode_cell(H264Context *h) {
  1.1147 +
  1.1148 +    pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr;   
  1.1149 +
  1.1150 +    start_timer();
  1.1151 +
  1.1152 +    pthread_create(&read_thr, NULL, read_thread, h);
  1.1153 +    pthread_create(&parsenal_thr, NULL, parsenal_thread, h);
  1.1154 +    pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h);
  1.1155 +    pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h);
  1.1156 +    pthread_create(&write_thr, NULL, write_thread, h);
  1.1157 +
  1.1158 +    pthread_join(read_thr, NULL);
  1.1159 +    pthread_join(parsenal_thr, NULL);
  1.1160 +    pthread_join(entropy_thr, NULL);
  1.1161 +    pthread_join(mbdec_thr, NULL);
  1.1162 +    pthread_join(write_thr, NULL);
  1.1163 +
  1.1164 +    return 0;
  1.1165 +}
  1.1166 +
  1.1167 +/*
  1.1168 +* The following code is the main loop of the file converter
  1.1169 +*/
  1.1170 +int h264_decode_cell_seq(H264Context *h) {
  1.1171 +ParserContext *pc;
  1.1172 +    NalContext *nc;
  1.1173 +    EntropyContext *ec;
  1.1174 +    MBRecContext *rc;
  1.1175 +    OutputContext *oc;
  1.1176 +
  1.1177 +    RawFrame frm;
  1.1178 +    EDSlice slice, *s=&slice;
  1.1179 +    MBSlice mbslice, *s2=&mbslice;
  1.1180 +    PictureInfo *pic=NULL;
  1.1181 +    DecodedPicture *out;
  1.1182 +    int size;
  1.1183 +    int frames=0;
  1.1184 +    
  1.1185 +    pc = get_parse_context(h->ifile);
  1.1186 +    nc = get_nal_context(h->width, h->height);
  1.1187 +    ec = get_entropy_context( h );
  1.1188 +    rc = get_mbrec_context(h);
  1.1189 +    oc = get_output_context( h );
  1.1190 +
  1.1191 +    rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var;
  1.1192 +    rl_cond = (cond_ea_t) (unsigned) &rl_cond_var;
  1.1193 +    rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var;
  1.1194 +    atomic_set(rl_cnt, 0);
  1.1195 +    mutex_init(rl_lock);
  1.1196 +    cond_init(rl_cond);
  1.1197 +
  1.1198 +    memset(s, 0, sizeof(EDSlice));
  1.1199 +    ff_init_slice(nc, s);
  1.1200 +    s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb));
  1.1201 +
  1.1202 +    DecodedPicture tmp;
  1.1203 +    tmp.base[0]=0;
  1.1204 +    ///fix this when want to debug the Cell errors
  1.1205 +    //init_dpb_entry(&tmp, h->width, h->height);
  1.1206 +
  1.1207 +    create_spe_ED_threads(h, 1, 0);
  1.1208 +    create_spe_MBR_threads(h, 1);
  1.1209 +    
  1.1210 +    start_timer();
  1.1211 +
  1.1212 +    while(!pc->final_frame && frames++ < h->num_frames){
  1.1213 +
  1.1214 +        av_read_frame_internal(pc, &frm);
  1.1215 +        
  1.1216 +        PictureInfo *pic=get_pib_entry(h);
  1.1217 +        ff_alloc_picture_info(nc, s, pic);
  1.1218 +        decode_nal_units(nc, s, &frm);
  1.1219 +
  1.1220 +        copyEDtoMBSlice(s2, s);
  1.1221 +        decode_slice_entropy_cell_seq(h, ec, s);
  1.1222 +        
  1.1223 +        decode_slice_mb_seq_cell(h, rc, s2, &tmp);
  1.1224 +
  1.1225 +        out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height);
  1.1226 +        
  1.1227 +        if (out){
  1.1228 +            release_dpb_entry(h, out, 1);
  1.1229 +        }
  1.1230 +        print_report(oc->frame_number, oc->video_size, 0, h->verbose);
  1.1231 +    }
  1.1232 +    while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ;
  1.1233 +
  1.1234 +    print_report(oc->frame_number, oc->video_size, 1, h->verbose);
  1.1235 +
  1.1236 +    /* finished ! */
  1.1237 +    av_freep(&s->mbs);
  1.1238 +
  1.1239 +    free_parse_context(pc);
  1.1240 +    free_nal_context  (nc);
  1.1241 +    free_entropy_context(ec);
  1.1242 +    free_mbrec_context(rc);
  1.1243 +    free_output_context(oc);                
  1.1244 +    return 0;
  1.1245 +}