Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__H264__App
diff libavcodec/h264_cell.c @ 2:897f711a7157
rearrange to work with autoconf
| author | Nina Engelhardt <nengel@mailbox.tu-berlin.de> |
|---|---|
| date | Tue, 25 Sep 2012 15:55:33 +0200 |
| parents | |
| children |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/libavcodec/h264_cell.c Tue Sep 25 15:55:33 2012 +0200 1.3 @@ -0,0 +1,1242 @@ 1.4 + 1.5 +#include "h264_types.h" 1.6 +#include "h264_parser.h" 1.7 +#include "h264_nal.h" 1.8 +#include "h264_entropy.h" 1.9 +#include "h264_rec.h" 1.10 +#include "h264_misc.h" 1.11 +#include "cell/h264_types_spu.h" 1.12 +#include "h264_pthread.h" 1.13 + 1.14 +#include <pthread.h> 1.15 +#include <assert.h> 1.16 +#include <unistd.h> 1.17 + 1.18 +#include <libspe2.h> 1.19 +#include <ppu_intrinsics.h> 1.20 +#include <cbe_mfc.h> 1.21 +#include <libsync.h> 1.22 + 1.23 +// spe global variables 1.24 +unsigned rl_cnt_var, rl_mutex_var, rl_cond_var; 1.25 +atomic_ea_t rl_cnt; 1.26 +cond_ea_t rl_cond; 1.27 +mutex_ea_t rl_lock; 1.28 + 1.29 +H264spe * spe_params; 1.30 +unsigned mutex_var[16]; 1.31 +unsigned cond_var[16]; 1.32 +unsigned atomic_var[16]; 1.33 + 1.34 +pthread_t * spe_tid; 1.35 +spe_context_ptr_t *spe_context; 1.36 +void** spe_control_area; 1.37 +void** spe_ls_area; 1.38 +H264slice **spe_slice_buf; 1.39 + 1.40 +H264spe * spe_ed_params; 1.41 +unsigned mutex_ed_var[16]; 1.42 +unsigned cond_ed_var[16]; 1.43 +unsigned atomic_ed_var[16]; 1.44 + 1.45 +pthread_t * spe_ed_tid; 1.46 +spe_context_ptr_t *spe_ed_context; 1.47 +void** spe_ed_control_area; 1.48 +void** spe_ed_ls_area; 1.49 +EDSlice_spu **spe_ed_slice_buf; 1.50 + 1.51 +//structs to propagate stop signal 1.52 +MBSlice last_slice; 1.53 +EDSlice last_ed_slice; 1.54 +DecodedPicture last_pic; 1.55 +RawFrame last_frm; 1.56 + 1.57 +static int direct_B_resolved(EDSlice *s, int *poc_list, int *poc_cnt){ 1.58 + int i; 1.59 + int cnt = *poc_cnt; 1.60 + for(i=0; i<cnt; i++){ 1.61 + if (poc_list[i]==s->ref_list[1][0]->poc){ 1.62 + *poc_cnt=i+1; 1.63 + while(++i<cnt) 1.64 + poc_list[i]=0; 1.65 + return 1; 1.66 + } 1.67 + } 1.68 + return 0; 1.69 +} 1.70 + 1.71 +static void update_IP_poc_list(int *poc_list, int *poc_cnt, int poc) { 1.72 + int i=0; 1.73 + int cnt = *poc_cnt; 1.74 + 1.75 + while (poc_list[i] > poc) { i++;} 1.76 + if ( i< cnt) 1.77 + memmove(&poc_list[i+1], &poc_list[i], (cnt-i)*sizeof(int)); 1.78 + 1.79 + poc_list[i]=poc; 1.80 + (*poc_cnt)++; 1.81 +} 1.82 + 1.83 +static void *spe_ed_thread(void *arg){ 1.84 + H264spe *params = (H264spe *)arg; 1.85 + unsigned int idx = params->idx; 1.86 + unsigned int runflags = 0; 1.87 + unsigned int entry = SPE_DEFAULT_ENTRY; 1.88 + // run SPE context 1.89 + spe_context_run(spe_ed_context[idx], &entry, runflags, (void*) params, NULL, NULL); 1.90 + // done - now exit thread 1.91 + pthread_exit(NULL); 1.92 +} 1.93 + 1.94 +static void create_spe_ED_threads(H264Context *h, int ip_threads, int b_threads) { 1.95 + int i; 1.96 + int num_threads = ip_threads+b_threads; 1.97 + spe_program_handle_t * spe_program = spe_image_open("spe_ed"); 1.98 + // reserve memory for spe thread id, context and argument addresses 1.99 + spe_ed_tid = av_malloc(num_threads * sizeof (pthread_t)); 1.100 + spe_ed_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); 1.101 + spe_ed_params = av_malloc(num_threads * sizeof (H264spe)); 1.102 + spe_ed_control_area = av_malloc(num_threads * sizeof (void*)); 1.103 + spe_ed_ls_area = av_malloc(num_threads * sizeof (void*)); 1.104 + spe_ed_slice_buf = av_malloc(num_threads * sizeof (void*)); 1.105 + 1.106 + if (spe_program == NULL) 1.107 + av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); 1.108 + 1.109 + for (i = 0; i < num_threads; i++) { 1.110 + // create context for spe program 1.111 + spe_ed_context[i] = spe_context_create(SPE_MAP_PS, NULL); 1.112 + if (spe_ed_context[i] == NULL) 1.113 + av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); 1.114 + // load SPE program into main memory 1.115 + if ((spe_program_load(spe_ed_context[i], spe_program)) == -1) 1.116 + av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); 1.117 + //get the control_area for fast mailboxing 1.118 + if ((spe_ed_control_area[i] = spe_ps_area_get(spe_ed_context[i], SPE_CONTROL_AREA)) == NULL) 1.119 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); 1.120 + //get ls area for inter spe communication 1.121 + if ((spe_ed_ls_area[i] = spe_ls_area_get(spe_ed_context[i])) == NULL) 1.122 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); 1.123 + } 1.124 + 1.125 + for (i = 0; i < ip_threads; i++) { 1.126 + spe_ed_params[i].mb_width = h->mb_width; 1.127 + spe_ed_params[i].mb_stride = h->mb_stride; 1.128 + spe_ed_params[i].mb_height = h->mb_height; 1.129 + spe_ed_params[i].type = EDIP; 1.130 + spe_ed_params[i].spe_id = i; 1.131 + spe_ed_params[i].idx = i; 1.132 + //spe_ed_params[i].spe_total = ip_threads; //not used 1.133 + //spe_params[i].slice_params= &slice_params; 1.134 + spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; 1.135 + spe_ed_params[i].tgt_spe = spe_ed_ls_area[(i+1)%num_threads]; 1.136 + 1.137 + spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; 1.138 + spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; 1.139 + spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); 1.140 + 1.141 + mutex_init(spe_ed_params[i].lock); 1.142 + cond_init(spe_ed_params[i].cond); 1.143 + if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) 1.144 + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); 1.145 + 1.146 + //slicebufaddr 1.147 + spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); 1.148 + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); 1.149 + } 1.150 + for (int j = 0; j < b_threads; j++) { 1.151 + i = j+ip_threads; 1.152 + spe_ed_params[i].mb_width = h->mb_width; 1.153 + spe_ed_params[i].mb_stride = h->mb_stride; 1.154 + spe_ed_params[i].mb_height = h->mb_height; 1.155 + spe_ed_params[i].type = EDB; 1.156 + spe_ed_params[i].idx = i; 1.157 + spe_ed_params[i].spe_id = j; 1.158 + spe_ed_params[i].spe_total = b_threads; 1.159 + //spe_params[i].slice_params= &slice_params; 1.160 + //spe_ed_params[i].src_spe = spe_ed_ls_area[(i-1+num_threads)%num_threads]; 1.161 + spe_ed_params[i].tgt_spe = spe_ed_ls_area[((j+1)%b_threads) + ip_threads]; 1.162 + 1.163 + spe_ed_params[i].lock = (mutex_ea_t) (unsigned) &mutex_ed_var[i]; 1.164 + spe_ed_params[i].cond = (cond_ea_t) (unsigned) &cond_ed_var[i]; 1.165 + spe_ed_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_ed_var[i]; atomic_set(spe_ed_params[i].cnt, 0); 1.166 + 1.167 + mutex_init(spe_ed_params[i].lock); 1.168 + cond_init(spe_ed_params[i].cond); 1.169 + if (pthread_create(&spe_ed_tid[i], NULL, spe_ed_thread, (void *) &spe_ed_params[i])) 1.170 + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); 1.171 + 1.172 + //slicebufaddr 1.173 + spe_ed_slice_buf[i] = (EDSlice_spu *) _spe_out_mbox_read(spe_ed_control_area[i]); 1.174 + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); 1.175 + } 1.176 + spe_image_close(spe_program); 1.177 + 1.178 +} 1.179 + 1.180 +static void fill_EDSlice_spu(EDSlice_spu *dst, EDSlice *src){ 1.181 + dst->pps = src->pps; 1.182 + dst->mbs = src->mbs; 1.183 + dst->state = src->state; 1.184 + dst->qp_thresh = src->qp_thresh; 1.185 + dst->pic = *src->current_picture; 1.186 + 1.187 + dst->ref_count[0] = src->ref_count[0]; 1.188 + dst->ref_count[1] = src->ref_count[1]; 1.189 + dst->slice_type = src->slice_type; 1.190 + dst->slice_type_nos = src->slice_type_nos; 1.191 + dst->direct_8x8_inference_flag = src->direct_8x8_inference_flag; 1.192 + dst->list_count = src->list_count; 1.193 + dst->coded_pic_num = src->coded_pic_num; 1.194 + 1.195 + GetBitContext *gb = &src->gb; 1.196 + align_get_bits( gb); 1.197 + dst->bytestream_start = gb->buffer + get_bits_count(gb)/8; 1.198 + dst->byte_bufsize = (get_bits_left(gb) + 7)/8; 1.199 + 1.200 + dst->transform_bypass = src->transform_bypass; 1.201 + dst->direct_spatial_mv_pred = src->direct_spatial_mv_pred; 1.202 + memcpy(dst->map_col_to_list0, src->map_col_to_list0, 2*16*sizeof(int)); 1.203 + memcpy(dst->dist_scale_factor, src->dist_scale_factor, 16*sizeof(int)); 1.204 + dst->cabac_init_idc = src->cabac_init_idc; 1.205 + memcpy(dst->ref2frm, src->ref2frm, 2*64*sizeof(int)); 1.206 + dst->chroma_qp[0]= src->chroma_qp[0]; 1.207 + dst->chroma_qp[1]= src->chroma_qp[1]; 1.208 + dst->qscale = src->qscale; 1.209 + dst->last_qscale_diff = src->last_qscale_diff; 1.210 + 1.211 + if (src->slice_type_nos == FF_B_TYPE) dst->list1 = *src->ref_list[1][0]; 1.212 +} 1.213 + 1.214 +static void send_slice_to_spe_and_wait(EDSlice_spu *s, int id){ 1.215 + unsigned status; 1.216 + 1.217 + spe_mfcio_get(spe_ed_context[id], (unsigned) spe_ed_slice_buf[id], s, sizeof(EDSlice_spu), 14, 0, 0); 1.218 + spe_mfcio_tag_status_read(spe_ed_context[id], 1<<14, SPE_TAG_ALL, &status); 1.219 + 1.220 + 1.221 + _spe_in_mbox_write(spe_ed_control_area[id], 0); 1.222 + 1.223 + while (!spe_out_mbox_status(spe_ed_context[id])){ 1.224 + //pthread_yield(); 1.225 + usleep(1000); 1.226 + } 1.227 + _spe_out_mbox_read(spe_ed_control_area[id]); 1.228 +} 1.229 + 1.230 +static int decode_slice_entropy_cell(EntropyContext *ec, EDSlice *s, int id){ 1.231 + int i,j; 1.232 + 1.233 + if( !s->pps.cabac ){ 1.234 + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); 1.235 + return -1; 1.236 + } 1.237 + DECLARE_ALIGNED(16, EDSlice_spu, slice); 1.238 + fill_EDSlice_spu(&slice, s); 1.239 + 1.240 + send_slice_to_spe_and_wait(&slice, id); 1.241 + 1.242 + return 0; 1.243 +} 1.244 + 1.245 +static int decode_slice_entropy_cell_seq(H264Context *h, EntropyContext *ec, EDSlice *s){ 1.246 + int i,j; 1.247 + 1.248 + if( !s->pps.cabac ){ 1.249 + av_log(AV_LOG_ERROR, "Only cabac encoded streams are supported\n"); 1.250 + return -1; 1.251 + } 1.252 + DECLARE_ALIGNED(16, EDSlice_spu, slice); 1.253 + fill_EDSlice_spu(&slice, s); 1.254 + 1.255 + send_slice_to_spe_and_wait(&slice, 0); 1.256 + 1.257 + if (s->release_cnt>0) { 1.258 + for (int i=0; i<s->release_cnt; i++){ 1.259 + release_pib_entry(h, s->release_ref[i], 2); 1.260 + } 1.261 + s->release_cnt=0; 1.262 + } 1.263 + 1.264 + release_pib_entry(h, s->current_picture, 1); 1.265 + av_freep(&s->gb.raw); 1.266 + if (s->gb.rbsp) 1.267 + av_freep(&s->gb.rbsp); 1.268 + 1.269 + return 0; 1.270 +} 1.271 + 1.272 +static void *entr_IP_spe_thread(void *arg){ 1.273 + EDThreadContext *eip = (EDThreadContext *) arg; 1.274 + H264Context *h = eip->h; 1.275 +// printf("eip %d, pid %d\n", eip->thread_num, syscall(SYS_gettid)); 1.276 + for (int i=0; i<SLICE_BUFS; i++){ 1.277 + eip->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); 1.278 + } 1.279 + 1.280 + EntropyContext *ec = get_entropy_context(h); 1.281 + EDSlice *s; 1.282 + 1.283 + for(;;){ 1.284 + { 1.285 + pthread_mutex_lock(&eip->ed_lock); 1.286 + while (eip->ed_cnt <= 0) 1.287 + pthread_cond_wait(&eip->ed_cond, &eip->ed_lock); 1.288 + s = &eip->ed_q[eip->ed_fo]; 1.289 + eip->ed_fo++; eip->ed_fo %= MAX_SLICE_COUNT; 1.290 + pthread_mutex_unlock(&eip->ed_lock); 1.291 + } 1.292 + 1.293 + if (s->state<0) 1.294 + break; 1.295 + { 1.296 + pthread_mutex_lock(&eip->mbs_lock); 1.297 + while (eip->mbs_cnt <= 0) 1.298 + pthread_cond_wait(&eip->mbs_cond, &eip->mbs_lock); 1.299 + 1.300 + s->mbs = eip->mbs[eip->mbs_fo]; 1.301 + s->ed = eip; 1.302 + eip->mbs_cnt--; 1.303 + eip->mbs_fo++; eip->mbs_fo%=SLICE_BUFS; 1.304 + pthread_mutex_unlock(&eip->mbs_lock); 1.305 + } 1.306 + if (eip->cell){ 1.307 + decode_slice_entropy_cell(ec, s, eip->thread_num); 1.308 + }else{ 1.309 + decode_slice_entropy(ec, s); 1.310 + } 1.311 + 1.312 +// { 1.313 +// pthread_mutex_lock(&h->lock[ENTROPY2]); 1.314 +// h->ed_poc[h->ed_poc_fi++ % MAX_SLICE_COUNT] = s->current_picture->poc; 1.315 +// while (h->ed_poc_fi > h->ed_poc_fo + MAX_SLICE_COUNT) 1.316 +// h->ed_poc_fo++; 1.317 +// 1.318 +// pthread_cond_signal(&h->cond[ENTROPY2]); 1.319 +// pthread_mutex_unlock(&h->lock[ENTROPY2]); 1.320 +// } 1.321 + 1.322 + { 1.323 + pthread_mutex_lock(&h->lock[ENTROPY4]); 1.324 + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) 1.325 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 1.326 + h->ed_reorder_q[h->ed_reorder_fi] = *s; 1.327 + h->ed_reorder_cnt++; 1.328 + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; 1.329 + pthread_cond_signal(&h->cond[ENTROPY4]); 1.330 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 1.331 + } 1.332 + 1.333 + { 1.334 + pthread_mutex_lock(&eip->ed_lock); 1.335 + eip->ed_cnt--; 1.336 + pthread_cond_signal(&eip->ed_cond); 1.337 + pthread_mutex_unlock(&eip->ed_lock); 1.338 + } 1.339 + } 1.340 + 1.341 + free_entropy_context(ec); 1.342 + 1.343 + pthread_exit(NULL); 1.344 + return NULL; 1.345 +} 1.346 + 1.347 +static void *entr_B_spe_thread(void *arg){ 1.348 + EDThreadContext *eb = (EDThreadContext *) arg; 1.349 + H264Context *h = eb->h; 1.350 +// printf("eb %d, pid %d\n", eb->thread_num, syscall(SYS_gettid)); 1.351 + for (int i=0; i<SLICE_BUFS; i++){ 1.352 + eb->mbs[i] = av_malloc(h->mb_height*h->mb_width*sizeof(H264Mb)); 1.353 + } 1.354 + 1.355 + EntropyContext *ec = get_entropy_context(h); 1.356 + EDSlice *s; 1.357 + 1.358 + for(;;){ 1.359 + { 1.360 + pthread_mutex_lock(&eb->ed_lock); 1.361 + while (eb->ed_cnt <= 0) 1.362 + pthread_cond_wait(&eb->ed_cond, &eb->ed_lock); 1.363 + s = &eb->ed_q[eb->ed_fo]; 1.364 + eb->ed_fo++; eb->ed_fo %= MAX_SLICE_COUNT; 1.365 + pthread_mutex_unlock(&eb->ed_lock); 1.366 + } 1.367 + 1.368 + if (s->state<0) 1.369 + break; 1.370 + { 1.371 + pthread_mutex_lock(&eb->mbs_lock); 1.372 + while (eb->mbs_cnt <= 0) 1.373 + pthread_cond_wait(&eb->mbs_cond, &eb->mbs_lock); 1.374 + s->mbs = eb->mbs[eb->mbs_fo]; 1.375 + s->ed = eb; 1.376 + eb->mbs_cnt--; 1.377 + eb->mbs_fo++; eb->mbs_fo%=SLICE_BUFS; 1.378 + pthread_mutex_unlock(&eb->mbs_lock); 1.379 + } 1.380 + //decode_B_slice_entropy(&hcabac, &cabac, s, eb, eb->prev_ed); 1.381 + decode_slice_entropy_cell(ec, s, eb->thread_num + h->edip_threads); 1.382 + 1.383 + { 1.384 + pthread_mutex_lock(&h->lock[ENTROPY4]); 1.385 + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) 1.386 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 1.387 + h->ed_reorder_q[h->ed_reorder_fi] = *s; 1.388 + h->ed_reorder_cnt++; 1.389 + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; 1.390 + pthread_cond_signal(&h->cond[ENTROPY4]); 1.391 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 1.392 + 1.393 + } 1.394 + 1.395 + { 1.396 + pthread_mutex_lock(&eb->ed_lock); 1.397 + eb->ed_cnt--; 1.398 + pthread_cond_signal(&eb->ed_cond); 1.399 + pthread_mutex_unlock(&eb->ed_lock); 1.400 + } 1.401 + } 1.402 + eb->lines_cnt++; 1.403 + 1.404 + free_entropy_context(ec); 1.405 + 1.406 + pthread_exit(NULL); 1.407 + return NULL; 1.408 +} 1.409 + 1.410 +static void *entr_B_distribute(void *arg){ 1.411 + H264Context *h = (H264Context *) arg; 1.412 + EDSlice *s; 1.413 + 1.414 + int i, n=0, poc; 1.415 + 1.416 +// printf("eb dist, pid %d\n", syscall(SYS_gettid)); 1.417 + 1.418 + for(i=0; i<h->edb_threads; i++){ 1.419 + h->b[i].h =h; 1.420 + h->b[i].thread_num =i; 1.421 + h->b[i].thread_total =h->edb_threads; 1.422 + pthread_mutex_init(&h->b[i].mbs_lock, NULL); 1.423 + pthread_cond_init(&h->b[i].mbs_cond, NULL); 1.424 + h->b[i].mbs_fo = 0; 1.425 + h->b[i].mbs_cnt = SLICE_BUFS; 1.426 + h->b[i].ed_fi =0; 1.427 + h->b[i].ed_fo =0; 1.428 + h->b[i].ed_cnt =0; 1.429 + h->b[i].lines_cnt =0; 1.430 + h->b[i].prev_ed = &h->b[(i-1 +h->edb_threads) % h->edb_threads]; 1.431 + pthread_mutex_init(&h->b[i].ed_lock, NULL); 1.432 + pthread_cond_init(&h->b[i].ed_cond, NULL); 1.433 + pthread_create(&h->ed_B_thr[i], NULL, entr_B_spe_thread, &h->b[i]); 1.434 + } 1.435 + 1.436 + for(;;){ 1.437 + { 1.438 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 1.439 + while (h->ed_B_cnt<=0) 1.440 + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); 1.441 + s= &h->ed_B_q[h->ed_B_fo]; 1.442 + h->ed_B_fo++; h->ed_B_fo %= MAX_SLICE_COUNT; 1.443 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 1.444 + 1.445 + } 1.446 + if (s->state<0) 1.447 + break; 1.448 + 1.449 + if (s->ref_list[1][0]->slice_type_nos != FF_B_TYPE){ 1.450 + while (poc < s->ref_list[1][0]->poc){ 1.451 + pthread_mutex_lock(&h->lock[ENTROPY2]); 1.452 + while (poc == h->ed_poc) 1.453 + pthread_cond_wait(&h->cond[ENTROPY2], &h->lock[ENTROPY2]); 1.454 + poc = h->ed_poc; 1.455 + pthread_mutex_unlock(&h->lock[ENTROPY2]); 1.456 + } 1.457 + } 1.458 + { 1.459 + pthread_mutex_lock(&h->b[n].ed_lock); 1.460 + while (h->b[n].ed_cnt >= MAX_SLICE_COUNT) 1.461 + pthread_cond_wait(&h->b[n].ed_cond, &h->b[n].ed_lock); 1.462 + h->b[n].ed_q[ h->b[n].ed_fi] = *s; 1.463 + h->b[n].ed_cnt++; 1.464 + h->b[n].ed_fi++; h->b[n].ed_fi %= MAX_SLICE_COUNT; 1.465 + pthread_cond_signal(&h->b[n].ed_cond); 1.466 + pthread_mutex_unlock(&h->b[n].ed_lock); 1.467 + 1.468 + n++; n%=h->edb_threads; 1.469 + } 1.470 + { 1.471 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 1.472 + h->ed_B_cnt--; 1.473 + pthread_cond_signal(&h->cond[ENTROPY3B]); 1.474 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 1.475 + 1.476 + } 1.477 + 1.478 + } 1.479 + 1.480 + for (i=0; i<h->edb_threads; i++){ 1.481 + pthread_mutex_lock(&h->b[i].ed_lock); 1.482 + while (h->b[i].ed_cnt >= MAX_SLICE_COUNT) 1.483 + pthread_cond_wait(&h->b[i].ed_cond, &h->b[i].ed_lock); 1.484 + h->b[i].ed_q[ h->b[i].ed_fi] = *s; 1.485 + h->b[i].ed_cnt++; 1.486 + h->b[i].ed_fi++; h->b[i].ed_fi %= MAX_SLICE_COUNT; 1.487 + pthread_cond_signal(&h->b[i].ed_cond); 1.488 + pthread_mutex_unlock(&h->b[i].ed_lock); 1.489 + 1.490 + } 1.491 + for(int i=0; i<h->edb_threads; i++){ 1.492 + pthread_join(h->ed_B_thr[i], NULL); 1.493 + } 1.494 + pthread_exit(NULL); 1.495 + return NULL; 1.496 +} 1.497 + 1.498 + 1.499 +static void *entr_IPB_distribute(void *arg){ 1.500 + H264Context *h = (H264Context *) arg; 1.501 + EDSlice *s; 1.502 + int i,n=0; 1.503 + 1.504 + create_spe_ED_threads(h, h->edip_threads, h->edb_threads); 1.505 + pthread_create(&h->ed_B_dist, NULL, entr_B_distribute, h); 1.506 + for(i=0; i<h->edip_threads + h->edip_ppe_threads; i++){ 1.507 + h->ip[i].h =h; 1.508 + h->ip[i].cell = (i >= h->edip_ppe_threads); 1.509 + pthread_mutex_init(&h->ip[i].mbs_lock, NULL); 1.510 + pthread_cond_init(&h->ip[i].mbs_cond, NULL); 1.511 + h->ip[i].thread_num = i - h->edip_ppe_threads; 1.512 + h->ip[i].thread_total=h->edip_threads+ h->edip_ppe_threads; 1.513 + h->ip[i].mbs_fo = 0; 1.514 + h->ip[i].mbs_cnt = SLICE_BUFS; 1.515 + h->ip[i].ed_fi =0; 1.516 + h->ip[i].ed_fo =0; 1.517 + pthread_mutex_init(&h->ip[i].ed_lock, NULL); 1.518 + pthread_cond_init(&h->ip[i].ed_cond, NULL); 1.519 + pthread_create(&h->ed_IP_thr[i], NULL, entr_IP_spe_thread, &h->ip[i]); 1.520 + } 1.521 + 1.522 + for(;;){ 1.523 + { 1.524 + pthread_mutex_lock(&h->lock[ENTROPY]); 1.525 + while (h->ed_cnt<=0) 1.526 + pthread_cond_wait(&h->cond[ENTROPY], &h->lock[ENTROPY]); 1.527 + s= &h->ed_q[h->ed_fo]; 1.528 + 1.529 + pthread_mutex_unlock(&h->lock[ENTROPY]); 1.530 + h->ed_fo++; h->ed_fo %= MAX_SLICE_COUNT; 1.531 + } 1.532 + if (s->state<0) 1.533 + break; 1.534 + 1.535 + assert(s->current_picture); 1.536 + if (s->slice_type_nos == FF_B_TYPE ) 1.537 + { 1.538 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 1.539 + while (h->ed_B_cnt>=MAX_SLICE_COUNT) 1.540 + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); 1.541 + h->ed_B_q[h->ed_B_fi] = *s; 1.542 + h->ed_B_cnt++; 1.543 + h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; 1.544 + pthread_cond_signal(&h->cond[ENTROPY3B]); 1.545 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 1.546 + }else 1.547 + { 1.548 + ///round robin now, change to based on rawframes size. 1.549 + pthread_mutex_lock(&h->ip[n].ed_lock); 1.550 + while (h->ip[n].ed_cnt >= MAX_SLICE_COUNT) 1.551 + pthread_cond_wait(&h->ip[n].ed_cond, &h->ip[n].ed_lock); 1.552 + h->ip[n].ed_q[ h->ip[n].ed_fi] = *s; 1.553 + h->ip[n].ed_cnt++; 1.554 + h->ip[n].ed_fi++; h->ip[n].ed_fi %= MAX_SLICE_COUNT; 1.555 + pthread_cond_signal(&h->ip[n].ed_cond); 1.556 + pthread_mutex_unlock(&h->ip[n].ed_lock); 1.557 + 1.558 + n++; n %=(h->edip_threads+h->edip_ppe_threads); 1.559 + } 1.560 + { 1.561 + pthread_mutex_lock(&h->lock[ENTROPY]); 1.562 + h->ed_cnt--; 1.563 + pthread_cond_signal(&h->cond[ENTROPY]); 1.564 + pthread_mutex_unlock(&h->lock[ENTROPY]); 1.565 + 1.566 + } 1.567 + } 1.568 + 1.569 + { 1.570 + pthread_mutex_lock(&h->lock[ENTROPY3B]); 1.571 + while (h->ed_B_cnt>=MAX_SLICE_COUNT) 1.572 + pthread_cond_wait(&h->cond[ENTROPY3B], &h->lock[ENTROPY3B]); 1.573 + h->ed_B_q[h->ed_B_fi] = *s; 1.574 + h->ed_B_cnt++; 1.575 + h->ed_B_fi++; h->ed_B_fi %= MAX_SLICE_COUNT; 1.576 + pthread_cond_signal(&h->cond[ENTROPY3B]); 1.577 + pthread_mutex_unlock(&h->lock[ENTROPY3B]); 1.578 + } 1.579 + { 1.580 + for (i=0; i<h->edip_threads + h->edip_ppe_threads; i++){ 1.581 + pthread_mutex_lock(&h->ip[i].ed_lock); 1.582 + while (h->ip[i].ed_cnt >= MAX_SLICE_COUNT) 1.583 + pthread_cond_wait(&h->ip[i].ed_cond, &h->ip[i].ed_lock); 1.584 + h->ip[i].ed_q[ h->ip[i].ed_fi] = *s; 1.585 + h->ip[i].ed_cnt++; 1.586 + h->ip[i].ed_fi++; h->ip[i].ed_fi %= MAX_SLICE_COUNT; 1.587 + pthread_cond_signal(&h->ip[i].ed_cond); 1.588 + pthread_mutex_unlock(&h->ip[i].ed_lock); 1.589 + } 1.590 + } 1.591 + { 1.592 + pthread_mutex_lock(&h->lock[ENTROPY4]); 1.593 + while (h->ed_reorder_cnt>=MAX_SLICE_COUNT) 1.594 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 1.595 + h->ed_reorder_q[h->ed_reorder_fi] = *s; 1.596 + h->ed_reorder_cnt++; 1.597 + h->ed_reorder_fi++; h->ed_reorder_fi %= MAX_SLICE_COUNT; 1.598 + pthread_cond_signal(&h->cond[ENTROPY4]); 1.599 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 1.600 + 1.601 + } 1.602 + pthread_join(h->ed_B_dist, NULL); 1.603 + for(i=0; i<h->edip_threads; i++){ 1.604 + pthread_join(h->ed_IP_thr[i], NULL); 1.605 + } 1.606 + pthread_exit(NULL); 1.607 + return NULL; 1.608 +} 1.609 + 1.610 +static pthread_t ed_IPB_dist; 1.611 +static void *entropy_IPB_cell_thread(void *arg){ 1.612 + H264Context *h = (H264Context *) arg; 1.613 + int i; 1.614 + EDSlice reorder[MAX_SLICE_COUNT]; 1.615 + int ip_poc[MAX_SLICE_COUNT][2]={0,}; 1.616 + int next_ip_id=0; 1.617 + int ip_poc_cnt=0; 1.618 + EDSlice *s; 1.619 + int reorder_cnt=0; 1.620 + unsigned next_pic_num=0; 1.621 + 1.622 + pthread_create(&ed_IPB_dist, NULL, entr_IPB_distribute, h); 1.623 + int count =0; 1.624 + for(;;){ 1.625 + //signals received from the entropy decoders 1.626 + { 1.627 + pthread_mutex_lock(&h->lock[ENTROPY4]); 1.628 + while (h->ed_reorder_cnt<=0) 1.629 + pthread_cond_wait(&h->cond[ENTROPY4], &h->lock[ENTROPY4]); 1.630 + s= &h->ed_reorder_q[h->ed_reorder_fo]; 1.631 + h->ed_reorder_fo++; h->ed_reorder_fo %=MAX_SLICE_COUNT; 1.632 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 1.633 + } 1.634 + 1.635 + if (s->state >=0 && s->slice_type_nos != FF_B_TYPE){ 1.636 + for (i=0; i<ip_poc_cnt; i++){ 1.637 + if (s->ip_id < ip_poc[i][0]){ 1.638 + memmove(ip_poc[i+1], ip_poc[i], 2*(ip_poc_cnt-i)*sizeof(int)); 1.639 + break; 1.640 + } 1.641 + } 1.642 + ip_poc[i][0]= s->ip_id; 1.643 + ip_poc[i][1]= s->current_picture->poc; 1.644 + ip_poc_cnt++; 1.645 + 1.646 + while (next_ip_id == ip_poc[0][0]){ 1.647 + pthread_mutex_lock(&h->lock[ENTROPY2]); 1.648 + h->ed_poc = ip_poc[0][1]; 1.649 + 1.650 + pthread_cond_signal(&h->cond[ENTROPY2]); 1.651 + pthread_mutex_unlock(&h->lock[ENTROPY2]); 1.652 + memmove(ip_poc[0], ip_poc[1], 2*(ip_poc_cnt-1)*sizeof(int)); 1.653 + ip_poc_cnt--; 1.654 + next_ip_id++; 1.655 + } 1.656 + } 1.657 + 1.658 + for(i=reorder_cnt; i>0; i--){ 1.659 + if (s->coded_pic_num < reorder[i-1].coded_pic_num) 1.660 + break; 1.661 + reorder[i]=reorder[i-1]; 1.662 + } 1.663 + reorder[i]=*s; 1.664 + 1.665 + while(reorder_cnt>=0){ 1.666 + if (next_pic_num!=reorder[reorder_cnt].coded_pic_num){ 1.667 + break; 1.668 + } 1.669 + EDSlice *es = &reorder[reorder_cnt]; 1.670 + 1.671 + { 1.672 + pthread_mutex_lock(&h->lock[MBDEC]); 1.673 + while (h->mbdec_cnt >= MAX_SLICE_COUNT) 1.674 + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); 1.675 + copyEDtoMBSlice(&h->mbdec_q[h->mbdec_fi], es); 1.676 + 1.677 + h->mbdec_cnt++; 1.678 + h->mbdec_fi++; h->mbdec_fi %= MAX_SLICE_COUNT; 1.679 + pthread_cond_signal(&h->cond[MBDEC]); 1.680 + pthread_mutex_unlock(&h->lock[MBDEC]); 1.681 + 1.682 + } 1.683 + 1.684 + if (es->state<0) 1.685 + goto end; 1.686 + 1.687 + assert(es->current_picture); 1.688 + for (int i=0; i<es->release_cnt; i++){ 1.689 + release_pib_entry(h, es->release_ref[i], 2); 1.690 + } 1.691 + release_pib_entry(h, es->current_picture, 1); 1.692 + av_freep(&es->gb.raw); 1.693 + if (es->gb.rbsp) 1.694 + av_freep(&es->gb.rbsp); 1.695 + 1.696 + next_pic_num++; 1.697 + reorder_cnt--; 1.698 + } 1.699 + reorder_cnt++; 1.700 + 1.701 + { 1.702 + pthread_mutex_lock(&h->lock[ENTROPY4]); 1.703 + h->ed_reorder_cnt--; 1.704 + pthread_cond_signal(&h->cond[ENTROPY4]); 1.705 + pthread_mutex_unlock(&h->lock[ENTROPY4]); 1.706 + } 1.707 + } 1.708 + 1.709 +end: 1.710 + pthread_join(ed_IPB_dist, NULL); 1.711 + pthread_exit(NULL); 1.712 + return NULL; 1.713 +} 1.714 + 1.715 + 1.716 +static void fill_spe_slice(H264slice *dst, const MBSlice *src, H264Context *h){ 1.717 + dst->deblocking_filter =1; 1.718 + dst->linesize = src->current_picture->linesize[0]; 1.719 + dst->uvlinesize = src->current_picture->linesize[1]; 1.720 + dst->mb_width = h->mb_width; 1.721 + dst->mb_height = h->mb_height; 1.722 + dst->use_weight = src->use_weight; 1.723 + dst->use_weight_chroma = src->use_weight_chroma; 1.724 + dst->luma_log2_weight_denom = src->luma_log2_weight_denom; 1.725 + dst->chroma_log2_weight_denom = src->chroma_log2_weight_denom; 1.726 + 1.727 + //weights later 1.728 + memcpy(dst->luma_weight, src->luma_weight, 16*2*2*sizeof(int16_t)); 1.729 + memcpy(dst->chroma_weight, src->chroma_weight, 16*2*2*2*sizeof(int16_t)); 1.730 + memcpy(dst->implicit_weight, src->implicit_weight, 16*16*2*sizeof(int16_t)); 1.731 + 1.732 + for(int list=0; list<2; list++){ 1.733 + for (int i=0; i<src->ref_count[list]; i++){ 1.734 + Picture_spu *p_dst = &dst->ref_list[list][i]; 1.735 + DecodedPicture *p_src = src->ref_list[list][i]; 1.736 + if (p_src){ 1.737 + p_dst->data[0] = p_src->data[0]; 1.738 + p_dst->data[1] = p_src->data[1]; 1.739 + p_dst->data[2] = p_src->data[2]; 1.740 + } 1.741 + } 1.742 + } 1.743 + dst->state = src->state; 1.744 + 1.745 + dst->emu_edge_width =32; 1.746 + dst->emu_edge_height =32; 1.747 + dst->slice_type = src->slice_type; 1.748 + dst->slice_type_nos = src->slice_type_nos; 1.749 + dst->slice_alpha_c0_offset = src->slice_alpha_c0_offset; 1.750 + dst->slice_beta_offset = src->slice_beta_offset; 1.751 + 1.752 + memcpy(dst->chroma_qp_table, src->pps.chroma_qp_table, 2*64); 1.753 + 1.754 + dst->blocks = src->mbs; 1.755 + dst->dst_y = src->current_picture->data[0]; 1.756 + dst->dst_cb = src->current_picture->data[1]; 1.757 + dst->dst_cr = src->current_picture->data[2]; 1.758 +} 1.759 + 1.760 +static void decode_slice_mb_seq_cell(H264Context *h, MBRecContext *d, MBSlice *s, DecodedPicture *tmp){ 1.761 + static int rl_fi=0; 1.762 + 1.763 + DECLARE_ALIGNED(16, H264slice, spe_slice); 1.764 + H264spe *p=&spe_params[0]; 1.765 + unsigned status; 1.766 + uint8_t *dst_y, *dst_cb, *dst_cr; 1.767 + 1.768 + DecodedPicture *dp; 1.769 + 1.770 + for (int i=0; i<2; i++){ 1.771 + for(int j=0; j< s->ref_count[i]; j++){ 1.772 + if (s->ref_list_cpn[i][j] ==-1) 1.773 + continue; 1.774 + int k; 1.775 + for (k=0; k<DPB_SIZE; k++){ 1.776 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ 1.777 + s->ref_list[i][j] = &h->dpb[k]; 1.778 + break; 1.779 + } 1.780 + } 1.781 + } 1.782 + } 1.783 + 1.784 + dp = get_dpb_entry(h); 1.785 + init_dpb_entry(dp, s, d->width, d->height); 1.786 + 1.787 + if (h->no_mbd) 1.788 + return; 1.789 + 1.790 + 1.791 + fill_spe_slice(&spe_slice, s, h); 1.792 + spe_mfcio_get(spe_context[0], (unsigned) (spe_slice_buf[0] + rl_fi), &spe_slice, sizeof(H264slice), 15, 0, 0); 1.793 + spe_mfcio_tag_status_read(spe_context[0], 1<<15, SPE_TAG_ALL, &status); 1.794 + rl_fi++; rl_fi %= 2; 1.795 + 1.796 + _spe_in_mbox_write(spe_control_area[0], 0); 1.797 + while (atomic_read(rl_cnt)<=0){ 1.798 + //pthread_yield(); 1.799 + usleep(1000); 1.800 + } 1.801 + atomic_dec(rl_cnt); 1.802 + 1.803 + 1.804 +/** This is error free, no visual artifacts, however, md5sum fails.... (WTF) **/ 1.805 +// memcpy(tmp->data[0], s->current_picture->data[0], tmp->linesize[0]*h->mb_height*16); 1.806 +// memcpy(tmp->data[1], s->current_picture->data[1], tmp->linesize[1]*h->mb_height*8); 1.807 +// memcpy(tmp->data[2], s->current_picture->data[2], tmp->linesize[1]*h->mb_height*8); 1.808 +// 1.809 +// memset(s->current_picture->data[0], 0, tmp->linesize[0]*h->mb_height*16); 1.810 +// memset(s->current_picture->data[1], 0, tmp->linesize[1]*h->mb_height*8); 1.811 +// memset(s->current_picture->data[2], 0, tmp->linesize[1]*h->mb_height*8); 1.812 +// 1.813 +// decode_slice_mb_seq(d, s); 1.814 +// 1.815 +// for (int i=0; i<h->mb_height*16; i++){ 1.816 +// for (int j=0; j<h->width; j++){ 1.817 +// if (tmp->data[0][j + i*tmp->linesize[0]] != s->current_picture->data[0][j + i*tmp->linesize[0]]){ 1.818 +// printf("%d, %d, %d, %d\n", j, i, tmp->data[0][j + i*tmp->linesize[0]], s->current_picture->data[0][j + i*tmp->linesize[0]]); 1.819 +// return; 1.820 +// } 1.821 +// } 1.822 +// } 1.823 +// 1.824 +// for (int i=0; i<h->mb_height*8; i++){ 1.825 +// for (int j=0; j<h->width/2; j++){ 1.826 +// if (tmp->data[1][j + i*tmp->linesize[1]] != s->current_picture->data[1][j + i*tmp->linesize[1]]){ 1.827 +// printf("%d, %d, %d, %d\n", j, i, tmp->data[1][j + i*tmp->linesize[1]], s->current_picture->data[1][j + i*tmp->linesize[1]]); 1.828 +// return; 1.829 +// } 1.830 +// } 1.831 +// } 1.832 +// 1.833 +// for (int i=0; i<h->mb_height*8; i++){ 1.834 +// for (int j=0; j<h->width/2; j++){ 1.835 +// if (tmp->data[2][j + i*tmp->linesize[1]] != s->current_picture->data[2][j + i*tmp->linesize[1]]){ 1.836 +// printf("%d, %d, %d, %d\n", j, i, tmp->data[2][j + i*tmp->linesize[1]], s->current_picture->data[2][j + i*tmp->linesize[1]]); 1.837 +// return; 1.838 +// } 1.839 +// } 1.840 +// } 1.841 + 1.842 + 1.843 + //printf("dst_y %p\n", dst_y); 1.844 + 1.845 + 1.846 + for (int i=0; i<s->release_cnt; i++){ 1.847 + for(int j=0; j<DPB_SIZE; j++){ 1.848 + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ 1.849 + release_dpb_entry(h, &h->dpb[j], 2); 1.850 + break; 1.851 + } 1.852 + } 1.853 + } 1.854 + s->release_cnt=0; 1.855 + 1.856 +} 1.857 + 1.858 +static void *h264_spe_thread(void * thread_args ) { 1.859 + H264spe *params = (H264spe *)thread_args; 1.860 + unsigned int spe_id = params->spe_id; 1.861 + unsigned int runflags = 0; 1.862 + unsigned int entry = SPE_DEFAULT_ENTRY; 1.863 + // run SPE context 1.864 + spe_context_run(spe_context[spe_id], &entry, runflags, (void*) params, NULL, NULL); 1.865 + // done - now exit thread 1.866 + pthread_exit(NULL); 1.867 +} 1.868 + 1.869 +static int create_spe_MBR_threads(H264Context *h, int num_threads) { 1.870 + int i; 1.871 + 1.872 + // reserve memory for spe thread id, context and argument addresses 1.873 + spe_tid = av_malloc(num_threads * sizeof (pthread_t)); 1.874 + spe_context = av_malloc(num_threads * sizeof (spe_context_ptr_t)); 1.875 + spe_params = av_malloc(num_threads * sizeof (H264spe)); 1.876 + spe_control_area = av_malloc(num_threads * sizeof (void*)); 1.877 + spe_ls_area = av_malloc(num_threads * sizeof (void*)); 1.878 + spe_slice_buf = av_malloc(num_threads * sizeof (void*)); 1.879 + 1.880 + spe_program_handle_t *spe_program = spe_image_open("spe_mbd"); 1.881 + 1.882 + if (spe_program == NULL) 1.883 + av_log(AV_LOG_ERROR, "PPE: error opening SPE object image:%d. error=%s \n", errno, strerror(errno)); 1.884 + 1.885 + for (i = 0; i < num_threads; i++) { 1.886 + // create context for spe program 1.887 + spe_context[i] = spe_context_create(SPE_MAP_PS, NULL); 1.888 + if (spe_context[i] == NULL) 1.889 + av_log(AV_LOG_ERROR, "PPE: error creating SPE context:%d. error=%s \n", errno, strerror(errno)); 1.890 + // load SPE program into main memory 1.891 + if ((spe_program_load(spe_context[i], spe_program)) == -1) 1.892 + av_log(AV_LOG_ERROR, "PPE: error loading SPE context:%d. error=%s \n", errno, strerror(errno)); 1.893 + //get the control_area for fast mailboxing 1.894 + if ((spe_control_area[i] = spe_ps_area_get(spe_context[i], SPE_CONTROL_AREA)) == NULL) 1.895 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE control area:%d. error=%s \n", errno, strerror(errno)); 1.896 + //get ls area for inter spe communication 1.897 + if ((spe_ls_area[i] = spe_ls_area_get(spe_context[i])) == NULL) 1.898 + av_log(AV_LOG_ERROR, "PPE: error retrieving SPE ls area:%d. error=%s \n", errno, strerror(errno)); 1.899 + } 1.900 + 1.901 + for (i = 0; i < num_threads; i++) { 1.902 + spe_params[i].mb_width = h->mb_width; 1.903 + spe_params[i].mb_height = h->mb_height; 1.904 + spe_params[i].mb_stride = h->mb_stride; 1.905 + spe_params[i].spe_id = i; 1.906 + spe_params[i].spe_total = num_threads; 1.907 + //spe_params[i].slice_params= &slice_params; 1.908 + spe_params[i].src_spe = spe_ls_area[(i-1+num_threads)%num_threads]; 1.909 + spe_params[i].tgt_spe = spe_ls_area[(i+1)%num_threads]; 1.910 + 1.911 + spe_params[i].rl_lock = rl_lock; 1.912 + spe_params[i].rl_cond = rl_cond; 1.913 + spe_params[i].rl_cnt = rl_cnt; 1.914 + spe_params[i].lock = (mutex_ea_t) (unsigned) &mutex_var[i]; 1.915 + spe_params[i].cond = (cond_ea_t) (unsigned) &cond_var[i]; 1.916 + spe_params[i].cnt = (atomic_ea_t)(unsigned) &atomic_var[i]; atomic_set(spe_params[i].cnt, 0); 1.917 + 1.918 + mutex_init(spe_params[i].lock); 1.919 + cond_init(spe_params[i].cond); 1.920 + if (pthread_create(&spe_tid[i], NULL, h264_spe_thread, (void *) &spe_params[i])) 1.921 + av_log(AV_LOG_ERROR, "create_workers: pthread create for spe failed %d\n", i); 1.922 + 1.923 + //slicebufaddr 1.924 + spe_slice_buf[i] = (H264slice *) _spe_out_mbox_read(spe_control_area[i]); 1.925 + 1.926 + av_log(AV_LOG_DEBUG, "create_workers: created spe thread %d\n", i); 1.927 + } 1.928 + spe_image_close(spe_program); 1.929 + return 0; 1.930 +} 1.931 + 1.932 +//_spe_out_mbox_read(spe_control_area[i]); 1.933 +/** 1.934 +* joins all the spe worker threads. 1.935 +*/ 1.936 +static void join_spe_worker_threads(H264slice *s, int num_threads, int *rl_fi) { 1.937 + int i; 1.938 + ///just to keep coding consistency. 1.939 + { 1.940 + for (i=0; i<num_threads; i++){ 1.941 + H264spe *p=&spe_params[i]; 1.942 + unsigned status; 1.943 + 1.944 + while (atomic_read(p->cnt)>=2) {//double buffered 1.945 + usleep(1000);//cond_wait(p->cond, p->lock); 1.946 + } 1.947 + 1.948 + spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), s, sizeof(H264slice), 15, 0, 0); 1.949 + spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); 1.950 + //mutex_unlock(p->lock); 1.951 + _spe_in_mbox_write(spe_control_area[i], 0); 1.952 + } 1.953 + } 1.954 + 1.955 + for (i=0; i<num_threads; i++){ 1.956 + pthread_join(spe_tid[i], NULL); 1.957 + } 1.958 + 1.959 + for (i=0; i<num_threads; i++){ 1.960 + spe_context_destroy(spe_context[i]); 1.961 + } 1.962 + atomic_inc(rl_cnt); 1.963 + 1.964 + // destroy memory reserved for spe thread id, context and argument addresses 1.965 + av_freep(&spe_tid); 1.966 + av_freep(&spe_context); 1.967 + av_freep(&spe_params); 1.968 + av_freep(&spe_control_area); 1.969 + av_freep(&spe_slice_buf); 1.970 +} 1.971 + 1.972 + 1.973 +static void *rl_dist_thread(void *arg){ 1.974 + int i; 1.975 + H264Context *h = (H264Context *) arg; 1.976 + MBSlice *s; 1.977 + DecodedPicture *dp; 1.978 + int rl_fi[16]={0,}; 1.979 + DECLARE_ALIGNED(16, H264slice, spe_slice); 1.980 + 1.981 + create_spe_MBR_threads(h, h->rl_threads); 1.982 + for(;;){ 1.983 + { 1.984 + pthread_mutex_lock(&h->lock[MBDEC]); 1.985 + while (h->mbdec_cnt<=0) 1.986 + pthread_cond_wait(&h->cond[MBDEC], &h->lock[MBDEC]); 1.987 + s= &h->mbdec_q[h->mbdec_fo]; 1.988 + h->mbdec_fo++; h->mbdec_fo %= MAX_SLICE_COUNT; 1.989 + pthread_mutex_unlock(&h->lock[MBDEC]); 1.990 + } 1.991 + 1.992 + if (s->state<0){ 1.993 + break; 1.994 + } 1.995 + for (int i=0; i<2; i++){ 1.996 + for(int j=0; j< s->ref_count[i]; j++){ 1.997 + if (s->ref_list_cpn[i][j] ==-1) 1.998 + continue; 1.999 + int k; 1.1000 + for (k=0; k<DPB_SIZE; k++){ 1.1001 + if(h->dpb[k].reference >= 2 && h->dpb[k].cpn == s->ref_list_cpn[i][j]){ 1.1002 + s->ref_list[i][j] = &h->dpb[k]; 1.1003 + break; 1.1004 + } 1.1005 + } 1.1006 + 1.1007 + } 1.1008 + } 1.1009 + dp = get_dpb_entry(h); 1.1010 + init_dpb_entry(dp, s, h->width, h->height); 1.1011 + assert(s->current_picture); 1.1012 + { 1.1013 + while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ 1.1014 + usleep(1000); 1.1015 + } 1.1016 + h->mbrel_q[h->mbrel_fi] = *s; 1.1017 + 1.1018 + h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; 1.1019 + } 1.1020 + { 1.1021 + if(h->no_mbd){ 1.1022 + atomic_inc(rl_cnt); 1.1023 + }else { 1.1024 + fill_spe_slice(&spe_slice, s, h); 1.1025 + for (i=0; i<h->rl_threads; i++){ 1.1026 + H264spe *p=&spe_params[i]; 1.1027 + unsigned status; 1.1028 + while (atomic_read(p->cnt)>=2){ //double buffered 1.1029 + usleep(1000); 1.1030 + //cond_wait(p->cond, p->lock); 1.1031 + } 1.1032 + spe_mfcio_get(spe_context[i], (unsigned) (spe_slice_buf[i] + rl_fi[i]), &spe_slice, sizeof(H264slice), 15, 0, 0); 1.1033 + spe_mfcio_tag_status_read(spe_context[i], 1<<15, SPE_TAG_ALL, &status); 1.1034 + rl_fi[i]++; rl_fi[i] %= 2; 1.1035 + atomic_inc(p->cnt); 1.1036 + 1.1037 + _spe_in_mbox_write(spe_control_area[i], 0); 1.1038 + } 1.1039 + } 1.1040 + } 1.1041 + 1.1042 + { 1.1043 + pthread_mutex_lock(&h->lock[MBDEC]); 1.1044 + h->mbdec_cnt--; 1.1045 + pthread_cond_signal(&h->cond[MBDEC]); 1.1046 + pthread_mutex_unlock(&h->lock[MBDEC]); 1.1047 + } 1.1048 + 1.1049 + } 1.1050 + 1.1051 + { 1.1052 + while (atomic_read(rl_cnt) >=MAX_SLICE_COUNT){ 1.1053 + usleep(1000); 1.1054 + } 1.1055 + h->mbrel_q[h->mbrel_fi] = *s; 1.1056 + 1.1057 + h->mbrel_fi++; h->mbrel_fi %= MAX_SLICE_COUNT; 1.1058 + } 1.1059 + spe_slice.state=-1; 1.1060 + join_spe_worker_threads(&spe_slice, h->rl_threads, rl_fi); 1.1061 + pthread_exit(NULL); 1.1062 + return NULL; 1.1063 +} 1.1064 + 1.1065 +static void *mbdec_cell_thread(void *arg){ 1.1066 + H264Context *h = (H264Context *) arg; 1.1067 + 1.1068 + rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; 1.1069 + rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; 1.1070 + rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; 1.1071 + atomic_set(rl_cnt, 0); 1.1072 + mutex_init(rl_lock); 1.1073 + cond_init(rl_cond); 1.1074 +// printf("mbdec, pid %d\n", syscall(SYS_gettid)); 1.1075 + pthread_create(&h->rl_dist_thr, NULL, rl_dist_thread, h); 1.1076 + 1.1077 + for(;;){ 1.1078 + MBSlice *s=NULL; 1.1079 + { 1.1080 + while (atomic_read(rl_cnt)<=0){ 1.1081 + usleep(1000); 1.1082 + } 1.1083 + s= &h->mbrel_q[h->mbrel_fo]; 1.1084 + h->mbrel_fo++; h->mbrel_fo %= MAX_SLICE_COUNT; 1.1085 + } 1.1086 + 1.1087 + if (s->state<0) 1.1088 + break; 1.1089 + 1.1090 + for (int i=0; i<s->release_cnt; i++){ 1.1091 + for(int j=0; j<DPB_SIZE; j++){ 1.1092 + if(h->dpb[j].cpn== s->release_ref_cpn[i]){ 1.1093 + release_dpb_entry(h, &h->dpb[j], 2); 1.1094 + break; 1.1095 + } 1.1096 + } 1.1097 + } 1.1098 + 1.1099 + { 1.1100 + EDThreadContext *ed = s->ed; 1.1101 + pthread_mutex_lock(&ed->mbs_lock); 1.1102 + ed->mbs_cnt++; 1.1103 + pthread_cond_signal(&ed->mbs_cond); 1.1104 + pthread_mutex_unlock(&ed->mbs_lock); 1.1105 + } 1.1106 + 1.1107 + { 1.1108 + pthread_mutex_lock(&h->lock[WRITE]); 1.1109 + while (h->write_cnt>= DPB_SIZE) 1.1110 + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); 1.1111 + assert(s); 1.1112 + assert(s->current_picture); 1.1113 + h->write_q[h->write_fi]= s->current_picture; 1.1114 + h->write_cnt++; 1.1115 + h->write_fi++; h->write_fi %= DPB_SIZE; 1.1116 + pthread_cond_signal(&h->cond[WRITE]); 1.1117 + pthread_mutex_unlock(&h->lock[WRITE]); 1.1118 + 1.1119 + } 1.1120 + { 1.1121 + atomic_dec(rl_cnt); 1.1122 + } 1.1123 + 1.1124 + } 1.1125 + 1.1126 + {//propagate exit 1.1127 + pthread_mutex_lock(&h->lock[WRITE]); 1.1128 + while (h->write_cnt>= DPB_SIZE) 1.1129 + pthread_cond_wait(&h->cond[WRITE], &h->lock[WRITE]); 1.1130 + last_pic.reference = -1; 1.1131 + h->write_q[h->write_fi] = &last_pic; 1.1132 + h->write_cnt++; 1.1133 + h->write_fi++; h->write_fi %= DPB_SIZE; 1.1134 + pthread_cond_signal(&h->cond[WRITE]); 1.1135 + pthread_mutex_unlock(&h->lock[WRITE]); 1.1136 + 1.1137 + } 1.1138 + pthread_join(h->rl_dist_thr, NULL); 1.1139 + pthread_exit(NULL); 1.1140 + return NULL; 1.1141 +} 1.1142 + 1.1143 +/* 1.1144 +* The following code is the main loop of the file converter 1.1145 +*/ 1.1146 +int h264_decode_cell(H264Context *h) { 1.1147 + 1.1148 + pthread_t read_thr, parsenal_thr, entropy_thr, mbdec_thr, write_thr; 1.1149 + 1.1150 + start_timer(); 1.1151 + 1.1152 + pthread_create(&read_thr, NULL, read_thread, h); 1.1153 + pthread_create(&parsenal_thr, NULL, parsenal_thread, h); 1.1154 + pthread_create(&entropy_thr, NULL, entropy_IPB_cell_thread, h); 1.1155 + pthread_create(&mbdec_thr, NULL, mbdec_cell_thread, h); 1.1156 + pthread_create(&write_thr, NULL, write_thread, h); 1.1157 + 1.1158 + pthread_join(read_thr, NULL); 1.1159 + pthread_join(parsenal_thr, NULL); 1.1160 + pthread_join(entropy_thr, NULL); 1.1161 + pthread_join(mbdec_thr, NULL); 1.1162 + pthread_join(write_thr, NULL); 1.1163 + 1.1164 + return 0; 1.1165 +} 1.1166 + 1.1167 +/* 1.1168 +* The following code is the main loop of the file converter 1.1169 +*/ 1.1170 +int h264_decode_cell_seq(H264Context *h) { 1.1171 +ParserContext *pc; 1.1172 + NalContext *nc; 1.1173 + EntropyContext *ec; 1.1174 + MBRecContext *rc; 1.1175 + OutputContext *oc; 1.1176 + 1.1177 + RawFrame frm; 1.1178 + EDSlice slice, *s=&slice; 1.1179 + MBSlice mbslice, *s2=&mbslice; 1.1180 + PictureInfo *pic=NULL; 1.1181 + DecodedPicture *out; 1.1182 + int size; 1.1183 + int frames=0; 1.1184 + 1.1185 + pc = get_parse_context(h->ifile); 1.1186 + nc = get_nal_context(h->width, h->height); 1.1187 + ec = get_entropy_context( h ); 1.1188 + rc = get_mbrec_context(h); 1.1189 + oc = get_output_context( h ); 1.1190 + 1.1191 + rl_lock = (mutex_ea_t) (unsigned) &rl_mutex_var; 1.1192 + rl_cond = (cond_ea_t) (unsigned) &rl_cond_var; 1.1193 + rl_cnt = (atomic_ea_t) (unsigned) &rl_cnt_var; 1.1194 + atomic_set(rl_cnt, 0); 1.1195 + mutex_init(rl_lock); 1.1196 + cond_init(rl_cond); 1.1197 + 1.1198 + memset(s, 0, sizeof(EDSlice)); 1.1199 + ff_init_slice(nc, s); 1.1200 + s->mbs = av_malloc( h->mb_height * h->mb_width * sizeof(H264Mb)); 1.1201 + 1.1202 + DecodedPicture tmp; 1.1203 + tmp.base[0]=0; 1.1204 + ///fix this when want to debug the Cell errors 1.1205 + //init_dpb_entry(&tmp, h->width, h->height); 1.1206 + 1.1207 + create_spe_ED_threads(h, 1, 0); 1.1208 + create_spe_MBR_threads(h, 1); 1.1209 + 1.1210 + start_timer(); 1.1211 + 1.1212 + while(!pc->final_frame && frames++ < h->num_frames){ 1.1213 + 1.1214 + av_read_frame_internal(pc, &frm); 1.1215 + 1.1216 + PictureInfo *pic=get_pib_entry(h); 1.1217 + ff_alloc_picture_info(nc, s, pic); 1.1218 + decode_nal_units(nc, s, &frm); 1.1219 + 1.1220 + copyEDtoMBSlice(s2, s); 1.1221 + decode_slice_entropy_cell_seq(h, ec, s); 1.1222 + 1.1223 + decode_slice_mb_seq_cell(h, rc, s2, &tmp); 1.1224 + 1.1225 + out =output_frame(h, oc, s2->current_picture, h->ofile, h->frame_width, h->frame_height); 1.1226 + 1.1227 + if (out){ 1.1228 + release_dpb_entry(h, out, 1); 1.1229 + } 1.1230 + print_report(oc->frame_number, oc->video_size, 0, h->verbose); 1.1231 + } 1.1232 + while ((out=output_frame(h, oc, NULL, h->ofile, h->frame_width, h->frame_height))) ; 1.1233 + 1.1234 + print_report(oc->frame_number, oc->video_size, 1, h->verbose); 1.1235 + 1.1236 + /* finished ! */ 1.1237 + av_freep(&s->mbs); 1.1238 + 1.1239 + free_parse_context(pc); 1.1240 + free_nal_context (nc); 1.1241 + free_entropy_context(ec); 1.1242 + free_mbrec_context(rc); 1.1243 + free_output_context(oc); 1.1244 + return 0; 1.1245 +}
