annotate main.c @ 20:29b273cf3b1f

Benchmark that tests msg loss
author Merten Sach <msach@mailbox.tu-berlin.de>
date Tue, 13 Mar 2012 12:25:48 +0100
parents fdc2f264f3d6
children 08b37152b48d
rev   line source
Me@4 1 /*
Me@4 2 *
Me@4 3 */
Me@4 4 #include <stdio.h>
Me@4 5 #include <stdlib.h>
Me@4 6 #include <string.h>
Me@4 7 #include <math.h>
Me@4 8 #include <ctype.h>
Me@4 9 #include <errno.h>
Me@4 10 #include <pthread.h>
msach@6 11 #include <unistd.h>
msach@19 12 #include "VMS_Implementations/Vthread_impl/VPThread.h"
msach@19 13 #include "C_Libraries/Queue_impl/PrivateQueue.h"
msach@20 14 #include "C_Libraries/DynArray/DynArray.h"
msach@20 15 #include "C_Libraries/BestEffortMessaging/LossyCom.h"
Me@4 16
msach@6 17 #include <linux/perf_event.h>
msach@6 18 #include <linux/prctl.h>
msach@6 19 #include <sys/syscall.h>
msach@6 20
Me@4 21 #undef DEBUG
Me@4 22 //#define DEBUG
Me@4 23
msach@20 24 //#define MEASURE_PERF
msach@15 25
Me@4 26 #if !defined(unix) && !defined(__unix__)
Me@4 27 #ifdef __MACH__
Me@4 28 #define unix 1
Me@4 29 #define __unix__ 1
Me@4 30 #endif /* __MACH__ */
Me@4 31 #endif /* unix */
Me@4 32
Me@4 33 /* find the appropriate way to define explicitly sized types */
Me@4 34 /* for C99 or GNU libc (also mach's libc) we can use stdint.h */
Me@4 35 #if (__STDC_VERSION__ >= 199900) || defined(__GLIBC__) || defined(__MACH__)
Me@4 36 #include <stdint.h>
Me@4 37 #elif defined(unix) || defined(__unix__) /* some UNIX systems have them in sys/types.h */
Me@4 38 #include <sys/types.h>
Me@4 39 #elif defined(__WIN32__) || defined(WIN32) /* the nameless one */
Me@4 40 typedef unsigned __int8 uint8_t;
Me@4 41 typedef unsigned __int32 uint32_t;
Me@4 42 #endif /* sized type detection */
Me@4 43
Me@4 44 /* provide a millisecond-resolution timer for each system */
Me@4 45 #if defined(unix) || defined(__unix__)
Me@4 46 #include <time.h>
Me@4 47 #include <sys/time.h>
Me@4 48 unsigned long get_msec(void) {
Me@4 49 static struct timeval timeval, first_timeval;
Me@4 50
Me@4 51 gettimeofday(&timeval, 0);
Me@4 52 if(first_timeval.tv_sec == 0) {
Me@4 53 first_timeval = timeval;
Me@4 54 return 0;
Me@4 55 }
Me@4 56 return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
Me@4 57 }
Me@4 58 #elif defined(__WIN32__) || defined(WIN32)
Me@4 59 #include <windows.h>
Me@4 60 unsigned long get_msec(void) {
Me@4 61 return GetTickCount();
Me@4 62 }
Me@4 63 #else
Me@4 64 //#error "I don't know how to measure time on your platform"
Me@4 65 #endif
Me@4 66
Me@4 67 //======================== Defines =========================
kshalle@8 68 typedef struct perfData measurement_t;
kshalle@8 69 struct perfData{
kshalle@8 70 uint64 cycles;
kshalle@8 71 uint64 instructions;
kshalle@8 72 };
Me@4 73
Me@4 74 const char *usage = {
msach@20 75 "Usage: msg_passing_test [options]\n"
msach@20 76 " Starts threads equal to the number of cores and sends\n"
msach@20 77 " messages to random receivers\n\n"
Me@4 78 "Options:\n"
msach@20 79 " -n <num> This specifies the number of sends done by each thread.\n"
Me@4 80 " -h this help screen\n\n"
Me@4 81 };
Me@4 82
msach@20 83 /***************************
msach@20 84 * Barrier Implementation
msach@20 85 ***************************/
msach@20 86
Me@4 87 struct barrier_t
Me@4 88 {
Me@4 89 int counter;
Me@4 90 int nthreads;
Me@4 91 int32 mutex;
Me@4 92 int32 cond;
kshalle@8 93 measurement_t endBarrierCycles;
kshalle@8 94
Me@4 95 };
Me@4 96 typedef struct barrier_t barrier;
Me@4 97
Me@4 98 void inline barrier_init(barrier *barr, int nthreads, VirtProcr *animatingPr)
Me@4 99 {
Me@4 100 barr->counter = 0;
Me@4 101 barr->nthreads = nthreads;
Me@4 102 barr->mutex = VPThread__make_mutex(animatingPr);
Me@4 103 barr->cond = VPThread__make_cond(barr->mutex, animatingPr);
Me@4 104 }
Me@4 105
kshalle@8 106 int cycles_counter_main_fd;
Me@4 107 void inline barrier_wait(barrier *barr, VirtProcr *animatingPr)
Me@4 108 { int i;
Me@4 109
Me@4 110 VPThread__mutex_lock(barr->mutex, animatingPr);
Me@4 111 barr->counter++;
Me@4 112 if(barr->counter == barr->nthreads)
kshalle@8 113 {
msach@15 114 #ifdef MEASURE_PERF
kshalle@8 115 read(cycles_counter_main_fd, &(barr->endBarrierCycles.cycles), \
kshalle@8 116 sizeof(barr->endBarrierCycles.cycles));
msach@15 117 #endif
kshalle@8 118
kshalle@8 119 barr->counter = 0;
Me@4 120 for(i=0; i < barr->nthreads; i++)
Me@4 121 VPThread__cond_signal(barr->cond, animatingPr);
Me@4 122 }
Me@4 123 else
Me@4 124 { VPThread__cond_wait(barr->cond, animatingPr);
Me@4 125 }
Me@4 126 VPThread__mutex_unlock(barr->mutex, animatingPr);
Me@4 127 }
Me@4 128
kshalle@8 129
msach@20 130 /**************************
msach@20 131 * Worker Parameters
msach@20 132 **************************/
msach@9 133 typedef struct
msach@9 134 { struct barrier_t* barrier;
msach@9 135 uint64_t totalWorkCycles;
msach@9 136 uint64_t totalBadCycles;
msach@9 137 uint64_t totalSyncCycles;
msach@9 138 uint64_t totalBadSyncCycles;
msach@9 139 uint64 numGoodSyncs;
msach@9 140 uint64 numGoodTasks;
msach@20 141 uint64_t coreID;
msach@20 142 lossyCom__endpoint_t* localEndpoint;
msach@20 143 lossyCom__exchange_t* centralMsgExchange;
msach@20 144 unsigned int receivedACKs;
msach@20 145 unsigned int broadcasterStatus;
msach@20 146 unsigned int terminate;
msach@9 147 }
msach@9 148 WorkerParams;
Me@4 149
kshalle@8 150 typedef struct
kshalle@8 151 { measurement_t *startExeCycles;
kshalle@8 152 measurement_t *endExeCycles;
kshalle@8 153 }
kshalle@8 154 BenchParams;
Me@4 155
msach@20 156 typedef struct
msach@20 157 {
msach@20 158 lossyCom__endpointID_t receiverID;
msach@20 159 lossyCom__msgBody_t msg;
msach@20 160 } savedMsg_t;
msach@20 161
Me@4 162 //======================== Globals =========================
Me@4 163 char __ProgrammName[] = "overhead_test";
Me@4 164 char __DataSet[255];
Me@4 165
msach@20 166 int num_msg_to_send;
Me@4 167 size_t chunk_size = 0;
Me@4 168
msach@6 169 int cycles_counter_fd[NUM_CORES];
msach@7 170 struct perf_event_attr* hw_event;
Me@4 171
kshalle@8 172 WorkerParams *workerParamsArray;
kshalle@8 173
msach@20 174 // init random number
msach@20 175 uint32_t seed1;
msach@20 176 uint32_t seed2;
msach@20 177
Me@4 178 //======================== App Code =========================
Me@4 179 /*
Me@4 180 * Workload
Me@4 181 */
msach@6 182
msach@6 183 #define saveCyclesAndInstrs(core,cycles) do{ \
msach@6 184 int cycles_fd = cycles_counter_fd[core]; \
msach@6 185 int nread; \
msach@6 186 \
msach@6 187 nread = read(cycles_fd,&(cycles),sizeof(cycles)); \
msach@7 188 if(nread<0){ \
msach@6 189 perror("Error reading cycles counter"); \
msach@6 190 cycles = 0; \
msach@6 191 } \
msach@6 192 } while (0) //macro magic for scoping
msach@6 193
msach@20 194 extern inline uint32_t
msach@20 195 randomNumber(uint32_t* seed1, uint32_t* seed2);
msach@20 196
msach@20 197 #define BROADCAST BROADCAST_ID
msach@20 198 #define BROADCAST_ACK BROADCAST_ID-1
msach@20 199 #define TERMINATE BROADCAST_ID-2
msach@20 200
msach@20 201 #define RECEIVING_BROADCAST 0
msach@20 202 #define BROADCASTING 1
msach@20 203 #define RECEIVING_ACK 2
msach@20 204
msach@20 205 /*
msach@20 206 * Message Handler Function
msach@20 207 */
msach@20 208 void msgHandler(lossyCom__endpointID_t senderID, lossyCom__msgBody_t msg, void* data)
msach@20 209 {
msach@20 210 WorkerParams* threadData = (WorkerParams*)data;
msach@20 211 lossyCom__endpoint_t* comEndpoint = threadData->localEndpoint;
msach@20 212 lossyCom__endpointID_t receiverID;
msach@20 213
msach@20 214 if(msg == BROADCAST_ID) //answer broadcast message
msach@20 215 {
msach@20 216 lossyCom__sendMsg(comEndpoint, senderID, BROADCAST_ACK);
msach@20 217 return;
msach@20 218 }
msach@20 219 if(msg == (BROADCAST_ACK) && threadData->broadcasterStatus == RECEIVING_ACK)
msach@20 220 {
msach@20 221 threadData->receivedACKs++;
msach@20 222 if(threadData->receivedACKs == NUM_CORES/2)//chose next broadcaster
msach@20 223 {
msach@20 224 do{
msach@20 225 receiverID = randomNumber(&seed1, &seed2) % NUM_CORES;
msach@20 226 }while(receiverID == comEndpoint->endpointID);
msach@20 227
msach@20 228 //send the receiverID to the receiver to notify him that he is next
msach@20 229 lossyCom__sendMsg(comEndpoint, receiverID, receiverID);
msach@20 230 threadData->broadcasterStatus = RECEIVING_BROADCAST;
msach@20 231 }
msach@20 232 return;
msach@20 233 }
msach@20 234 if(msg == TERMINATE) //termination message
msach@20 235 {
msach@20 236 printf("endpoint %d received termination request\n", comEndpoint->endpointID);
msach@20 237 threadData->terminate = TRUE;
msach@20 238 return;
msach@20 239 }
msach@20 240 //
msach@20 241 threadData->broadcasterStatus = BROADCASTING;
msach@20 242 }
msach@20 243
msach@20 244 unsigned int global_broadcast_counter;
msach@7 245
msach@9 246 double
msach@9 247 worker_TLF(void* _params, VirtProcr* animatingPr)
Me@5 248 {
msach@20 249 unsigned int msgCounter;
msach@20 250 unsigned int broadcaster;
msach@20 251 uint32_t wait_iterations;
msach@9 252 WorkerParams* params = (WorkerParams*)_params;
msach@9 253 unsigned int totalWorkCycles = 0, totalBadCycles = 0;
msach@9 254 unsigned int totalSyncCycles = 0, totalBadSyncCycles = 0;
msach@9 255 unsigned int workspace1=0, numGoodSyncs = 0, numGoodTasks = 0;
kshalle@8 256 double workspace2=0.0;
msach@20 257
msach@20 258 //core 0 always starts
msach@20 259 params->broadcasterStatus = params->coreID==0?BROADCASTING:RECEIVING_BROADCAST;
msach@20 260
msach@20 261 /*
Me@5 262 int32 privateMutex = VPThread__make_mutex(animatingPr);
msach@6 263
msach@6 264 int cpuid = sched_getcpu();
msach@9 265
msach@11 266 measurement_t startWorkload, endWorkload, startWorkload2, endWorkload2;
msach@9 267 uint64 numCycles;
msach@20 268 */
msach@15 269 #ifdef MEASURE_PERF
msach@10 270 saveCyclesAndInstrs(cpuid,startWorkload.cycles);
msach@15 271 #endif
msach@20 272
msach@20 273 //initialize endpoint for communication
msach@20 274 lossyCom__endpoint_t comEndpoint;
msach@20 275 params->localEndpoint = &comEndpoint;
msach@20 276 lossyCom__initialize_endpoint(&comEndpoint,
msach@20 277 params->centralMsgExchange,
msach@20 278 params->coreID,
msach@20 279 msgHandler,
msach@20 280 params);
msach@20 281
msach@20 282 lossyCom__endpointID_t receiverID;
msach@20 283 msgCounter = 0;
msach@20 284 while(msgCounter <= num_msg_to_send)
msach@20 285 {
msach@20 286 int i;
msach@9 287
msach@20 288 if(params->broadcasterStatus == BROADCASTING)
Me@5 289 {
msach@20 290 if(msgCounter == num_msg_to_send)//send termination msg
msach@20 291 {
msach@20 292 lossyCom__sendMsg(&comEndpoint,BROADCAST_ID, TERMINATE);
msach@20 293 break;
msach@20 294 }else{ //generate and send random message
msach@20 295 params->receivedACKs = 0;
msach@20 296 lossyCom__sendMsg(&comEndpoint, BROADCAST_ID, BROADCAST);
msach@20 297 global_broadcast_counter++;
msach@20 298 if(global_broadcast_counter % 1000 == 0){
msach@20 299 printf("broadcast count: %d\n", global_broadcast_counter);
msach@20 300 }
msach@20 301 params->broadcasterStatus = RECEIVING_ACK; //mark msg as send
msach@20 302 msgCounter++;
msach@20 303 }
Me@5 304 }
msach@20 305
msach@20 306 //check if the benchmark should terminate
msach@20 307 if(params->terminate)
msach@20 308 break;
msach@20 309
msach@20 310 //receive msg
msach@20 311 lossyCom__receiveMsg(&comEndpoint);
msach@20 312 }
msach@20 313
msach@15 314
msach@15 315 #ifdef MEASURE_PERF
msach@10 316 saveCyclesAndInstrs(cpuid,endWorkload.cycles);
msach@10 317 numCycles = endWorkload.cycles - startWorkload.cycles;
msach@9 318 //sanity check (400K is about 20K iters)
msach@9 319 if( numCycles < 400000 ) {totalWorkCycles += numCycles; numGoodTasks++;}
msach@9 320 else {totalBadCycles += numCycles; }
msach@15 321 #endif
msach@9 322
msach@20 323 barrier_wait(params->barrier, animatingPr);
Me@5 324
kshalle@8 325 params->totalWorkCycles = totalWorkCycles;
msach@9 326 params->totalBadCycles = totalBadCycles;
msach@9 327 params->numGoodTasks = numGoodTasks;
msach@9 328 params->totalSyncCycles = totalSyncCycles;
msach@9 329 params->totalBadSyncCycles = totalBadSyncCycles;
msach@9 330 params->numGoodSyncs = numGoodSyncs;
msach@9 331 /*
msach@9 332 params->totalSyncCycles = VMS__give_num_plugin_cycles();
msach@9 333 params->totalBadSyncCycles = 0;
msach@9 334 params->numGoodSyncs = VMS__give_num_plugin_animations();
msach@9 335 */
Me@5 336 //Shutdown worker
Me@5 337 VPThread__dissipate_thread(animatingPr);
msach@9 338
msach@9 339 //below return never reached --> there for gcc
msach@9 340 return (workspace1 + workspace2); //to prevent gcc from optimizing work out
Me@5 341 }
Me@4 342
kshalle@8 343
Me@4 344 /* this is run after the VMS is set up*/
kshalle@8 345 void benchmark(void *_params, VirtProcr *animatingPr)
Me@4 346 {
msach@20 347 int i, cpuID, idx;
msach@20 348 struct barrier_t barr;
kshalle@8 349 BenchParams *params;
kshalle@8 350
kshalle@8 351 params = (BenchParams *)_params;
msach@20 352
msach@20 353 barrier_init(&barr, NUM_CORES+1, animatingPr);
msach@20 354
msach@20 355 //Init central communication exchange
msach@20 356 lossyCom__exchange_t* centralMsgExchange = lossyCom__initialize(NUM_CORES);
msach@20 357
msach@6 358 //prepare input
msach@20 359 for(i=0; i<NUM_CORES; i++)
msach@6 360 {
kshalle@8 361 workerParamsArray[i].barrier = &barr;
msach@20 362 workerParamsArray[i].coreID = i;
msach@20 363 workerParamsArray[i].centralMsgExchange = centralMsgExchange;
msach@20 364 workerParamsArray[i].terminate = FALSE;
Me@4 365 }
msach@20 366 global_broadcast_counter = 0;
msach@20 367
msach@20 368 // init random number generator for wait and msg content
msach@20 369 seed1 = rand()%1000;
msach@20 370 seed2 = rand()%1000;
msach@7 371
kshalle@8 372 //save cycles before execution of threads, to get total exe cycles
kshalle@8 373 measurement_t *startExeCycles, *endExeCycles;
kshalle@8 374 startExeCycles = params->startExeCycles;
kshalle@8 375
msach@15 376 #ifdef MEASURE_PERF
kshalle@8 377 int nread = read(cycles_counter_main_fd, &(startExeCycles->cycles),
kshalle@8 378 sizeof(startExeCycles->cycles));
msach@9 379 if(nread<0) perror("Error reading cycles counter");
msach@15 380 #endif
msach@9 381
msach@9 382 //create (which starts running) all threads
msach@20 383 for(i=NUM_CORES-1; i>=0; i--)
msach@20 384 {
msach@20 385 VPThread__create_thread_with_affinity((VirtProcrFnPtr)worker_TLF,
msach@20 386 &(workerParamsArray[i]),
msach@20 387 animatingPr,
msach@20 388 i);//schedule to core i
kshalle@8 389 }
msach@6 390
msach@15 391 #ifdef MEASURE_PERF
msach@9 392 //endBarrierCycles read in barrier_wait()! Merten, email me if want to chg
kshalle@8 393 params->endExeCycles->cycles = barr.endBarrierCycles.cycles;
msach@15 394 #endif
kshalle@8 395
msach@20 396 barrier_wait(&barr, animatingPr);
msach@20 397 printf("Total broadcast count: %d\n", global_broadcast_counter);
msach@20 398
msach@20 399 //print send msgs
msach@20 400 /*
msach@20 401 printf("sendMsgs = []\n");
msach@20 402 for(i = 0; i<NUM_CORES; i++)
msach@20 403 {
msach@20 404 printf("sendMsgs.append([");
msach@20 405 for(idx = 0; idx< workerParamsArray[i].sendMsgs->numInArray; idx++)
msach@20 406 {
msach@20 407 printf("(%lu, %lu),",
msach@20 408 (uint64_t)(workerParamsArray[i].ptrToArrayOfSendMsgs[idx]) & 0xFFFFFFFF,
msach@20 409 ((uint64_t)(workerParamsArray[i].ptrToArrayOfSendMsgs[idx]) >> 32 ) & 0xFFFFFFFF);
msach@20 410 }
msach@20 411 printf("])\n");
msach@20 412 }
msach@20 413
msach@20 414
msach@20 415 //print received msgs
msach@20 416 printf("receivedMsgs = []\n");
msach@20 417 for(i = 0; i<NUM_CORES; i++)
msach@20 418 {
msach@20 419 printf("receivedMsgs.append([");
msach@20 420 for(idx = 0; idx< workerParamsArray[i].receivedMsgs->numInArray; idx++)
msach@20 421 {
msach@20 422 printf("(%lu, %lu),",
msach@20 423 (uint64_t)(workerParamsArray[i].ptrToArrayOfReceivedMsgs[idx]) & 0xFFFFFFFF,
msach@20 424 ((uint64_t)(workerParamsArray[i].ptrToArrayOfReceivedMsgs[idx]) >> 32 ) & 0xFFFFFFFF);
msach@20 425 }
msach@20 426 printf("])\n");
msach@20 427 }*/
Me@4 428
kshalle@8 429 /*
msach@6 430 uint64_t overallWorkCycles = 0;
msach@6 431 for(i=0; i<num_threads; i++){
msach@7 432 printf("WorkCycles: %lu\n",input[i].totalWorkCycles);
msach@6 433 overallWorkCycles += input[i].totalWorkCycles;
Me@4 434 }
msach@6 435
msach@6 436 printf("Sum across threads of work cycles: %lu\n", overallWorkCycles);
msach@6 437 printf("Total Execution: %lu\n", endBenchTime.cycles-startBenchTime.cycles);
kshalle@8 438 printf("Runtime/Workcycle Ratio %lu\n",
kshalle@8 439 ((endBenchTime.cycles-startBenchTime.cycles)*100)/overallWorkCycles);
kshalle@8 440 */
Me@4 441
Me@4 442 //======================================================
Me@4 443
Me@4 444 VPThread__dissipate_thread(animatingPr);
Me@4 445 }
Me@4 446
Me@4 447 int main(int argc, char **argv)
Me@4 448 {
Me@4 449 int i;
Me@4 450
Me@4 451 //set global static variables, based on cmd-line args
Me@4 452 for(i=1; i<argc; i++)
Me@4 453 {
Me@4 454 if(argv[i][0] == '-' && argv[i][2] == 0)
Me@4 455 {
Me@4 456 switch(argv[i][1])
Me@4 457 {
msach@20 458 case 'n':
Me@4 459 if(!isdigit(argv[++i][0]))
Me@4 460 {
msach@20 461 fprintf(stderr, "-t must be followed by the number messages to send per core\n");
Me@4 462 return EXIT_FAILURE;
Me@4 463 }
msach@20 464 num_msg_to_send = atoi(argv[i]);
msach@20 465 if(!num_msg_to_send)
Me@4 466 {
msach@20 467 fprintf(stderr, "invalid number of messages to send: %d\n", num_msg_to_send);
Me@4 468 return EXIT_FAILURE;
Me@4 469 }
Me@4 470 break;
Me@4 471 case 'h':
Me@4 472 fputs(usage, stdout);
msach@20 473 return 0;
Me@4 474 default:
Me@4 475 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
Me@4 476 fputs(usage, stderr);
Me@4 477 return EXIT_FAILURE;
Me@4 478 }//switch
Me@4 479 }//if arg
Me@4 480 else
Me@4 481 {
msach@20 482 fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
msach@20 483 fputs(usage, stderr);
msach@20 484 return EXIT_FAILURE;
Me@4 485 }
Me@4 486 }//for
msach@7 487
kshalle@8 488
msach@15 489 #ifdef MEASURE_PERF
msach@7 490 //setup performance counters
msach@7 491 hw_event = malloc(sizeof(struct perf_event_attr));
msach@7 492 memset(hw_event,0,sizeof(struct perf_event_attr));
msach@7 493
msach@7 494 hw_event->type = PERF_TYPE_HARDWARE;
msach@7 495 hw_event->size = sizeof(hw_event);
msach@7 496 hw_event->disabled = 0;
msach@7 497 hw_event->freq = 0;
msach@7 498 hw_event->inherit = 1; /* children inherit it */
msach@7 499 hw_event->pinned = 1; /* says this virt counter must always be on HW */
msach@7 500 hw_event->exclusive = 0; /* only group on PMU */
msach@7 501 hw_event->exclude_user = 0; /* don't count user */
msach@7 502 hw_event->exclude_kernel = 1; /* don't count kernel */
msach@7 503 hw_event->exclude_hv = 1; /* ditto hypervisor */
msach@7 504 hw_event->exclude_idle = 1; /* don't count when idle */
msach@7 505 hw_event->mmap = 0; /* include mmap data */
msach@7 506 hw_event->comm = 0; /* include comm data */
msach@7 507
msach@7 508 hw_event->config = PERF_COUNT_HW_CPU_CYCLES; //cycles
msach@7 509
msach@7 510 int cpuID, retries;
msach@7 511
msach@7 512 for( cpuID = 0; cpuID < NUM_CORES; cpuID++ )
msach@7 513 { retries = 0;
msach@7 514 do
msach@7 515 { retries += 1;
msach@7 516 cycles_counter_fd[cpuID] =
msach@7 517 syscall(__NR_perf_event_open, hw_event,
msach@7 518 0,//pid_t: 0 is "pid of calling process"
msach@7 519 cpuID,//int: cpu, the value returned by "CPUID" instr(?)
msach@7 520 -1,//int: group_fd, -1 is "leader" or independent
msach@7 521 0//unsigned long: flags
msach@7 522 );
msach@7 523 }
msach@7 524 while(cycles_counter_fd[cpuID]<0 && retries < 100);
msach@7 525 if(retries >= 100)
msach@7 526 {
msach@7 527 fprintf(stderr,"On core %d: ",cpuID);
msach@7 528 perror("Failed to open cycles counter");
msach@7 529 }
msach@7 530 }
msach@7 531
msach@7 532 //Set up counter to accumulate total cycles to process, across all CPUs
msach@7 533
msach@7 534 retries = 0;
msach@7 535 do
msach@7 536 { retries += 1;
msach@7 537 cycles_counter_main_fd =
msach@7 538 syscall(__NR_perf_event_open, hw_event,
msach@7 539 0,//pid_t: 0 is "pid of calling process"
msach@7 540 -1,//int: cpu, -1 means accumulate from all cores
msach@7 541 -1,//int: group_fd, -1 is "leader" == independent
msach@7 542 0//unsigned long: flags
msach@7 543 );
msach@7 544 }
msach@7 545 while(cycles_counter_main_fd<0 && retries < 100);
msach@7 546 if(retries >= 100)
msach@7 547 {
msach@7 548 fprintf(stderr,"in main ");
msach@7 549 perror("Failed to open cycles counter");
msach@7 550 }
msach@15 551 #endif
kshalle@8 552
msach@9 553 measurement_t startExeCycles, endExeCycles;
msach@9 554 BenchParams *benchParams;
msach@9 555
msach@9 556 benchParams = malloc(sizeof(BenchParams));
msach@9 557
msach@9 558 benchParams->startExeCycles = &startExeCycles;
msach@9 559 benchParams->endExeCycles = &endExeCycles;
msach@9 560
msach@20 561 workerParamsArray = (WorkerParams *)malloc( (NUM_CORES) * sizeof(WorkerParams) );
kshalle@8 562 if(workerParamsArray == NULL ) printf("error mallocing worker params array\n");
kshalle@8 563
msach@9 564
kshalle@8 565 //This is the transition to the VMS runtime
kshalle@8 566 VPThread__create_seed_procr_and_do_work( &benchmark, benchParams );
kshalle@8 567
msach@15 568 #ifdef MEASURE_PERF
msach@9 569 uint64_t totalWorkCyclesAcrossCores = 0, totalBadCyclesAcrossCores = 0;
msach@9 570 uint64_t totalSyncCyclesAcrossCores = 0, totalBadSyncCyclesAcrossCores = 0;
kshalle@8 571 for(i=0; i<num_threads; i++){
kshalle@8 572 printf("WorkCycles: %lu\n",workerParamsArray[i].totalWorkCycles);
msach@9 573 // printf("Num Good Tasks: %lu\n",workerParamsArray[i].numGoodTasks);
msach@9 574 // printf("SyncCycles: %lu\n",workerParamsArray[i].totalSyncCycles);
msach@9 575 // printf("Num Good Syncs: %lu\n",workerParamsArray[i].numGoodSyncs);
kshalle@8 576 totalWorkCyclesAcrossCores += workerParamsArray[i].totalWorkCycles;
msach@9 577 totalBadCyclesAcrossCores += workerParamsArray[i].totalBadCycles;
msach@9 578 totalSyncCyclesAcrossCores += workerParamsArray[i].totalSyncCycles;
msach@9 579 totalBadSyncCyclesAcrossCores += workerParamsArray[i].totalBadSyncCycles;
kshalle@8 580 }
msach@7 581
kshalle@8 582 uint64_t totalExeCycles = endExeCycles.cycles - startExeCycles.cycles;
msach@9 583 totalExeCycles -= totalBadCyclesAcrossCores;
msach@10 584 uint64 totalOverhead = totalExeCycles - totalWorkCyclesAcrossCores;
msach@10 585 int32 numSyncs = outer_iters * num_threads * 2;
msach@10 586 printf("Total Execution Cycles: %lu\n", totalExeCycles);
kshalle@8 587 printf("Sum across threads of work cycles: %lu\n", totalWorkCyclesAcrossCores);
msach@10 588 printf("Sum across threads of bad work cycles: %lu\n", totalBadCyclesAcrossCores);
msach@10 589 // printf("Sum across threads of Bad Sync cycles: %lu\n", totalBadSyncCyclesAcrossCores);
msach@10 590 printf("Overhead per sync: %f\n", (double)totalOverhead / (double)numSyncs );
kshalle@8 591 printf("ExeCycles/WorkCycles Ratio %f\n",
kshalle@8 592 (double)totalExeCycles / (double)totalWorkCyclesAcrossCores);
msach@15 593 #else
msach@20 594 printf("#No measurement done!\n");
msach@15 595 #endif
Me@4 596 return 0;
msach@7 597 }