/* 
 * 
 */

#include "main.h"

//==========  Global Vars  ===========

const char *usage = 
 {
   "Usage: k_tuple_async [options]\n"
   "  Creates a number of workers, and one consumer that packages productions "
   "   into a tuple.\n\n"
   "Options:\n"
   "  -p <num>   The number of producer threads to create.\n"
   "  -t <num>   the number of tuples to create\n"
   "  -h         this help screen\n\n"
 };

char __ProgrammName[] = "K-tuple_async";
char __DataSet[255];

#ifdef MEASURE_PERF
int cycles_counter_fd[NUM_CORES];
int instrs_counter_fd[NUM_CORES];
int cycles_counter_main_fd;
#endif

pthread_mutex_t  waitForAllDoneLock;
pthread_cond_t   waitForAllDoneCond;   


//===================================
/* provide a millisecond-resolution timer for each system */
#if defined(unix) || defined(__unix__)
#include <time.h>
#include <sys/time.h>
unsigned long get_msec(void) {
	static struct timeval timeval, first_timeval;
	
	gettimeofday(&timeval, 0);
	if(first_timeval.tv_sec == 0) {
		first_timeval = timeval;
		return 0;
	}
	return (timeval.tv_sec - first_timeval.tv_sec) * 1000 + (timeval.tv_usec - first_timeval.tv_usec) / 1000;
}
#elif defined(__WIN32__) || defined(WIN32)
#include <windows.h>
unsigned long get_msec(void) {
	return GetTickCount();
}
#else
#error "I don't know how to measure time on your platform"
#endif

/*Initializes the performance counters, and opens the file descriptors used
 * to read from the performance counters
 */
void
set_up_performance_counters()
 { int i;
 
 #ifdef MEASURE_PERF
    //setup performance counters
    struct perf_event_attr hw_event;
    memset(&hw_event,0,sizeof(hw_event));
        hw_event.type = PERF_TYPE_HARDWARE;
        hw_event.size = sizeof(hw_event);
        hw_event.disabled = 0;
        hw_event.freq = 0;
        hw_event.inherit = 1; /* children inherit it   */
        hw_event.pinned = 1; /* must always be on PMU */
        hw_event.exclusive = 0; /* only group on PMU     */
        hw_event.exclude_user = 0; /* don't count user      */
        hw_event.exclude_kernel = 1; /* ditto kernel          */
        hw_event.exclude_hv = 1; /* ditto hypervisor      */
        hw_event.exclude_idle = 1; /* don't count when idle */
        hw_event.mmap = 0; /* include mmap data     */
        hw_event.comm = 0; /* include comm data     */


    for( i = 0; i < NUM_CORES; i++ )
    {
        hw_event.config = PERF_COUNT_HW_CPU_CYCLES; //cycles
        cycles_counter_fd[i] = syscall(__NR_perf_event_open, &hw_event,
                0,//pid_t pid, 
                i,//int cpu, 
                -1,//int group_fd,
                0//unsigned long flags
        );
        if (cycles_counter_fd[i]<0){
            fprintf(stderr,"On core %d: ",i);
            perror("Failed to open cycles counter");
        }
    }        
       
    int cycles_counter_main_fd;
    hw_event.config = PERF_COUNT_HW_CPU_CYCLES; //cycles
    hw_event.exclude_kernel=0;
    cycles_counter_main_fd = syscall(__NR_perf_event_open, &hw_event,
            0,//pid_t pid, 
            -1,//int cpu, 
            -1,//int group_fd,
            0//unsigned long flags
    );
    if (cycles_counter_main_fd<0){
        perror("Failed to open main cycles counter");
    }
    
 #endif
 }


void
init_stuff()
 { 
   pthread_mutex_init(&tupleIterLock, NULL);
   pthread_cond_init( &tupleIterCond, NULL );
   tupleIter = 0;

   pthread_mutex_init(&producerAccessMutex, NULL);
   pthread_mutex_init(&productionReadyLock, NULL);
   pthread_cond_init( &productionReadyCond, NULL );
   currProductionNum = 0;

   pthread_mutex_init(&consumerReceivedAckLock, NULL);
   pthread_cond_init( &consumerReceivedAckCond, NULL );
   currConsumerReceivedACKNum = 0;
 }


typedef struct
 {
   int numProducers;
   int numTuplesToCreate;   
 }
ParsedArgs;

/*The benchmark Fn creates the producers and the consumer, then gives the
 * "go" signal.  It measures time from go until the consumer produces the
 * last tuple as output.
 */
void 
benchmark( ParsedArgs *args )
 {
   int i;
   ProducerParams producerParams[args->numProducers];
   pthread_t producerThds[args->numProducers];
   pthread_t consumerThd;
   
   ConsumerParams consumerParams;
   
   //Set up the param structs for producers.. gives them the mutex and cond var
   // to communicate with consumer
   //Also the core the producer should pin its thread to
   for(i=0; i < args->numProducers; i++)
    {
      producerParams[i].producerID = i + 1; //no ID of 0, a fact used in handshake
      producerParams[i].numTuplesToCreate = args->numTuplesToCreate;
      producerParams[i].coreID = i % NUM_CORES;
    }

   consumerParams.numProducers = args->numProducers;
   consumerParams.numTuplesToCreate = args->numTuplesToCreate;
           
   //take measurement before creation of threads, to get total exetime
   MeasStruct benchStartMeas, benchEndMeas;
       
   takeAMeas(0, benchStartMeas);
   
   for(i=0; i < args->numProducers; i++) 
    { pthread_create( &producerThds[i], NULL, &producer_birthFn, (void*)&producerParams[i]);  
    }
   
   pthread_create( &consumerThd, NULL, &consumer_birthFn, (void*)&consumerParams );
   
   for(i=0; i<args->numProducers; i++)
    { pthread_join( producerThds[i], NULL );
    }
   pthread_join( consumerThd, NULL );
   
   //work is all done, so take a measurement snapshot at end
   takeAMeas(0, benchEndMeas);
   
   
#ifdef MEASURE_PERF
   uint64_t totalExeCycles = ( benchEndMeas.cycles - benchStartMeas.cycles);
   printf("Total Execution: %lu\n", totalExeCycles);
#else
   uint64_t totalExeCycles = ( benchEndMeas.total - benchStartMeas.total);
   printf("Total Cycles of Execution: %lu\n", totalExeCycles);   
#endif

    //======================================================
 }


/*This parsed the command line arguments and returns the values in a struct
 * Command line args should be a '-' followed by a single letter, then a value
 */
ParsedArgs *
parse_arguments( int argc, char **argv )
 { ParsedArgs *parsedArgs;
   int i;
   
   parsedArgs = malloc(sizeof(ParsedArgs));
   if(argc < 2)
    { fprintf(stdout, "must give arguments");
      fputs(usage, stdout);
      return EXIT_FAILURE;
    }
   for( i=1; i < argc; i++ ) 
    { if(argv[i][0] == '-' && argv[i][2] == 0) 
       { switch(argv[i][1]) 
          { case 'p':
             { if(!isdigit(argv[++i][0])) 
                { fprintf(stderr, "-p must be followed by the number of producer threads to spawn\n");
                  return EXIT_FAILURE;
                }
               parsedArgs->numProducers = atoi(argv[i]);
               if( parsedArgs->numProducers == 0 ) 
                { fprintf(stderr, "invalid number of producers specified: %d\n", parsedArgs->numProducers);
                  return EXIT_FAILURE;
                }
               else
                { DEBUG__printf1("num producers: %d\n", parsedArgs->numProducers );
                }
             }
            break;
            case 't':
             { if( !isdigit( argv[++i][0] ) ) 
                { fputs("-t must be followed by a number\n", stderr);
                  return EXIT_FAILURE;
                }
               parsedArgs->numTuplesToCreate = atoi(argv[i]);
               DEBUG__printf1("num tuples to produce: %d\n", parsedArgs->numTuplesToCreate );
             }
            break;
            case 'h':
             { fputs(usage, stdout);
               return 0;
             }
            default:
             { fprintf(stderr, "unrecognized argument: %s\n", argv[i]);
               fputs(usage, stderr);
               return EXIT_FAILURE;
             }
          }
       }
      else 
       { fprintf(stdout, "unrecognized argument: %s\n", argv[i]);
         fputs(usage, stdout);
         return EXIT_FAILURE;
       }
    }//for
   return parsedArgs;
 }

int main(int argc, char **argv)
 { ParsedArgs *args;
   int i;
        
    
   set_up_performance_counters();
 
   init_stuff();
   
   args = parse_arguments( argc, argv);
   
   if( args < 10 ) return args +1; //non-zero exit when parsing went wrong
   
   benchmark( args );

   return 0;
 }

