Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__Hello_World__LangDev

changeset 0:9cf4c84a3091
Initial add of copied code -- nonsense code still
author: Some Random Person <seanhalle@yahoo.com>
date: Wed, 23 May 2012 12:39:19 -0700
children: 9ad1a6186956
files: .hgeol .hgignore VSs__Hello_World/EntryPoint.c VSs__Hello_World/SeedVP.c VSs__Hello_World/VSs__Hello_World.h __brch__default main.c
diffstat: 7 files changed, 812 insertions(+), 0 deletions(-) [+]
[-]

.hgeol 14

.hgignore 12

VSs__Hello_World/EntryPoint.c 62

VSs__Hello_World/SeedVP.c 594

VSs__Hello_World/VSs__Hello_World.h 94

__brch__default 1

main.c 35 .hgeol 14 .hgignore 12 VSs__Hello_World/EntryPoint.c 62 VSs__Hello_World/SeedVP.c 594 VSs__Hello_World/VSs__Hello_World.h 94 __brch__default 1 main.c 35
.hgeol 14
.hgignore 12
VSs__Hello_World/EntryPoint.c 62
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.hgeol	Wed May 23 12:39:19 2012 -0700
     1.3 @@ -0,0 +1,14 @@
     1.4 +
     1.5 +[patterns]
     1.6 +**.py = native
     1.7 +**.txt = native
     1.8 +**.c = native
     1.9 +**.h = native
    1.10 +**.cpp = native
    1.11 +**.java = native
    1.12 +**.class = bin
    1.13 +**.jar = bin
    1.14 +**.sh = native
    1.15 +**.pl = native
    1.16 +**.jpg = bin
    1.17 +**.gif = bin

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/.hgignore	Wed May 23 12:39:19 2012 -0700
     2.3 @@ -0,0 +1,12 @@
     2.4 +nbproject
     2.5 +Makefile
     2.6 +build
     2.7 +dist
     2.8 +src/Default
     2.9 +src/.settings
    2.10 +src/.cproject
    2.11 +src/.project
    2.12 +.dep.inc
    2.13 +glob:.cproject
    2.14 +glob:.project
    2.15 +glob:Debug

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/VSs__Hello_World/EntryPoint.c	Wed May 23 12:39:19 2012 -0700
     3.3 @@ -0,0 +1,62 @@
     3.4 +/*
     3.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     3.6 + *  Licensed under GNU General Public License version 2
     3.7 + *
     3.8 + * Author: seanhalle@yahoo.com
     3.9 + *
    3.10 + */
    3.11 +
    3.12 +#include <math.h>
    3.13 +
    3.14 +#include "SSR_Matrix_Mult.h"
    3.15 +
    3.16 +
    3.17 +
    3.18 +/*Every SSR system has an "entry point" function that creates the first
    3.19 + * processor, which starts the chain of creating more processors..
    3.20 + * eventually all of the processors will dissipate themselves, and
    3.21 + * return.
    3.22 + *
    3.23 + *This entry-point function follows the same pattern as all entry-point
    3.24 + * functions do:
    3.25 + *1) it creates the params for the seed processor, from the
    3.26 + *    parameters passed into the entry-point function
    3.27 + *2) it calls SSR__create_seed_procr_and_do_work
    3.28 + *3) it gets the return value from the params struc, frees the params struc,
    3.29 + *    and returns the value from the function
    3.30 + *
    3.31 + */
    3.32 +Matrix *
    3.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
    3.34 + { Matrix          *resMatrix;
    3.35 +   DividerParams   *dividerParams;
    3.36 +   int32            numResRows, numResCols;
    3.37 +
    3.38 +
    3.39 +   dividerParams              = malloc( sizeof( DividerParams ) );
    3.40 +   dividerParams->leftMatrix  = leftMatrix;
    3.41 +   dividerParams->rightMatrix = rightMatrix;
    3.42 +
    3.43 +
    3.44 +   numResRows  = leftMatrix->numRows;
    3.45 +   numResCols  = rightMatrix->numCols;
    3.46 +
    3.47 +      //VMS has its own separate internal malloc, so to get results out,
    3.48 +      // have to pass in empty array for it to fill up
    3.49 +      //The alternative is internally telling SSR make external space to use
    3.50 +   resMatrix            = malloc( sizeof(Matrix) );
    3.51 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    3.52 +   resMatrix->numCols   = rightMatrix->numCols;
    3.53 +   resMatrix->numRows   = leftMatrix->numRows;
    3.54 +
    3.55 +
    3.56 +   dividerParams->resultMatrix   = resMatrix;
    3.57 +
    3.58 +      //create divider processor, start doing the work, and wait till done
    3.59 +      //This function is the "border crossing" between normal code and SSR
    3.60 +   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    3.61 +                                       dividerParams );
    3.62 +   
    3.63 +   free( dividerParams );
    3.64 +   return resMatrix;
    3.65 + }

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/VSs__Hello_World/SeedVP.c	Wed May 23 12:39:19 2012 -0700
     4.3 @@ -0,0 +1,594 @@
     4.4 +/*
     4.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     4.6 + *  Licensed under GNU General Public License version 2
     4.7 + *
     4.8 + * Author: seanhalle@yahoo.com
     4.9 + *
    4.10 + */
    4.11 +
    4.12 +
    4.13 +#include <math.h>
    4.14 +#include <string.h>
    4.15 +#include "SSR_Matrix_Mult.h"
    4.16 +
    4.17 +   //The time to compute this many result values should equal the time to
    4.18 +   // perform this division on a matrix of size gives that many result calcs
    4.19 +   //IE, size this so that sequential time to calc equals divide time
    4.20 +   // find the value by experimenting -- but divide time and calc time scale
    4.21 +   // same way, so this value might remain the same across hardware
    4.22 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
    4.23 +
    4.24 +
    4.25 +//===========================================================================
    4.26 +int inline
    4.27 +measureMatrixMultPrimitive( SlaveVP *animPr );
    4.28 +
    4.29 +SlicingStrucCarrier *
    4.30 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    4.31 +                                 SlaveVP *animPr );
    4.32 +
    4.33 +SlicingStruc *
    4.34 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    4.35 +                  SlaveVP *animPr );
    4.36 +
    4.37 +void
    4.38 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr );
    4.39 +
    4.40 +SubMatrix **
    4.41 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.42 +                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr );
    4.43 +
    4.44 +void
    4.45 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    4.46 +                 SubMatrix **subMatrices, SlaveVP *animPr );
    4.47 +
    4.48 +void
    4.49 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
    4.50 +                                    SubMatrix **rightSubMatrices,
    4.51 +                                    int32 numRowIdxs, int32 numColIdxs,
    4.52 +                                    int32 numVecIdxs,
    4.53 +                                    SlaveVP *resultPr,
    4.54 +                                    SlaveVP *animatingPr );
    4.55 +
    4.56 +void
    4.57 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
    4.58 +            SlicingStrucCarrier *slicingStrucCarrier,
    4.59 +            SlaveVP *resultPr, SlaveVP *animatingPr );
    4.60 +
    4.61 +
    4.62 +
    4.63 +/*Divider creates one processor for every sub-matrix
    4.64 + * It hands them:
    4.65 + *  the name of the result processor that they should send their results to,
    4.66 + *  the left and right matrices, and the rows and cols they should multiply
    4.67 + * It first creates the result processor, then all the sub-matrixPair
    4.68 + *  processors,
    4.69 + *  then does a receive of a message from the result processor that gives
    4.70 + *  the divider ownership of the result matrix.
    4.71 + * Finally, the divider returns the result matrix out of the SSR system.
    4.72 + *
    4.73 + * Divider chooses the size of sub-matrices via an algorithm that tries to
    4.74 + *  keep the minimum work above a threshold.  The threshold is machine-
    4.75 + *  dependent, so ask SSR for min work-unit time to get a
    4.76 + *  given overhead
    4.77 + *
    4.78 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
    4.79 + *  product -- gives the number of products need to have in min size
    4.80 + *  matrix.
    4.81 + *
    4.82 + * So then, take cubed root of this to get the size of a side of min sub-
    4.83 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    4.84 + *  up the two input matrices into ones as close as possible to that size,
    4.85 + *  and create the pairs of sub-matrices.
    4.86 + *
    4.87 + *========================  STRATEGIC OVERVIEW  =======================
    4.88 + *
    4.89 + *This division is a bit tricky, because have to create things in advance
    4.90 + * that it's not at first obvious need to be created..
    4.91 + *
    4.92 + *First slice up each dimension -- three of them..  this is because will have
    4.93 + * to create the sub-matrix's data-structures before pairing the sub-matrices
    4.94 + * with each other -- so, have three dimensions to slice up before can
    4.95 + * create the sub-matrix data-strucs -- also, have to be certain that the
    4.96 + * cols of the left input have the exact same slicing as the rows of the
    4.97 + * left matrix, so just to be sure, do the slicing calc once, then use it
    4.98 + * for both.
    4.99 + *
   4.100 + *So, goes like this:
   4.101 + *1) calculate the start & end values of each dimension in each matrix.
   4.102 + *2) use those values to create sub-matrix structures
   4.103 + *3) combine sub-matrices into pairs, as the tasks to perform.
   4.104 + *
   4.105 + *Have to calculate separately from creating the sub-matrices because of the
   4.106 + * nature of the nesting -- would either end up creating the same sub-matrix
   4.107 + * multiple times, or else would have to put in detection of whether had
   4.108 + * made a particular one already if tried to combine steps 1 and 2.
   4.109 + *
   4.110 + *Step 3 has to be separate because of the nesting, as well -- same reason,
   4.111 + * would either create same sub-matrix multiple times, or else have to
   4.112 + * add detection of whether was already created.
   4.113 + *
   4.114 + *Another way to look at it: there's one level of loop to divide dimensions,
   4.115 + * two levels of nesting to create sub-matrices, and three levels to pair
   4.116 + * up the sub-matrices.
   4.117 + */
   4.118 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   4.119 +                                        SlaveVP *animPr )
   4.120 + { SlaveVP       *resultPr;
   4.121 +   DividerParams   *dividerParams;
   4.122 +   ResultsParams   *resultsParams;
   4.123 +   Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   4.124 +   void            *msg;
   4.125 +   SlicingStrucCarrier *slicingStrucCarrier;
   4.126 +   float32         *resultArray; //points to array inside result matrix
   4.127 +   
   4.128 +         DEBUG__printf( dbgAppFlow, "start divide")
   4.129 +
   4.130 +         int32
   4.131 +         divideProbe = VMS_App__create_single_interval_probe( "divideProbe",
   4.132 +                                                          animPr );
   4.133 +         VMS_App__record_sched_choice_into_probe( divideProbe, animPr );
   4.134 +         VMS_App__record_interval_start_in_probe( divideProbe );
   4.135 +
   4.136 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   4.137 +   int32 numResRows, numResCols, vectLength;
   4.138 +
   4.139 +   dividerParams   = (DividerParams *)_dividerParams;
   4.140 +   
   4.141 +   leftMatrix      = dividerParams->leftMatrix;
   4.142 +   rightMatrix     = dividerParams->rightMatrix;
   4.143 +
   4.144 +   vectLength = leftMatrix->numCols;
   4.145 +   numResRows = leftMatrix->numRows;
   4.146 +   numResCols = rightMatrix->numCols;
   4.147 +   resultArray     = dividerParams->resultMatrix->array;
   4.148 +   
   4.149 +      //zero the result array
   4.150 +   memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
   4.151 +
   4.152 +   //==============  Do either sequential mult or do division ==============
   4.153 +
   4.154 +      //Check if input matrices too small -- if yes, just do sequential
   4.155 +      //Cutoff is determined by overhead of this divider -- relatively
   4.156 +      // machine-independent
   4.157 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
   4.158 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
   4.159 +    {
   4.160 +      //====== Do sequential multiply on a single core
   4.161 +            DEBUG__printf( dbgAppFlow, "doing sequential")
   4.162 +            
   4.163 +         //transpose the right matrix
   4.164 +      float32 *
   4.165 +      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
   4.166 +                                         rightMatrix->numCols * sizeof(float32),
   4.167 +                                         animPr );
   4.168 +
   4.169 +         //copy values from orig matrix to local
   4.170 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   4.171 +                     0, 0, rightMatrix->numRows,
   4.172 +                     transRightArray, rightMatrix->array );
   4.173 +      
   4.174 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   4.175 +                            leftMatrix->array, transRightArray,
   4.176 +                            resultArray );
   4.177 +    }
   4.178 +   else
   4.179 +    {
   4.180 +      //====== Do parallel multiply across cores
   4.181 +
   4.182 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   4.183 +         // the two matrices.
   4.184 +         //The ideal size is the one takes the number of cycles to calculate
   4.185 +         // such that calc time is equal or greater than min work-unit size
   4.186 +      slicingStrucCarrier =
   4.187 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
   4.188 +
   4.189 +         //Make the results processor, now that know how many to wait for
   4.190 +      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
   4.191 +      resultsParams->numSubMatrixPairs  =
   4.192 +         slicingStrucCarrier->leftRowSlices->numVals *
   4.193 +         slicingStrucCarrier->rightColSlices->numVals *
   4.194 +         slicingStrucCarrier->vecSlices->numVals;
   4.195 +      resultsParams->dividerPr   = animPr;
   4.196 +      resultsParams->numCols     = rightMatrix->numCols;
   4.197 +      resultsParams->numRows     = leftMatrix->numRows;
   4.198 +      resultsParams->resultArray = resultArray;
   4.199 +
   4.200 +            DEBUG__printf(dbgAppFlow,"**create result Pr**")
   4.201 +      resultPr =
   4.202 +         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
   4.203 +
   4.204 +         //Make the sub-matrices, and pair them up, and make processor to
   4.205 +         // calc product of each pair.
   4.206 +      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   4.207 +                                    slicingStrucCarrier,
   4.208 +                                    resultPr, animPr);
   4.209 + 
   4.210 +         //result array is allocated externally, so no message from resultPr
   4.211 +         // however, do have to wait before printing out stats, so wait
   4.212 +         // for an empty handshake message
   4.213 +      msg = SSR__receive_from_to( resultPr, animPr );
   4.214 +   }
   4.215 +
   4.216 +
   4.217 +   //===============  Work done -- send results back =================
   4.218 +
   4.219 +
   4.220 +         DEBUG__printf( dbgAppFlow, "end divide")
   4.221 +
   4.222 +         VMS_App__record_interval_end_in_probe( divideProbe );
   4.223 +         VMS_App__print_stats_of_all_probes();
   4.224 +
   4.225 +      //nothing left to do so dissipate, SSR will wait to shutdown and hence
   4.226 +      // make results available to outside until all the processors have
   4.227 +      // dissipated -- so no need to wait for results processor
   4.228 +
   4.229 +   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
   4.230 +      //when all of the processors have dissipated, the "create seed and do
   4.231 +      // work" call in the entry point function returns
   4.232 + }
   4.233 +
   4.234 +
   4.235 +SlicingStrucCarrier *
   4.236 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   4.237 +                                 SlaveVP *animPr )
   4.238 + {
   4.239 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   4.240 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.241 +   SlicingStrucCarrier *slicingStrucCarrier =
   4.242 +                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
   4.243 +
   4.244 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   4.245 +   float64 numPrimitiveOpsInMinWorkUnit;
   4.246 +
   4.247 +
   4.248 +   //=======  Calc ideal size of min-sized sub-matrix  ========
   4.249 +
   4.250 +      //ask SSR for the number of cycles of the minimum work unit, at given
   4.251 +      // percent overhead then add a guess at overhead from this divider
   4.252 +   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
   4.253 +
   4.254 +      //ask SSR for number of cycles of the "primitive" op of matrix mult
   4.255 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
   4.256 +
   4.257 +   numPrimitiveOpsInMinWorkUnit =
   4.258 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   4.259 +
   4.260 +      //take cubed root -- that's number of these in a "side" of sub-matrix
   4.261 +      // then multiply by 5 because the primitive is 5x5
   4.262 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   4.263 +
   4.264 +   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
   4.265 +   
   4.266 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   4.267 +   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   4.268 +
   4.269 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
   4.270 +      idealSizeOfSide = idealSizeOfSide1;
   4.271 +   else
   4.272 +      idealSizeOfSide = idealSizeOfSide2;
   4.273 +
   4.274 +      //The multiply inner loop blocks the array to fit into L1 cache
   4.275 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   4.276 +
   4.277 +   //============  Slice up dimensions, now that know target size ===========
   4.278 +
   4.279 +      //Tell the slicer the target size of a side (floating pt), the start
   4.280 +      // value to start slicing at, and the end value to stop slicing at
   4.281 +      //It returns an array of start value of each chunk, plus number of them
   4.282 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   4.283 +   startLeftRow  = 0;
   4.284 +   endLeftRow    = leftMatrix->numRows -1;
   4.285 +   startVec      = 0;
   4.286 +   endVec        = leftMatrix->numCols -1;
   4.287 +   startRightCol = 0;
   4.288 +   endRightCol   = rightMatrix->numCols -1;
   4.289 +
   4.290 +   leftRowSlices =
   4.291 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   4.292 +
   4.293 +   vecSlices =
   4.294 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   4.295 +
   4.296 +   rightColSlices =
   4.297 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   4.298 +
   4.299 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   4.300 +   slicingStrucCarrier->vecSlices      = vecSlices;
   4.301 +   slicingStrucCarrier->rightColSlices = rightColSlices;
   4.302 +
   4.303 +   return slicingStrucCarrier;
   4.304 + }
   4.305 +
   4.306 +
   4.307 +void
   4.308 +makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
   4.309 +            SlicingStrucCarrier *slicingStrucCarrier,
   4.310 +            SlaveVP *resultPr,   SlaveVP *animPr )
   4.311 + {
   4.312 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   4.313 +   
   4.314 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   4.315 +   vecSlices      = slicingStrucCarrier->vecSlices;
   4.316 +   rightColSlices = slicingStrucCarrier->rightColSlices;
   4.317 +   SSR__free( slicingStrucCarrier, animPr );
   4.318 +   
   4.319 +   //================  Make sub-matrices, given the slicing  ================
   4.320 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
   4.321 +   leftSubMatrices =
   4.322 +      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
   4.323 +                         leftMatrix, animPr );
   4.324 +   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
   4.325 +   rightSubMatrices =
   4.326 +      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
   4.327 +                         rightMatrix, animPr );
   4.328 +
   4.329 +
   4.330 +   //==============  pair the sub-matrices and make processors ==============
   4.331 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
   4.332 +
   4.333 +   numRowIdxs = leftRowSlices->numVals;
   4.334 +   numColIdxs = rightColSlices->numVals;
   4.335 +   numVecIdxs = vecSlices->numVals;
   4.336 +   
   4.337 +   
   4.338 +   freeSlicingStruc( leftRowSlices, animPr );
   4.339 +   freeSlicingStruc( vecSlices, animPr );
   4.340 +   freeSlicingStruc( rightColSlices, animPr );
   4.341 +   
   4.342 +   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
   4.343 +                                       rightSubMatrices,
   4.344 +                                       numRowIdxs, numColIdxs,
   4.345 +                                       numVecIdxs,
   4.346 +                                       resultPr,
   4.347 +                                       animPr );
   4.348 + }
   4.349 +
   4.350 +
   4.351 +
   4.352 +
   4.353 +void
   4.354 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   4.355 +                                    SubMatrix **rightSubMatrices,
   4.356 +                                    int32 numRowIdxs, int32 numColIdxs,
   4.357 +                                    int32 numVecIdxs,
   4.358 +                                    SlaveVP *resultPr,
   4.359 +                                    SlaveVP *animatingPr )
   4.360 + {
   4.361 +   int32 resRowIdx, resColIdx, vecIdx;
   4.362 +   int32 numLeftColIdxs, numRightColIdxs;
   4.363 +   int32 leftRowIdxOffset;
   4.364 +   SMPairParams *subMatrixPairParams;
   4.365 +   float32 numToPutOntoEachCore, leftOverFraction, numVecOnCurrCore;
   4.366 +   int32 numCores, coreToAssignOnto;
   4.367 +
   4.368 +   numLeftColIdxs  = numColIdxs;
   4.369 +   numRightColIdxs = numVecIdxs;
   4.370 +
   4.371 +   numCores = SSR__give_number_of_cores_to_schedule_onto();
   4.372 +
   4.373 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   4.374 +   leftOverFraction = 0;
   4.375 +   numVecOnCurrCore = 0;
   4.376 +   coreToAssignOnto = 0;
   4.377 +
   4.378 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   4.379 +    {
   4.380 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   4.381 +
   4.382 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   4.383 +       {
   4.384 +         
   4.385 +         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   4.386 +          {
   4.387 +               //Make the processor for the pair of sub-matrices
   4.388 +            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
   4.389 +                                                               animatingPr);
   4.390 +            subMatrixPairParams->leftSubMatrix  =
   4.391 +               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   4.392 +
   4.393 +            subMatrixPairParams->rightSubMatrix =
   4.394 +               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   4.395 +
   4.396 +            subMatrixPairParams->resultPr = resultPr;
   4.397 +
   4.398 +               //put all pairs from the same vector onto same core
   4.399 +            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
   4.400 +                                             subMatrixPairParams,
   4.401 +                                             animatingPr,
   4.402 +                                             coreToAssignOnto );
   4.403 +
   4.404 +               //Trying to distribute the subMatrix-vectors across the cores, so
   4.405 +               // that each core gets the same number of vectors, with a max
   4.406 +               // imbalance of 1 vector more on some cores than others
   4.407 +            numVecOnCurrCore += 1;                 //incr before checking, so
   4.408 +            if( numVecOnCurrCore > numToPutOntoEachCore ) //actual num 1 less
   4.409 +             {
   4.410 +                  //deal with fractional part, to ensure that imbalance is 1 max
   4.411 +                  // IE, core with most has only 1 more than core with least
   4.412 +               leftOverFraction = numToPutOntoEachCore - numVecOnCurrCore;
   4.413 +               if( leftOverFraction > 1 ) ERROR("division alg messed up\n");
   4.414 +               numVecOnCurrCore = leftOverFraction; //accumulates "extra"
   4.415 +
   4.416 +                  //Move to next core, max core-value to incr to is numCores -1
   4.417 +               coreToAssignOnto += 1;
   4.418 +               if( coreToAssignOnto >= numCores ) coreToAssignOnto = 0;
   4.419 +             } //if
   4.420 +          } //for( vecIdx
   4.421 +       } //for( resColIdx
   4.422 +    } //for( resRowIdx
   4.423 +
   4.424 + }
   4.425 +
   4.426 +
   4.427 +
   4.428 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
   4.429 + */
   4.430 +SubMatrix **
   4.431 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.432 +                   int32 numUses, Matrix *origMatrix, SlaveVP *animPr )
   4.433 + {
   4.434 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   4.435 +   int32 startRow, endRow, startCol, endCol;
   4.436 +   int32 *rowStartVals, *colStartVals;
   4.437 +   int32 rowOffset;
   4.438 +   SubMatrix **subMatrices, *newSubMatrix;
   4.439 +
   4.440 +   numRowIdxs = rowSlices->numVals;
   4.441 +   numColIdxs = colSlices->numVals;
   4.442 +
   4.443 +   rowStartVals = rowSlices->startVals;
   4.444 +   colStartVals = colSlices->startVals;
   4.445 +
   4.446 +   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
   4.447 +                                 animPr );
   4.448 +
   4.449 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.450 +    {
   4.451 +      rowOffset = rowIdx * numColIdxs;
   4.452 +      
   4.453 +      startRow  = rowStartVals[rowIdx];
   4.454 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   4.455 +                                               // at last valid idx + 1 & is
   4.456 +                                               // 1 greater than end value
   4.457 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.458 +       {
   4.459 +         startCol = colStartVals[colIdx];
   4.460 +         endCol   = colStartVals[colIdx + 1] -1;
   4.461 +
   4.462 +         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
   4.463 +         newSubMatrix->numRows       = endRow - startRow +1;
   4.464 +         newSubMatrix->numCols       = endCol - startCol +1;
   4.465 +         newSubMatrix->origMatrix    = origMatrix;
   4.466 +         newSubMatrix->origStartRow  = startRow;
   4.467 +         newSubMatrix->origStartCol  = startCol;
   4.468 +         newSubMatrix->copySingleton = NULL;
   4.469 +         newSubMatrix->numUsesLeft   = numUses; //can free after this many
   4.470 +         //Prevent uninitialized memory
   4.471 +         newSubMatrix->copySingleton = NULL;
   4.472 +         newSubMatrix->copyTransSingleton = NULL;
   4.473 +
   4.474 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   4.475 +       }
   4.476 +    }
   4.477 +   return subMatrices;
   4.478 + }
   4.479 +
   4.480 +
   4.481 +void
   4.482 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   4.483 +                 SubMatrix **subMatrices, SlaveVP *animPr )
   4.484 + {
   4.485 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   4.486 +   SubMatrix *subMatrix;
   4.487 +
   4.488 +   numRowIdxs = rowSlices->numVals;
   4.489 +   numColIdxs = colSlices->numVals;
   4.490 +
   4.491 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   4.492 +    {
   4.493 +      rowOffset = rowIdx * numColIdxs;
   4.494 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   4.495 +       {
   4.496 +         subMatrix = subMatrices[ rowOffset + colIdx ];
   4.497 +         if( subMatrix->alreadyCopied )
   4.498 +            SSR__free( subMatrix->array, animPr );
   4.499 +         SSR__free( subMatrix, animPr );
   4.500 +       }
   4.501 +    }
   4.502 +   SSR__free( subMatrices, animPr );
   4.503 + }
   4.504 +
   4.505 +
   4.506 +
   4.507 +SlicingStruc *
   4.508 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   4.509 +                  SlaveVP *animPr )
   4.510 + { float32 residualAcc = 0;
   4.511 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   4.512 +   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
   4.513 +
   4.514 +      //calc size of matrix need to hold start vals --
   4.515 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   4.516 +
   4.517 +   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
   4.518 +
   4.519 +      //Calc the upper limit of start value -- when get above this, end loop
   4.520 +      // by saving highest value of the matrix dimension to access, plus 1
   4.521 +      // as the start point of the imaginary slice following the last one
   4.522 +      //Plus 1 because go up to value but not include when process last slice
   4.523 +      //The stopping condition is half-a-size less than highest value because
   4.524 +      // don't want any pieces smaller than half the ideal size -- just tack
   4.525 +      // little ones onto end of last one
   4.526 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   4.527 +   for( i = 0; startVal <= endVal; i++ )
   4.528 +    {
   4.529 +      startVals[i] = startVal;
   4.530 +      residualAcc += idealSizeOfSide;
   4.531 +      sizeOfSlice  = (int)residualAcc;
   4.532 +      residualAcc -= (float32)sizeOfSlice;
   4.533 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   4.534 +
   4.535 +      if( startVal > endCondition )
   4.536 +       { startVal = endVal + 1;
   4.537 +         startVals[ i + 1 ] = startVal;
   4.538 +       }
   4.539 +    }
   4.540 +
   4.541 +   slicingStruc->startVals = startVals;
   4.542 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   4.543 +                                 // which means is num sub-matrices in dim
   4.544 +                                 // also == idx of the fake start just above
   4.545 +   return slicingStruc;
   4.546 + }
   4.547 +
   4.548 +void
   4.549 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr )
   4.550 + {
   4.551 +   SSR__free( slicingStruc->startVals, animPr );
   4.552 +   SSR__free( slicingStruc, animPr );
   4.553 + }
   4.554 +
   4.555 +
   4.556 +inline int
   4.557 +measureMatrixMultPrimitive( SlaveVP *animPr )
   4.558 + {
   4.559 +   int r, c, v, numCycles;
   4.560 +   float32 *res, *left, *right;
   4.561 +
   4.562 +      //setup inputs
   4.563 +   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.564 +   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.565 +   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   4.566 +
   4.567 +   for( r = 0; r < 5; r++ )
   4.568 +    {
   4.569 +      for( c = 0; c < 5; c++ )
   4.570 +       {
   4.571 +         left[  r * 5 + c ] = r;
   4.572 +         right[ r * 5 + c ] = c;
   4.573 +       }
   4.574 +    }
   4.575 +
   4.576 +      //do primitive
   4.577 +   SSR__start_primitive();  //for now, just takes time stamp
   4.578 +   for( r = 0; r < 5; r++ )
   4.579 +    {
   4.580 +      for( c = 0; c < 5; c++ )
   4.581 +       {
   4.582 +         for( v = 0; v < 5; v++ )
   4.583 +          {
   4.584 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   4.585 +          }
   4.586 +       }
   4.587 +    }
   4.588 +   numCycles =
   4.589 +      SSR__end_primitive_and_give_cycles();
   4.590 +
   4.591 +   SSR__free( left, animPr );
   4.592 +   SSR__free( right, animPr );
   4.593 +   SSR__free( res, animPr );
   4.594 +
   4.595 +   return numCycles;
   4.596 + }
   4.597 +

     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/VSs__Hello_World/VSs__Hello_World.h	Wed May 23 12:39:19 2012 -0700
     5.3 @@ -0,0 +1,94 @@
     5.4 +/*
     5.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     5.6 + *  Licensed under GNU General Public License version 2
     5.7 + */
     5.8 +
     5.9 +#ifndef _SSR_MATRIX_MULT_H_
    5.10 +#define _SSR_MATRIX_MULT_H_
    5.11 +
    5.12 +#include <stdio.h>
    5.13 +
    5.14 +#include "SSR_impl/SSR.h"
    5.15 +#include "../Matrix_Mult.h"
    5.16 +
    5.17 +
    5.18 +//===============================  Defines  ==============================
    5.19 +#define ROWS_IN_BLOCK 32
    5.20 +#define COLS_IN_BLOCK 32
    5.21 +#define VEC_IN_BLOCK  32
    5.22 +
    5.23 +#define copyMatrixSingleton 1
    5.24 +#define copyTransposeSingleton 2
    5.25 +
    5.26 +//==============================  Structures  ==============================
    5.27 +typedef struct
    5.28 + {
    5.29 +   Matrix *leftMatrix;
    5.30 +   Matrix *rightMatrix;
    5.31 +   Matrix *resultMatrix;
    5.32 + }
    5.33 +DividerParams;
    5.34 +
    5.35 +typedef struct
    5.36 + {
    5.37 +   SlaveVP *dividerPr;
    5.38 +   int numRows;
    5.39 +   int numCols;
    5.40 +   int numSubMatrixPairs;
    5.41 +   float32 *resultArray;
    5.42 + }
    5.43 +ResultsParams;
    5.44 +
    5.45 +typedef struct
    5.46 + { int32    numRows;
    5.47 +   int32    numCols;
    5.48 +   Matrix  *origMatrix;
    5.49 +   int32    origStartRow;
    5.50 +   int32    origStartCol;
    5.51 +   int32    alreadyCopied;
    5.52 +   int32    numUsesLeft; //have update via message to avoid multiple writers
    5.53 +   SSRSingleton *copySingleton;
    5.54 +   SSRSingleton *copyTransSingleton;
    5.55 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    5.56 + }
    5.57 +SubMatrix;
    5.58 +
    5.59 +typedef struct
    5.60 + { SlaveVP   *resultPr;
    5.61 +   SubMatrix *leftSubMatrix;
    5.62 +   SubMatrix *rightSubMatrix;
    5.63 +   float32   *partialResultArray;
    5.64 + }
    5.65 +SMPairParams;
    5.66 +
    5.67 +typedef struct
    5.68 + { int32    numVals;
    5.69 +   int32   *startVals;
    5.70 + }
    5.71 +SlicingStruc;
    5.72 +
    5.73 +typedef struct
    5.74 + {
    5.75 +   SlicingStruc *leftRowSlices;
    5.76 +   SlicingStruc *vecSlices;
    5.77 +   SlicingStruc *rightColSlices;
    5.78 + }
    5.79 +SlicingStrucCarrier;
    5.80 +
    5.81 +enum MMMsgType
    5.82 + {
    5.83 +   RESULTS_MSG = 1
    5.84 + };
    5.85 +
    5.86 +//============================= Processor Functions =========================
    5.87 +void divideWorkIntoSubMatrixPairProcrs( void *data, SlaveVP *animatingPr );
    5.88 +void calcSubMatrixProduct(        void *data, SlaveVP *animatingPr );
    5.89 +void gatherResults(     void *data, SlaveVP *animatingPr );
    5.90 +
    5.91 +
    5.92 +//================================ Entry Point ==============================
    5.93 +Matrix *
    5.94 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix );
    5.95 +
    5.96 +
    5.97 +#endif /*_SSR_MATRIX_MULT_H_*/

     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/__brch__default	Wed May 23 12:39:19 2012 -0700
     6.3 @@ -0,0 +1,1 @@
     6.4 +Applications normally have only the default branch -- they shouldn't be affected by any choices in VMS or language..

     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/main.c	Wed May 23 12:39:19 2012 -0700
     7.3 @@ -0,0 +1,35 @@
     7.4 +/*
     7.5 + *  Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org
     7.6 + *  Licensed under GNU General Public License version 2
     7.7 + *
     7.8 + * author seanhalle@yahoo.com
     7.9 + */
    7.10 +
    7.11 +#include <malloc.h>
    7.12 +#include <stdlib.h>
    7.13 +
    7.14 +#include "Matrix_Mult.h"
    7.15 +#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h"
    7.16 +
    7.17 +/**
    7.18 + * 
    7.19 + */
    7.20 +int main( int argc, char **argv )
    7.21 + { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
    7.22 +   ParamBag    *paramBag;
    7.23 +   
    7.24 +   DEBUG__printf2(TRUE, "arguments: %s | %s", argv[0], argv[1] );
    7.25 +
    7.26 +   paramBag = makeParamBag();
    7.27 +   readParamFileIntoBag( argv[1], paramBag );
    7.28 +   initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
    7.29 +   
    7.30 +   resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix );
    7.31 +
    7.32 +   printf("\nresult matrix: \n");
    7.33 +   printMatrix( resultMatrix );
    7.34 +   
    7.35 +   fflush(stdin);
    7.36 +   
    7.37 +   exit(0); //cleans up
    7.38 + }