Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > SSR > SSR__Blocked_Matrix_Mult__Bench

changeset 1:47802166a7ae
Either working correctly, or close..
author: Me
date: Thu, 14 Oct 2010 17:09:22 -0700
parents: b8b71da62a09
children: f33a9cba5d89
files: src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/EntryPoint.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/Vector_Pr.c src/Application/SSR_Matrix_Mult/subMatrix_Pr.c
diffstat: 8 files changed, 808 insertions(+), 122 deletions(-) [+]
[-]

src/Application/Matrix_Mult.c 22

src/Application/Matrix_Mult.h 2

src/Application/SSR_Matrix_Mult/Divide_Pr.c 483

src/Application/SSR_Matrix_Mult/EntryPoint.c 3

src/Application/SSR_Matrix_Mult/Result_Pr.c 75

src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h 53

src/Application/SSR_Matrix_Mult/Vector_Pr.c 47

src/Application/SSR_Matrix_Mult/subMatrix_Pr.c 245 src/Application/Matrix_Mult.c 22 src/Application/Matrix_Mult.h 2 src/Application/SSR_Matrix_Mult/Divide_Pr.c 483 src/Application/SSR_Matrix_Mult/EntryPoint.c 3 src/Application/SSR_Matrix_Mult/Result_Pr.c 75 src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h 53 src/Application/SSR_Matrix_Mult/Vector_Pr.c 47 src/Application/SSR_Matrix_Mult/subMatrix_Pr.c 245
src/Application/Matrix_Mult.c 22
src/Application/Matrix_Mult.h 2
     1.1 --- a/src/Application/Matrix_Mult.c	Tue Oct 05 10:00:11 2010 -0700
     1.2 +++ b/src/Application/Matrix_Mult.c	Thu Oct 14 17:09:22 2010 -0700
     1.3 @@ -61,7 +61,7 @@
     1.4     
     1.5     numRows = matrixStruc->numRows;
     1.6     numCols = matrixStruc->numCols;
     1.7 -   matrixStart = matrixStruc->matrix;
     1.8 +   matrixStart = matrixStruc->array;
     1.9  
    1.10     file = fopen( matrixFileName, "r" );
    1.11     if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);}
    1.12 @@ -131,7 +131,7 @@
    1.13     retMatrix = malloc( sizeof( Matrix ) );
    1.14     retMatrix->numRows = numRows;
    1.15     retMatrix->numCols = numCols;
    1.16 -   retMatrix->matrix  = malloc( numRows * numCols * sizeof(float32) );
    1.17 +   retMatrix->array  = malloc( numRows * numCols * sizeof(float32) );
    1.18  
    1.19     return retMatrix;
    1.20   }
    1.21 @@ -142,22 +142,24 @@
    1.22   }
    1.23   void
    1.24  freeMatrix( Matrix * matrix )
    1.25 - { free( matrix->matrix );
    1.26 + { free( matrix->array );
    1.27     free( matrix );
    1.28   }
    1.29  
    1.30  void
    1.31  printMatrix( Matrix *matrix )
    1.32 - { int r, c, numRows, numCols;
    1.33 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr;
    1.34     float32 *matrixArray;
    1.35  
    1.36 -   numRows = matrix->numRows;
    1.37 -   numCols = matrix->numCols;
    1.38 -   matrixArray = matrix->matrix;
    1.39 +   numRows = rowsToPrint = matrix->numRows;
    1.40 +   numCols = colsToPrint = matrix->numCols;
    1.41 +   matrixArray = matrix->array;
    1.42  
    1.43 -   for( r = 0; r < numRows; r++ )
    1.44 -    { for( c = 0; c < numCols; c++ )
    1.45 -       { printf( "%f | ", *(matrixArray + r*numCols + c) );
    1.46 +   rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed
    1.47 +   colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed
    1.48 +   for( r = 0; r < numRows; r += rowIncr )
    1.49 +    { for( c = 0; c < numCols; c += colIncr )
    1.50 +       { printf( "%3.1f | ", matrixArray[ r * numCols + c ] );
    1.51         }
    1.52        printf("\n");
    1.53      }

     2.1 --- a/src/Application/Matrix_Mult.h	Tue Oct 05 10:00:11 2010 -0700
     2.2 +++ b/src/Application/Matrix_Mult.h	Thu Oct 14 17:09:22 2010 -0700
     2.3 @@ -19,7 +19,7 @@
     2.4  struct
     2.5   { int32 numRows;
     2.6     int32 numCols;
     2.7 -   float32 *matrix;  //2D, but dynamically sized, so use addr arith
     2.8 +   float32 *array;  //2D, but dynamically sized, so use addr arith
     2.9   }
    2.10  Matrix;
    2.11  

     3.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Tue Oct 05 10:00:11 2010 -0700
     3.2 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Thu Oct 14 17:09:22 2010 -0700
     3.3 @@ -8,78 +8,485 @@
     3.4  
     3.5  
     3.6  #include "SSR_Matrix_Mult.h"
     3.7 +#include <math.h>
     3.8  
     3.9 -/*Divider creates one processor for every row-col pair.
    3.10 +   //The time to compute this many result values should equal the time to
    3.11 +   // perform this division on a matrix of size gives that many result calcs
    3.12 +   //IE, size this so that sequential time to calc equals divide time
    3.13 +   // find the value by experimenting -- but divide time and calc time scale
    3.14 +   // same way, so this value should remain valid across hardware
    3.15 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
    3.16 +
    3.17 +
    3.18 +int
    3.19 +measureMatrixMultPrimitive();
    3.20 +
    3.21 +
    3.22 +SlicingStrucCarrier *
    3.23 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix );
    3.24 +
    3.25 +SlicingStruc *
    3.26 +sliceUpDimension( float32 idealSizeOfPiece, int startVal, int endVal );
    3.27 +
    3.28 +SubMatrix **
    3.29 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    3.30 +                   Matrix *origMatrix );
    3.31 +
    3.32 +
    3.33 +void
    3.34 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
    3.35 +                                    SubMatrix **rightSubMatrices,
    3.36 +                                    int32 numRowIdxs, int32 numColIdxs,
    3.37 +                                    int32 numVecIdxs,
    3.38 +                                    VirtProcr *resultPr,
    3.39 +                                    VirtProcr *animatingPr );
    3.40 +
    3.41 +void
    3.42 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix,
    3.43 +            SlicingStrucCarrier *slicingStrucCarrier,
    3.44 +            VirtProcr *resultPr, VirtProcr *animatingPr );
    3.45 +
    3.46 +
    3.47 +
    3.48 +/*Divider creates one processor for every sub-matrix
    3.49   * It hands them:
    3.50   *  the name of the result processor that they should send their results to,
    3.51 - *  the left and right matrices, and the row and col they should multiply
    3.52 - *  the length of the vector
    3.53 - * It first creates the result processor, then all the vector processors,
    3.54 + *  the left and right matrices, and the rows and cols they should multiply
    3.55 + * It first creates the result processor, then all the sub-matrixPair
    3.56 + *  processors,
    3.57   *  then does a receive of a message from the result processor that gives
    3.58   *  the divider ownership of the result matrix.
    3.59   * Finally, the divider returns the result matrix out of the SSR system.
    3.60 + *
    3.61 + * Divider chooses the size of sub-matrices via an algorithm that tries to
    3.62 + *  keep the minimum work above a threshold.  The threshold is machine-
    3.63 + *  dependent, so ask SSR for min work-unit time to get a
    3.64 + *  given overhead
    3.65 + *
    3.66 + * Divide min work-unit cycles by measured-cycles for one matrix-cell
    3.67 + *  product -- gives the number of products need to have in min size
    3.68 + *  matrix.
    3.69 + *
    3.70 + * So then, take cubed root of this to get the size of a side of min sub-
    3.71 + *  matrix.  That is the size of the ideal square sub-matrix -- so tile
    3.72 + *  up the two input matrices into ones as close as possible to that size,
    3.73 + *  and create the pairs of sub-matrices.
    3.74 + *
    3.75 + *========================  STRATEGIC OVERVIEW  =======================
    3.76 + *
    3.77 + *This division is a bit tricky, because have to create things in advance
    3.78 + * that it's not at first obvious need to be created..
    3.79 + *
    3.80 + *First slice up each dimension -- three of them..  this is because will have
    3.81 + * to create the sub-matrix's data-structures before pairing the sub-matrices
    3.82 + * with each other -- so, have three dimensions to slice up before can
    3.83 + * create the sub-matrix data-strucs -- also, have to be certain that the
    3.84 + * cols of the left input have the exact same slicing as the rows of the
    3.85 + * left matrix, so just to be sure, do the slicing calc once, then use it
    3.86 + * for both.
    3.87 + *
    3.88 + *So, goes like this:
    3.89 + *1) calculate the start & end values of each dimension in each matrix.
    3.90 + *2) use those values to create sub-matrix structures
    3.91 + *3) combine sub-matrices into pairs, as the tasks to perform.
    3.92 + *
    3.93 + *Have to calculate separately from creating the sub-matrices because of the
    3.94 + * nature of the nesting -- would either end up creating the same sub-matrix
    3.95 + * multiple times, or else would have to put in detection of whether had
    3.96 + * made a particular one already if tried to combine steps 1 and 2.
    3.97 + *
    3.98 + *Step 3 has to be separate because of the nesting, as well -- same reason,
    3.99 + * would either create same sub-matrix multiple times, or else have to
   3.100 + * add detection of whether was already created.
   3.101 + *
   3.102 + *Another way to look at it: there's one level of loop to divide dimensions,
   3.103 + * two levels of nesting to create sub-matrices, and three levels to pair
   3.104 + * up the sub-matrices.
   3.105   */
   3.106 -void divideIntoVectors( void *_dividerParams, VirtProcr *animatingPr )
   3.107 +
   3.108 +void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
   3.109 +                                        VirtProcr *animatingPr )
   3.110   { VirtProcr       *resultPr;
   3.111     DividerParams   *dividerParams;
   3.112     ResultsParams   *resultsParams;
   3.113 -   VectorParams    *vectParams;
   3.114     Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
   3.115     void            *msg;
   3.116 +   SlicingStrucCarrier *slicingStrucCarrier;
   3.117 +   float32         *resultArray; //points to array to be put inside result
   3.118 +                                 // matrix
   3.119 +   
   3.120 +         PRINT_DEBUG("start divide\n")
   3.121  
   3.122 -//   printf("start divide\n"); fflush(stdin);
   3.123 -   
   3.124 +
   3.125 +   //=========== Setup -- make local copies of ptd-to-things, malloc, aso
   3.126 +
   3.127     dividerParams   = (DividerParams *)_dividerParams;
   3.128     
   3.129     leftMatrix      = dividerParams->leftMatrix;
   3.130     rightMatrix     = dividerParams->rightMatrix;
   3.131  
   3.132 -   resultsParams            = SSR__malloc_size_to( sizeof(ResultsParams),
   3.133 -                                                               animatingPr );
   3.134 -   resultsParams->dividerPr = animatingPr;
   3.135 -   resultsParams->numCols   = rightMatrix->numCols;
   3.136 -   resultsParams->numRows   = leftMatrix->numRows;
   3.137 -   
   3.138 -   resultPr = SSR__create_procr_with(&gatherResults, resultsParams,
   3.139 -                                        animatingPr);
   3.140  
   3.141 -   int row, col;
   3.142 -   for( row = 0; row < leftMatrix->numRows; row++ )
   3.143 -    { for( col = 0; col < rightMatrix->numCols; col++ )
   3.144 -       {
   3.145 -         vectParams             = SSR__malloc_size_to(sizeof(VectorParams),
   3.146 -                                                                animatingPr);
   3.147 -         vectParams->resultPr   = resultPr;
   3.148 -         vectParams->myCol      = col;
   3.149 -         vectParams->myRow      = row;
   3.150 -         vectParams->vectLength = leftMatrix->numCols;
   3.151 -         vectParams->leftMatrix = leftMatrix;
   3.152 -         vectParams->rightMatrix = rightMatrix;
   3.153 -         
   3.154 -         SSR__create_procr_with( &calcVector, vectParams, animatingPr );
   3.155 -         //vectParams ownership transferred to the newly created processor
   3.156 -       }
   3.157 +   //==============  Do either sequential mult or do division ==============
   3.158 +
   3.159 +      //Check if input matrices too small -- if yes, just do sequential
   3.160 +   if( leftMatrix->numRows * leftMatrix->numCols * rightMatrix->numCols
   3.161 +       < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) //curoff is determined by overhead
   3.162 +       // of this divider -- relatively machine-independent
   3.163 +    { int32 vectLength, numResRows, numResCols;
   3.164 +
   3.165 +      //====== Do sequential multiply on a single core
   3.166 +
   3.167 +      vectLength = leftMatrix->numCols;
   3.168 +      numResRows = leftMatrix->numRows;
   3.169 +      numResCols = rightMatrix->numCols;
   3.170 +
   3.171 +      resultArray = malloc( numResRows * numResCols * sizeof(float32) );
   3.172 +
   3.173 +      multiplyMatrixArrays( vectLength, numResRows, numResCols,
   3.174 +                            leftMatrix->array, rightMatrix->array,
   3.175 +                            resultArray );
   3.176 +    }
   3.177 +   else
   3.178 +    {
   3.179 +      //====== Do parallel multiply across cores
   3.180 +
   3.181 +         //Calc the ideal size of sub-matrix and slice up the dimensions of
   3.182 +         // the two matrices.
   3.183 +         //The ideal size is the one takes the number of cycles to calculate
   3.184 +         // such that calc time is equal or greater than min work-unit size
   3.185 +      slicingStrucCarrier =
   3.186 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix );
   3.187 +
   3.188 +         //Make the results processor, now that know how many to wait for
   3.189 +      resultsParams = SSR__malloc_size_to(sizeof(ResultsParams),animatingPr);
   3.190 +      resultsParams->dividerPr = animatingPr;
   3.191 +      resultsParams->numSubMatrixPairs  =
   3.192 +         slicingStrucCarrier->leftRowSlices->numVals *
   3.193 +         slicingStrucCarrier->rightColSlices->numVals *
   3.194 +         slicingStrucCarrier->vecSlices->numVals;
   3.195 +      resultsParams->numCols   = rightMatrix->numCols;
   3.196 +      resultsParams->numRows   = leftMatrix->numRows;
   3.197 +
   3.198 +      resultPr =
   3.199 +         SSR__create_procr_with( &gatherResults, resultsParams, animatingPr);
   3.200 +
   3.201 +         //Make the sub-matrices, and pair them up, and make processor to
   3.202 +         // calc product of each pair.
   3.203 +      makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   3.204 +                                    slicingStrucCarrier,
   3.205 +                                    resultPr, animatingPr);
   3.206 + 
   3.207 +         //Get result from result procr
   3.208 +      msg = SSR__receive_from_to( resultPr, animatingPr );
   3.209 +      resultArray = (float32 *) msg;
   3.210      }
   3.211  
   3.212 -      //Get result from result procr
   3.213 -   msg = SSR__receive_from_to( resultPr, animatingPr );
   3.214 +
   3.215 +   //===============  Work done -- send results back =================
   3.216 +
   3.217  
   3.218        //prepare results to persist outside of SSR when return from entry pt
   3.219        //The results of the all the work have to be linked-to from the data
   3.220        // struc given to the seed procr -- this divide func is animated by
   3.221        // that seed procr, so have to link results to the _dividerParams.
   3.222 -   resultMatrix            = SSR__malloc_size_to( sizeof(Matrix),
   3.223 -                                                               animatingPr );
   3.224 +   resultMatrix            = SSR__malloc_size_to(sizeof(Matrix),animatingPr);
   3.225 +   resultMatrix->array     = resultArray;
   3.226     resultMatrix->numCols   = rightMatrix->numCols;
   3.227     resultMatrix->numRows   = leftMatrix->numRows;
   3.228 +
   3.229 +
   3.230     dividerParams->resultMatrix   = resultMatrix;
   3.231 -   resultMatrix->matrix          = (float32 *) msg;
   3.232     SSR__transfer_ownership_to_outside( msg ); //so not freed
   3.233     SSR__transfer_ownership_to_outside( resultMatrix );
   3.234  
   3.235 -   //printf("end divide\n"); fflush(stdin);
   3.236 +         PRINT_DEBUG("end divide\n")
   3.237  
   3.238     SSR__dissipate_procr( animatingPr );  //all procrs dissipate self at end
   3.239        //when all of the processors have dissipated, the "create seed and do
   3.240        // work" call in the entry point function returns
   3.241   }
   3.242 +
   3.243 +
   3.244 +SlicingStrucCarrier *
   3.245 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix )
   3.246 +{
   3.247 +   float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   3.248 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   3.249 +   SlicingStrucCarrier *slicingStrucCarrier =
   3.250 +                                         malloc(sizeof(SlicingStrucCarrier));
   3.251 +
   3.252 +   int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   3.253 +   float64 numPrimitiveOpsInMinWorkUnit;
   3.254 +
   3.255 +
   3.256 +   //=======  Calc ideal size of min-sized sub-matrix  ========
   3.257 +
   3.258 +      //ask SSR for the number of cycles of the minimum work unit, at given
   3.259 +      // percent overhead then add a guess at overhead from this divider
   3.260 +   minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
   3.261 +
   3.262 +      //ask SSR for number of cycles of the "primitive" op of matrix mult
   3.263 +   primitiveCycles = measureMatrixMultPrimitive();
   3.264 +
   3.265 +   numPrimitiveOpsInMinWorkUnit =
   3.266 +      (float64)minWorkUnitCycles / (float64)primitiveCycles;
   3.267 +
   3.268 +      //take cubed root -- that's number of these in a "side" of sub-matrix
   3.269 +      // then multiply by 5 because the primitive is 5x5
   3.270 +   idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit );
   3.271 +
   3.272 +   idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
   3.273 +   
   3.274 +   idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   3.275 +
   3.276 +   if( idealSizeOfSide1 > idealSizeOfSide2 )
   3.277 +      idealSizeOfSide = idealSizeOfSide1;
   3.278 +   else
   3.279 +      idealSizeOfSide = idealSizeOfSide2;
   3.280 +
   3.281 +      //The multiply inner loop blocks the array to fit into L1 cache
   3.282 +//   if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK;
   3.283 +
   3.284 +   //============  Slice up dimensions, now that know target size ===========
   3.285 +
   3.286 +      //Tell the slicer the target size of a side (floating pt), the start
   3.287 +      // value to start slicing at, and the end value to stop slicing at
   3.288 +      //It returns an array of start value of each chunk, plus number of them
   3.289 +   int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol;
   3.290 +   startLeftRow  = 0;
   3.291 +   endLeftRow    = leftMatrix->numRows -1;
   3.292 +   startVec      = 0;
   3.293 +   endVec        = leftMatrix->numCols -1;
   3.294 +   startRightCol = 0;
   3.295 +   endRightCol   = rightMatrix->numCols -1;
   3.296 +
   3.297 +   leftRowSlices =
   3.298 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow );
   3.299 +
   3.300 +   vecSlices =
   3.301 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec );
   3.302 +
   3.303 +   rightColSlices =
   3.304 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol );
   3.305 +
   3.306 +   slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   3.307 +   slicingStrucCarrier->vecSlices      = vecSlices;
   3.308 +   slicingStrucCarrier->rightColSlices = rightColSlices;
   3.309 +
   3.310 +   return slicingStrucCarrier;
   3.311 +}
   3.312 +
   3.313 +
   3.314 +void
   3.315 +makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
   3.316 +            SlicingStrucCarrier *slicingStrucCarrier,
   3.317 +            VirtProcr *resultPr,   VirtProcr *animatingPr )
   3.318 + {
   3.319 +   SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   3.320 +   
   3.321 +   leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   3.322 +   vecSlices      = slicingStrucCarrier->vecSlices;
   3.323 +   rightColSlices = slicingStrucCarrier->rightColSlices;
   3.324 +   
   3.325 +   //================  Make sub-matrices, given the slicing  ================
   3.326 +   SubMatrix **leftSubMatrices, **rightSubMatrices;
   3.327 +   leftSubMatrices =
   3.328 +      createSubMatrices( leftRowSlices, vecSlices,
   3.329 +                         leftMatrix );
   3.330 +   rightSubMatrices =
   3.331 +      createSubMatrices( vecSlices, rightColSlices,
   3.332 +                         rightMatrix );
   3.333 +
   3.334 +   //==============  pair the sub-matrices and make processors ==============
   3.335 +   int32 numRowIdxs, numColIdxs, numVecIdxs;
   3.336 +
   3.337 +   numRowIdxs = leftRowSlices->numVals;
   3.338 +   numColIdxs = rightColSlices->numVals;
   3.339 +   numVecIdxs = vecSlices->numVals;
   3.340 +   pairUpSubMatricesAndMakeProcessors( leftSubMatrices,
   3.341 +                                       rightSubMatrices,
   3.342 +                                       numRowIdxs, numColIdxs,
   3.343 +                                       numVecIdxs,
   3.344 +                                       resultPr,
   3.345 +                                       animatingPr );
   3.346 + }
   3.347 +
   3.348 +
   3.349 +
   3.350 +
   3.351 +void
   3.352 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
   3.353 +                                    SubMatrix **rightSubMatrices,
   3.354 +                                    int32 numRowIdxs, int32 numColIdxs,
   3.355 +                                    int32 numVecIdxs,
   3.356 +                                    VirtProcr *resultPr,
   3.357 +                                    VirtProcr *animatingPr )
   3.358 + {
   3.359 +   int32 resRowIdx, resColIdx, vecIdx;
   3.360 +   int32 numLeftColIdxs, numRightColIdxs;
   3.361 +   int32 leftRowIdxOffset;
   3.362 +   SMPairParams *subMatrixPairParams;
   3.363 +
   3.364 +   numLeftColIdxs  = numColIdxs;
   3.365 +   numRightColIdxs = numVecIdxs;
   3.366 +
   3.367 +   for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   3.368 +    {
   3.369 +      leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   3.370 +
   3.371 +      for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   3.372 +       {
   3.373 +
   3.374 +         for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   3.375 +          {
   3.376 +               //Make the processor for the pair of sub-matrices
   3.377 +            subMatrixPairParams  = SSR__malloc_size_to(sizeof(SMPairParams),
   3.378 +                                                               animatingPr);
   3.379 +            subMatrixPairParams->leftSubMatrix  =
   3.380 +               leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   3.381 +
   3.382 +            subMatrixPairParams->rightSubMatrix =
   3.383 +               rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ];
   3.384 +
   3.385 +            subMatrixPairParams->resultPr = resultPr;
   3.386 +
   3.387 +            SSR__create_procr_with( &calcSubMatrixProduct,
   3.388 +                                    subMatrixPairParams,
   3.389 +                                    animatingPr );
   3.390 +          }
   3.391 +       }
   3.392 +    }
   3.393 +
   3.394 + }
   3.395 +
   3.396 +
   3.397 +
   3.398 +/*Walk through the two slice-strucs, making sub-matrix strucs as go
   3.399 + */
   3.400 +SubMatrix **
   3.401 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   3.402 +                   Matrix *origMatrix )
   3.403 + {
   3.404 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   3.405 +   int32 startRow, endRow, startCol, endCol;
   3.406 +   int32 *rowStartVals, *colStartVals;
   3.407 +   int32 rowOffset;
   3.408 +   SubMatrix **subMatrices, *newSubMatrix;
   3.409 +
   3.410 +   numRowIdxs = rowSlices->numVals;
   3.411 +   numColIdxs = colSlices->numVals;
   3.412 +
   3.413 +   rowStartVals = rowSlices->startVals;
   3.414 +   colStartVals = colSlices->startVals;
   3.415 +
   3.416 +   subMatrices = malloc( numRowIdxs * numColIdxs * sizeof(SubMatrix *) );
   3.417 +
   3.418 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   3.419 +    {
   3.420 +      rowOffset = rowIdx * numColIdxs;
   3.421 +      
   3.422 +      startRow  = rowStartVals[rowIdx];
   3.423 +      endRow    = rowStartVals[rowIdx + 1] -1; //"fake" start above last is
   3.424 +                                               // at last valid idx + 1 & is
   3.425 +                                               // 1 greater than end value
   3.426 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   3.427 +       {
   3.428 +         startCol = colStartVals[colIdx];
   3.429 +         endCol   = colStartVals[colIdx + 1] -1;
   3.430 +
   3.431 +         newSubMatrix = malloc( sizeof(SubMatrix) );
   3.432 +         newSubMatrix->numRows       = endRow - startRow +1;
   3.433 +         newSubMatrix->numCols       = endCol - startCol +1;
   3.434 +         newSubMatrix->origMatrix    = origMatrix;
   3.435 +         newSubMatrix->origStartRow  = startRow;
   3.436 +         newSubMatrix->origStartCol  = startCol;
   3.437 +         newSubMatrix->alreadyCopied = FALSE;
   3.438 +
   3.439 +         subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   3.440 +       }
   3.441 +    }
   3.442 +   return subMatrices;
   3.443 + }
   3.444 +
   3.445 +
   3.446 +
   3.447 +
   3.448 +SlicingStruc *
   3.449 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal )
   3.450 + { float32 residualAcc = 0;
   3.451 +   int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   3.452 +   SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) );
   3.453 +
   3.454 +      //calc size of matrix need to hold start vals --
   3.455 +   numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   3.456 +
   3.457 +   startVals = malloc( (numSlices + 1) * sizeof(int32) );
   3.458 +
   3.459 +      //Calc the upper limit of start value -- when get above this, end loop
   3.460 +      // by saving highest value of the matrix dimension to access, plus 1
   3.461 +      // as the start point of the imaginary slice following the last one
   3.462 +      //Plus 1 because go up to value but not include when process last slice
   3.463 +      //The stopping condition is half-a-size less than highest value because
   3.464 +      // don't want any pieces smaller than half the ideal size -- just tack
   3.465 +      // little ones onto end of last one
   3.466 +   endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size
   3.467 +   for( i = 0; startVal <= endVal; i++ )
   3.468 +    {
   3.469 +      startVals[i] = startVal;
   3.470 +      residualAcc += idealSizeOfSide;
   3.471 +      sizeOfSlice  = (int)residualAcc;
   3.472 +      residualAcc -= (float32)sizeOfSlice;
   3.473 +      startVal    += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8..
   3.474 +
   3.475 +      if( startVal > endCondition )
   3.476 +       { startVal = endVal + 1;
   3.477 +         startVals[ i + 1 ] = startVal;
   3.478 +       }
   3.479 +    }
   3.480 +
   3.481 +   slicingStruc->startVals = startVals;
   3.482 +   slicingStruc->numVals   = i;  //loop incr'd, so == last valid start idx+1
   3.483 +                                 // which means is num sub-matrices in dim
   3.484 +                                 // also == idx of the fake start just above
   3.485 +   return slicingStruc;
   3.486 + }
   3.487 +
   3.488 +
   3.489 +int inline
   3.490 +measureMatrixMultPrimitive()
   3.491 + {
   3.492 +   int r, c, v, numCycles;
   3.493 +   float32 *res, *left, *right;
   3.494 +
   3.495 +      //setup inputs
   3.496 +   left  = malloc( 5 * 5 * sizeof( float32 ) );
   3.497 +   right = malloc( 5 * 5 * sizeof( float32 ) );
   3.498 +   res   = malloc( 5 * 5 * sizeof( float32 ) );
   3.499 +
   3.500 +   for( r = 0; r < 5; r++ )
   3.501 +    {
   3.502 +      for( c = 0; c < 5; c++ )
   3.503 +       {
   3.504 +         left[  r * 5 + c ] = r;
   3.505 +         right[ r * 5 + c ] = c;
   3.506 +       }
   3.507 +    }
   3.508 +
   3.509 +      //do primitive
   3.510 +   SSR__start_primitive();  //for now, just takes time stamp
   3.511 +   for( r = 0; r < 5; r++ )
   3.512 +    {
   3.513 +      for( c = 0; c < 5; c++ )
   3.514 +       {
   3.515 +         for( v = 0; v < 5; v++ )
   3.516 +          {
   3.517 +            res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ];
   3.518 +          }
   3.519 +       }
   3.520 +    }
   3.521 +   numCycles =
   3.522 +      SSR__end_primitive_and_give_cycles(); 
   3.523 +
   3.524 +   return numCycles;
   3.525 + }
   3.526 +

     4.1 --- a/src/Application/SSR_Matrix_Mult/EntryPoint.c	Tue Oct 05 10:00:11 2010 -0700
     4.2 +++ b/src/Application/SSR_Matrix_Mult/EntryPoint.c	Thu Oct 14 17:09:22 2010 -0700
     4.3 @@ -39,7 +39,8 @@
     4.4  
     4.5        //create divider processor, start doing the work, and wait till done
     4.6        //This function is the "border crossing" between normal code and SSR
     4.7 -   SSR__create_seed_procr_and_do_work( &divideIntoVectors, dividerParams );
     4.8 +   SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
     4.9 +                                       dividerParams );
    4.10     
    4.11        //get result matrix and return it
    4.12     resMatrix = dividerParams->resultMatrix;

     5.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c	Tue Oct 05 10:00:11 2010 -0700
     5.2 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c	Thu Oct 14 17:09:22 2010 -0700
     5.3 @@ -8,6 +8,15 @@
     5.4  
     5.5  #include "SSR_Matrix_Mult.h"
     5.6  
     5.7 +void inline
     5.8 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
     5.9 +                  int32    startRow,
    5.10 +                  int32    numRows,
    5.11 +                  int32    startCol,
    5.12 +                  int32    numCols,
    5.13 +                  int32    numOrigCols );
    5.14 +
    5.15 +
    5.16  /*The Result Processor gets a message from each of the vector processors,
    5.17   * puts the result from the message in its location in the result-
    5.18   * matrix, and increments the count of results.
    5.19 @@ -18,34 +27,68 @@
    5.20  void gatherResults( void *_params, VirtProcr *animatingPr )
    5.21   { VirtProcr *dividerPr;
    5.22     ResultsParams  *params;
    5.23 -   int             numRows, numCols, numCells, count=0;
    5.24 -   float32        *resultMatrix;
    5.25 +   int             row, col, numRows, numCols, numSubMatrixPairs, count=0;
    5.26 +   float32        *resultArray;
    5.27     void           *msg;
    5.28 -   VectorParams   *aResult;
    5.29 +   SMPairParams   *resParams;
    5.30  
    5.31 -//   printf("start resultPr\n"); fflush(stdin);
    5.32 -
    5.33 +         PRINT_DEBUG("start resultPr\n")
    5.34 +         
    5.35     params    = (ResultsParams *)_params;
    5.36     dividerPr = params->dividerPr;
    5.37 -   numCols   = params->numCols;
    5.38 -   numRows   = params->numRows;
    5.39 -   numCells  = numRows * numCols;
    5.40 +   numSubMatrixPairs = params->numSubMatrixPairs;
    5.41 +   numRows = params->numRows;
    5.42 +   numCols = params->numCols;
    5.43  
    5.44 -   resultMatrix = SSR__malloc_size_to( numCells*sizeof( float32 ), animatingPr);
    5.45 -   
    5.46 -   while( count < numCells )
    5.47 +   resultArray = SSR__malloc_size_to( numRows * numCols * sizeof(float32),
    5.48 +                                       animatingPr );
    5.49 +
    5.50 +      //zero out the results array -- will be accumulating, so must start 0
    5.51 +   for( row = 0; row < numRows; row++ )
    5.52 +    {
    5.53 +      for( col = 0; col < numCols; col++ )
    5.54 +       {
    5.55 +         resultArray[ row * numCols + col ] = 0;
    5.56 +       }
    5.57 +    }
    5.58 +
    5.59 +   while( count < numSubMatrixPairs )
    5.60      {
    5.61        msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
    5.62  
    5.63 -      aResult = (VectorParams *)msg;
    5.64 -      *(resultMatrix + aResult->myRow * numCols + aResult->myCol) =
    5.65 -                                                             aResult->result;
    5.66 +      resParams = (SMPairParams *)msg;
    5.67 +      accumulateResult( resultArray, resParams->resultArray,
    5.68 +                        resParams->leftSubMatrix->origStartRow,
    5.69 +                        resParams->leftSubMatrix->numRows,
    5.70 +                        resParams->rightSubMatrix->origStartCol,
    5.71 +                        resParams->rightSubMatrix->numCols,
    5.72 +                        resParams->rightSubMatrix->origMatrix->numCols );
    5.73        count++;
    5.74      }
    5.75        //if were real lang, would have auto-nested transfer -- but HelloWorld
    5.76        // language, so have to transfer ownership of each allocated block of
    5.77        // locations separately
    5.78 -   SSR__transfer_ownership_of_from_to( resultMatrix, animatingPr, dividerPr );
    5.79 -   SSR__send_from_to( resultMatrix, animatingPr, dividerPr );
    5.80 +   SSR__transfer_ownership_of_from_to( resultArray, animatingPr, dividerPr );
    5.81 +   SSR__send_from_to( resultArray, animatingPr, dividerPr );
    5.82     SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
    5.83   }
    5.84 +
    5.85 +void inline
    5.86 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    5.87 +                  int32    startRow,
    5.88 +                  int32    numRows,
    5.89 +                  int32    startCol,
    5.90 +                  int32    numCols,
    5.91 +                  int32    numOrigCols )
    5.92 + { int32 row, col;
    5.93 +
    5.94 +   for( row = 0; row < numRows; row++ )
    5.95 +    {
    5.96 +      for( col = 0; col < numCols; col++ )
    5.97 +       {
    5.98 +         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
    5.99 +            subMatrixResultArray[ row * numCols + col ];
   5.100 +       }
   5.101 +    }
   5.102 +
   5.103 + }

     6.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Tue Oct 05 10:00:11 2010 -0700
     6.2 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Thu Oct 14 17:09:22 2010 -0700
     6.3 @@ -11,6 +11,15 @@
     6.4  #include "../../SSR_lib/SSR.h"
     6.5  #include "../Matrix_Mult.h"
     6.6  
     6.7 +
     6.8 +//===============================  Defines  ==============================
     6.9 +#define ROWS_IN_BLOCK 32
    6.10 +#define COLS_IN_BLOCK 32
    6.11 +#define VEC_IN_BLOCK  32
    6.12 +
    6.13 +
    6.14 +#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin);
    6.15 +
    6.16  //==============================  Structures  ==============================
    6.17  typedef struct
    6.18   {
    6.19 @@ -25,19 +34,45 @@
    6.20     VirtProcr *dividerPr;
    6.21     int numRows;
    6.22     int numCols;
    6.23 +   int numSubMatrixPairs;
    6.24   }
    6.25  ResultsParams;
    6.26  
    6.27 +typedef
    6.28 +struct
    6.29 + { int32    numRows;
    6.30 +   int32    numCols;
    6.31 +   Matrix  *origMatrix;
    6.32 +   int32    origStartRow;
    6.33 +   int32    origStartCol;
    6.34 +   int32    alreadyCopied;
    6.35 +   float32 *array;  //2D, but dynamically sized, so use addr arith
    6.36 + }
    6.37 +SubMatrix;
    6.38 +
    6.39  typedef struct
    6.40   { VirtProcr *resultPr;
    6.41 -   int        myCol;
    6.42 -   int        myRow;
    6.43 -   int        vectLength;
    6.44 -   Matrix    *leftMatrix;
    6.45 -   Matrix    *rightMatrix;
    6.46 -   float32    result;
    6.47 +   SubMatrix *leftSubMatrix;
    6.48 +   SubMatrix *rightSubMatrix;
    6.49 +   float32   *resultArray;
    6.50   }
    6.51 -VectorParams;
    6.52 +SMPairParams;
    6.53 +
    6.54 +typedef
    6.55 +struct
    6.56 + { int32    numVals;
    6.57 +   int32   *startVals;
    6.58 + }
    6.59 +SlicingStruc;
    6.60 +
    6.61 +typedef
    6.62 +struct
    6.63 + {
    6.64 +   SlicingStruc *leftRowSlices;
    6.65 +   SlicingStruc *vecSlices;
    6.66 +   SlicingStruc *rightColSlices;
    6.67 + }
    6.68 +SlicingStrucCarrier;
    6.69  
    6.70  enum MMMsgType
    6.71   {
    6.72 @@ -45,8 +80,8 @@
    6.73   };
    6.74  
    6.75  //============================= Processor Functions =========================
    6.76 -void divideIntoVectors( void *data, VirtProcr *animatingPr );
    6.77 -void calcVector(        void *data, VirtProcr *animatingPr );
    6.78 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr );
    6.79 +void calcSubMatrixProduct(        void *data, VirtProcr *animatingPr );
    6.80  void gatherResults(     void *data, VirtProcr *animatingPr );
    6.81  
    6.82  

     7.1 --- a/src/Application/SSR_Matrix_Mult/Vector_Pr.c	Tue Oct 05 10:00:11 2010 -0700
     7.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.3 @@ -1,47 +0,0 @@
     7.4 -/* 
     7.5 - *  Copyright 2009 OpenSourceStewardshipFoundation.org
     7.6 - *  Licensed under GNU General Public License version 2
     7.7 - *
     7.8 - * Author: SeanHalle@yahoo.com
     7.9 - *
    7.10 - */
    7.11 -
    7.12 -#include "SSR_Matrix_Mult.h"
    7.13 -
    7.14 -/*A Vector processor is created with an environment that holds two matrices,
    7.15 - * the row and col that it owns, and the name of a result gathering
    7.16 - * processor.
    7.17 - *It calculates its vector product then sends the result to the result
    7.18 - * processor, which puts it into the result matrix and returns that matrix
    7.19 - * when all is done.
    7.20 - */
    7.21 - void
    7.22 -calcVector( void *data, VirtProcr *animatingPr )
    7.23 - { 
    7.24 -   VectorParams   *params;
    7.25 -   VirtProcr      *resultPr;
    7.26 -   int             myRow, myCol, vectLength, pos;
    7.27 -   float32        *leftMatrixArray, *rightMatrixArray, result = 0.0;
    7.28 -   Matrix         *leftMatrix, *rightMatrix;
    7.29 -
    7.30 -//   printf("start vector\n"); fflush(stdin);
    7.31 -
    7.32 -   params      = (VectorParams *)data;
    7.33 -   resultPr    = params->resultPr;
    7.34 -   myCol       = params->myCol;
    7.35 -   myRow       = params->myRow;
    7.36 -   vectLength  = params->vectLength;
    7.37 -   leftMatrix  = params->leftMatrix;
    7.38 -   rightMatrix = params->rightMatrix;
    7.39 -   leftMatrixArray  = leftMatrix->matrix;
    7.40 -   rightMatrixArray = rightMatrix->matrix;
    7.41 -
    7.42 -   for( pos = 0; pos < vectLength; pos++ )
    7.43 -    {
    7.44 -      result += *(leftMatrixArray  + myRow * vectLength + pos)  *
    7.45 -                *(rightMatrixArray + pos   * vectLength + myCol);
    7.46 -    }
    7.47 -   params->result = result;
    7.48 -   SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
    7.49 -   SSR__dissipate_procr( animatingPr );
    7.50 - }

     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Thu Oct 14 17:09:22 2010 -0700
     8.3 @@ -0,0 +1,245 @@
     8.4 +/* 
     8.5 + *  Copyright 2009 OpenSourceStewardshipFoundation.org
     8.6 + *  Licensed under GNU General Public License version 2
     8.7 + *
     8.8 + * Author: SeanHalle@yahoo.com
     8.9 + *
    8.10 + */
    8.11 +
    8.12 +#include "SSR_Matrix_Mult.h"
    8.13 +
    8.14 +
    8.15 +void inline
    8.16 +copyFromOrig( SubMatrix *subMatrix );
    8.17 +
    8.18 +void inline
    8.19 +copyTransposeFromOrig( SubMatrix *subMatrix );
    8.20 +
    8.21 +void inline
    8.22 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    8.23 +                     float32 *resArray,
    8.24 +                     int startRow,  int endRow,
    8.25 +                     int startCol,  int endCol,
    8.26 +                     int startVec,  int endVec,
    8.27 +                     int resStride, int inpStride );
    8.28 +
    8.29 +void inline
    8.30 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
    8.31 +                      float32 *leftArray, float32 *rightArray,
    8.32 +                      float32 *resArray );
    8.33 +
    8.34 +
    8.35 +/*A  processor is created with an environment that holds two matrices,
    8.36 + * the row and col that it owns, and the name of a result gathering
    8.37 + * processor.
    8.38 + *It calculates the product of two sub-portions of the input matrices
    8.39 + * by using Intel's mkl library for single-core.
    8.40 + *
    8.41 + *This demonstrates using optimized single-threaded code inside scheduled
    8.42 + * work-units.
    8.43 + *
    8.44 + *When done, it sends the result to the result processor
    8.45 + */
    8.46 +void
    8.47 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr )
    8.48 + { 
    8.49 +   SMPairParams   *params;
    8.50 +   VirtProcr      *resultPr;
    8.51 +   float32        *leftArray,  *rightArray, *resArray;
    8.52 +   SubMatrix      *leftSubMatrix, *rightSubMatrix;
    8.53 +
    8.54 +         PRINT_DEBUG("start sub-matrix mult\n")
    8.55 +
    8.56 +   params         = (SMPairParams *)data;
    8.57 +   resultPr       = params->resultPr;
    8.58 +   leftSubMatrix  = params->leftSubMatrix;
    8.59 +   rightSubMatrix = params->rightSubMatrix;
    8.60 +
    8.61 +      //make sure the input sub-matrices have been copied out of orig
    8.62 +   copyFromOrig( leftSubMatrix );
    8.63 +   copyTransposeFromOrig( rightSubMatrix );
    8.64 +   
    8.65 +   leftArray      = leftSubMatrix->array;
    8.66 +   rightArray     = rightSubMatrix->array;
    8.67 +
    8.68 +   resArray = malloc( leftSubMatrix->numRows * rightSubMatrix->numCols *
    8.69 +                         sizeof( float32 ) );
    8.70 +
    8.71 +
    8.72 +   int32 numResRows, numResCols, vectLength;
    8.73 +   
    8.74 +   vectLength = leftSubMatrix->numCols;
    8.75 +   numResRows = leftSubMatrix->numRows;
    8.76 +   numResCols = rightSubMatrix->numCols;
    8.77 +
    8.78 +   multiplyMatrixArrays( vectLength, numResRows, numResCols,
    8.79 +                         leftArray, rightArray,
    8.80 +                         resArray );
    8.81 +
    8.82 +   //send result to result processor
    8.83 +   params->resultArray = resArray;
    8.84 +   SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
    8.85 +   SSR__dissipate_procr( animatingPr );
    8.86 + }
    8.87 +
    8.88 +
    8.89 +/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache
    8.90 + * Would be nice to embed this within another level that divided into
    8.91 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
    8.92 + *
    8.93 + *Eventually want these divisions to be automatic, using DKU pattern
    8.94 + * embedded into SSR, and with VMS controlling the divisions according to
    8.95 + * the cache sizes, which it knows about.
    8.96 + *And, want VMS to work with language to split among main-mems, so a socket
    8.97 + * only cranks on data in its local segment of main mem
    8.98 + *
    8.99 + */
   8.100 +void inline
   8.101 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
   8.102 +                      float32 *leftArray, float32 *rightArray,
   8.103 +                      float32 *resArray )
   8.104 + {
   8.105 +   int resStride, inpStride;
   8.106 +   int startRow, startCol, endRow, endCol, startVec, endVec;
   8.107 +
   8.108 +   resStride  = numResCols;
   8.109 +   inpStride  = vecLength;
   8.110 +
   8.111 +   for( startRow = 0; startRow < numResRows; )
   8.112 +    {
   8.113 +      endRow = startRow + ROWS_IN_BLOCK;
   8.114 +      if( endRow > numResRows ) endRow = numResRows;
   8.115 +
   8.116 +      for( startCol = 0; startCol < numResCols; )
   8.117 +       {
   8.118 +         endCol   = startCol + COLS_IN_BLOCK;
   8.119 +         if( endCol > numResCols ) endCol = numResCols;
   8.120 +
   8.121 +         for( startVec = 0; startVec < vecLength; )
   8.122 +          {
   8.123 +            endVec   = startVec + VEC_IN_BLOCK;
   8.124 +            if( endVec > vecLength ) endVec = vecLength;
   8.125 +
   8.126 +               //By having the "vector" of sub-blocks in a sub-block slice
   8.127 +               // be marched down in inner loop, are re-using the result
   8.128 +               // matrix, which stays in L1 cache -- can only re-use one of
   8.129 +               // the three, so this is the most important -- avoids writing
   8.130 +               // dirty blocks until those result-locations fully done
   8.131 +               //Row and Col is position in result matrix -- so row and vec
   8.132 +               // for left array, then vec and col for right array
   8.133 +            multiplySubBlocksTransposed( leftArray, rightArray,
   8.134 +                                         resArray,
   8.135 +                                         startRow,  endRow,
   8.136 +                                         startCol,  endCol,
   8.137 +                                         startVec,  endVec,
   8.138 +                                         resStride, inpStride );
   8.139 +            startVec = endVec;
   8.140 +          }
   8.141 +         startCol = endCol;
   8.142 +       }
   8.143 +      startRow = endRow;
   8.144 +    }
   8.145 + }
   8.146 +
   8.147 +
   8.148 +void inline
   8.149 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   8.150 +                     float32 *resArray,
   8.151 +                     int startRow,  int endRow,
   8.152 +                     int startCol,  int endCol,
   8.153 +                     int startVec,  int endVec,
   8.154 +                     int resStride, int inpStride )
   8.155 + {
   8.156 +   int row,    col,        vec;
   8.157 +   int leftOffset, rightOffset;
   8.158 +   float32 result;
   8.159 +   
   8.160 +   for( row = startRow; row < endRow; row++ )
   8.161 +    { 
   8.162 +      for( col = startCol; col < endCol; col++ )
   8.163 +       { 
   8.164 +         leftOffset  = row * inpStride;//left & right inp strides always same
   8.165 +         rightOffset = col * inpStride;// because right is transposed
   8.166 +         result = 0;
   8.167 +         for( vec = startVec; vec < endVec; vec++ )
   8.168 +          {
   8.169 +            result +=
   8.170 +               leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   8.171 +          }
   8.172 +         
   8.173 +         resArray[ row * resStride + col ] += result;
   8.174 +       }
   8.175 +    }
   8.176 + }
   8.177 +
   8.178 +void inline
   8.179 +copyTransposeFromOrig( SubMatrix *subMatrix )
   8.180 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   8.181 +   Matrix *origMatrix;
   8.182 +   float32 *origArray, *subArray;
   8.183 +
   8.184 +   if( subMatrix->alreadyCopied ) return;
   8.185 +
   8.186 +   subMatrix->alreadyCopied = TRUE;
   8.187 +
   8.188 +   origMatrix   = subMatrix->origMatrix;
   8.189 +   origArray     = origMatrix->array;
   8.190 +   numCols      = subMatrix->numCols;
   8.191 +   numRows      = subMatrix->numRows;
   8.192 +   stride       = numRows;
   8.193 +   origStartRow = subMatrix->origStartRow;
   8.194 +   origStartCol = subMatrix->origStartCol;
   8.195 +   origStride   = origMatrix->numCols;
   8.196 +
   8.197 +   subArray      = malloc( numRows * numCols * sizeof(float32) );
   8.198 +   subMatrix->array = subArray;
   8.199 +
   8.200 +      //copy values from orig matrix to local
   8.201 +   int row, col, origOffset;
   8.202 +   for( row = 0; row < numRows; row++ )
   8.203 +    {
   8.204 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.205 +      for( col = 0; col < numCols; col++ )
   8.206 +       {
   8.207 +            //transpose means swap row & col -- traverse orig matrix normally
   8.208 +            // but put into reversed place in local array -- means the
   8.209 +            // stride is the num rows now, so col * numRows + row
   8.210 +         subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   8.211 +       }      
   8.212 +    }
   8.213 + }
   8.214 +
   8.215 +void inline
   8.216 +copyFromOrig( SubMatrix *subMatrix )
   8.217 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   8.218 +   Matrix *origMatrix;
   8.219 +   float32 *origArray, *subArray;
   8.220 +
   8.221 +   if( subMatrix->alreadyCopied ) return;
   8.222 +
   8.223 +   subMatrix->alreadyCopied = TRUE;
   8.224 +
   8.225 +   origMatrix    = subMatrix->origMatrix;
   8.226 +   origArray     = origMatrix->array;
   8.227 +   numCols       = subMatrix->numCols;
   8.228 +   numRows       = subMatrix->numRows;
   8.229 +   origStartRow  = subMatrix->origStartRow;
   8.230 +   origStartCol  = subMatrix->origStartCol;
   8.231 +   stride        = numCols;
   8.232 +   origStride    = origMatrix->numCols;
   8.233 +
   8.234 +   subArray      = malloc( numRows * numCols * sizeof(float32) );
   8.235 +   subMatrix->array = subArray;
   8.236 +
   8.237 +      //copy values from orig matrix to local
   8.238 +   int row, col, offset, origOffset;
   8.239 +   for( row = 0; row < numRows; row++ )
   8.240 +    {
   8.241 +      offset     = row * stride;
   8.242 +      origOffset = (row + origStartRow) * origStride + origStartCol;
   8.243 +      for( col = 0; col < numCols; col++ )
   8.244 +       {
   8.245 +         subArray[ offset + col ]  =  origArray[ origOffset + col ];
   8.246 +       }
   8.247 +    }
   8.248 + }