Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > SSR > SSR__Blocked_Matrix_Mult__Bench

changeset 3:4e14e2663af9
Fixed concurrency bug -- added singleton to SSR -- works! 3.4x speedup
author: Me
date: Tue, 02 Nov 2010 17:00:50 -0700
parents: f33a9cba5d89
children: cbd8db6b8657
files: src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/EntryPoint.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/subMatrix_Pr.c
diffstat: 5 files changed, 305 insertions(+), 134 deletions(-) [+]
[-]

src/Application/SSR_Matrix_Mult/Divide_Pr.c 256

src/Application/SSR_Matrix_Mult/EntryPoint.c 17

src/Application/SSR_Matrix_Mult/Result_Pr.c 46

src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h 8

src/Application/SSR_Matrix_Mult/subMatrix_Pr.c 112 src/Application/SSR_Matrix_Mult/Divide_Pr.c 256 src/Application/SSR_Matrix_Mult/EntryPoint.c 17 src/Application/SSR_Matrix_Mult/Result_Pr.c 46 src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h 8 src/Application/SSR_Matrix_Mult/subMatrix_Pr.c 112
src/Application/SSR_Matrix_Mult/Divide_Pr.c 256
src/Application/SSR_Matrix_Mult/EntryPoint.c 17
     1.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Thu Oct 14 17:10:17 2010 -0700
     1.2 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Tue Nov 02 17:00:50 2010 -0700
     1.3 @@ -18,20 +18,28 @@
     1.4  #define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000
     1.5  
     1.6  
     1.7 -int
     1.8 -measureMatrixMultPrimitive();
     1.9 -
    1.10 +//===========================================================================
    1.11 +int inline
    1.12 +measureMatrixMultPrimitive( VirtProcr *animPr );
    1.13  
    1.14  SlicingStrucCarrier *
    1.15 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix );
    1.16 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
    1.17 +                                 VirtProcr *animPr );
    1.18  
    1.19  SlicingStruc *
    1.20 -sliceUpDimension( float32 idealSizeOfPiece, int startVal, int endVal );
    1.21 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
    1.22 +                  VirtProcr *animPr );
    1.23 +
    1.24 +void
    1.25 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr );
    1.26  
    1.27  SubMatrix **
    1.28  createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    1.29 -                   Matrix *origMatrix );
    1.30 +                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr );
    1.31  
    1.32 +void
    1.33 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
    1.34 +                 SubMatrix **subMatrices, VirtProcr *animPr );
    1.35  
    1.36  void
    1.37  pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices,
    1.38 @@ -105,44 +113,59 @@
    1.39   */
    1.40  
    1.41  void divideWorkIntoSubMatrixPairProcrs( void      *_dividerParams,
    1.42 -                                        VirtProcr *animatingPr )
    1.43 +                                        VirtProcr *animPr )
    1.44   { VirtProcr       *resultPr;
    1.45     DividerParams   *dividerParams;
    1.46     ResultsParams   *resultsParams;
    1.47     Matrix          *leftMatrix, *rightMatrix, *resultMatrix;
    1.48     void            *msg;
    1.49     SlicingStrucCarrier *slicingStrucCarrier;
    1.50 -   float32         *resultArray; //points to array to be put inside result
    1.51 -                                 // matrix
    1.52 +   float32         *resultArray; //points to array inside result matrix
    1.53     
    1.54 -         PRINT_DEBUG("start divide\n")
    1.55 +         DEBUG("start divide\n")
    1.56  
    1.57 +         int32
    1.58 +         divideProbe = VMS__create_single_interval_probe( "divideProbe",
    1.59 +                                                          animPr );
    1.60 +         VMS__record_sched_choice_into_probe( divideProbe, animPr );
    1.61 +         VMS__record_interval_start_in_probe( divideProbe );
    1.62  
    1.63     //=========== Setup -- make local copies of ptd-to-things, malloc, aso
    1.64 +   int32 numResRows, numResCols, vectLength;
    1.65  
    1.66     dividerParams   = (DividerParams *)_dividerParams;
    1.67     
    1.68     leftMatrix      = dividerParams->leftMatrix;
    1.69     rightMatrix     = dividerParams->rightMatrix;
    1.70  
    1.71 +   vectLength = leftMatrix->numCols;
    1.72 +   numResRows = leftMatrix->numRows;
    1.73 +   numResCols = rightMatrix->numCols;
    1.74 +   resultArray     = dividerParams->resultMatrix->array;
    1.75  
    1.76     //==============  Do either sequential mult or do division ==============
    1.77  
    1.78        //Check if input matrices too small -- if yes, just do sequential
    1.79 -   if( leftMatrix->numRows * leftMatrix->numCols * rightMatrix->numCols
    1.80 -       < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) //curoff is determined by overhead
    1.81 -       // of this divider -- relatively machine-independent
    1.82 -    { int32 vectLength, numResRows, numResCols;
    1.83 +      //Cutoff is determined by overhead of this divider -- relatively
    1.84 +      // machine-independent
    1.85 +   if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols *
    1.86 +       (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
    1.87 +    {
    1.88 +      //====== Do sequential multiply on a single core
    1.89 +            DEBUG("doing sequential")
    1.90  
    1.91 -      //====== Do sequential multiply on a single core
    1.92 +      //have to transpose the right matrix first
    1.93 +      float32 *
    1.94 +      transRightArray  = SSR__malloc_to( rightMatrix->numRows *
    1.95 +                                         rightMatrix->numCols *
    1.96 +                                         sizeof(float32),        animPr );
    1.97  
    1.98 -      vectLength = leftMatrix->numCols;
    1.99 -      numResRows = leftMatrix->numRows;
   1.100 -      numResCols = rightMatrix->numCols;
   1.101 -
   1.102 -      resultArray = malloc( numResRows * numResCols * sizeof(float32) );
   1.103 -
   1.104 -      multiplyMatrixArrays( vectLength, numResRows, numResCols,
   1.105 +         //copy values from orig matrix to local
   1.106 +      copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
   1.107 +                     0, 0, rightMatrix->numRows,
   1.108 +                     transRightArray, rightMatrix->array );
   1.109 +      
   1.110 +      multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
   1.111                              leftMatrix->array, rightMatrix->array,
   1.112                              resultArray );
   1.113      }
   1.114 @@ -155,65 +178,62 @@
   1.115           //The ideal size is the one takes the number of cycles to calculate
   1.116           // such that calc time is equal or greater than min work-unit size
   1.117        slicingStrucCarrier =
   1.118 -         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix );
   1.119 +         calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr );
   1.120  
   1.121           //Make the results processor, now that know how many to wait for
   1.122 -      resultsParams = SSR__malloc_size_to(sizeof(ResultsParams),animatingPr);
   1.123 -      resultsParams->dividerPr = animatingPr;
   1.124 +      resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr );
   1.125        resultsParams->numSubMatrixPairs  =
   1.126           slicingStrucCarrier->leftRowSlices->numVals *
   1.127           slicingStrucCarrier->rightColSlices->numVals *
   1.128           slicingStrucCarrier->vecSlices->numVals;
   1.129 -      resultsParams->numCols   = rightMatrix->numCols;
   1.130 -      resultsParams->numRows   = leftMatrix->numRows;
   1.131 +      resultsParams->dividerPr   = animPr;
   1.132 +      resultsParams->numCols     = rightMatrix->numCols;
   1.133 +      resultsParams->numRows     = leftMatrix->numRows;
   1.134 +      resultsParams->resultArray = resultArray;
   1.135 +
   1.136  
   1.137        resultPr =
   1.138 -         SSR__create_procr_with( &gatherResults, resultsParams, animatingPr);
   1.139 +         SSR__create_procr_with( &gatherResults, resultsParams, animPr);
   1.140  
   1.141           //Make the sub-matrices, and pair them up, and make processor to
   1.142           // calc product of each pair.
   1.143        makeSubMatricesAndProcrs( leftMatrix, rightMatrix,
   1.144                                      slicingStrucCarrier,
   1.145 -                                    resultPr, animatingPr);
   1.146 +                                    resultPr, animPr);
   1.147   
   1.148 -         //Get result from result procr
   1.149 -      msg = SSR__receive_from_to( resultPr, animatingPr );
   1.150 -      resultArray = (float32 *) msg;
   1.151 -    }
   1.152 +         //result array is allocated externally, so no message from resultPr
   1.153 +         // however, do have to wait before printing out stats, so wait
   1.154 +         // for an empty handshake message
   1.155 +      msg = SSR__receive_from_to( resultPr, animPr );
   1.156 +   }
   1.157  
   1.158  
   1.159     //===============  Work done -- send results back =================
   1.160  
   1.161  
   1.162 -      //prepare results to persist outside of SSR when return from entry pt
   1.163 -      //The results of the all the work have to be linked-to from the data
   1.164 -      // struc given to the seed procr -- this divide func is animated by
   1.165 -      // that seed procr, so have to link results to the _dividerParams.
   1.166 -   resultMatrix            = SSR__malloc_size_to(sizeof(Matrix),animatingPr);
   1.167 -   resultMatrix->array     = resultArray;
   1.168 -   resultMatrix->numCols   = rightMatrix->numCols;
   1.169 -   resultMatrix->numRows   = leftMatrix->numRows;
   1.170 +         DEBUG_MSG( dbgAppFlow, "end divide\n")
   1.171  
   1.172 +         VMS__record_interval_end_in_probe( divideProbe );
   1.173 +         VMS__print_stats_of_all_probes();
   1.174  
   1.175 -   dividerParams->resultMatrix   = resultMatrix;
   1.176 -   SSR__transfer_ownership_to_outside( msg ); //so not freed
   1.177 -   SSR__transfer_ownership_to_outside( resultMatrix );
   1.178 +      //nothing left to do so dissipate, SSR will wait to shutdown and hence
   1.179 +      // make results available to outside until all the processors have
   1.180 +      // dissipated -- so no need to wait for results processor
   1.181  
   1.182 -         PRINT_DEBUG("end divide\n")
   1.183 -
   1.184 -   SSR__dissipate_procr( animatingPr );  //all procrs dissipate self at end
   1.185 +   SSR__dissipate_procr( animPr );  //all procrs dissipate self at end
   1.186        //when all of the processors have dissipated, the "create seed and do
   1.187        // work" call in the entry point function returns
   1.188   }
   1.189  
   1.190  
   1.191  SlicingStrucCarrier *
   1.192 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix )
   1.193 -{
   1.194 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix,
   1.195 +                                 VirtProcr *animPr )
   1.196 + {
   1.197     float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2;
   1.198     SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   1.199     SlicingStrucCarrier *slicingStrucCarrier =
   1.200 -                                         malloc(sizeof(SlicingStrucCarrier));
   1.201 +                         SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr);
   1.202  
   1.203     int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits;
   1.204     float64 numPrimitiveOpsInMinWorkUnit;
   1.205 @@ -226,7 +246,7 @@
   1.206     minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 );
   1.207  
   1.208        //ask SSR for number of cycles of the "primitive" op of matrix mult
   1.209 -   primitiveCycles = measureMatrixMultPrimitive();
   1.210 +   primitiveCycles = measureMatrixMultPrimitive( animPr );
   1.211  
   1.212     numPrimitiveOpsInMinWorkUnit =
   1.213        (float64)minWorkUnitCycles / (float64)primitiveCycles;
   1.214 @@ -238,6 +258,7 @@
   1.215     idealNumWorkUnits = SSR__giveIdealNumWorkUnits();
   1.216     
   1.217     idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits ));
   1.218 +   idealSizeOfSide2 *= 0.6; //finer granularity to help load balance
   1.219  
   1.220     if( idealSizeOfSide1 > idealSizeOfSide2 )
   1.221        idealSizeOfSide = idealSizeOfSide1;
   1.222 @@ -261,41 +282,47 @@
   1.223     endRightCol   = rightMatrix->numCols -1;
   1.224  
   1.225     leftRowSlices =
   1.226 -      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow );
   1.227 +      sliceUpDimension( idealSizeOfSide,  startLeftRow, endLeftRow, animPr );
   1.228  
   1.229     vecSlices =
   1.230 -      sliceUpDimension( idealSizeOfSide,  startVec, endVec );
   1.231 +      sliceUpDimension( idealSizeOfSide,  startVec, endVec, animPr );
   1.232  
   1.233     rightColSlices =
   1.234 -      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol );
   1.235 +      sliceUpDimension( idealSizeOfSide,  startRightCol, endRightCol,animPr);
   1.236  
   1.237     slicingStrucCarrier->leftRowSlices  = leftRowSlices;
   1.238     slicingStrucCarrier->vecSlices      = vecSlices;
   1.239     slicingStrucCarrier->rightColSlices = rightColSlices;
   1.240  
   1.241     return slicingStrucCarrier;
   1.242 -}
   1.243 + }
   1.244  
   1.245  
   1.246  void
   1.247  makeSubMatricesAndProcrs( Matrix    *leftMatrix, Matrix    *rightMatrix,
   1.248              SlicingStrucCarrier *slicingStrucCarrier,
   1.249 -            VirtProcr *resultPr,   VirtProcr *animatingPr )
   1.250 +            VirtProcr *resultPr,   VirtProcr *animPr )
   1.251   {
   1.252     SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices;
   1.253     
   1.254     leftRowSlices  = slicingStrucCarrier->leftRowSlices;
   1.255     vecSlices      = slicingStrucCarrier->vecSlices;
   1.256     rightColSlices = slicingStrucCarrier->rightColSlices;
   1.257 +   SSR__free( slicingStrucCarrier, animPr );
   1.258     
   1.259     //================  Make sub-matrices, given the slicing  ================
   1.260     SubMatrix **leftSubMatrices, **rightSubMatrices;
   1.261     leftSubMatrices =
   1.262 -      createSubMatrices( leftRowSlices, vecSlices,
   1.263 -                         leftMatrix );
   1.264 +      createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals,
   1.265 +                         leftMatrix, animPr );
   1.266 +   //double_check_that_always_numRows_in_right_same_as_numCols_in_left();
   1.267     rightSubMatrices =
   1.268 -      createSubMatrices( vecSlices, rightColSlices,
   1.269 -                         rightMatrix );
   1.270 +      createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals,
   1.271 +                         rightMatrix, animPr );
   1.272 +
   1.273 +   freeSlicingStruc( leftRowSlices, animPr );
   1.274 +   freeSlicingStruc( vecSlices, animPr );
   1.275 +   freeSlicingStruc( rightColSlices, animPr );
   1.276  
   1.277     //==============  pair the sub-matrices and make processors ==============
   1.278     int32 numRowIdxs, numColIdxs, numVecIdxs;
   1.279 @@ -308,7 +335,7 @@
   1.280                                         numRowIdxs, numColIdxs,
   1.281                                         numVecIdxs,
   1.282                                         resultPr,
   1.283 -                                       animatingPr );
   1.284 +                                       animPr );
   1.285   }
   1.286  
   1.287  
   1.288 @@ -326,21 +353,30 @@
   1.289     int32 numLeftColIdxs, numRightColIdxs;
   1.290     int32 leftRowIdxOffset;
   1.291     SMPairParams *subMatrixPairParams;
   1.292 +   float32 numToPutOntoEachCore, leftOverFraction;
   1.293 +   int32 numCores, coreToScheduleOnto, numVecOnCurrCore;
   1.294  
   1.295     numLeftColIdxs  = numColIdxs;
   1.296     numRightColIdxs = numVecIdxs;
   1.297  
   1.298 +   numCores = SSR__give_number_of_cores_to_schedule_onto();
   1.299 +
   1.300 +   numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores;
   1.301 +   leftOverFraction = 0;
   1.302 +   numVecOnCurrCore = 0;
   1.303 +   coreToScheduleOnto = 0;
   1.304 +
   1.305     for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ )
   1.306      {
   1.307        leftRowIdxOffset = resRowIdx * numLeftColIdxs;
   1.308  
   1.309        for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ )
   1.310         {
   1.311 -
   1.312 +         
   1.313           for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ )
   1.314            {
   1.315                 //Make the processor for the pair of sub-matrices
   1.316 -            subMatrixPairParams  = SSR__malloc_size_to(sizeof(SMPairParams),
   1.317 +            subMatrixPairParams  = SSR__malloc_to( sizeof(SMPairParams),
   1.318                                                                 animatingPr);
   1.319              subMatrixPairParams->leftSubMatrix  =
   1.320                 leftSubMatrices[ leftRowIdxOffset + vecIdx ];
   1.321 @@ -350,9 +386,36 @@
   1.322  
   1.323              subMatrixPairParams->resultPr = resultPr;
   1.324  
   1.325 -            SSR__create_procr_with( &calcSubMatrixProduct,
   1.326 -                                    subMatrixPairParams,
   1.327 -                                    animatingPr );
   1.328 +               //put all pairs from the same vector onto same core
   1.329 +            SSR__create_procr_with_affinity( &calcSubMatrixProduct,
   1.330 +                                             subMatrixPairParams,
   1.331 +                                             animatingPr,
   1.332 +                                             coreToScheduleOnto );
   1.333 +          }
   1.334 +
   1.335 +            //Trying to distribute the subMatrix-vectors across the cores, so
   1.336 +            // that each core gets the same number of vectors, with a max
   1.337 +            // imbalance of 1 vector more on some cores than others
   1.338 +         numVecOnCurrCore += 1;
   1.339 +         if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 )
   1.340 +          {
   1.341 +               //deal with fractional part, to ensure that imbalance is 1 max
   1.342 +               // IE, core with most has only 1 more than core with least
   1.343 +            leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore;
   1.344 +            if( leftOverFraction >= 1 )
   1.345 +             { leftOverFraction -= 1;
   1.346 +               numVecOnCurrCore = -1;
   1.347 +             }
   1.348 +            else
   1.349 +             { numVecOnCurrCore = 0;
   1.350 +             }
   1.351 +               //Move to next core, max core-value to incr to is numCores -1
   1.352 +            if( coreToScheduleOnto >= numCores -1 )
   1.353 +             { coreToScheduleOnto = 0;
   1.354 +             }
   1.355 +            else
   1.356 +             { coreToScheduleOnto += 1;
   1.357 +             }
   1.358            }
   1.359         }
   1.360      }
   1.361 @@ -365,7 +428,7 @@
   1.362   */
   1.363  SubMatrix **
   1.364  createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   1.365 -                   Matrix *origMatrix )
   1.366 +                   int32 numUses, Matrix *origMatrix, VirtProcr *animPr )
   1.367   {
   1.368     int32 numRowIdxs, numColIdxs, rowIdx, colIdx;
   1.369     int32 startRow, endRow, startCol, endCol;
   1.370 @@ -379,7 +442,8 @@
   1.371     rowStartVals = rowSlices->startVals;
   1.372     colStartVals = colSlices->startVals;
   1.373  
   1.374 -   subMatrices = malloc( numRowIdxs * numColIdxs * sizeof(SubMatrix *) );
   1.375 +   subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*),
   1.376 +                                 animPr );
   1.377  
   1.378     for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   1.379      {
   1.380 @@ -394,13 +458,14 @@
   1.381           startCol = colStartVals[colIdx];
   1.382           endCol   = colStartVals[colIdx + 1] -1;
   1.383  
   1.384 -         newSubMatrix = malloc( sizeof(SubMatrix) );
   1.385 +         newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr );
   1.386           newSubMatrix->numRows       = endRow - startRow +1;
   1.387           newSubMatrix->numCols       = endCol - startCol +1;
   1.388           newSubMatrix->origMatrix    = origMatrix;
   1.389           newSubMatrix->origStartRow  = startRow;
   1.390           newSubMatrix->origStartCol  = startCol;
   1.391           newSubMatrix->alreadyCopied = FALSE;
   1.392 +         newSubMatrix->numUsesLeft   = numUses; //can free after this many
   1.393  
   1.394           subMatrices[ rowOffset + colIdx ] = newSubMatrix;
   1.395         }
   1.396 @@ -409,18 +474,43 @@
   1.397   }
   1.398  
   1.399  
   1.400 +void
   1.401 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices,
   1.402 +                 SubMatrix **subMatrices, VirtProcr *animPr )
   1.403 + {
   1.404 +   int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset;
   1.405 +   SubMatrix *subMatrix;
   1.406 +
   1.407 +   numRowIdxs = rowSlices->numVals;
   1.408 +   numColIdxs = colSlices->numVals;
   1.409 +
   1.410 +   for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ )
   1.411 +    {
   1.412 +      rowOffset = rowIdx * numColIdxs;
   1.413 +      for( colIdx = 0; colIdx < numColIdxs; colIdx++ )
   1.414 +       {
   1.415 +         subMatrix = subMatrices[ rowOffset + colIdx ];
   1.416 +         if( subMatrix->alreadyCopied )
   1.417 +            SSR__free( subMatrix->array, animPr );
   1.418 +         SSR__free( subMatrix, animPr );
   1.419 +       }
   1.420 +    }
   1.421 +   SSR__free( subMatrices, animPr );
   1.422 + }
   1.423 +
   1.424  
   1.425  
   1.426  SlicingStruc *
   1.427 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal )
   1.428 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal,
   1.429 +                  VirtProcr *animPr )
   1.430   { float32 residualAcc = 0;
   1.431     int     numSlices, i, *startVals, sizeOfSlice, endCondition;
   1.432 -   SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) );
   1.433 +   SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr);
   1.434  
   1.435        //calc size of matrix need to hold start vals --
   1.436     numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide);
   1.437  
   1.438 -   startVals = malloc( (numSlices + 1) * sizeof(int32) );
   1.439 +   startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr );
   1.440  
   1.441        //Calc the upper limit of start value -- when get above this, end loop
   1.442        // by saving highest value of the matrix dimension to access, plus 1
   1.443 @@ -451,17 +541,24 @@
   1.444     return slicingStruc;
   1.445   }
   1.446  
   1.447 +void
   1.448 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr )
   1.449 + {
   1.450 +   SSR__free( slicingStruc->startVals, animPr );
   1.451 +   SSR__free( slicingStruc, animPr );
   1.452 + }
   1.453 +
   1.454  
   1.455  int inline
   1.456 -measureMatrixMultPrimitive()
   1.457 +measureMatrixMultPrimitive( VirtProcr *animPr )
   1.458   {
   1.459     int r, c, v, numCycles;
   1.460     float32 *res, *left, *right;
   1.461  
   1.462        //setup inputs
   1.463 -   left  = malloc( 5 * 5 * sizeof( float32 ) );
   1.464 -   right = malloc( 5 * 5 * sizeof( float32 ) );
   1.465 -   res   = malloc( 5 * 5 * sizeof( float32 ) );
   1.466 +   left  = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   1.467 +   right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   1.468 +   res   = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr );
   1.469  
   1.470     for( r = 0; r < 5; r++ )
   1.471      {
   1.472 @@ -485,8 +582,11 @@
   1.473         }
   1.474      }
   1.475     numCycles =
   1.476 -      SSR__end_primitive_and_give_cycles(); 
   1.477 +      SSR__end_primitive_and_give_cycles();
   1.478 +
   1.479 +   SSR__free( left, animPr );
   1.480 +   SSR__free( right, animPr );
   1.481 +   SSR__free( res, animPr );
   1.482  
   1.483     return numCycles;
   1.484   }
   1.485 -

     2.1 --- a/src/Application/SSR_Matrix_Mult/EntryPoint.c	Thu Oct 14 17:10:17 2010 -0700
     2.2 +++ b/src/Application/SSR_Matrix_Mult/EntryPoint.c	Tue Nov 02 17:00:50 2010 -0700
     2.3 @@ -30,6 +30,7 @@
     2.4  multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix )
     2.5   { Matrix          *resMatrix;
     2.6     DividerParams   *dividerParams;
     2.7 +   int32            numResRows, numResCols;
     2.8  
     2.9  
    2.10     dividerParams              = malloc( sizeof( DividerParams ) );
    2.11 @@ -37,13 +38,25 @@
    2.12     dividerParams->rightMatrix = rightMatrix;
    2.13  
    2.14  
    2.15 +   numResRows  = leftMatrix->numRows;
    2.16 +   numResCols  = rightMatrix->numCols;
    2.17 +
    2.18 +      //VMS has its own separate internal malloc, so to get results out,
    2.19 +      // have to pass in empty array for it to fill up
    2.20 +      //The alternative is internally telling SSR make external space to use
    2.21 +   resMatrix            = malloc( sizeof(Matrix) );
    2.22 +   resMatrix->array     = malloc( numResRows * numResCols * sizeof(float32));
    2.23 +   resMatrix->numCols   = rightMatrix->numCols;
    2.24 +   resMatrix->numRows   = leftMatrix->numRows;
    2.25 +
    2.26 +
    2.27 +   dividerParams->resultMatrix   = resMatrix;
    2.28 +
    2.29        //create divider processor, start doing the work, and wait till done
    2.30        //This function is the "border crossing" between normal code and SSR
    2.31     SSR__create_seed_procr_and_do_work( &divideWorkIntoSubMatrixPairProcrs,
    2.32                                         dividerParams );
    2.33     
    2.34 -      //get result matrix and return it
    2.35 -   resMatrix = dividerParams->resultMatrix;
    2.36     free( dividerParams );
    2.37     return resMatrix;
    2.38   }

     3.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c	Thu Oct 14 17:10:17 2010 -0700
     3.2 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c	Tue Nov 02 17:00:50 2010 -0700
     3.3 @@ -8,6 +8,7 @@
     3.4  
     3.5  #include "SSR_Matrix_Mult.h"
     3.6  
     3.7 +//=====================
     3.8  void inline
     3.9  accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    3.10                    int32    startRow,
    3.11 @@ -16,6 +17,7 @@
    3.12                    int32    numCols,
    3.13                    int32    numOrigCols );
    3.14  
    3.15 +//===========================================================================
    3.16  
    3.17  /*The Result Processor gets a message from each of the vector processors,
    3.18   * puts the result from the message in its location in the result-
    3.19 @@ -32,7 +34,7 @@
    3.20     void           *msg;
    3.21     SMPairParams   *resParams;
    3.22  
    3.23 -         PRINT_DEBUG("start resultPr\n")
    3.24 +         DEBUG("start resultPr\n")
    3.25           
    3.26     params    = (ResultsParams *)_params;
    3.27     dividerPr = params->dividerPr;
    3.28 @@ -40,8 +42,7 @@
    3.29     numRows = params->numRows;
    3.30     numCols = params->numCols;
    3.31  
    3.32 -   resultArray = SSR__malloc_size_to( numRows * numCols * sizeof(float32),
    3.33 -                                       animatingPr );
    3.34 +   resultArray = params->resultArray;
    3.35  
    3.36        //zero out the results array -- will be accumulating, so must start 0
    3.37     for( row = 0; row < numRows; row++ )
    3.38 @@ -57,24 +58,45 @@
    3.39        msg = SSR__receive_type_to( RESULTS_MSG, animatingPr );
    3.40  
    3.41        resParams = (SMPairParams *)msg;
    3.42 -      accumulateResult( resultArray, resParams->resultArray,
    3.43 +      accumulateResult( resultArray, resParams->partialResultArray,
    3.44                          resParams->leftSubMatrix->origStartRow,
    3.45                          resParams->leftSubMatrix->numRows,
    3.46                          resParams->rightSubMatrix->origStartCol,
    3.47                          resParams->rightSubMatrix->numCols,
    3.48                          resParams->rightSubMatrix->origMatrix->numCols );
    3.49 +
    3.50 +      SSR__free( resParams->partialResultArray, animatingPr );
    3.51 +      
    3.52 +         //there is only one copy of results procr, so can update numUsesLeft
    3.53 +         // without concurrency worries.  When zero, free the sub-matrix
    3.54 +      resParams->leftSubMatrix->numUsesLeft -= 1;
    3.55 +      if( resParams->leftSubMatrix->numUsesLeft == 0 )
    3.56 +       {
    3.57 +         SSR__free( resParams->leftSubMatrix->array, animatingPr );
    3.58 +         SSR__free( resParams->leftSubMatrix, animatingPr );
    3.59 +       }
    3.60 +
    3.61 +      resParams->rightSubMatrix->numUsesLeft -= 1;
    3.62 +      if( resParams->rightSubMatrix->numUsesLeft == 0 )
    3.63 +       {
    3.64 +         SSR__free( resParams->rightSubMatrix->array, animatingPr );
    3.65 +         SSR__free( resParams->rightSubMatrix, animatingPr );
    3.66 +       }
    3.67 +
    3.68 +         //count of how many sub-matrix pairs accumulated so know when done
    3.69        count++;
    3.70      }
    3.71 -      //if were real lang, would have auto-nested transfer -- but HelloWorld
    3.72 -      // language, so have to transfer ownership of each allocated block of
    3.73 -      // locations separately
    3.74 -   SSR__transfer_ownership_of_from_to( resultArray, animatingPr, dividerPr );
    3.75 -   SSR__send_from_to( resultArray, animatingPr, dividerPr );
    3.76 +
    3.77 +      //Done -- could just dissipate -- SSR will wait for all processors to
    3.78 +      // dissipate before shutting down, and thereby making results avaial to
    3.79 +      // outside, so no need to stop the divider from dissipating, so no need
    3.80 +      // to send a hand-shake message to it -- bug makes debug easier
    3.81 +   SSR__send_from_to( NULL, animatingPr, dividerPr );
    3.82     SSR__dissipate_procr( animatingPr );  //frees any data owned by procr
    3.83   }
    3.84  
    3.85  void inline
    3.86 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray,
    3.87 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray,
    3.88                    int32    startRow,
    3.89                    int32    numRows,
    3.90                    int32    startCol,
    3.91 @@ -86,8 +108,8 @@
    3.92      {
    3.93        for( col = 0; col < numCols; col++ )
    3.94         {
    3.95 -         resultArray[ (row + startRow) * numOrigCols + col + startCol ] +=
    3.96 -            subMatrixResultArray[ row * numCols + col ];
    3.97 +         resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] +=
    3.98 +            subMatrixPairResultArray[ row * numCols + col ];
    3.99         }
   3.100      }
   3.101  

     4.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Thu Oct 14 17:10:17 2010 -0700
     4.2 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Tue Nov 02 17:00:50 2010 -0700
     4.3 @@ -17,8 +17,10 @@
     4.4  #define COLS_IN_BLOCK 32
     4.5  #define VEC_IN_BLOCK  32
     4.6  
     4.7 +#define copyMatrixSingleton 1
     4.8 +#define copyTransposeSingleton 2
     4.9  
    4.10 -#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin);
    4.11 +#define DEBUG(msg) //printf(msg); fflush(stdin);
    4.12  
    4.13  //==============================  Structures  ==============================
    4.14  typedef struct
    4.15 @@ -35,6 +37,7 @@
    4.16     int numRows;
    4.17     int numCols;
    4.18     int numSubMatrixPairs;
    4.19 +   float32 *resultArray;
    4.20   }
    4.21  ResultsParams;
    4.22  
    4.23 @@ -46,6 +49,7 @@
    4.24     int32    origStartRow;
    4.25     int32    origStartCol;
    4.26     int32    alreadyCopied;
    4.27 +   int32    numUsesLeft; //have update via message to avoid multiple writers
    4.28     float32 *array;  //2D, but dynamically sized, so use addr arith
    4.29   }
    4.30  SubMatrix;
    4.31 @@ -54,7 +58,7 @@
    4.32   { VirtProcr *resultPr;
    4.33     SubMatrix *leftSubMatrix;
    4.34     SubMatrix *rightSubMatrix;
    4.35 -   float32   *resultArray;
    4.36 +   float32   *partialResultArray;
    4.37   }
    4.38  SMPairParams;
    4.39  

     5.1 --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Thu Oct 14 17:10:17 2010 -0700
     5.2 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Tue Nov 02 17:00:50 2010 -0700
     5.3 @@ -10,10 +10,10 @@
     5.4  
     5.5  
     5.6  void inline
     5.7 -copyFromOrig( SubMatrix *subMatrix );
     5.8 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
     5.9  
    5.10  void inline
    5.11 -copyTransposeFromOrig( SubMatrix *subMatrix );
    5.12 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr );
    5.13  
    5.14  void inline
    5.15  multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
    5.16 @@ -24,7 +24,7 @@
    5.17                       int resStride, int inpStride );
    5.18  
    5.19  void inline
    5.20 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
    5.21 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols,
    5.22                        float32 *leftArray, float32 *rightArray,
    5.23                        float32 *resArray );
    5.24  
    5.25 @@ -48,7 +48,7 @@
    5.26     float32        *leftArray,  *rightArray, *resArray;
    5.27     SubMatrix      *leftSubMatrix, *rightSubMatrix;
    5.28  
    5.29 -         PRINT_DEBUG("start sub-matrix mult\n")
    5.30 +         DEBUG("start sub-matrix mult\n")
    5.31  
    5.32     params         = (SMPairParams *)data;
    5.33     resultPr       = params->resultPr;
    5.34 @@ -56,14 +56,15 @@
    5.35     rightSubMatrix = params->rightSubMatrix;
    5.36  
    5.37        //make sure the input sub-matrices have been copied out of orig
    5.38 -   copyFromOrig( leftSubMatrix );
    5.39 -   copyTransposeFromOrig( rightSubMatrix );
    5.40 +      //do it here, inside sub-matrix pair to hopefully gain reuse in cache
    5.41 +   copyFromOrig( leftSubMatrix, animatingPr );
    5.42 +   copyTransposeFromOrig( rightSubMatrix, animatingPr );
    5.43     
    5.44     leftArray      = leftSubMatrix->array;
    5.45     rightArray     = rightSubMatrix->array;
    5.46  
    5.47 -   resArray = malloc( leftSubMatrix->numRows * rightSubMatrix->numCols *
    5.48 -                         sizeof( float32 ) );
    5.49 +   resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols
    5.50 +                             * sizeof( float32 ), animatingPr );
    5.51  
    5.52  
    5.53     int32 numResRows, numResCols, vectLength;
    5.54 @@ -72,12 +73,12 @@
    5.55     numResRows = leftSubMatrix->numRows;
    5.56     numResCols = rightSubMatrix->numCols;
    5.57  
    5.58 -   multiplyMatrixArrays( vectLength, numResRows, numResCols,
    5.59 -                         leftArray, rightArray,
    5.60 +   multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
    5.61 +                         leftArray,  rightArray,
    5.62                           resArray );
    5.63  
    5.64     //send result to result processor
    5.65 -   params->resultArray = resArray;
    5.66 +   params->partialResultArray = resArray;
    5.67     SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr );
    5.68     SSR__dissipate_procr( animatingPr );
    5.69   }
    5.70 @@ -95,7 +96,8 @@
    5.71   *
    5.72   */
    5.73  void inline
    5.74 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols,
    5.75 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
    5.76 +                                int32 numResCols,
    5.77                        float32 *leftArray, float32 *rightArray,
    5.78                        float32 *resArray )
    5.79   {
    5.80 @@ -172,29 +174,15 @@
    5.81      }
    5.82   }
    5.83  
    5.84 +
    5.85 +/*Reuse this in divider when do the sequential multiply case
    5.86 + */
    5.87  void inline
    5.88 -copyTransposeFromOrig( SubMatrix *subMatrix )
    5.89 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
    5.90 -   Matrix *origMatrix;
    5.91 -   float32 *origArray, *subArray;
    5.92 -
    5.93 -   if( subMatrix->alreadyCopied ) return;
    5.94 -
    5.95 -   subMatrix->alreadyCopied = TRUE;
    5.96 -
    5.97 -   origMatrix   = subMatrix->origMatrix;
    5.98 -   origArray     = origMatrix->array;
    5.99 -   numCols      = subMatrix->numCols;
   5.100 -   numRows      = subMatrix->numRows;
   5.101 -   stride       = numRows;
   5.102 -   origStartRow = subMatrix->origStartRow;
   5.103 -   origStartCol = subMatrix->origStartCol;
   5.104 -   origStride   = origMatrix->numCols;
   5.105 -
   5.106 -   subArray      = malloc( numRows * numCols * sizeof(float32) );
   5.107 -   subMatrix->array = subArray;
   5.108 -
   5.109 -      //copy values from orig matrix to local
   5.110 +copyTranspose( int32 numRows, int32 numCols,
   5.111 +               int32 origStartRow, int32 origStartCol, int32 origStride,
   5.112 +               float32 *subArray, float32 *origArray )
   5.113 + { int32 stride = numRows;
   5.114 + 
   5.115     int row, col, origOffset;
   5.116     for( row = 0; row < numRows; row++ )
   5.117      {
   5.118 @@ -203,21 +191,60 @@
   5.119         {
   5.120              //transpose means swap row & col -- traverse orig matrix normally
   5.121              // but put into reversed place in local array -- means the
   5.122 -            // stride is the num rows now, so col * numRows + row
   5.123 +            // stride is the numRows now, so col * numRows + row
   5.124           subArray[ col * stride + row ]  =  origArray[ origOffset + col ];
   5.125 -       }      
   5.126 +       }
   5.127      }
   5.128   }
   5.129  
   5.130  void inline
   5.131 -copyFromOrig( SubMatrix *subMatrix )
   5.132 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   5.133 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride;
   5.134 +   Matrix *origMatrix;
   5.135 +   float32 *origArray, *subArray;
   5.136 +
   5.137 +   if( subMatrix->alreadyCopied ) return;
   5.138 +   SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr);
   5.139 +
   5.140 +   origMatrix   = subMatrix->origMatrix;
   5.141 +   origArray    = origMatrix->array;
   5.142 +   numCols      = subMatrix->numCols;
   5.143 +   numRows      = subMatrix->numRows;
   5.144 +   origStartRow = subMatrix->origStartRow;
   5.145 +   origStartCol = subMatrix->origStartCol;
   5.146 +   origStride   = origMatrix->numCols;
   5.147 +
   5.148 +   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
   5.149 +   subMatrix->array = subArray;
   5.150 +
   5.151 +      //copy values from orig matrix to local
   5.152 +   copyTranspose( numRows, numCols,
   5.153 +                  origStartRow, origStartCol, origStride,
   5.154 +                  subArray, origArray );
   5.155 +
   5.156 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   5.157 +   EndOfTransSingleton:
   5.158 +   return;
   5.159 + }
   5.160 +
   5.161 +
   5.162 +void inline
   5.163 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr )
   5.164   { int numCols, numRows, origStartRow, origStartCol, stride, origStride;
   5.165     Matrix *origMatrix;
   5.166     float32 *origArray, *subArray;
   5.167  
   5.168 +
   5.169 +      //This lets only a single VP execute the code between start and
   5.170 +      // end -- using start and end so that work runs outside the master.
   5.171 +      //Inside, if a second VP ever executes the start, it will be returned
   5.172 +      // from the end-point.
   5.173 +      //Note, for non-GCC, can add a second SSR call at the end, and inside
   5.174 +      // that one, look at the stack at the return addr & save that in an
   5.175 +      // array indexed by singletonID
   5.176     if( subMatrix->alreadyCopied ) return;
   5.177 +   SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr );
   5.178  
   5.179 -   subMatrix->alreadyCopied = TRUE;
   5.180  
   5.181     origMatrix    = subMatrix->origMatrix;
   5.182     origArray     = origMatrix->array;
   5.183 @@ -225,13 +252,14 @@
   5.184     numRows       = subMatrix->numRows;
   5.185     origStartRow  = subMatrix->origStartRow;
   5.186     origStartCol  = subMatrix->origStartCol;
   5.187 -   stride        = numCols;
   5.188     origStride    = origMatrix->numCols;
   5.189  
   5.190 -   subArray      = malloc( numRows * numCols * sizeof(float32) );
   5.191 +   subArray     = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr);
   5.192     subMatrix->array = subArray;
   5.193  
   5.194        //copy values from orig matrix to local
   5.195 +   stride        = numCols;
   5.196 +
   5.197     int row, col, offset, origOffset;
   5.198     for( row = 0; row < numRows; row++ )
   5.199      {
   5.200 @@ -242,4 +270,8 @@
   5.201           subArray[ offset + col ]  =  origArray[ origOffset + col ];
   5.202         }
   5.203      }
   5.204 +
   5.205 +   subMatrix->alreadyCopied = TRUE; //must be last thing before label
   5.206 +   EndOfCopySingleton:
   5.207 +   return;
   5.208   }