Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > SSR > SSR__Blocked_Matrix_Mult__Bench
changeset 3:4e14e2663af9
Fixed concurrency bug -- added singleton to SSR -- works! 3.4x speedup
| author | Me |
|---|---|
| date | Tue, 02 Nov 2010 17:00:50 -0700 |
| parents | f33a9cba5d89 |
| children | cbd8db6b8657 |
| files | src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/EntryPoint.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/subMatrix_Pr.c |
| diffstat | 5 files changed, 305 insertions(+), 134 deletions(-) [+] |
line diff
1.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c Thu Oct 14 17:10:17 2010 -0700 1.2 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c Tue Nov 02 17:00:50 2010 -0700 1.3 @@ -18,20 +18,28 @@ 1.4 #define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 1.5 1.6 1.7 -int 1.8 -measureMatrixMultPrimitive(); 1.9 - 1.10 +//=========================================================================== 1.11 +int inline 1.12 +measureMatrixMultPrimitive( VirtProcr *animPr ); 1.13 1.14 SlicingStrucCarrier * 1.15 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ); 1.16 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 1.17 + VirtProcr *animPr ); 1.18 1.19 SlicingStruc * 1.20 -sliceUpDimension( float32 idealSizeOfPiece, int startVal, int endVal ); 1.21 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 1.22 + VirtProcr *animPr ); 1.23 + 1.24 +void 1.25 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ); 1.26 1.27 SubMatrix ** 1.28 createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 1.29 - Matrix *origMatrix ); 1.30 + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ); 1.31 1.32 +void 1.33 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 1.34 + SubMatrix **subMatrices, VirtProcr *animPr ); 1.35 1.36 void 1.37 pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 1.38 @@ -105,44 +113,59 @@ 1.39 */ 1.40 1.41 void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 1.42 - VirtProcr *animatingPr ) 1.43 + VirtProcr *animPr ) 1.44 { VirtProcr *resultPr; 1.45 DividerParams *dividerParams; 1.46 ResultsParams *resultsParams; 1.47 Matrix *leftMatrix, *rightMatrix, *resultMatrix; 1.48 void *msg; 1.49 SlicingStrucCarrier *slicingStrucCarrier; 1.50 - float32 *resultArray; //points to array to be put inside result 1.51 - // matrix 1.52 + float32 *resultArray; //points to array inside result matrix 1.53 1.54 - PRINT_DEBUG("start divide\n") 1.55 + DEBUG("start divide\n") 1.56 1.57 + int32 1.58 + divideProbe = VMS__create_single_interval_probe( "divideProbe", 1.59 + animPr ); 1.60 + VMS__record_sched_choice_into_probe( divideProbe, animPr ); 1.61 + VMS__record_interval_start_in_probe( divideProbe ); 1.62 1.63 //=========== Setup -- make local copies of ptd-to-things, malloc, aso 1.64 + int32 numResRows, numResCols, vectLength; 1.65 1.66 dividerParams = (DividerParams *)_dividerParams; 1.67 1.68 leftMatrix = dividerParams->leftMatrix; 1.69 rightMatrix = dividerParams->rightMatrix; 1.70 1.71 + vectLength = leftMatrix->numCols; 1.72 + numResRows = leftMatrix->numRows; 1.73 + numResCols = rightMatrix->numCols; 1.74 + resultArray = dividerParams->resultMatrix->array; 1.75 1.76 //============== Do either sequential mult or do division ============== 1.77 1.78 //Check if input matrices too small -- if yes, just do sequential 1.79 - if( leftMatrix->numRows * leftMatrix->numCols * rightMatrix->numCols 1.80 - < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) //curoff is determined by overhead 1.81 - // of this divider -- relatively machine-independent 1.82 - { int32 vectLength, numResRows, numResCols; 1.83 + //Cutoff is determined by overhead of this divider -- relatively 1.84 + // machine-independent 1.85 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 1.86 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 1.87 + { 1.88 + //====== Do sequential multiply on a single core 1.89 + DEBUG("doing sequential") 1.90 1.91 - //====== Do sequential multiply on a single core 1.92 + //have to transpose the right matrix first 1.93 + float32 * 1.94 + transRightArray = SSR__malloc_to( rightMatrix->numRows * 1.95 + rightMatrix->numCols * 1.96 + sizeof(float32), animPr ); 1.97 1.98 - vectLength = leftMatrix->numCols; 1.99 - numResRows = leftMatrix->numRows; 1.100 - numResCols = rightMatrix->numCols; 1.101 - 1.102 - resultArray = malloc( numResRows * numResCols * sizeof(float32) ); 1.103 - 1.104 - multiplyMatrixArrays( vectLength, numResRows, numResCols, 1.105 + //copy values from orig matrix to local 1.106 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 1.107 + 0, 0, rightMatrix->numRows, 1.108 + transRightArray, rightMatrix->array ); 1.109 + 1.110 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 1.111 leftMatrix->array, rightMatrix->array, 1.112 resultArray ); 1.113 } 1.114 @@ -155,65 +178,62 @@ 1.115 //The ideal size is the one takes the number of cycles to calculate 1.116 // such that calc time is equal or greater than min work-unit size 1.117 slicingStrucCarrier = 1.118 - calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix ); 1.119 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); 1.120 1.121 //Make the results processor, now that know how many to wait for 1.122 - resultsParams = SSR__malloc_size_to(sizeof(ResultsParams),animatingPr); 1.123 - resultsParams->dividerPr = animatingPr; 1.124 + resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); 1.125 resultsParams->numSubMatrixPairs = 1.126 slicingStrucCarrier->leftRowSlices->numVals * 1.127 slicingStrucCarrier->rightColSlices->numVals * 1.128 slicingStrucCarrier->vecSlices->numVals; 1.129 - resultsParams->numCols = rightMatrix->numCols; 1.130 - resultsParams->numRows = leftMatrix->numRows; 1.131 + resultsParams->dividerPr = animPr; 1.132 + resultsParams->numCols = rightMatrix->numCols; 1.133 + resultsParams->numRows = leftMatrix->numRows; 1.134 + resultsParams->resultArray = resultArray; 1.135 + 1.136 1.137 resultPr = 1.138 - SSR__create_procr_with( &gatherResults, resultsParams, animatingPr); 1.139 + SSR__create_procr_with( &gatherResults, resultsParams, animPr); 1.140 1.141 //Make the sub-matrices, and pair them up, and make processor to 1.142 // calc product of each pair. 1.143 makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 1.144 slicingStrucCarrier, 1.145 - resultPr, animatingPr); 1.146 + resultPr, animPr); 1.147 1.148 - //Get result from result procr 1.149 - msg = SSR__receive_from_to( resultPr, animatingPr ); 1.150 - resultArray = (float32 *) msg; 1.151 - } 1.152 + //result array is allocated externally, so no message from resultPr 1.153 + // however, do have to wait before printing out stats, so wait 1.154 + // for an empty handshake message 1.155 + msg = SSR__receive_from_to( resultPr, animPr ); 1.156 + } 1.157 1.158 1.159 //=============== Work done -- send results back ================= 1.160 1.161 1.162 - //prepare results to persist outside of SSR when return from entry pt 1.163 - //The results of the all the work have to be linked-to from the data 1.164 - // struc given to the seed procr -- this divide func is animated by 1.165 - // that seed procr, so have to link results to the _dividerParams. 1.166 - resultMatrix = SSR__malloc_size_to(sizeof(Matrix),animatingPr); 1.167 - resultMatrix->array = resultArray; 1.168 - resultMatrix->numCols = rightMatrix->numCols; 1.169 - resultMatrix->numRows = leftMatrix->numRows; 1.170 + DEBUG_MSG( dbgAppFlow, "end divide\n") 1.171 1.172 + VMS__record_interval_end_in_probe( divideProbe ); 1.173 + VMS__print_stats_of_all_probes(); 1.174 1.175 - dividerParams->resultMatrix = resultMatrix; 1.176 - SSR__transfer_ownership_to_outside( msg ); //so not freed 1.177 - SSR__transfer_ownership_to_outside( resultMatrix ); 1.178 + //nothing left to do so dissipate, SSR will wait to shutdown and hence 1.179 + // make results available to outside until all the processors have 1.180 + // dissipated -- so no need to wait for results processor 1.181 1.182 - PRINT_DEBUG("end divide\n") 1.183 - 1.184 - SSR__dissipate_procr( animatingPr ); //all procrs dissipate self at end 1.185 + SSR__dissipate_procr( animPr ); //all procrs dissipate self at end 1.186 //when all of the processors have dissipated, the "create seed and do 1.187 // work" call in the entry point function returns 1.188 } 1.189 1.190 1.191 SlicingStrucCarrier * 1.192 -calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ) 1.193 -{ 1.194 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 1.195 + VirtProcr *animPr ) 1.196 + { 1.197 float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 1.198 SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 1.199 SlicingStrucCarrier *slicingStrucCarrier = 1.200 - malloc(sizeof(SlicingStrucCarrier)); 1.201 + SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); 1.202 1.203 int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 1.204 float64 numPrimitiveOpsInMinWorkUnit; 1.205 @@ -226,7 +246,7 @@ 1.206 minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); 1.207 1.208 //ask SSR for number of cycles of the "primitive" op of matrix mult 1.209 - primitiveCycles = measureMatrixMultPrimitive(); 1.210 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 1.211 1.212 numPrimitiveOpsInMinWorkUnit = 1.213 (float64)minWorkUnitCycles / (float64)primitiveCycles; 1.214 @@ -238,6 +258,7 @@ 1.215 idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); 1.216 1.217 idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 1.218 + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 1.219 1.220 if( idealSizeOfSide1 > idealSizeOfSide2 ) 1.221 idealSizeOfSide = idealSizeOfSide1; 1.222 @@ -261,41 +282,47 @@ 1.223 endRightCol = rightMatrix->numCols -1; 1.224 1.225 leftRowSlices = 1.226 - sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow ); 1.227 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 1.228 1.229 vecSlices = 1.230 - sliceUpDimension( idealSizeOfSide, startVec, endVec ); 1.231 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 1.232 1.233 rightColSlices = 1.234 - sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol ); 1.235 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 1.236 1.237 slicingStrucCarrier->leftRowSlices = leftRowSlices; 1.238 slicingStrucCarrier->vecSlices = vecSlices; 1.239 slicingStrucCarrier->rightColSlices = rightColSlices; 1.240 1.241 return slicingStrucCarrier; 1.242 -} 1.243 + } 1.244 1.245 1.246 void 1.247 makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 1.248 SlicingStrucCarrier *slicingStrucCarrier, 1.249 - VirtProcr *resultPr, VirtProcr *animatingPr ) 1.250 + VirtProcr *resultPr, VirtProcr *animPr ) 1.251 { 1.252 SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 1.253 1.254 leftRowSlices = slicingStrucCarrier->leftRowSlices; 1.255 vecSlices = slicingStrucCarrier->vecSlices; 1.256 rightColSlices = slicingStrucCarrier->rightColSlices; 1.257 + SSR__free( slicingStrucCarrier, animPr ); 1.258 1.259 //================ Make sub-matrices, given the slicing ================ 1.260 SubMatrix **leftSubMatrices, **rightSubMatrices; 1.261 leftSubMatrices = 1.262 - createSubMatrices( leftRowSlices, vecSlices, 1.263 - leftMatrix ); 1.264 + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, 1.265 + leftMatrix, animPr ); 1.266 + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); 1.267 rightSubMatrices = 1.268 - createSubMatrices( vecSlices, rightColSlices, 1.269 - rightMatrix ); 1.270 + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, 1.271 + rightMatrix, animPr ); 1.272 + 1.273 + freeSlicingStruc( leftRowSlices, animPr ); 1.274 + freeSlicingStruc( vecSlices, animPr ); 1.275 + freeSlicingStruc( rightColSlices, animPr ); 1.276 1.277 //============== pair the sub-matrices and make processors ============== 1.278 int32 numRowIdxs, numColIdxs, numVecIdxs; 1.279 @@ -308,7 +335,7 @@ 1.280 numRowIdxs, numColIdxs, 1.281 numVecIdxs, 1.282 resultPr, 1.283 - animatingPr ); 1.284 + animPr ); 1.285 } 1.286 1.287 1.288 @@ -326,21 +353,30 @@ 1.289 int32 numLeftColIdxs, numRightColIdxs; 1.290 int32 leftRowIdxOffset; 1.291 SMPairParams *subMatrixPairParams; 1.292 + float32 numToPutOntoEachCore, leftOverFraction; 1.293 + int32 numCores, coreToScheduleOnto, numVecOnCurrCore; 1.294 1.295 numLeftColIdxs = numColIdxs; 1.296 numRightColIdxs = numVecIdxs; 1.297 1.298 + numCores = SSR__give_number_of_cores_to_schedule_onto(); 1.299 + 1.300 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 1.301 + leftOverFraction = 0; 1.302 + numVecOnCurrCore = 0; 1.303 + coreToScheduleOnto = 0; 1.304 + 1.305 for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 1.306 { 1.307 leftRowIdxOffset = resRowIdx * numLeftColIdxs; 1.308 1.309 for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 1.310 { 1.311 - 1.312 + 1.313 for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 1.314 { 1.315 //Make the processor for the pair of sub-matrices 1.316 - subMatrixPairParams = SSR__malloc_size_to(sizeof(SMPairParams), 1.317 + subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), 1.318 animatingPr); 1.319 subMatrixPairParams->leftSubMatrix = 1.320 leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 1.321 @@ -350,9 +386,36 @@ 1.322 1.323 subMatrixPairParams->resultPr = resultPr; 1.324 1.325 - SSR__create_procr_with( &calcSubMatrixProduct, 1.326 - subMatrixPairParams, 1.327 - animatingPr ); 1.328 + //put all pairs from the same vector onto same core 1.329 + SSR__create_procr_with_affinity( &calcSubMatrixProduct, 1.330 + subMatrixPairParams, 1.331 + animatingPr, 1.332 + coreToScheduleOnto ); 1.333 + } 1.334 + 1.335 + //Trying to distribute the subMatrix-vectors across the cores, so 1.336 + // that each core gets the same number of vectors, with a max 1.337 + // imbalance of 1 vector more on some cores than others 1.338 + numVecOnCurrCore += 1; 1.339 + if( numVecOnCurrCore + leftOverFraction >= numToPutOntoEachCore -1 ) 1.340 + { 1.341 + //deal with fractional part, to ensure that imbalance is 1 max 1.342 + // IE, core with most has only 1 more than core with least 1.343 + leftOverFraction += numToPutOntoEachCore - numVecOnCurrCore; 1.344 + if( leftOverFraction >= 1 ) 1.345 + { leftOverFraction -= 1; 1.346 + numVecOnCurrCore = -1; 1.347 + } 1.348 + else 1.349 + { numVecOnCurrCore = 0; 1.350 + } 1.351 + //Move to next core, max core-value to incr to is numCores -1 1.352 + if( coreToScheduleOnto >= numCores -1 ) 1.353 + { coreToScheduleOnto = 0; 1.354 + } 1.355 + else 1.356 + { coreToScheduleOnto += 1; 1.357 + } 1.358 } 1.359 } 1.360 } 1.361 @@ -365,7 +428,7 @@ 1.362 */ 1.363 SubMatrix ** 1.364 createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 1.365 - Matrix *origMatrix ) 1.366 + int32 numUses, Matrix *origMatrix, VirtProcr *animPr ) 1.367 { 1.368 int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 1.369 int32 startRow, endRow, startCol, endCol; 1.370 @@ -379,7 +442,8 @@ 1.371 rowStartVals = rowSlices->startVals; 1.372 colStartVals = colSlices->startVals; 1.373 1.374 - subMatrices = malloc( numRowIdxs * numColIdxs * sizeof(SubMatrix *) ); 1.375 + subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), 1.376 + animPr ); 1.377 1.378 for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 1.379 { 1.380 @@ -394,13 +458,14 @@ 1.381 startCol = colStartVals[colIdx]; 1.382 endCol = colStartVals[colIdx + 1] -1; 1.383 1.384 - newSubMatrix = malloc( sizeof(SubMatrix) ); 1.385 + newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); 1.386 newSubMatrix->numRows = endRow - startRow +1; 1.387 newSubMatrix->numCols = endCol - startCol +1; 1.388 newSubMatrix->origMatrix = origMatrix; 1.389 newSubMatrix->origStartRow = startRow; 1.390 newSubMatrix->origStartCol = startCol; 1.391 newSubMatrix->alreadyCopied = FALSE; 1.392 + newSubMatrix->numUsesLeft = numUses; //can free after this many 1.393 1.394 subMatrices[ rowOffset + colIdx ] = newSubMatrix; 1.395 } 1.396 @@ -409,18 +474,43 @@ 1.397 } 1.398 1.399 1.400 +void 1.401 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 1.402 + SubMatrix **subMatrices, VirtProcr *animPr ) 1.403 + { 1.404 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 1.405 + SubMatrix *subMatrix; 1.406 + 1.407 + numRowIdxs = rowSlices->numVals; 1.408 + numColIdxs = colSlices->numVals; 1.409 + 1.410 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 1.411 + { 1.412 + rowOffset = rowIdx * numColIdxs; 1.413 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 1.414 + { 1.415 + subMatrix = subMatrices[ rowOffset + colIdx ]; 1.416 + if( subMatrix->alreadyCopied ) 1.417 + SSR__free( subMatrix->array, animPr ); 1.418 + SSR__free( subMatrix, animPr ); 1.419 + } 1.420 + } 1.421 + SSR__free( subMatrices, animPr ); 1.422 + } 1.423 + 1.424 1.425 1.426 SlicingStruc * 1.427 -sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal ) 1.428 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 1.429 + VirtProcr *animPr ) 1.430 { float32 residualAcc = 0; 1.431 int numSlices, i, *startVals, sizeOfSlice, endCondition; 1.432 - SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) ); 1.433 + SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); 1.434 1.435 //calc size of matrix need to hold start vals -- 1.436 numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 1.437 1.438 - startVals = malloc( (numSlices + 1) * sizeof(int32) ); 1.439 + startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); 1.440 1.441 //Calc the upper limit of start value -- when get above this, end loop 1.442 // by saving highest value of the matrix dimension to access, plus 1 1.443 @@ -451,17 +541,24 @@ 1.444 return slicingStruc; 1.445 } 1.446 1.447 +void 1.448 +freeSlicingStruc( SlicingStruc *slicingStruc, VirtProcr *animPr ) 1.449 + { 1.450 + SSR__free( slicingStruc->startVals, animPr ); 1.451 + SSR__free( slicingStruc, animPr ); 1.452 + } 1.453 + 1.454 1.455 int inline 1.456 -measureMatrixMultPrimitive() 1.457 +measureMatrixMultPrimitive( VirtProcr *animPr ) 1.458 { 1.459 int r, c, v, numCycles; 1.460 float32 *res, *left, *right; 1.461 1.462 //setup inputs 1.463 - left = malloc( 5 * 5 * sizeof( float32 ) ); 1.464 - right = malloc( 5 * 5 * sizeof( float32 ) ); 1.465 - res = malloc( 5 * 5 * sizeof( float32 ) ); 1.466 + left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 1.467 + right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 1.468 + res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 1.469 1.470 for( r = 0; r < 5; r++ ) 1.471 { 1.472 @@ -485,8 +582,11 @@ 1.473 } 1.474 } 1.475 numCycles = 1.476 - SSR__end_primitive_and_give_cycles(); 1.477 + SSR__end_primitive_and_give_cycles(); 1.478 + 1.479 + SSR__free( left, animPr ); 1.480 + SSR__free( right, animPr ); 1.481 + SSR__free( res, animPr ); 1.482 1.483 return numCycles; 1.484 } 1.485 -
2.1 --- a/src/Application/SSR_Matrix_Mult/EntryPoint.c Thu Oct 14 17:10:17 2010 -0700 2.2 +++ b/src/Application/SSR_Matrix_Mult/EntryPoint.c Tue Nov 02 17:00:50 2010 -0700 2.3 @@ -30,6 +30,7 @@ 2.4 multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 2.5 { Matrix *resMatrix; 2.6 DividerParams *dividerParams; 2.7 + int32 numResRows, numResCols; 2.8 2.9 2.10 dividerParams = malloc( sizeof( DividerParams ) ); 2.11 @@ -37,13 +38,25 @@ 2.12 dividerParams->rightMatrix = rightMatrix; 2.13 2.14 2.15 + numResRows = leftMatrix->numRows; 2.16 + numResCols = rightMatrix->numCols; 2.17 + 2.18 + //VMS has its own separate internal malloc, so to get results out, 2.19 + // have to pass in empty array for it to fill up 2.20 + //The alternative is internally telling SSR make external space to use 2.21 + resMatrix = malloc( sizeof(Matrix) ); 2.22 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 2.23 + resMatrix->numCols = rightMatrix->numCols; 2.24 + resMatrix->numRows = leftMatrix->numRows; 2.25 + 2.26 + 2.27 + dividerParams->resultMatrix = resMatrix; 2.28 + 2.29 //create divider processor, start doing the work, and wait till done 2.30 //This function is the "border crossing" between normal code and SSR 2.31 SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 2.32 dividerParams ); 2.33 2.34 - //get result matrix and return it 2.35 - resMatrix = dividerParams->resultMatrix; 2.36 free( dividerParams ); 2.37 return resMatrix; 2.38 }
3.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c Thu Oct 14 17:10:17 2010 -0700 3.2 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c Tue Nov 02 17:00:50 2010 -0700 3.3 @@ -8,6 +8,7 @@ 3.4 3.5 #include "SSR_Matrix_Mult.h" 3.6 3.7 +//===================== 3.8 void inline 3.9 accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 3.10 int32 startRow, 3.11 @@ -16,6 +17,7 @@ 3.12 int32 numCols, 3.13 int32 numOrigCols ); 3.14 3.15 +//=========================================================================== 3.16 3.17 /*The Result Processor gets a message from each of the vector processors, 3.18 * puts the result from the message in its location in the result- 3.19 @@ -32,7 +34,7 @@ 3.20 void *msg; 3.21 SMPairParams *resParams; 3.22 3.23 - PRINT_DEBUG("start resultPr\n") 3.24 + DEBUG("start resultPr\n") 3.25 3.26 params = (ResultsParams *)_params; 3.27 dividerPr = params->dividerPr; 3.28 @@ -40,8 +42,7 @@ 3.29 numRows = params->numRows; 3.30 numCols = params->numCols; 3.31 3.32 - resultArray = SSR__malloc_size_to( numRows * numCols * sizeof(float32), 3.33 - animatingPr ); 3.34 + resultArray = params->resultArray; 3.35 3.36 //zero out the results array -- will be accumulating, so must start 0 3.37 for( row = 0; row < numRows; row++ ) 3.38 @@ -57,24 +58,45 @@ 3.39 msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); 3.40 3.41 resParams = (SMPairParams *)msg; 3.42 - accumulateResult( resultArray, resParams->resultArray, 3.43 + accumulateResult( resultArray, resParams->partialResultArray, 3.44 resParams->leftSubMatrix->origStartRow, 3.45 resParams->leftSubMatrix->numRows, 3.46 resParams->rightSubMatrix->origStartCol, 3.47 resParams->rightSubMatrix->numCols, 3.48 resParams->rightSubMatrix->origMatrix->numCols ); 3.49 + 3.50 + SSR__free( resParams->partialResultArray, animatingPr ); 3.51 + 3.52 + //there is only one copy of results procr, so can update numUsesLeft 3.53 + // without concurrency worries. When zero, free the sub-matrix 3.54 + resParams->leftSubMatrix->numUsesLeft -= 1; 3.55 + if( resParams->leftSubMatrix->numUsesLeft == 0 ) 3.56 + { 3.57 + SSR__free( resParams->leftSubMatrix->array, animatingPr ); 3.58 + SSR__free( resParams->leftSubMatrix, animatingPr ); 3.59 + } 3.60 + 3.61 + resParams->rightSubMatrix->numUsesLeft -= 1; 3.62 + if( resParams->rightSubMatrix->numUsesLeft == 0 ) 3.63 + { 3.64 + SSR__free( resParams->rightSubMatrix->array, animatingPr ); 3.65 + SSR__free( resParams->rightSubMatrix, animatingPr ); 3.66 + } 3.67 + 3.68 + //count of how many sub-matrix pairs accumulated so know when done 3.69 count++; 3.70 } 3.71 - //if were real lang, would have auto-nested transfer -- but HelloWorld 3.72 - // language, so have to transfer ownership of each allocated block of 3.73 - // locations separately 3.74 - SSR__transfer_ownership_of_from_to( resultArray, animatingPr, dividerPr ); 3.75 - SSR__send_from_to( resultArray, animatingPr, dividerPr ); 3.76 + 3.77 + //Done -- could just dissipate -- SSR will wait for all processors to 3.78 + // dissipate before shutting down, and thereby making results avaial to 3.79 + // outside, so no need to stop the divider from dissipating, so no need 3.80 + // to send a hand-shake message to it -- bug makes debug easier 3.81 + SSR__send_from_to( NULL, animatingPr, dividerPr ); 3.82 SSR__dissipate_procr( animatingPr ); //frees any data owned by procr 3.83 } 3.84 3.85 void inline 3.86 -accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 3.87 +accumulateResult( float32 *resultArray, float32 *subMatrixPairResultArray, 3.88 int32 startRow, 3.89 int32 numRows, 3.90 int32 startCol, 3.91 @@ -86,8 +108,8 @@ 3.92 { 3.93 for( col = 0; col < numCols; col++ ) 3.94 { 3.95 - resultArray[ (row + startRow) * numOrigCols + col + startCol ] += 3.96 - subMatrixResultArray[ row * numCols + col ]; 3.97 + resultArray[ (row + startRow) * numOrigCols + (col + startCol) ] += 3.98 + subMatrixPairResultArray[ row * numCols + col ]; 3.99 } 3.100 } 3.101
4.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Thu Oct 14 17:10:17 2010 -0700 4.2 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Tue Nov 02 17:00:50 2010 -0700 4.3 @@ -17,8 +17,10 @@ 4.4 #define COLS_IN_BLOCK 32 4.5 #define VEC_IN_BLOCK 32 4.6 4.7 +#define copyMatrixSingleton 1 4.8 +#define copyTransposeSingleton 2 4.9 4.10 -#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin); 4.11 +#define DEBUG(msg) //printf(msg); fflush(stdin); 4.12 4.13 //============================== Structures ============================== 4.14 typedef struct 4.15 @@ -35,6 +37,7 @@ 4.16 int numRows; 4.17 int numCols; 4.18 int numSubMatrixPairs; 4.19 + float32 *resultArray; 4.20 } 4.21 ResultsParams; 4.22 4.23 @@ -46,6 +49,7 @@ 4.24 int32 origStartRow; 4.25 int32 origStartCol; 4.26 int32 alreadyCopied; 4.27 + int32 numUsesLeft; //have update via message to avoid multiple writers 4.28 float32 *array; //2D, but dynamically sized, so use addr arith 4.29 } 4.30 SubMatrix; 4.31 @@ -54,7 +58,7 @@ 4.32 { VirtProcr *resultPr; 4.33 SubMatrix *leftSubMatrix; 4.34 SubMatrix *rightSubMatrix; 4.35 - float32 *resultArray; 4.36 + float32 *partialResultArray; 4.37 } 4.38 SMPairParams; 4.39
5.1 --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Thu Oct 14 17:10:17 2010 -0700 5.2 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Tue Nov 02 17:00:50 2010 -0700 5.3 @@ -10,10 +10,10 @@ 5.4 5.5 5.6 void inline 5.7 -copyFromOrig( SubMatrix *subMatrix ); 5.8 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 5.9 5.10 void inline 5.11 -copyTransposeFromOrig( SubMatrix *subMatrix ); 5.12 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ); 5.13 5.14 void inline 5.15 multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 5.16 @@ -24,7 +24,7 @@ 5.17 int resStride, int inpStride ); 5.18 5.19 void inline 5.20 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 5.21 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, int32 numResCols, 5.22 float32 *leftArray, float32 *rightArray, 5.23 float32 *resArray ); 5.24 5.25 @@ -48,7 +48,7 @@ 5.26 float32 *leftArray, *rightArray, *resArray; 5.27 SubMatrix *leftSubMatrix, *rightSubMatrix; 5.28 5.29 - PRINT_DEBUG("start sub-matrix mult\n") 5.30 + DEBUG("start sub-matrix mult\n") 5.31 5.32 params = (SMPairParams *)data; 5.33 resultPr = params->resultPr; 5.34 @@ -56,14 +56,15 @@ 5.35 rightSubMatrix = params->rightSubMatrix; 5.36 5.37 //make sure the input sub-matrices have been copied out of orig 5.38 - copyFromOrig( leftSubMatrix ); 5.39 - copyTransposeFromOrig( rightSubMatrix ); 5.40 + //do it here, inside sub-matrix pair to hopefully gain reuse in cache 5.41 + copyFromOrig( leftSubMatrix, animatingPr ); 5.42 + copyTransposeFromOrig( rightSubMatrix, animatingPr ); 5.43 5.44 leftArray = leftSubMatrix->array; 5.45 rightArray = rightSubMatrix->array; 5.46 5.47 - resArray = malloc( leftSubMatrix->numRows * rightSubMatrix->numCols * 5.48 - sizeof( float32 ) ); 5.49 + resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols 5.50 + * sizeof( float32 ), animatingPr ); 5.51 5.52 5.53 int32 numResRows, numResCols, vectLength; 5.54 @@ -72,12 +73,12 @@ 5.55 numResRows = leftSubMatrix->numRows; 5.56 numResCols = rightSubMatrix->numCols; 5.57 5.58 - multiplyMatrixArrays( vectLength, numResRows, numResCols, 5.59 - leftArray, rightArray, 5.60 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 5.61 + leftArray, rightArray, 5.62 resArray ); 5.63 5.64 //send result to result processor 5.65 - params->resultArray = resArray; 5.66 + params->partialResultArray = resArray; 5.67 SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); 5.68 SSR__dissipate_procr( animatingPr ); 5.69 } 5.70 @@ -95,7 +96,8 @@ 5.71 * 5.72 */ 5.73 void inline 5.74 -multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 5.75 +multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 5.76 + int32 numResCols, 5.77 float32 *leftArray, float32 *rightArray, 5.78 float32 *resArray ) 5.79 { 5.80 @@ -172,29 +174,15 @@ 5.81 } 5.82 } 5.83 5.84 + 5.85 +/*Reuse this in divider when do the sequential multiply case 5.86 + */ 5.87 void inline 5.88 -copyTransposeFromOrig( SubMatrix *subMatrix ) 5.89 - { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 5.90 - Matrix *origMatrix; 5.91 - float32 *origArray, *subArray; 5.92 - 5.93 - if( subMatrix->alreadyCopied ) return; 5.94 - 5.95 - subMatrix->alreadyCopied = TRUE; 5.96 - 5.97 - origMatrix = subMatrix->origMatrix; 5.98 - origArray = origMatrix->array; 5.99 - numCols = subMatrix->numCols; 5.100 - numRows = subMatrix->numRows; 5.101 - stride = numRows; 5.102 - origStartRow = subMatrix->origStartRow; 5.103 - origStartCol = subMatrix->origStartCol; 5.104 - origStride = origMatrix->numCols; 5.105 - 5.106 - subArray = malloc( numRows * numCols * sizeof(float32) ); 5.107 - subMatrix->array = subArray; 5.108 - 5.109 - //copy values from orig matrix to local 5.110 +copyTranspose( int32 numRows, int32 numCols, 5.111 + int32 origStartRow, int32 origStartCol, int32 origStride, 5.112 + float32 *subArray, float32 *origArray ) 5.113 + { int32 stride = numRows; 5.114 + 5.115 int row, col, origOffset; 5.116 for( row = 0; row < numRows; row++ ) 5.117 { 5.118 @@ -203,21 +191,60 @@ 5.119 { 5.120 //transpose means swap row & col -- traverse orig matrix normally 5.121 // but put into reversed place in local array -- means the 5.122 - // stride is the num rows now, so col * numRows + row 5.123 + // stride is the numRows now, so col * numRows + row 5.124 subArray[ col * stride + row ] = origArray[ origOffset + col ]; 5.125 - } 5.126 + } 5.127 } 5.128 } 5.129 5.130 void inline 5.131 -copyFromOrig( SubMatrix *subMatrix ) 5.132 +copyTransposeFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 5.133 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 5.134 + Matrix *origMatrix; 5.135 + float32 *origArray, *subArray; 5.136 + 5.137 + if( subMatrix->alreadyCopied ) return; 5.138 + SSR__start_singleton( copyMatrixSingleton, &&EndOfTransSingleton, animPr); 5.139 + 5.140 + origMatrix = subMatrix->origMatrix; 5.141 + origArray = origMatrix->array; 5.142 + numCols = subMatrix->numCols; 5.143 + numRows = subMatrix->numRows; 5.144 + origStartRow = subMatrix->origStartRow; 5.145 + origStartCol = subMatrix->origStartCol; 5.146 + origStride = origMatrix->numCols; 5.147 + 5.148 + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 5.149 + subMatrix->array = subArray; 5.150 + 5.151 + //copy values from orig matrix to local 5.152 + copyTranspose( numRows, numCols, 5.153 + origStartRow, origStartCol, origStride, 5.154 + subArray, origArray ); 5.155 + 5.156 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 5.157 + EndOfTransSingleton: 5.158 + return; 5.159 + } 5.160 + 5.161 + 5.162 +void inline 5.163 +copyFromOrig( SubMatrix *subMatrix, VirtProcr *animPr ) 5.164 { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 5.165 Matrix *origMatrix; 5.166 float32 *origArray, *subArray; 5.167 5.168 + 5.169 + //This lets only a single VP execute the code between start and 5.170 + // end -- using start and end so that work runs outside the master. 5.171 + //Inside, if a second VP ever executes the start, it will be returned 5.172 + // from the end-point. 5.173 + //Note, for non-GCC, can add a second SSR call at the end, and inside 5.174 + // that one, look at the stack at the return addr & save that in an 5.175 + // array indexed by singletonID 5.176 if( subMatrix->alreadyCopied ) return; 5.177 + SSR__start_singleton( copyMatrixSingleton, &&EndOfCopySingleton, animPr ); 5.178 5.179 - subMatrix->alreadyCopied = TRUE; 5.180 5.181 origMatrix = subMatrix->origMatrix; 5.182 origArray = origMatrix->array; 5.183 @@ -225,13 +252,14 @@ 5.184 numRows = subMatrix->numRows; 5.185 origStartRow = subMatrix->origStartRow; 5.186 origStartCol = subMatrix->origStartCol; 5.187 - stride = numCols; 5.188 origStride = origMatrix->numCols; 5.189 5.190 - subArray = malloc( numRows * numCols * sizeof(float32) ); 5.191 + subArray = SSR__malloc_to( numRows * numCols *sizeof(float32),animPr); 5.192 subMatrix->array = subArray; 5.193 5.194 //copy values from orig matrix to local 5.195 + stride = numCols; 5.196 + 5.197 int row, col, offset, origOffset; 5.198 for( row = 0; row < numRows; row++ ) 5.199 { 5.200 @@ -242,4 +270,8 @@ 5.201 subArray[ offset + col ] = origArray[ origOffset + col ]; 5.202 } 5.203 } 5.204 + 5.205 + subMatrix->alreadyCopied = TRUE; //must be last thing before label 5.206 + EndOfCopySingleton: 5.207 + return; 5.208 }
