Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > SSR > SSR__Blocked_Matrix_Mult__Bench
changeset 1:47802166a7ae
Either working correctly, or close..
| author | Me |
|---|---|
| date | Thu, 14 Oct 2010 17:09:22 -0700 |
| parents | b8b71da62a09 |
| children | f33a9cba5d89 |
| files | src/Application/Matrix_Mult.c src/Application/Matrix_Mult.h src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/EntryPoint.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/Vector_Pr.c src/Application/SSR_Matrix_Mult/subMatrix_Pr.c |
| diffstat | 8 files changed, 808 insertions(+), 122 deletions(-) [+] |
line diff
1.1 --- a/src/Application/Matrix_Mult.c Tue Oct 05 10:00:11 2010 -0700 1.2 +++ b/src/Application/Matrix_Mult.c Thu Oct 14 17:09:22 2010 -0700 1.3 @@ -61,7 +61,7 @@ 1.4 1.5 numRows = matrixStruc->numRows; 1.6 numCols = matrixStruc->numCols; 1.7 - matrixStart = matrixStruc->matrix; 1.8 + matrixStart = matrixStruc->array; 1.9 1.10 file = fopen( matrixFileName, "r" ); 1.11 if( file == NULL ) { printf( "\nCouldn't open file!!\n"); exit(1);} 1.12 @@ -131,7 +131,7 @@ 1.13 retMatrix = malloc( sizeof( Matrix ) ); 1.14 retMatrix->numRows = numRows; 1.15 retMatrix->numCols = numCols; 1.16 - retMatrix->matrix = malloc( numRows * numCols * sizeof(float32) ); 1.17 + retMatrix->array = malloc( numRows * numCols * sizeof(float32) ); 1.18 1.19 return retMatrix; 1.20 } 1.21 @@ -142,22 +142,24 @@ 1.22 } 1.23 void 1.24 freeMatrix( Matrix * matrix ) 1.25 - { free( matrix->matrix ); 1.26 + { free( matrix->array ); 1.27 free( matrix ); 1.28 } 1.29 1.30 void 1.31 printMatrix( Matrix *matrix ) 1.32 - { int r, c, numRows, numCols; 1.33 + { int r, c, numRows, numCols, rowsToPrint, colsToPrint, rowIncr, colIncr; 1.34 float32 *matrixArray; 1.35 1.36 - numRows = matrix->numRows; 1.37 - numCols = matrix->numCols; 1.38 - matrixArray = matrix->matrix; 1.39 + numRows = rowsToPrint = matrix->numRows; 1.40 + numCols = colsToPrint = matrix->numCols; 1.41 + matrixArray = matrix->array; 1.42 1.43 - for( r = 0; r < numRows; r++ ) 1.44 - { for( c = 0; c < numCols; c++ ) 1.45 - { printf( "%f | ", *(matrixArray + r*numCols + c) ); 1.46 + rowIncr = numRows/20; if(rowIncr == 0) rowIncr = 1;//20 to 39 rows printed 1.47 + colIncr = numCols/20; if(colIncr == 0) colIncr = 1;//20 to 39 cols printed 1.48 + for( r = 0; r < numRows; r += rowIncr ) 1.49 + { for( c = 0; c < numCols; c += colIncr ) 1.50 + { printf( "%3.1f | ", matrixArray[ r * numCols + c ] ); 1.51 } 1.52 printf("\n"); 1.53 }
2.1 --- a/src/Application/Matrix_Mult.h Tue Oct 05 10:00:11 2010 -0700 2.2 +++ b/src/Application/Matrix_Mult.h Thu Oct 14 17:09:22 2010 -0700 2.3 @@ -19,7 +19,7 @@ 2.4 struct 2.5 { int32 numRows; 2.6 int32 numCols; 2.7 - float32 *matrix; //2D, but dynamically sized, so use addr arith 2.8 + float32 *array; //2D, but dynamically sized, so use addr arith 2.9 } 2.10 Matrix; 2.11
3.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c Tue Oct 05 10:00:11 2010 -0700 3.2 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c Thu Oct 14 17:09:22 2010 -0700 3.3 @@ -8,78 +8,485 @@ 3.4 3.5 3.6 #include "SSR_Matrix_Mult.h" 3.7 +#include <math.h> 3.8 3.9 -/*Divider creates one processor for every row-col pair. 3.10 + //The time to compute this many result values should equal the time to 3.11 + // perform this division on a matrix of size gives that many result calcs 3.12 + //IE, size this so that sequential time to calc equals divide time 3.13 + // find the value by experimenting -- but divide time and calc time scale 3.14 + // same way, so this value should remain valid across hardware 3.15 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 3.16 + 3.17 + 3.18 +int 3.19 +measureMatrixMultPrimitive(); 3.20 + 3.21 + 3.22 +SlicingStrucCarrier * 3.23 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ); 3.24 + 3.25 +SlicingStruc * 3.26 +sliceUpDimension( float32 idealSizeOfPiece, int startVal, int endVal ); 3.27 + 3.28 +SubMatrix ** 3.29 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.30 + Matrix *origMatrix ); 3.31 + 3.32 + 3.33 +void 3.34 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 3.35 + SubMatrix **rightSubMatrices, 3.36 + int32 numRowIdxs, int32 numColIdxs, 3.37 + int32 numVecIdxs, 3.38 + VirtProcr *resultPr, 3.39 + VirtProcr *animatingPr ); 3.40 + 3.41 +void 3.42 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 3.43 + SlicingStrucCarrier *slicingStrucCarrier, 3.44 + VirtProcr *resultPr, VirtProcr *animatingPr ); 3.45 + 3.46 + 3.47 + 3.48 +/*Divider creates one processor for every sub-matrix 3.49 * It hands them: 3.50 * the name of the result processor that they should send their results to, 3.51 - * the left and right matrices, and the row and col they should multiply 3.52 - * the length of the vector 3.53 - * It first creates the result processor, then all the vector processors, 3.54 + * the left and right matrices, and the rows and cols they should multiply 3.55 + * It first creates the result processor, then all the sub-matrixPair 3.56 + * processors, 3.57 * then does a receive of a message from the result processor that gives 3.58 * the divider ownership of the result matrix. 3.59 * Finally, the divider returns the result matrix out of the SSR system. 3.60 + * 3.61 + * Divider chooses the size of sub-matrices via an algorithm that tries to 3.62 + * keep the minimum work above a threshold. The threshold is machine- 3.63 + * dependent, so ask SSR for min work-unit time to get a 3.64 + * given overhead 3.65 + * 3.66 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 3.67 + * product -- gives the number of products need to have in min size 3.68 + * matrix. 3.69 + * 3.70 + * So then, take cubed root of this to get the size of a side of min sub- 3.71 + * matrix. That is the size of the ideal square sub-matrix -- so tile 3.72 + * up the two input matrices into ones as close as possible to that size, 3.73 + * and create the pairs of sub-matrices. 3.74 + * 3.75 + *======================== STRATEGIC OVERVIEW ======================= 3.76 + * 3.77 + *This division is a bit tricky, because have to create things in advance 3.78 + * that it's not at first obvious need to be created.. 3.79 + * 3.80 + *First slice up each dimension -- three of them.. this is because will have 3.81 + * to create the sub-matrix's data-structures before pairing the sub-matrices 3.82 + * with each other -- so, have three dimensions to slice up before can 3.83 + * create the sub-matrix data-strucs -- also, have to be certain that the 3.84 + * cols of the left input have the exact same slicing as the rows of the 3.85 + * left matrix, so just to be sure, do the slicing calc once, then use it 3.86 + * for both. 3.87 + * 3.88 + *So, goes like this: 3.89 + *1) calculate the start & end values of each dimension in each matrix. 3.90 + *2) use those values to create sub-matrix structures 3.91 + *3) combine sub-matrices into pairs, as the tasks to perform. 3.92 + * 3.93 + *Have to calculate separately from creating the sub-matrices because of the 3.94 + * nature of the nesting -- would either end up creating the same sub-matrix 3.95 + * multiple times, or else would have to put in detection of whether had 3.96 + * made a particular one already if tried to combine steps 1 and 2. 3.97 + * 3.98 + *Step 3 has to be separate because of the nesting, as well -- same reason, 3.99 + * would either create same sub-matrix multiple times, or else have to 3.100 + * add detection of whether was already created. 3.101 + * 3.102 + *Another way to look at it: there's one level of loop to divide dimensions, 3.103 + * two levels of nesting to create sub-matrices, and three levels to pair 3.104 + * up the sub-matrices. 3.105 */ 3.106 -void divideIntoVectors( void *_dividerParams, VirtProcr *animatingPr ) 3.107 + 3.108 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 3.109 + VirtProcr *animatingPr ) 3.110 { VirtProcr *resultPr; 3.111 DividerParams *dividerParams; 3.112 ResultsParams *resultsParams; 3.113 - VectorParams *vectParams; 3.114 Matrix *leftMatrix, *rightMatrix, *resultMatrix; 3.115 void *msg; 3.116 + SlicingStrucCarrier *slicingStrucCarrier; 3.117 + float32 *resultArray; //points to array to be put inside result 3.118 + // matrix 3.119 + 3.120 + PRINT_DEBUG("start divide\n") 3.121 3.122 -// printf("start divide\n"); fflush(stdin); 3.123 - 3.124 + 3.125 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 3.126 + 3.127 dividerParams = (DividerParams *)_dividerParams; 3.128 3.129 leftMatrix = dividerParams->leftMatrix; 3.130 rightMatrix = dividerParams->rightMatrix; 3.131 3.132 - resultsParams = SSR__malloc_size_to( sizeof(ResultsParams), 3.133 - animatingPr ); 3.134 - resultsParams->dividerPr = animatingPr; 3.135 - resultsParams->numCols = rightMatrix->numCols; 3.136 - resultsParams->numRows = leftMatrix->numRows; 3.137 - 3.138 - resultPr = SSR__create_procr_with(&gatherResults, resultsParams, 3.139 - animatingPr); 3.140 3.141 - int row, col; 3.142 - for( row = 0; row < leftMatrix->numRows; row++ ) 3.143 - { for( col = 0; col < rightMatrix->numCols; col++ ) 3.144 - { 3.145 - vectParams = SSR__malloc_size_to(sizeof(VectorParams), 3.146 - animatingPr); 3.147 - vectParams->resultPr = resultPr; 3.148 - vectParams->myCol = col; 3.149 - vectParams->myRow = row; 3.150 - vectParams->vectLength = leftMatrix->numCols; 3.151 - vectParams->leftMatrix = leftMatrix; 3.152 - vectParams->rightMatrix = rightMatrix; 3.153 - 3.154 - SSR__create_procr_with( &calcVector, vectParams, animatingPr ); 3.155 - //vectParams ownership transferred to the newly created processor 3.156 - } 3.157 + //============== Do either sequential mult or do division ============== 3.158 + 3.159 + //Check if input matrices too small -- if yes, just do sequential 3.160 + if( leftMatrix->numRows * leftMatrix->numCols * rightMatrix->numCols 3.161 + < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) //curoff is determined by overhead 3.162 + // of this divider -- relatively machine-independent 3.163 + { int32 vectLength, numResRows, numResCols; 3.164 + 3.165 + //====== Do sequential multiply on a single core 3.166 + 3.167 + vectLength = leftMatrix->numCols; 3.168 + numResRows = leftMatrix->numRows; 3.169 + numResCols = rightMatrix->numCols; 3.170 + 3.171 + resultArray = malloc( numResRows * numResCols * sizeof(float32) ); 3.172 + 3.173 + multiplyMatrixArrays( vectLength, numResRows, numResCols, 3.174 + leftMatrix->array, rightMatrix->array, 3.175 + resultArray ); 3.176 + } 3.177 + else 3.178 + { 3.179 + //====== Do parallel multiply across cores 3.180 + 3.181 + //Calc the ideal size of sub-matrix and slice up the dimensions of 3.182 + // the two matrices. 3.183 + //The ideal size is the one takes the number of cycles to calculate 3.184 + // such that calc time is equal or greater than min work-unit size 3.185 + slicingStrucCarrier = 3.186 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix ); 3.187 + 3.188 + //Make the results processor, now that know how many to wait for 3.189 + resultsParams = SSR__malloc_size_to(sizeof(ResultsParams),animatingPr); 3.190 + resultsParams->dividerPr = animatingPr; 3.191 + resultsParams->numSubMatrixPairs = 3.192 + slicingStrucCarrier->leftRowSlices->numVals * 3.193 + slicingStrucCarrier->rightColSlices->numVals * 3.194 + slicingStrucCarrier->vecSlices->numVals; 3.195 + resultsParams->numCols = rightMatrix->numCols; 3.196 + resultsParams->numRows = leftMatrix->numRows; 3.197 + 3.198 + resultPr = 3.199 + SSR__create_procr_with( &gatherResults, resultsParams, animatingPr); 3.200 + 3.201 + //Make the sub-matrices, and pair them up, and make processor to 3.202 + // calc product of each pair. 3.203 + makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 3.204 + slicingStrucCarrier, 3.205 + resultPr, animatingPr); 3.206 + 3.207 + //Get result from result procr 3.208 + msg = SSR__receive_from_to( resultPr, animatingPr ); 3.209 + resultArray = (float32 *) msg; 3.210 } 3.211 3.212 - //Get result from result procr 3.213 - msg = SSR__receive_from_to( resultPr, animatingPr ); 3.214 + 3.215 + //=============== Work done -- send results back ================= 3.216 + 3.217 3.218 //prepare results to persist outside of SSR when return from entry pt 3.219 //The results of the all the work have to be linked-to from the data 3.220 // struc given to the seed procr -- this divide func is animated by 3.221 // that seed procr, so have to link results to the _dividerParams. 3.222 - resultMatrix = SSR__malloc_size_to( sizeof(Matrix), 3.223 - animatingPr ); 3.224 + resultMatrix = SSR__malloc_size_to(sizeof(Matrix),animatingPr); 3.225 + resultMatrix->array = resultArray; 3.226 resultMatrix->numCols = rightMatrix->numCols; 3.227 resultMatrix->numRows = leftMatrix->numRows; 3.228 + 3.229 + 3.230 dividerParams->resultMatrix = resultMatrix; 3.231 - resultMatrix->matrix = (float32 *) msg; 3.232 SSR__transfer_ownership_to_outside( msg ); //so not freed 3.233 SSR__transfer_ownership_to_outside( resultMatrix ); 3.234 3.235 - //printf("end divide\n"); fflush(stdin); 3.236 + PRINT_DEBUG("end divide\n") 3.237 3.238 SSR__dissipate_procr( animatingPr ); //all procrs dissipate self at end 3.239 //when all of the processors have dissipated, the "create seed and do 3.240 // work" call in the entry point function returns 3.241 } 3.242 + 3.243 + 3.244 +SlicingStrucCarrier * 3.245 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix ) 3.246 +{ 3.247 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 3.248 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 3.249 + SlicingStrucCarrier *slicingStrucCarrier = 3.250 + malloc(sizeof(SlicingStrucCarrier)); 3.251 + 3.252 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 3.253 + float64 numPrimitiveOpsInMinWorkUnit; 3.254 + 3.255 + 3.256 + //======= Calc ideal size of min-sized sub-matrix ======== 3.257 + 3.258 + //ask SSR for the number of cycles of the minimum work unit, at given 3.259 + // percent overhead then add a guess at overhead from this divider 3.260 + minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); 3.261 + 3.262 + //ask SSR for number of cycles of the "primitive" op of matrix mult 3.263 + primitiveCycles = measureMatrixMultPrimitive(); 3.264 + 3.265 + numPrimitiveOpsInMinWorkUnit = 3.266 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 3.267 + 3.268 + //take cubed root -- that's number of these in a "side" of sub-matrix 3.269 + // then multiply by 5 because the primitive is 5x5 3.270 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 3.271 + 3.272 + idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); 3.273 + 3.274 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 3.275 + 3.276 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 3.277 + idealSizeOfSide = idealSizeOfSide1; 3.278 + else 3.279 + idealSizeOfSide = idealSizeOfSide2; 3.280 + 3.281 + //The multiply inner loop blocks the array to fit into L1 cache 3.282 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 3.283 + 3.284 + //============ Slice up dimensions, now that know target size =========== 3.285 + 3.286 + //Tell the slicer the target size of a side (floating pt), the start 3.287 + // value to start slicing at, and the end value to stop slicing at 3.288 + //It returns an array of start value of each chunk, plus number of them 3.289 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 3.290 + startLeftRow = 0; 3.291 + endLeftRow = leftMatrix->numRows -1; 3.292 + startVec = 0; 3.293 + endVec = leftMatrix->numCols -1; 3.294 + startRightCol = 0; 3.295 + endRightCol = rightMatrix->numCols -1; 3.296 + 3.297 + leftRowSlices = 3.298 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow ); 3.299 + 3.300 + vecSlices = 3.301 + sliceUpDimension( idealSizeOfSide, startVec, endVec ); 3.302 + 3.303 + rightColSlices = 3.304 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol ); 3.305 + 3.306 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 3.307 + slicingStrucCarrier->vecSlices = vecSlices; 3.308 + slicingStrucCarrier->rightColSlices = rightColSlices; 3.309 + 3.310 + return slicingStrucCarrier; 3.311 +} 3.312 + 3.313 + 3.314 +void 3.315 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 3.316 + SlicingStrucCarrier *slicingStrucCarrier, 3.317 + VirtProcr *resultPr, VirtProcr *animatingPr ) 3.318 + { 3.319 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 3.320 + 3.321 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 3.322 + vecSlices = slicingStrucCarrier->vecSlices; 3.323 + rightColSlices = slicingStrucCarrier->rightColSlices; 3.324 + 3.325 + //================ Make sub-matrices, given the slicing ================ 3.326 + SubMatrix **leftSubMatrices, **rightSubMatrices; 3.327 + leftSubMatrices = 3.328 + createSubMatrices( leftRowSlices, vecSlices, 3.329 + leftMatrix ); 3.330 + rightSubMatrices = 3.331 + createSubMatrices( vecSlices, rightColSlices, 3.332 + rightMatrix ); 3.333 + 3.334 + //============== pair the sub-matrices and make processors ============== 3.335 + int32 numRowIdxs, numColIdxs, numVecIdxs; 3.336 + 3.337 + numRowIdxs = leftRowSlices->numVals; 3.338 + numColIdxs = rightColSlices->numVals; 3.339 + numVecIdxs = vecSlices->numVals; 3.340 + pairUpSubMatricesAndMakeProcessors( leftSubMatrices, 3.341 + rightSubMatrices, 3.342 + numRowIdxs, numColIdxs, 3.343 + numVecIdxs, 3.344 + resultPr, 3.345 + animatingPr ); 3.346 + } 3.347 + 3.348 + 3.349 + 3.350 + 3.351 +void 3.352 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 3.353 + SubMatrix **rightSubMatrices, 3.354 + int32 numRowIdxs, int32 numColIdxs, 3.355 + int32 numVecIdxs, 3.356 + VirtProcr *resultPr, 3.357 + VirtProcr *animatingPr ) 3.358 + { 3.359 + int32 resRowIdx, resColIdx, vecIdx; 3.360 + int32 numLeftColIdxs, numRightColIdxs; 3.361 + int32 leftRowIdxOffset; 3.362 + SMPairParams *subMatrixPairParams; 3.363 + 3.364 + numLeftColIdxs = numColIdxs; 3.365 + numRightColIdxs = numVecIdxs; 3.366 + 3.367 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 3.368 + { 3.369 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 3.370 + 3.371 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 3.372 + { 3.373 + 3.374 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 3.375 + { 3.376 + //Make the processor for the pair of sub-matrices 3.377 + subMatrixPairParams = SSR__malloc_size_to(sizeof(SMPairParams), 3.378 + animatingPr); 3.379 + subMatrixPairParams->leftSubMatrix = 3.380 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 3.381 + 3.382 + subMatrixPairParams->rightSubMatrix = 3.383 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 3.384 + 3.385 + subMatrixPairParams->resultPr = resultPr; 3.386 + 3.387 + SSR__create_procr_with( &calcSubMatrixProduct, 3.388 + subMatrixPairParams, 3.389 + animatingPr ); 3.390 + } 3.391 + } 3.392 + } 3.393 + 3.394 + } 3.395 + 3.396 + 3.397 + 3.398 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 3.399 + */ 3.400 +SubMatrix ** 3.401 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 3.402 + Matrix *origMatrix ) 3.403 + { 3.404 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 3.405 + int32 startRow, endRow, startCol, endCol; 3.406 + int32 *rowStartVals, *colStartVals; 3.407 + int32 rowOffset; 3.408 + SubMatrix **subMatrices, *newSubMatrix; 3.409 + 3.410 + numRowIdxs = rowSlices->numVals; 3.411 + numColIdxs = colSlices->numVals; 3.412 + 3.413 + rowStartVals = rowSlices->startVals; 3.414 + colStartVals = colSlices->startVals; 3.415 + 3.416 + subMatrices = malloc( numRowIdxs * numColIdxs * sizeof(SubMatrix *) ); 3.417 + 3.418 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 3.419 + { 3.420 + rowOffset = rowIdx * numColIdxs; 3.421 + 3.422 + startRow = rowStartVals[rowIdx]; 3.423 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 3.424 + // at last valid idx + 1 & is 3.425 + // 1 greater than end value 3.426 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 3.427 + { 3.428 + startCol = colStartVals[colIdx]; 3.429 + endCol = colStartVals[colIdx + 1] -1; 3.430 + 3.431 + newSubMatrix = malloc( sizeof(SubMatrix) ); 3.432 + newSubMatrix->numRows = endRow - startRow +1; 3.433 + newSubMatrix->numCols = endCol - startCol +1; 3.434 + newSubMatrix->origMatrix = origMatrix; 3.435 + newSubMatrix->origStartRow = startRow; 3.436 + newSubMatrix->origStartCol = startCol; 3.437 + newSubMatrix->alreadyCopied = FALSE; 3.438 + 3.439 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 3.440 + } 3.441 + } 3.442 + return subMatrices; 3.443 + } 3.444 + 3.445 + 3.446 + 3.447 + 3.448 +SlicingStruc * 3.449 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal ) 3.450 + { float32 residualAcc = 0; 3.451 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 3.452 + SlicingStruc *slicingStruc = malloc( sizeof(SlicingStruc) ); 3.453 + 3.454 + //calc size of matrix need to hold start vals -- 3.455 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 3.456 + 3.457 + startVals = malloc( (numSlices + 1) * sizeof(int32) ); 3.458 + 3.459 + //Calc the upper limit of start value -- when get above this, end loop 3.460 + // by saving highest value of the matrix dimension to access, plus 1 3.461 + // as the start point of the imaginary slice following the last one 3.462 + //Plus 1 because go up to value but not include when process last slice 3.463 + //The stopping condition is half-a-size less than highest value because 3.464 + // don't want any pieces smaller than half the ideal size -- just tack 3.465 + // little ones onto end of last one 3.466 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 3.467 + for( i = 0; startVal <= endVal; i++ ) 3.468 + { 3.469 + startVals[i] = startVal; 3.470 + residualAcc += idealSizeOfSide; 3.471 + sizeOfSlice = (int)residualAcc; 3.472 + residualAcc -= (float32)sizeOfSlice; 3.473 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 3.474 + 3.475 + if( startVal > endCondition ) 3.476 + { startVal = endVal + 1; 3.477 + startVals[ i + 1 ] = startVal; 3.478 + } 3.479 + } 3.480 + 3.481 + slicingStruc->startVals = startVals; 3.482 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 3.483 + // which means is num sub-matrices in dim 3.484 + // also == idx of the fake start just above 3.485 + return slicingStruc; 3.486 + } 3.487 + 3.488 + 3.489 +int inline 3.490 +measureMatrixMultPrimitive() 3.491 + { 3.492 + int r, c, v, numCycles; 3.493 + float32 *res, *left, *right; 3.494 + 3.495 + //setup inputs 3.496 + left = malloc( 5 * 5 * sizeof( float32 ) ); 3.497 + right = malloc( 5 * 5 * sizeof( float32 ) ); 3.498 + res = malloc( 5 * 5 * sizeof( float32 ) ); 3.499 + 3.500 + for( r = 0; r < 5; r++ ) 3.501 + { 3.502 + for( c = 0; c < 5; c++ ) 3.503 + { 3.504 + left[ r * 5 + c ] = r; 3.505 + right[ r * 5 + c ] = c; 3.506 + } 3.507 + } 3.508 + 3.509 + //do primitive 3.510 + SSR__start_primitive(); //for now, just takes time stamp 3.511 + for( r = 0; r < 5; r++ ) 3.512 + { 3.513 + for( c = 0; c < 5; c++ ) 3.514 + { 3.515 + for( v = 0; v < 5; v++ ) 3.516 + { 3.517 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 3.518 + } 3.519 + } 3.520 + } 3.521 + numCycles = 3.522 + SSR__end_primitive_and_give_cycles(); 3.523 + 3.524 + return numCycles; 3.525 + } 3.526 +
4.1 --- a/src/Application/SSR_Matrix_Mult/EntryPoint.c Tue Oct 05 10:00:11 2010 -0700 4.2 +++ b/src/Application/SSR_Matrix_Mult/EntryPoint.c Thu Oct 14 17:09:22 2010 -0700 4.3 @@ -39,7 +39,8 @@ 4.4 4.5 //create divider processor, start doing the work, and wait till done 4.6 //This function is the "border crossing" between normal code and SSR 4.7 - SSR__create_seed_procr_and_do_work( ÷IntoVectors, dividerParams ); 4.8 + SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 4.9 + dividerParams ); 4.10 4.11 //get result matrix and return it 4.12 resMatrix = dividerParams->resultMatrix;
5.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c Tue Oct 05 10:00:11 2010 -0700 5.2 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c Thu Oct 14 17:09:22 2010 -0700 5.3 @@ -8,6 +8,15 @@ 5.4 5.5 #include "SSR_Matrix_Mult.h" 5.6 5.7 +void inline 5.8 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 5.9 + int32 startRow, 5.10 + int32 numRows, 5.11 + int32 startCol, 5.12 + int32 numCols, 5.13 + int32 numOrigCols ); 5.14 + 5.15 + 5.16 /*The Result Processor gets a message from each of the vector processors, 5.17 * puts the result from the message in its location in the result- 5.18 * matrix, and increments the count of results. 5.19 @@ -18,34 +27,68 @@ 5.20 void gatherResults( void *_params, VirtProcr *animatingPr ) 5.21 { VirtProcr *dividerPr; 5.22 ResultsParams *params; 5.23 - int numRows, numCols, numCells, count=0; 5.24 - float32 *resultMatrix; 5.25 + int row, col, numRows, numCols, numSubMatrixPairs, count=0; 5.26 + float32 *resultArray; 5.27 void *msg; 5.28 - VectorParams *aResult; 5.29 + SMPairParams *resParams; 5.30 5.31 -// printf("start resultPr\n"); fflush(stdin); 5.32 - 5.33 + PRINT_DEBUG("start resultPr\n") 5.34 + 5.35 params = (ResultsParams *)_params; 5.36 dividerPr = params->dividerPr; 5.37 - numCols = params->numCols; 5.38 - numRows = params->numRows; 5.39 - numCells = numRows * numCols; 5.40 + numSubMatrixPairs = params->numSubMatrixPairs; 5.41 + numRows = params->numRows; 5.42 + numCols = params->numCols; 5.43 5.44 - resultMatrix = SSR__malloc_size_to( numCells*sizeof( float32 ), animatingPr); 5.45 - 5.46 - while( count < numCells ) 5.47 + resultArray = SSR__malloc_size_to( numRows * numCols * sizeof(float32), 5.48 + animatingPr ); 5.49 + 5.50 + //zero out the results array -- will be accumulating, so must start 0 5.51 + for( row = 0; row < numRows; row++ ) 5.52 + { 5.53 + for( col = 0; col < numCols; col++ ) 5.54 + { 5.55 + resultArray[ row * numCols + col ] = 0; 5.56 + } 5.57 + } 5.58 + 5.59 + while( count < numSubMatrixPairs ) 5.60 { 5.61 msg = SSR__receive_type_to( RESULTS_MSG, animatingPr ); 5.62 5.63 - aResult = (VectorParams *)msg; 5.64 - *(resultMatrix + aResult->myRow * numCols + aResult->myCol) = 5.65 - aResult->result; 5.66 + resParams = (SMPairParams *)msg; 5.67 + accumulateResult( resultArray, resParams->resultArray, 5.68 + resParams->leftSubMatrix->origStartRow, 5.69 + resParams->leftSubMatrix->numRows, 5.70 + resParams->rightSubMatrix->origStartCol, 5.71 + resParams->rightSubMatrix->numCols, 5.72 + resParams->rightSubMatrix->origMatrix->numCols ); 5.73 count++; 5.74 } 5.75 //if were real lang, would have auto-nested transfer -- but HelloWorld 5.76 // language, so have to transfer ownership of each allocated block of 5.77 // locations separately 5.78 - SSR__transfer_ownership_of_from_to( resultMatrix, animatingPr, dividerPr ); 5.79 - SSR__send_from_to( resultMatrix, animatingPr, dividerPr ); 5.80 + SSR__transfer_ownership_of_from_to( resultArray, animatingPr, dividerPr ); 5.81 + SSR__send_from_to( resultArray, animatingPr, dividerPr ); 5.82 SSR__dissipate_procr( animatingPr ); //frees any data owned by procr 5.83 } 5.84 + 5.85 +void inline 5.86 +accumulateResult( float32 *resultArray, float32 *subMatrixResultArray, 5.87 + int32 startRow, 5.88 + int32 numRows, 5.89 + int32 startCol, 5.90 + int32 numCols, 5.91 + int32 numOrigCols ) 5.92 + { int32 row, col; 5.93 + 5.94 + for( row = 0; row < numRows; row++ ) 5.95 + { 5.96 + for( col = 0; col < numCols; col++ ) 5.97 + { 5.98 + resultArray[ (row + startRow) * numOrigCols + col + startCol ] += 5.99 + subMatrixResultArray[ row * numCols + col ]; 5.100 + } 5.101 + } 5.102 + 5.103 + }
6.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Tue Oct 05 10:00:11 2010 -0700 6.2 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Thu Oct 14 17:09:22 2010 -0700 6.3 @@ -11,6 +11,15 @@ 6.4 #include "../../SSR_lib/SSR.h" 6.5 #include "../Matrix_Mult.h" 6.6 6.7 + 6.8 +//=============================== Defines ============================== 6.9 +#define ROWS_IN_BLOCK 32 6.10 +#define COLS_IN_BLOCK 32 6.11 +#define VEC_IN_BLOCK 32 6.12 + 6.13 + 6.14 +#define PRINT_DEBUG(msg) //printf(msg); fflush(stdin); 6.15 + 6.16 //============================== Structures ============================== 6.17 typedef struct 6.18 { 6.19 @@ -25,19 +34,45 @@ 6.20 VirtProcr *dividerPr; 6.21 int numRows; 6.22 int numCols; 6.23 + int numSubMatrixPairs; 6.24 } 6.25 ResultsParams; 6.26 6.27 +typedef 6.28 +struct 6.29 + { int32 numRows; 6.30 + int32 numCols; 6.31 + Matrix *origMatrix; 6.32 + int32 origStartRow; 6.33 + int32 origStartCol; 6.34 + int32 alreadyCopied; 6.35 + float32 *array; //2D, but dynamically sized, so use addr arith 6.36 + } 6.37 +SubMatrix; 6.38 + 6.39 typedef struct 6.40 { VirtProcr *resultPr; 6.41 - int myCol; 6.42 - int myRow; 6.43 - int vectLength; 6.44 - Matrix *leftMatrix; 6.45 - Matrix *rightMatrix; 6.46 - float32 result; 6.47 + SubMatrix *leftSubMatrix; 6.48 + SubMatrix *rightSubMatrix; 6.49 + float32 *resultArray; 6.50 } 6.51 -VectorParams; 6.52 +SMPairParams; 6.53 + 6.54 +typedef 6.55 +struct 6.56 + { int32 numVals; 6.57 + int32 *startVals; 6.58 + } 6.59 +SlicingStruc; 6.60 + 6.61 +typedef 6.62 +struct 6.63 + { 6.64 + SlicingStruc *leftRowSlices; 6.65 + SlicingStruc *vecSlices; 6.66 + SlicingStruc *rightColSlices; 6.67 + } 6.68 +SlicingStrucCarrier; 6.69 6.70 enum MMMsgType 6.71 { 6.72 @@ -45,8 +80,8 @@ 6.73 }; 6.74 6.75 //============================= Processor Functions ========================= 6.76 -void divideIntoVectors( void *data, VirtProcr *animatingPr ); 6.77 -void calcVector( void *data, VirtProcr *animatingPr ); 6.78 +void divideWorkIntoSubMatrixPairProcrs( void *data, VirtProcr *animatingPr ); 6.79 +void calcSubMatrixProduct( void *data, VirtProcr *animatingPr ); 6.80 void gatherResults( void *data, VirtProcr *animatingPr ); 6.81 6.82
7.1 --- a/src/Application/SSR_Matrix_Mult/Vector_Pr.c Tue Oct 05 10:00:11 2010 -0700 7.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 7.3 @@ -1,47 +0,0 @@ 7.4 -/* 7.5 - * Copyright 2009 OpenSourceStewardshipFoundation.org 7.6 - * Licensed under GNU General Public License version 2 7.7 - * 7.8 - * Author: SeanHalle@yahoo.com 7.9 - * 7.10 - */ 7.11 - 7.12 -#include "SSR_Matrix_Mult.h" 7.13 - 7.14 -/*A Vector processor is created with an environment that holds two matrices, 7.15 - * the row and col that it owns, and the name of a result gathering 7.16 - * processor. 7.17 - *It calculates its vector product then sends the result to the result 7.18 - * processor, which puts it into the result matrix and returns that matrix 7.19 - * when all is done. 7.20 - */ 7.21 - void 7.22 -calcVector( void *data, VirtProcr *animatingPr ) 7.23 - { 7.24 - VectorParams *params; 7.25 - VirtProcr *resultPr; 7.26 - int myRow, myCol, vectLength, pos; 7.27 - float32 *leftMatrixArray, *rightMatrixArray, result = 0.0; 7.28 - Matrix *leftMatrix, *rightMatrix; 7.29 - 7.30 -// printf("start vector\n"); fflush(stdin); 7.31 - 7.32 - params = (VectorParams *)data; 7.33 - resultPr = params->resultPr; 7.34 - myCol = params->myCol; 7.35 - myRow = params->myRow; 7.36 - vectLength = params->vectLength; 7.37 - leftMatrix = params->leftMatrix; 7.38 - rightMatrix = params->rightMatrix; 7.39 - leftMatrixArray = leftMatrix->matrix; 7.40 - rightMatrixArray = rightMatrix->matrix; 7.41 - 7.42 - for( pos = 0; pos < vectLength; pos++ ) 7.43 - { 7.44 - result += *(leftMatrixArray + myRow * vectLength + pos) * 7.45 - *(rightMatrixArray + pos * vectLength + myCol); 7.46 - } 7.47 - params->result = result; 7.48 - SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); 7.49 - SSR__dissipate_procr( animatingPr ); 7.50 - }
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 8.2 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Thu Oct 14 17:09:22 2010 -0700 8.3 @@ -0,0 +1,245 @@ 8.4 +/* 8.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 8.6 + * Licensed under GNU General Public License version 2 8.7 + * 8.8 + * Author: SeanHalle@yahoo.com 8.9 + * 8.10 + */ 8.11 + 8.12 +#include "SSR_Matrix_Mult.h" 8.13 + 8.14 + 8.15 +void inline 8.16 +copyFromOrig( SubMatrix *subMatrix ); 8.17 + 8.18 +void inline 8.19 +copyTransposeFromOrig( SubMatrix *subMatrix ); 8.20 + 8.21 +void inline 8.22 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.23 + float32 *resArray, 8.24 + int startRow, int endRow, 8.25 + int startCol, int endCol, 8.26 + int startVec, int endVec, 8.27 + int resStride, int inpStride ); 8.28 + 8.29 +void inline 8.30 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 8.31 + float32 *leftArray, float32 *rightArray, 8.32 + float32 *resArray ); 8.33 + 8.34 + 8.35 +/*A processor is created with an environment that holds two matrices, 8.36 + * the row and col that it owns, and the name of a result gathering 8.37 + * processor. 8.38 + *It calculates the product of two sub-portions of the input matrices 8.39 + * by using Intel's mkl library for single-core. 8.40 + * 8.41 + *This demonstrates using optimized single-threaded code inside scheduled 8.42 + * work-units. 8.43 + * 8.44 + *When done, it sends the result to the result processor 8.45 + */ 8.46 +void 8.47 +calcSubMatrixProduct( void *data, VirtProcr *animatingPr ) 8.48 + { 8.49 + SMPairParams *params; 8.50 + VirtProcr *resultPr; 8.51 + float32 *leftArray, *rightArray, *resArray; 8.52 + SubMatrix *leftSubMatrix, *rightSubMatrix; 8.53 + 8.54 + PRINT_DEBUG("start sub-matrix mult\n") 8.55 + 8.56 + params = (SMPairParams *)data; 8.57 + resultPr = params->resultPr; 8.58 + leftSubMatrix = params->leftSubMatrix; 8.59 + rightSubMatrix = params->rightSubMatrix; 8.60 + 8.61 + //make sure the input sub-matrices have been copied out of orig 8.62 + copyFromOrig( leftSubMatrix ); 8.63 + copyTransposeFromOrig( rightSubMatrix ); 8.64 + 8.65 + leftArray = leftSubMatrix->array; 8.66 + rightArray = rightSubMatrix->array; 8.67 + 8.68 + resArray = malloc( leftSubMatrix->numRows * rightSubMatrix->numCols * 8.69 + sizeof( float32 ) ); 8.70 + 8.71 + 8.72 + int32 numResRows, numResCols, vectLength; 8.73 + 8.74 + vectLength = leftSubMatrix->numCols; 8.75 + numResRows = leftSubMatrix->numRows; 8.76 + numResCols = rightSubMatrix->numCols; 8.77 + 8.78 + multiplyMatrixArrays( vectLength, numResRows, numResCols, 8.79 + leftArray, rightArray, 8.80 + resArray ); 8.81 + 8.82 + //send result to result processor 8.83 + params->resultArray = resArray; 8.84 + SSR__send_of_type_to( animatingPr, params, RESULTS_MSG, resultPr ); 8.85 + SSR__dissipate_procr( animatingPr ); 8.86 + } 8.87 + 8.88 + 8.89 +/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache 8.90 + * Would be nice to embed this within another level that divided into 8.91 + * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 8.92 + * 8.93 + *Eventually want these divisions to be automatic, using DKU pattern 8.94 + * embedded into SSR, and with VMS controlling the divisions according to 8.95 + * the cache sizes, which it knows about. 8.96 + *And, want VMS to work with language to split among main-mems, so a socket 8.97 + * only cranks on data in its local segment of main mem 8.98 + * 8.99 + */ 8.100 +void inline 8.101 +multiplyMatrixArrays( int32 vecLength, int32 numResRows, int32 numResCols, 8.102 + float32 *leftArray, float32 *rightArray, 8.103 + float32 *resArray ) 8.104 + { 8.105 + int resStride, inpStride; 8.106 + int startRow, startCol, endRow, endCol, startVec, endVec; 8.107 + 8.108 + resStride = numResCols; 8.109 + inpStride = vecLength; 8.110 + 8.111 + for( startRow = 0; startRow < numResRows; ) 8.112 + { 8.113 + endRow = startRow + ROWS_IN_BLOCK; 8.114 + if( endRow > numResRows ) endRow = numResRows; 8.115 + 8.116 + for( startCol = 0; startCol < numResCols; ) 8.117 + { 8.118 + endCol = startCol + COLS_IN_BLOCK; 8.119 + if( endCol > numResCols ) endCol = numResCols; 8.120 + 8.121 + for( startVec = 0; startVec < vecLength; ) 8.122 + { 8.123 + endVec = startVec + VEC_IN_BLOCK; 8.124 + if( endVec > vecLength ) endVec = vecLength; 8.125 + 8.126 + //By having the "vector" of sub-blocks in a sub-block slice 8.127 + // be marched down in inner loop, are re-using the result 8.128 + // matrix, which stays in L1 cache -- can only re-use one of 8.129 + // the three, so this is the most important -- avoids writing 8.130 + // dirty blocks until those result-locations fully done 8.131 + //Row and Col is position in result matrix -- so row and vec 8.132 + // for left array, then vec and col for right array 8.133 + multiplySubBlocksTransposed( leftArray, rightArray, 8.134 + resArray, 8.135 + startRow, endRow, 8.136 + startCol, endCol, 8.137 + startVec, endVec, 8.138 + resStride, inpStride ); 8.139 + startVec = endVec; 8.140 + } 8.141 + startCol = endCol; 8.142 + } 8.143 + startRow = endRow; 8.144 + } 8.145 + } 8.146 + 8.147 + 8.148 +void inline 8.149 +multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 8.150 + float32 *resArray, 8.151 + int startRow, int endRow, 8.152 + int startCol, int endCol, 8.153 + int startVec, int endVec, 8.154 + int resStride, int inpStride ) 8.155 + { 8.156 + int row, col, vec; 8.157 + int leftOffset, rightOffset; 8.158 + float32 result; 8.159 + 8.160 + for( row = startRow; row < endRow; row++ ) 8.161 + { 8.162 + for( col = startCol; col < endCol; col++ ) 8.163 + { 8.164 + leftOffset = row * inpStride;//left & right inp strides always same 8.165 + rightOffset = col * inpStride;// because right is transposed 8.166 + result = 0; 8.167 + for( vec = startVec; vec < endVec; vec++ ) 8.168 + { 8.169 + result += 8.170 + leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 8.171 + } 8.172 + 8.173 + resArray[ row * resStride + col ] += result; 8.174 + } 8.175 + } 8.176 + } 8.177 + 8.178 +void inline 8.179 +copyTransposeFromOrig( SubMatrix *subMatrix ) 8.180 + { int numCols, numRows, origStartRow, origStartCol, origStride, stride; 8.181 + Matrix *origMatrix; 8.182 + float32 *origArray, *subArray; 8.183 + 8.184 + if( subMatrix->alreadyCopied ) return; 8.185 + 8.186 + subMatrix->alreadyCopied = TRUE; 8.187 + 8.188 + origMatrix = subMatrix->origMatrix; 8.189 + origArray = origMatrix->array; 8.190 + numCols = subMatrix->numCols; 8.191 + numRows = subMatrix->numRows; 8.192 + stride = numRows; 8.193 + origStartRow = subMatrix->origStartRow; 8.194 + origStartCol = subMatrix->origStartCol; 8.195 + origStride = origMatrix->numCols; 8.196 + 8.197 + subArray = malloc( numRows * numCols * sizeof(float32) ); 8.198 + subMatrix->array = subArray; 8.199 + 8.200 + //copy values from orig matrix to local 8.201 + int row, col, origOffset; 8.202 + for( row = 0; row < numRows; row++ ) 8.203 + { 8.204 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.205 + for( col = 0; col < numCols; col++ ) 8.206 + { 8.207 + //transpose means swap row & col -- traverse orig matrix normally 8.208 + // but put into reversed place in local array -- means the 8.209 + // stride is the num rows now, so col * numRows + row 8.210 + subArray[ col * stride + row ] = origArray[ origOffset + col ]; 8.211 + } 8.212 + } 8.213 + } 8.214 + 8.215 +void inline 8.216 +copyFromOrig( SubMatrix *subMatrix ) 8.217 + { int numCols, numRows, origStartRow, origStartCol, stride, origStride; 8.218 + Matrix *origMatrix; 8.219 + float32 *origArray, *subArray; 8.220 + 8.221 + if( subMatrix->alreadyCopied ) return; 8.222 + 8.223 + subMatrix->alreadyCopied = TRUE; 8.224 + 8.225 + origMatrix = subMatrix->origMatrix; 8.226 + origArray = origMatrix->array; 8.227 + numCols = subMatrix->numCols; 8.228 + numRows = subMatrix->numRows; 8.229 + origStartRow = subMatrix->origStartRow; 8.230 + origStartCol = subMatrix->origStartCol; 8.231 + stride = numCols; 8.232 + origStride = origMatrix->numCols; 8.233 + 8.234 + subArray = malloc( numRows * numCols * sizeof(float32) ); 8.235 + subMatrix->array = subArray; 8.236 + 8.237 + //copy values from orig matrix to local 8.238 + int row, col, offset, origOffset; 8.239 + for( row = 0; row < numRows; row++ ) 8.240 + { 8.241 + offset = row * stride; 8.242 + origOffset = (row + origStartRow) * origStride + origStartCol; 8.243 + for( col = 0; col < numCols; col++ ) 8.244 + { 8.245 + subArray[ offset + col ] = origArray[ origOffset + col ]; 8.246 + } 8.247 + } 8.248 + }
