Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > VSs > VSs__Hello_World__LangDev
changeset 0:9cf4c84a3091
Initial add of copied code -- nonsense code still
| author | Some Random Person <seanhalle@yahoo.com> |
|---|---|
| date | Wed, 23 May 2012 12:39:19 -0700 |
| parents | |
| children | 9ad1a6186956 |
| files | .hgeol .hgignore VSs__Hello_World/EntryPoint.c VSs__Hello_World/SeedVP.c VSs__Hello_World/VSs__Hello_World.h __brch__default main.c |
| diffstat | 7 files changed, 812 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.hgeol Wed May 23 12:39:19 2012 -0700 1.3 @@ -0,0 +1,14 @@ 1.4 + 1.5 +[patterns] 1.6 +**.py = native 1.7 +**.txt = native 1.8 +**.c = native 1.9 +**.h = native 1.10 +**.cpp = native 1.11 +**.java = native 1.12 +**.class = bin 1.13 +**.jar = bin 1.14 +**.sh = native 1.15 +**.pl = native 1.16 +**.jpg = bin 1.17 +**.gif = bin
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/.hgignore Wed May 23 12:39:19 2012 -0700 2.3 @@ -0,0 +1,12 @@ 2.4 +nbproject 2.5 +Makefile 2.6 +build 2.7 +dist 2.8 +src/Default 2.9 +src/.settings 2.10 +src/.cproject 2.11 +src/.project 2.12 +.dep.inc 2.13 +glob:.cproject 2.14 +glob:.project 2.15 +glob:Debug
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/VSs__Hello_World/EntryPoint.c Wed May 23 12:39:19 2012 -0700 3.3 @@ -0,0 +1,62 @@ 3.4 +/* 3.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 3.6 + * Licensed under GNU General Public License version 2 3.7 + * 3.8 + * Author: seanhalle@yahoo.com 3.9 + * 3.10 + */ 3.11 + 3.12 +#include <math.h> 3.13 + 3.14 +#include "SSR_Matrix_Mult.h" 3.15 + 3.16 + 3.17 + 3.18 +/*Every SSR system has an "entry point" function that creates the first 3.19 + * processor, which starts the chain of creating more processors.. 3.20 + * eventually all of the processors will dissipate themselves, and 3.21 + * return. 3.22 + * 3.23 + *This entry-point function follows the same pattern as all entry-point 3.24 + * functions do: 3.25 + *1) it creates the params for the seed processor, from the 3.26 + * parameters passed into the entry-point function 3.27 + *2) it calls SSR__create_seed_procr_and_do_work 3.28 + *3) it gets the return value from the params struc, frees the params struc, 3.29 + * and returns the value from the function 3.30 + * 3.31 + */ 3.32 +Matrix * 3.33 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ) 3.34 + { Matrix *resMatrix; 3.35 + DividerParams *dividerParams; 3.36 + int32 numResRows, numResCols; 3.37 + 3.38 + 3.39 + dividerParams = malloc( sizeof( DividerParams ) ); 3.40 + dividerParams->leftMatrix = leftMatrix; 3.41 + dividerParams->rightMatrix = rightMatrix; 3.42 + 3.43 + 3.44 + numResRows = leftMatrix->numRows; 3.45 + numResCols = rightMatrix->numCols; 3.46 + 3.47 + //VMS has its own separate internal malloc, so to get results out, 3.48 + // have to pass in empty array for it to fill up 3.49 + //The alternative is internally telling SSR make external space to use 3.50 + resMatrix = malloc( sizeof(Matrix) ); 3.51 + resMatrix->array = malloc( numResRows * numResCols * sizeof(float32)); 3.52 + resMatrix->numCols = rightMatrix->numCols; 3.53 + resMatrix->numRows = leftMatrix->numRows; 3.54 + 3.55 + 3.56 + dividerParams->resultMatrix = resMatrix; 3.57 + 3.58 + //create divider processor, start doing the work, and wait till done 3.59 + //This function is the "border crossing" between normal code and SSR 3.60 + SSR__create_seed_procr_and_do_work( ÷WorkIntoSubMatrixPairProcrs, 3.61 + dividerParams ); 3.62 + 3.63 + free( dividerParams ); 3.64 + return resMatrix; 3.65 + }
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/VSs__Hello_World/SeedVP.c Wed May 23 12:39:19 2012 -0700 4.3 @@ -0,0 +1,594 @@ 4.4 +/* 4.5 + * Copyright 2009 OpenSourceStewardshipFoundation.org 4.6 + * Licensed under GNU General Public License version 2 4.7 + * 4.8 + * Author: seanhalle@yahoo.com 4.9 + * 4.10 + */ 4.11 + 4.12 + 4.13 +#include <math.h> 4.14 +#include <string.h> 4.15 +#include "SSR_Matrix_Mult.h" 4.16 + 4.17 + //The time to compute this many result values should equal the time to 4.18 + // perform this division on a matrix of size gives that many result calcs 4.19 + //IE, size this so that sequential time to calc equals divide time 4.20 + // find the value by experimenting -- but divide time and calc time scale 4.21 + // same way, so this value might remain the same across hardware 4.22 +#define NUM_CELLS_IN_SEQUENTIAL_CUTOFF 1000 4.23 + 4.24 + 4.25 +//=========================================================================== 4.26 +int inline 4.27 +measureMatrixMultPrimitive( SlaveVP *animPr ); 4.28 + 4.29 +SlicingStrucCarrier * 4.30 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.31 + SlaveVP *animPr ); 4.32 + 4.33 +SlicingStruc * 4.34 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.35 + SlaveVP *animPr ); 4.36 + 4.37 +void 4.38 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr ); 4.39 + 4.40 +SubMatrix ** 4.41 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.42 + int32 numUses, Matrix *origMatrix, SlaveVP *animPr ); 4.43 + 4.44 +void 4.45 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.46 + SubMatrix **subMatrices, SlaveVP *animPr ); 4.47 + 4.48 +void 4.49 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 4.50 + SubMatrix **rightSubMatrices, 4.51 + int32 numRowIdxs, int32 numColIdxs, 4.52 + int32 numVecIdxs, 4.53 + SlaveVP *resultPr, 4.54 + SlaveVP *animatingPr ); 4.55 + 4.56 +void 4.57 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 4.58 + SlicingStrucCarrier *slicingStrucCarrier, 4.59 + SlaveVP *resultPr, SlaveVP *animatingPr ); 4.60 + 4.61 + 4.62 + 4.63 +/*Divider creates one processor for every sub-matrix 4.64 + * It hands them: 4.65 + * the name of the result processor that they should send their results to, 4.66 + * the left and right matrices, and the rows and cols they should multiply 4.67 + * It first creates the result processor, then all the sub-matrixPair 4.68 + * processors, 4.69 + * then does a receive of a message from the result processor that gives 4.70 + * the divider ownership of the result matrix. 4.71 + * Finally, the divider returns the result matrix out of the SSR system. 4.72 + * 4.73 + * Divider chooses the size of sub-matrices via an algorithm that tries to 4.74 + * keep the minimum work above a threshold. The threshold is machine- 4.75 + * dependent, so ask SSR for min work-unit time to get a 4.76 + * given overhead 4.77 + * 4.78 + * Divide min work-unit cycles by measured-cycles for one matrix-cell 4.79 + * product -- gives the number of products need to have in min size 4.80 + * matrix. 4.81 + * 4.82 + * So then, take cubed root of this to get the size of a side of min sub- 4.83 + * matrix. That is the size of the ideal square sub-matrix -- so tile 4.84 + * up the two input matrices into ones as close as possible to that size, 4.85 + * and create the pairs of sub-matrices. 4.86 + * 4.87 + *======================== STRATEGIC OVERVIEW ======================= 4.88 + * 4.89 + *This division is a bit tricky, because have to create things in advance 4.90 + * that it's not at first obvious need to be created.. 4.91 + * 4.92 + *First slice up each dimension -- three of them.. this is because will have 4.93 + * to create the sub-matrix's data-structures before pairing the sub-matrices 4.94 + * with each other -- so, have three dimensions to slice up before can 4.95 + * create the sub-matrix data-strucs -- also, have to be certain that the 4.96 + * cols of the left input have the exact same slicing as the rows of the 4.97 + * left matrix, so just to be sure, do the slicing calc once, then use it 4.98 + * for both. 4.99 + * 4.100 + *So, goes like this: 4.101 + *1) calculate the start & end values of each dimension in each matrix. 4.102 + *2) use those values to create sub-matrix structures 4.103 + *3) combine sub-matrices into pairs, as the tasks to perform. 4.104 + * 4.105 + *Have to calculate separately from creating the sub-matrices because of the 4.106 + * nature of the nesting -- would either end up creating the same sub-matrix 4.107 + * multiple times, or else would have to put in detection of whether had 4.108 + * made a particular one already if tried to combine steps 1 and 2. 4.109 + * 4.110 + *Step 3 has to be separate because of the nesting, as well -- same reason, 4.111 + * would either create same sub-matrix multiple times, or else have to 4.112 + * add detection of whether was already created. 4.113 + * 4.114 + *Another way to look at it: there's one level of loop to divide dimensions, 4.115 + * two levels of nesting to create sub-matrices, and three levels to pair 4.116 + * up the sub-matrices. 4.117 + */ 4.118 +void divideWorkIntoSubMatrixPairProcrs( void *_dividerParams, 4.119 + SlaveVP *animPr ) 4.120 + { SlaveVP *resultPr; 4.121 + DividerParams *dividerParams; 4.122 + ResultsParams *resultsParams; 4.123 + Matrix *leftMatrix, *rightMatrix, *resultMatrix; 4.124 + void *msg; 4.125 + SlicingStrucCarrier *slicingStrucCarrier; 4.126 + float32 *resultArray; //points to array inside result matrix 4.127 + 4.128 + DEBUG__printf( dbgAppFlow, "start divide") 4.129 + 4.130 + int32 4.131 + divideProbe = VMS_App__create_single_interval_probe( "divideProbe", 4.132 + animPr ); 4.133 + VMS_App__record_sched_choice_into_probe( divideProbe, animPr ); 4.134 + VMS_App__record_interval_start_in_probe( divideProbe ); 4.135 + 4.136 + //=========== Setup -- make local copies of ptd-to-things, malloc, aso 4.137 + int32 numResRows, numResCols, vectLength; 4.138 + 4.139 + dividerParams = (DividerParams *)_dividerParams; 4.140 + 4.141 + leftMatrix = dividerParams->leftMatrix; 4.142 + rightMatrix = dividerParams->rightMatrix; 4.143 + 4.144 + vectLength = leftMatrix->numCols; 4.145 + numResRows = leftMatrix->numRows; 4.146 + numResCols = rightMatrix->numCols; 4.147 + resultArray = dividerParams->resultMatrix->array; 4.148 + 4.149 + //zero the result array 4.150 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 4.151 + 4.152 + //============== Do either sequential mult or do division ============== 4.153 + 4.154 + //Check if input matrices too small -- if yes, just do sequential 4.155 + //Cutoff is determined by overhead of this divider -- relatively 4.156 + // machine-independent 4.157 + if( (float32)leftMatrix->numRows * (float32)leftMatrix->numCols * 4.158 + (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 4.159 + { 4.160 + //====== Do sequential multiply on a single core 4.161 + DEBUG__printf( dbgAppFlow, "doing sequential") 4.162 + 4.163 + //transpose the right matrix 4.164 + float32 * 4.165 + transRightArray = SSR__malloc_to( rightMatrix->numRows * 4.166 + rightMatrix->numCols * sizeof(float32), 4.167 + animPr ); 4.168 + 4.169 + //copy values from orig matrix to local 4.170 + copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 4.171 + 0, 0, rightMatrix->numRows, 4.172 + transRightArray, rightMatrix->array ); 4.173 + 4.174 + multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 4.175 + leftMatrix->array, transRightArray, 4.176 + resultArray ); 4.177 + } 4.178 + else 4.179 + { 4.180 + //====== Do parallel multiply across cores 4.181 + 4.182 + //Calc the ideal size of sub-matrix and slice up the dimensions of 4.183 + // the two matrices. 4.184 + //The ideal size is the one takes the number of cycles to calculate 4.185 + // such that calc time is equal or greater than min work-unit size 4.186 + slicingStrucCarrier = 4.187 + calcIdealSizeAndSliceDimensions( leftMatrix, rightMatrix, animPr ); 4.188 + 4.189 + //Make the results processor, now that know how many to wait for 4.190 + resultsParams = SSR__malloc_to( sizeof(ResultsParams), animPr ); 4.191 + resultsParams->numSubMatrixPairs = 4.192 + slicingStrucCarrier->leftRowSlices->numVals * 4.193 + slicingStrucCarrier->rightColSlices->numVals * 4.194 + slicingStrucCarrier->vecSlices->numVals; 4.195 + resultsParams->dividerPr = animPr; 4.196 + resultsParams->numCols = rightMatrix->numCols; 4.197 + resultsParams->numRows = leftMatrix->numRows; 4.198 + resultsParams->resultArray = resultArray; 4.199 + 4.200 + DEBUG__printf(dbgAppFlow,"**create result Pr**") 4.201 + resultPr = 4.202 + SSR__create_procr_with( &gatherResults, resultsParams, animPr); 4.203 + 4.204 + //Make the sub-matrices, and pair them up, and make processor to 4.205 + // calc product of each pair. 4.206 + makeSubMatricesAndProcrs( leftMatrix, rightMatrix, 4.207 + slicingStrucCarrier, 4.208 + resultPr, animPr); 4.209 + 4.210 + //result array is allocated externally, so no message from resultPr 4.211 + // however, do have to wait before printing out stats, so wait 4.212 + // for an empty handshake message 4.213 + msg = SSR__receive_from_to( resultPr, animPr ); 4.214 + } 4.215 + 4.216 + 4.217 + //=============== Work done -- send results back ================= 4.218 + 4.219 + 4.220 + DEBUG__printf( dbgAppFlow, "end divide") 4.221 + 4.222 + VMS_App__record_interval_end_in_probe( divideProbe ); 4.223 + VMS_App__print_stats_of_all_probes(); 4.224 + 4.225 + //nothing left to do so dissipate, SSR will wait to shutdown and hence 4.226 + // make results available to outside until all the processors have 4.227 + // dissipated -- so no need to wait for results processor 4.228 + 4.229 + SSR__dissipate_procr( animPr ); //all procrs dissipate self at end 4.230 + //when all of the processors have dissipated, the "create seed and do 4.231 + // work" call in the entry point function returns 4.232 + } 4.233 + 4.234 + 4.235 +SlicingStrucCarrier * 4.236 +calcIdealSizeAndSliceDimensions( Matrix *leftMatrix, Matrix *rightMatrix, 4.237 + SlaveVP *animPr ) 4.238 + { 4.239 + float32 idealSizeOfSide, idealSizeOfSide1, idealSizeOfSide2; 4.240 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.241 + SlicingStrucCarrier *slicingStrucCarrier = 4.242 + SSR__malloc_to(sizeof(SlicingStrucCarrier), animPr); 4.243 + 4.244 + int minWorkUnitCycles, primitiveCycles, idealNumWorkUnits; 4.245 + float64 numPrimitiveOpsInMinWorkUnit; 4.246 + 4.247 + 4.248 + //======= Calc ideal size of min-sized sub-matrix ======== 4.249 + 4.250 + //ask SSR for the number of cycles of the minimum work unit, at given 4.251 + // percent overhead then add a guess at overhead from this divider 4.252 + minWorkUnitCycles = SSR__giveMinWorkUnitCycles( .05 ); 4.253 + 4.254 + //ask SSR for number of cycles of the "primitive" op of matrix mult 4.255 + primitiveCycles = measureMatrixMultPrimitive( animPr ); 4.256 + 4.257 + numPrimitiveOpsInMinWorkUnit = 4.258 + (float64)minWorkUnitCycles / (float64)primitiveCycles; 4.259 + 4.260 + //take cubed root -- that's number of these in a "side" of sub-matrix 4.261 + // then multiply by 5 because the primitive is 5x5 4.262 + idealSizeOfSide1 = 5 * cbrt( numPrimitiveOpsInMinWorkUnit ); 4.263 + 4.264 + idealNumWorkUnits = SSR__giveIdealNumWorkUnits(); 4.265 + 4.266 + idealSizeOfSide2 = leftMatrix->numRows / rint(cbrt( idealNumWorkUnits )); 4.267 + idealSizeOfSide2 *= 0.6; //finer granularity to help load balance 4.268 + 4.269 + if( idealSizeOfSide1 > idealSizeOfSide2 ) 4.270 + idealSizeOfSide = idealSizeOfSide1; 4.271 + else 4.272 + idealSizeOfSide = idealSizeOfSide2; 4.273 + 4.274 + //The multiply inner loop blocks the array to fit into L1 cache 4.275 +// if( idealSizeOfSide < ROWS_IN_BLOCK ) idealSizeOfSide = ROWS_IN_BLOCK; 4.276 + 4.277 + //============ Slice up dimensions, now that know target size =========== 4.278 + 4.279 + //Tell the slicer the target size of a side (floating pt), the start 4.280 + // value to start slicing at, and the end value to stop slicing at 4.281 + //It returns an array of start value of each chunk, plus number of them 4.282 + int32 startLeftRow, endLeftRow, startVec,endVec,startRightCol,endRightCol; 4.283 + startLeftRow = 0; 4.284 + endLeftRow = leftMatrix->numRows -1; 4.285 + startVec = 0; 4.286 + endVec = leftMatrix->numCols -1; 4.287 + startRightCol = 0; 4.288 + endRightCol = rightMatrix->numCols -1; 4.289 + 4.290 + leftRowSlices = 4.291 + sliceUpDimension( idealSizeOfSide, startLeftRow, endLeftRow, animPr ); 4.292 + 4.293 + vecSlices = 4.294 + sliceUpDimension( idealSizeOfSide, startVec, endVec, animPr ); 4.295 + 4.296 + rightColSlices = 4.297 + sliceUpDimension( idealSizeOfSide, startRightCol, endRightCol,animPr); 4.298 + 4.299 + slicingStrucCarrier->leftRowSlices = leftRowSlices; 4.300 + slicingStrucCarrier->vecSlices = vecSlices; 4.301 + slicingStrucCarrier->rightColSlices = rightColSlices; 4.302 + 4.303 + return slicingStrucCarrier; 4.304 + } 4.305 + 4.306 + 4.307 +void 4.308 +makeSubMatricesAndProcrs( Matrix *leftMatrix, Matrix *rightMatrix, 4.309 + SlicingStrucCarrier *slicingStrucCarrier, 4.310 + SlaveVP *resultPr, SlaveVP *animPr ) 4.311 + { 4.312 + SlicingStruc *leftRowSlices, *vecSlices, *rightColSlices; 4.313 + 4.314 + leftRowSlices = slicingStrucCarrier->leftRowSlices; 4.315 + vecSlices = slicingStrucCarrier->vecSlices; 4.316 + rightColSlices = slicingStrucCarrier->rightColSlices; 4.317 + SSR__free( slicingStrucCarrier, animPr ); 4.318 + 4.319 + //================ Make sub-matrices, given the slicing ================ 4.320 + SubMatrix **leftSubMatrices, **rightSubMatrices; 4.321 + leftSubMatrices = 4.322 + createSubMatrices( leftRowSlices, vecSlices, rightColSlices->numVals, 4.323 + leftMatrix, animPr ); 4.324 + //double_check_that_always_numRows_in_right_same_as_numCols_in_left(); 4.325 + rightSubMatrices = 4.326 + createSubMatrices( vecSlices, rightColSlices, leftRowSlices->numVals, 4.327 + rightMatrix, animPr ); 4.328 + 4.329 + 4.330 + //============== pair the sub-matrices and make processors ============== 4.331 + int32 numRowIdxs, numColIdxs, numVecIdxs; 4.332 + 4.333 + numRowIdxs = leftRowSlices->numVals; 4.334 + numColIdxs = rightColSlices->numVals; 4.335 + numVecIdxs = vecSlices->numVals; 4.336 + 4.337 + 4.338 + freeSlicingStruc( leftRowSlices, animPr ); 4.339 + freeSlicingStruc( vecSlices, animPr ); 4.340 + freeSlicingStruc( rightColSlices, animPr ); 4.341 + 4.342 + pairUpSubMatricesAndMakeProcessors( leftSubMatrices, 4.343 + rightSubMatrices, 4.344 + numRowIdxs, numColIdxs, 4.345 + numVecIdxs, 4.346 + resultPr, 4.347 + animPr ); 4.348 + } 4.349 + 4.350 + 4.351 + 4.352 + 4.353 +void 4.354 +pairUpSubMatricesAndMakeProcessors( SubMatrix **leftSubMatrices, 4.355 + SubMatrix **rightSubMatrices, 4.356 + int32 numRowIdxs, int32 numColIdxs, 4.357 + int32 numVecIdxs, 4.358 + SlaveVP *resultPr, 4.359 + SlaveVP *animatingPr ) 4.360 + { 4.361 + int32 resRowIdx, resColIdx, vecIdx; 4.362 + int32 numLeftColIdxs, numRightColIdxs; 4.363 + int32 leftRowIdxOffset; 4.364 + SMPairParams *subMatrixPairParams; 4.365 + float32 numToPutOntoEachCore, leftOverFraction, numVecOnCurrCore; 4.366 + int32 numCores, coreToAssignOnto; 4.367 + 4.368 + numLeftColIdxs = numColIdxs; 4.369 + numRightColIdxs = numVecIdxs; 4.370 + 4.371 + numCores = SSR__give_number_of_cores_to_schedule_onto(); 4.372 + 4.373 + numToPutOntoEachCore = numRowIdxs*numColIdxs/numCores; 4.374 + leftOverFraction = 0; 4.375 + numVecOnCurrCore = 0; 4.376 + coreToAssignOnto = 0; 4.377 + 4.378 + for( resRowIdx = 0; resRowIdx < numRowIdxs; resRowIdx++ ) 4.379 + { 4.380 + leftRowIdxOffset = resRowIdx * numLeftColIdxs; 4.381 + 4.382 + for( resColIdx = 0; resColIdx < numColIdxs; resColIdx++ ) 4.383 + { 4.384 + 4.385 + for( vecIdx = 0; vecIdx < numVecIdxs; vecIdx++ ) 4.386 + { 4.387 + //Make the processor for the pair of sub-matrices 4.388 + subMatrixPairParams = SSR__malloc_to( sizeof(SMPairParams), 4.389 + animatingPr); 4.390 + subMatrixPairParams->leftSubMatrix = 4.391 + leftSubMatrices[ leftRowIdxOffset + vecIdx ]; 4.392 + 4.393 + subMatrixPairParams->rightSubMatrix = 4.394 + rightSubMatrices[ vecIdx * numRightColIdxs + resColIdx ]; 4.395 + 4.396 + subMatrixPairParams->resultPr = resultPr; 4.397 + 4.398 + //put all pairs from the same vector onto same core 4.399 + SSR__create_procr_with_affinity( &calcSubMatrixProduct, 4.400 + subMatrixPairParams, 4.401 + animatingPr, 4.402 + coreToAssignOnto ); 4.403 + 4.404 + //Trying to distribute the subMatrix-vectors across the cores, so 4.405 + // that each core gets the same number of vectors, with a max 4.406 + // imbalance of 1 vector more on some cores than others 4.407 + numVecOnCurrCore += 1; //incr before checking, so 4.408 + if( numVecOnCurrCore > numToPutOntoEachCore ) //actual num 1 less 4.409 + { 4.410 + //deal with fractional part, to ensure that imbalance is 1 max 4.411 + // IE, core with most has only 1 more than core with least 4.412 + leftOverFraction = numToPutOntoEachCore - numVecOnCurrCore; 4.413 + if( leftOverFraction > 1 ) ERROR("division alg messed up\n"); 4.414 + numVecOnCurrCore = leftOverFraction; //accumulates "extra" 4.415 + 4.416 + //Move to next core, max core-value to incr to is numCores -1 4.417 + coreToAssignOnto += 1; 4.418 + if( coreToAssignOnto >= numCores ) coreToAssignOnto = 0; 4.419 + } //if 4.420 + } //for( vecIdx 4.421 + } //for( resColIdx 4.422 + } //for( resRowIdx 4.423 + 4.424 + } 4.425 + 4.426 + 4.427 + 4.428 +/*Walk through the two slice-strucs, making sub-matrix strucs as go 4.429 + */ 4.430 +SubMatrix ** 4.431 +createSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.432 + int32 numUses, Matrix *origMatrix, SlaveVP *animPr ) 4.433 + { 4.434 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx; 4.435 + int32 startRow, endRow, startCol, endCol; 4.436 + int32 *rowStartVals, *colStartVals; 4.437 + int32 rowOffset; 4.438 + SubMatrix **subMatrices, *newSubMatrix; 4.439 + 4.440 + numRowIdxs = rowSlices->numVals; 4.441 + numColIdxs = colSlices->numVals; 4.442 + 4.443 + rowStartVals = rowSlices->startVals; 4.444 + colStartVals = colSlices->startVals; 4.445 + 4.446 + subMatrices = SSR__malloc_to(numRowIdxs * numColIdxs * sizeof(SubMatrix*), 4.447 + animPr ); 4.448 + 4.449 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.450 + { 4.451 + rowOffset = rowIdx * numColIdxs; 4.452 + 4.453 + startRow = rowStartVals[rowIdx]; 4.454 + endRow = rowStartVals[rowIdx + 1] -1; //"fake" start above last is 4.455 + // at last valid idx + 1 & is 4.456 + // 1 greater than end value 4.457 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.458 + { 4.459 + startCol = colStartVals[colIdx]; 4.460 + endCol = colStartVals[colIdx + 1] -1; 4.461 + 4.462 + newSubMatrix = SSR__malloc_to( sizeof(SubMatrix), animPr ); 4.463 + newSubMatrix->numRows = endRow - startRow +1; 4.464 + newSubMatrix->numCols = endCol - startCol +1; 4.465 + newSubMatrix->origMatrix = origMatrix; 4.466 + newSubMatrix->origStartRow = startRow; 4.467 + newSubMatrix->origStartCol = startCol; 4.468 + newSubMatrix->copySingleton = NULL; 4.469 + newSubMatrix->numUsesLeft = numUses; //can free after this many 4.470 + //Prevent uninitialized memory 4.471 + newSubMatrix->copySingleton = NULL; 4.472 + newSubMatrix->copyTransSingleton = NULL; 4.473 + 4.474 + subMatrices[ rowOffset + colIdx ] = newSubMatrix; 4.475 + } 4.476 + } 4.477 + return subMatrices; 4.478 + } 4.479 + 4.480 + 4.481 +void 4.482 +freeSubMatrices( SlicingStruc *rowSlices, SlicingStruc *colSlices, 4.483 + SubMatrix **subMatrices, SlaveVP *animPr ) 4.484 + { 4.485 + int32 numRowIdxs, numColIdxs, rowIdx, colIdx, rowOffset; 4.486 + SubMatrix *subMatrix; 4.487 + 4.488 + numRowIdxs = rowSlices->numVals; 4.489 + numColIdxs = colSlices->numVals; 4.490 + 4.491 + for( rowIdx = 0; rowIdx < numRowIdxs; rowIdx++ ) 4.492 + { 4.493 + rowOffset = rowIdx * numColIdxs; 4.494 + for( colIdx = 0; colIdx < numColIdxs; colIdx++ ) 4.495 + { 4.496 + subMatrix = subMatrices[ rowOffset + colIdx ]; 4.497 + if( subMatrix->alreadyCopied ) 4.498 + SSR__free( subMatrix->array, animPr ); 4.499 + SSR__free( subMatrix, animPr ); 4.500 + } 4.501 + } 4.502 + SSR__free( subMatrices, animPr ); 4.503 + } 4.504 + 4.505 + 4.506 + 4.507 +SlicingStruc * 4.508 +sliceUpDimension( float32 idealSizeOfSide, int startVal, int endVal, 4.509 + SlaveVP *animPr ) 4.510 + { float32 residualAcc = 0; 4.511 + int numSlices, i, *startVals, sizeOfSlice, endCondition; 4.512 + SlicingStruc *slicingStruc = SSR__malloc_to(sizeof(SlicingStruc), animPr); 4.513 + 4.514 + //calc size of matrix need to hold start vals -- 4.515 + numSlices = (int32)( (float32)(endVal -startVal +1) / idealSizeOfSide); 4.516 + 4.517 + startVals = SSR__malloc_to( (numSlices + 1) * sizeof(int32), animPr ); 4.518 + 4.519 + //Calc the upper limit of start value -- when get above this, end loop 4.520 + // by saving highest value of the matrix dimension to access, plus 1 4.521 + // as the start point of the imaginary slice following the last one 4.522 + //Plus 1 because go up to value but not include when process last slice 4.523 + //The stopping condition is half-a-size less than highest value because 4.524 + // don't want any pieces smaller than half the ideal size -- just tack 4.525 + // little ones onto end of last one 4.526 + endCondition = endVal - (int) (idealSizeOfSide/2); //end *value*, not size 4.527 + for( i = 0; startVal <= endVal; i++ ) 4.528 + { 4.529 + startVals[i] = startVal; 4.530 + residualAcc += idealSizeOfSide; 4.531 + sizeOfSlice = (int)residualAcc; 4.532 + residualAcc -= (float32)sizeOfSlice; 4.533 + startVal += sizeOfSlice; //ex @size = 2 get 0, 2, 4, 6, 8.. 4.534 + 4.535 + if( startVal > endCondition ) 4.536 + { startVal = endVal + 1; 4.537 + startVals[ i + 1 ] = startVal; 4.538 + } 4.539 + } 4.540 + 4.541 + slicingStruc->startVals = startVals; 4.542 + slicingStruc->numVals = i; //loop incr'd, so == last valid start idx+1 4.543 + // which means is num sub-matrices in dim 4.544 + // also == idx of the fake start just above 4.545 + return slicingStruc; 4.546 + } 4.547 + 4.548 +void 4.549 +freeSlicingStruc( SlicingStruc *slicingStruc, SlaveVP *animPr ) 4.550 + { 4.551 + SSR__free( slicingStruc->startVals, animPr ); 4.552 + SSR__free( slicingStruc, animPr ); 4.553 + } 4.554 + 4.555 + 4.556 +inline int 4.557 +measureMatrixMultPrimitive( SlaveVP *animPr ) 4.558 + { 4.559 + int r, c, v, numCycles; 4.560 + float32 *res, *left, *right; 4.561 + 4.562 + //setup inputs 4.563 + left = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.564 + right = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.565 + res = SSR__malloc_to( 5 * 5 * sizeof( float32 ), animPr ); 4.566 + 4.567 + for( r = 0; r < 5; r++ ) 4.568 + { 4.569 + for( c = 0; c < 5; c++ ) 4.570 + { 4.571 + left[ r * 5 + c ] = r; 4.572 + right[ r * 5 + c ] = c; 4.573 + } 4.574 + } 4.575 + 4.576 + //do primitive 4.577 + SSR__start_primitive(); //for now, just takes time stamp 4.578 + for( r = 0; r < 5; r++ ) 4.579 + { 4.580 + for( c = 0; c < 5; c++ ) 4.581 + { 4.582 + for( v = 0; v < 5; v++ ) 4.583 + { 4.584 + res[ r * 5 + c ] = left[ r * 5 + v ] * right[ v * 5 + c ]; 4.585 + } 4.586 + } 4.587 + } 4.588 + numCycles = 4.589 + SSR__end_primitive_and_give_cycles(); 4.590 + 4.591 + SSR__free( left, animPr ); 4.592 + SSR__free( right, animPr ); 4.593 + SSR__free( res, animPr ); 4.594 + 4.595 + return numCycles; 4.596 + } 4.597 +
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/VSs__Hello_World/VSs__Hello_World.h Wed May 23 12:39:19 2012 -0700 5.3 @@ -0,0 +1,94 @@ 5.4 +/* 5.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 5.6 + * Licensed under GNU General Public License version 2 5.7 + */ 5.8 + 5.9 +#ifndef _SSR_MATRIX_MULT_H_ 5.10 +#define _SSR_MATRIX_MULT_H_ 5.11 + 5.12 +#include <stdio.h> 5.13 + 5.14 +#include "SSR_impl/SSR.h" 5.15 +#include "../Matrix_Mult.h" 5.16 + 5.17 + 5.18 +//=============================== Defines ============================== 5.19 +#define ROWS_IN_BLOCK 32 5.20 +#define COLS_IN_BLOCK 32 5.21 +#define VEC_IN_BLOCK 32 5.22 + 5.23 +#define copyMatrixSingleton 1 5.24 +#define copyTransposeSingleton 2 5.25 + 5.26 +//============================== Structures ============================== 5.27 +typedef struct 5.28 + { 5.29 + Matrix *leftMatrix; 5.30 + Matrix *rightMatrix; 5.31 + Matrix *resultMatrix; 5.32 + } 5.33 +DividerParams; 5.34 + 5.35 +typedef struct 5.36 + { 5.37 + SlaveVP *dividerPr; 5.38 + int numRows; 5.39 + int numCols; 5.40 + int numSubMatrixPairs; 5.41 + float32 *resultArray; 5.42 + } 5.43 +ResultsParams; 5.44 + 5.45 +typedef struct 5.46 + { int32 numRows; 5.47 + int32 numCols; 5.48 + Matrix *origMatrix; 5.49 + int32 origStartRow; 5.50 + int32 origStartCol; 5.51 + int32 alreadyCopied; 5.52 + int32 numUsesLeft; //have update via message to avoid multiple writers 5.53 + SSRSingleton *copySingleton; 5.54 + SSRSingleton *copyTransSingleton; 5.55 + float32 *array; //2D, but dynamically sized, so use addr arith 5.56 + } 5.57 +SubMatrix; 5.58 + 5.59 +typedef struct 5.60 + { SlaveVP *resultPr; 5.61 + SubMatrix *leftSubMatrix; 5.62 + SubMatrix *rightSubMatrix; 5.63 + float32 *partialResultArray; 5.64 + } 5.65 +SMPairParams; 5.66 + 5.67 +typedef struct 5.68 + { int32 numVals; 5.69 + int32 *startVals; 5.70 + } 5.71 +SlicingStruc; 5.72 + 5.73 +typedef struct 5.74 + { 5.75 + SlicingStruc *leftRowSlices; 5.76 + SlicingStruc *vecSlices; 5.77 + SlicingStruc *rightColSlices; 5.78 + } 5.79 +SlicingStrucCarrier; 5.80 + 5.81 +enum MMMsgType 5.82 + { 5.83 + RESULTS_MSG = 1 5.84 + }; 5.85 + 5.86 +//============================= Processor Functions ========================= 5.87 +void divideWorkIntoSubMatrixPairProcrs( void *data, SlaveVP *animatingPr ); 5.88 +void calcSubMatrixProduct( void *data, SlaveVP *animatingPr ); 5.89 +void gatherResults( void *data, SlaveVP *animatingPr ); 5.90 + 5.91 + 5.92 +//================================ Entry Point ============================== 5.93 +Matrix * 5.94 +multiplyTheseMatrices( Matrix *leftMatrix, Matrix *rightMatrix ); 5.95 + 5.96 + 5.97 +#endif /*_SSR_MATRIX_MULT_H_*/
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/__brch__default Wed May 23 12:39:19 2012 -0700 6.3 @@ -0,0 +1,1 @@ 6.4 +Applications normally have only the default branch -- they shouldn't be affected by any choices in VMS or language..
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/main.c Wed May 23 12:39:19 2012 -0700 7.3 @@ -0,0 +1,35 @@ 7.4 +/* 7.5 + * Copyright Oct 24, 2009 OpenSourceStewardshipFoundation.org 7.6 + * Licensed under GNU General Public License version 2 7.7 + * 7.8 + * author seanhalle@yahoo.com 7.9 + */ 7.10 + 7.11 +#include <malloc.h> 7.12 +#include <stdlib.h> 7.13 + 7.14 +#include "Matrix_Mult.h" 7.15 +#include "SSR_Matrix_Mult/SSR_Matrix_Mult.h" 7.16 + 7.17 +/** 7.18 + * 7.19 + */ 7.20 +int main( int argc, char **argv ) 7.21 + { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 7.22 + ParamBag *paramBag; 7.23 + 7.24 + DEBUG__printf2(TRUE, "arguments: %s | %s", argv[0], argv[1] ); 7.25 + 7.26 + paramBag = makeParamBag(); 7.27 + readParamFileIntoBag( argv[1], paramBag ); 7.28 + initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag ); 7.29 + 7.30 + resultMatrix = multiplyTheseMatrices( leftMatrix, rightMatrix ); 7.31 + 7.32 + printf("\nresult matrix: \n"); 7.33 + printMatrix( resultMatrix ); 7.34 + 7.35 + fflush(stdin); 7.36 + 7.37 + exit(0); //cleans up 7.38 + }
