Mercurial > cgi-bin > hgwebdir.cgi > PR > Applications > SSR > SSR__Blocked_Matrix_Mult__Bench
changeset 4:cbd8db6b8657
Fixed last bugs in matrix multiply code -- gives correct answers consistently
Needed to add initializing result matrices to 0 'cause accumulating
and fixed bug in sequential bypass where passed the wrong array
and fixed problem with end-conditions in blocked multiply loop nest
| author | Me |
|---|---|
| date | Thu, 04 Nov 2010 17:44:15 -0700 |
| parents | 4e14e2663af9 |
| children | 0a8667d13627 |
| files | src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/subMatrix_Pr.c src/Application/main.c |
| diffstat | 5 files changed, 66 insertions(+), 50 deletions(-) [+] |
line diff
1.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c Tue Nov 02 17:00:50 2010 -0700 1.2 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c Thu Nov 04 17:44:15 2010 -0700 1.3 @@ -122,7 +122,7 @@ 1.4 SlicingStrucCarrier *slicingStrucCarrier; 1.5 float32 *resultArray; //points to array inside result matrix 1.6 1.7 - DEBUG("start divide\n") 1.8 + DEBUG( dbgAppFlow, "start divide\n") 1.9 1.10 int32 1.11 divideProbe = VMS__create_single_interval_probe( "divideProbe", 1.12 @@ -152,13 +152,16 @@ 1.13 (float32)rightMatrix->numCols < NUM_CELLS_IN_SEQUENTIAL_CUTOFF ) 1.14 { 1.15 //====== Do sequential multiply on a single core 1.16 - DEBUG("doing sequential") 1.17 + DEBUG( dbgAppFlow, "doing sequential") 1.18 1.19 - //have to transpose the right matrix first 1.20 + //zero the result array 1.21 + memset( resultArray, 0, numResRows * numResCols * sizeof(float32) ); 1.22 + 1.23 + //transpose the right matrix 1.24 float32 * 1.25 - transRightArray = SSR__malloc_to( rightMatrix->numRows * 1.26 - rightMatrix->numCols * 1.27 - sizeof(float32), animPr ); 1.28 + transRightArray = SSR__malloc_to( rightMatrix->numRows * 1.29 + rightMatrix->numCols * sizeof(float32), 1.30 + animPr ); 1.31 1.32 //copy values from orig matrix to local 1.33 copyTranspose( rightMatrix->numRows, rightMatrix->numCols, 1.34 @@ -166,7 +169,7 @@ 1.35 transRightArray, rightMatrix->array ); 1.36 1.37 multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols, 1.38 - leftMatrix->array, rightMatrix->array, 1.39 + leftMatrix->array, transRightArray, 1.40 resultArray ); 1.41 } 1.42 else 1.43 @@ -211,7 +214,7 @@ 1.44 //=============== Work done -- send results back ================= 1.45 1.46 1.47 - DEBUG_MSG( dbgAppFlow, "end divide\n") 1.48 + DEBUG( dbgAppFlow, "end divide\n") 1.49 1.50 VMS__record_interval_end_in_probe( divideProbe ); 1.51 VMS__print_stats_of_all_probes(); 1.52 @@ -417,6 +420,7 @@ 1.53 { coreToScheduleOnto += 1; 1.54 } 1.55 } 1.56 + 1.57 } 1.58 } 1.59
2.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c Tue Nov 02 17:00:50 2010 -0700 2.2 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c Thu Nov 04 17:44:15 2010 -0700 2.3 @@ -34,7 +34,7 @@ 2.4 void *msg; 2.5 SMPairParams *resParams; 2.6 2.7 - DEBUG("start resultPr\n") 2.8 + DEBUG( dbgAppFlow, "start resultPr\n") 2.9 2.10 params = (ResultsParams *)_params; 2.11 dividerPr = params->dividerPr;
3.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Tue Nov 02 17:00:50 2010 -0700 3.2 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h Thu Nov 04 17:44:15 2010 -0700 3.3 @@ -20,8 +20,6 @@ 3.4 #define copyMatrixSingleton 1 3.5 #define copyTransposeSingleton 2 3.6 3.7 -#define DEBUG(msg) //printf(msg); fflush(stdin); 3.8 - 3.9 //============================== Structures ============================== 3.10 typedef struct 3.11 {
4.1 --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Tue Nov 02 17:00:50 2010 -0700 4.2 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c Thu Nov 04 17:44:15 2010 -0700 4.3 @@ -48,7 +48,7 @@ 4.4 float32 *leftArray, *rightArray, *resArray; 4.5 SubMatrix *leftSubMatrix, *rightSubMatrix; 4.6 4.7 - DEBUG("start sub-matrix mult\n") 4.8 + DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID) 4.9 4.10 params = (SMPairParams *)data; 4.11 resultPr = params->resultPr; 4.12 @@ -63,8 +63,10 @@ 4.13 leftArray = leftSubMatrix->array; 4.14 rightArray = rightSubMatrix->array; 4.15 4.16 - resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols 4.17 - * sizeof( float32 ), animatingPr ); 4.18 + int32 4.19 + resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32); 4.20 + resArray = SSR__malloc_to( resSize, animatingPr ); 4.21 + memset( resArray, 0, resSize ); 4.22 4.23 4.24 int32 numResRows, numResCols, vectLength; 4.25 @@ -84,97 +86,107 @@ 4.26 } 4.27 4.28 4.29 -/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache 4.30 - * Would be nice to embed this within another level that divided into 4.31 + 4.32 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into 4.33 + * the 32KB L1 cache. 4.34 + *Would be nice to embed this within another level that divided into 4.35 * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache 4.36 * 4.37 *Eventually want these divisions to be automatic, using DKU pattern 4.38 - * embedded into SSR, and with VMS controlling the divisions according to 4.39 - * the cache sizes, which it knows about. 4.40 - *And, want VMS to work with language to split among main-mems, so a socket 4.41 + * embedded into VMS and exposed in the language, and with VMS controlling the 4.42 + * divisions according to the cache sizes, which it knows about. 4.43 + *Also, want VMS to work with language to split among main-mems, so a socket 4.44 * only cranks on data in its local segment of main mem 4.45 * 4.46 + *So, outer two loops determine start and end points within the result matrix. 4.47 + * Inside that, a loop dets the start and end points along the shared dimensions 4.48 + * of the two input matrices. 4.49 */ 4.50 void inline 4.51 multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows, 4.52 int32 numResCols, 4.53 - float32 *leftArray, float32 *rightArray, 4.54 - float32 *resArray ) 4.55 + float32 *leftArray, float32 *rightArray, 4.56 + float32 *resArray ) 4.57 { 4.58 int resStride, inpStride; 4.59 - int startRow, startCol, endRow, endCol, startVec, endVec; 4.60 + int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec; 4.61 4.62 resStride = numResCols; 4.63 inpStride = vecLength; 4.64 4.65 - for( startRow = 0; startRow < numResRows; ) 4.66 + for( resStartRow = 0; resStartRow < numResRows; ) 4.67 { 4.68 - endRow = startRow + ROWS_IN_BLOCK; 4.69 - if( endRow > numResRows ) endRow = numResRows; 4.70 + resEndRow = resStartRow + ROWS_IN_BLOCK -1; //start at zero, so -1 4.71 + if( resEndRow > numResRows ) resEndRow = numResRows -1; 4.72 4.73 - for( startCol = 0; startCol < numResCols; ) 4.74 + for( resStartCol = 0; resStartCol < numResCols; ) 4.75 { 4.76 - endCol = startCol + COLS_IN_BLOCK; 4.77 - if( endCol > numResCols ) endCol = numResCols; 4.78 + resEndCol = resStartCol + COLS_IN_BLOCK -1; 4.79 + if( resEndCol > numResCols ) resEndCol = numResCols -1; 4.80 4.81 for( startVec = 0; startVec < vecLength; ) 4.82 { 4.83 - endVec = startVec + VEC_IN_BLOCK; 4.84 - if( endVec > vecLength ) endVec = vecLength; 4.85 + endVec = startVec + VEC_IN_BLOCK -1; 4.86 + if( endVec > vecLength ) endVec = vecLength -1; 4.87 4.88 //By having the "vector" of sub-blocks in a sub-block slice 4.89 // be marched down in inner loop, are re-using the result 4.90 - // matrix, which stays in L1 cache -- can only re-use one of 4.91 - // the three, so this is the most important -- avoids writing 4.92 + // matrix, which stays in L1 cache and re-using the left sub-mat 4.93 + // which repeats for each right sub-mat -- can only re-use two of 4.94 + // the three, so result is the most important -- avoids writing 4.95 // dirty blocks until those result-locations fully done 4.96 //Row and Col is position in result matrix -- so row and vec 4.97 // for left array, then vec and col for right array 4.98 multiplySubBlocksTransposed( leftArray, rightArray, 4.99 resArray, 4.100 - startRow, endRow, 4.101 - startCol, endCol, 4.102 + resStartRow, resEndRow, 4.103 + resStartCol, resEndCol, 4.104 startVec, endVec, 4.105 resStride, inpStride ); 4.106 - startVec = endVec; 4.107 + startVec = endVec +1; 4.108 } 4.109 - startCol = endCol; 4.110 + resStartCol = resEndCol +1; 4.111 } 4.112 - startRow = endRow; 4.113 + resStartRow = resEndRow +1; 4.114 } 4.115 } 4.116 4.117 4.118 + 4.119 void inline 4.120 multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray, 4.121 float32 *resArray, 4.122 - int startRow, int endRow, 4.123 - int startCol, int endCol, 4.124 + int resStartRow, int resEndRow, 4.125 + int resStartCol, int resEndCol, 4.126 int startVec, int endVec, 4.127 int resStride, int inpStride ) 4.128 { 4.129 - int row, col, vec; 4.130 + int resRow, resCol, vec; 4.131 int leftOffset, rightOffset; 4.132 float32 result; 4.133 - 4.134 - for( row = startRow; row < endRow; row++ ) 4.135 - { 4.136 - for( col = startCol; col < endCol; col++ ) 4.137 - { 4.138 - leftOffset = row * inpStride;//left & right inp strides always same 4.139 - rightOffset = col * inpStride;// because right is transposed 4.140 + 4.141 + //The result row is used only for the left matrix, res col for the right 4.142 + for( resCol = resStartCol; resCol <= resEndCol; resCol++ ) 4.143 + { 4.144 + for( resRow = resStartRow; resRow <= resEndRow; resRow++ ) 4.145 + { 4.146 + leftOffset = resRow * inpStride;//left & right inp strides always same 4.147 + rightOffset = resCol * inpStride;// because right is transposed 4.148 result = 0; 4.149 - for( vec = startVec; vec < endVec; vec++ ) 4.150 + for( vec = startVec; vec <= endVec; vec++ ) 4.151 { 4.152 result += 4.153 leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec]; 4.154 } 4.155 - 4.156 - resArray[ row * resStride + col ] += result; 4.157 + 4.158 + resArray[ resRow * resStride + resCol ] += result; 4.159 } 4.160 } 4.161 } 4.162 4.163 4.164 + 4.165 + 4.166 /*Reuse this in divider when do the sequential multiply case 4.167 */ 4.168 void inline
5.1 --- a/src/Application/main.c Tue Nov 02 17:00:50 2010 -0700 5.2 +++ b/src/Application/main.c Thu Nov 04 17:44:15 2010 -0700 5.3 @@ -19,6 +19,8 @@ 5.4 { Matrix *leftMatrix, *rightMatrix, *resultMatrix; 5.5 ParamBag *paramBag; 5.6 5.7 + printf( "arguments: %s | %s\n", argv[0], argv[1] ); 5.8 + 5.9 paramBag = makeParamBag(); 5.10 readParamFileIntoBag( argv[1], paramBag ); 5.11 initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );
