changeset 4:cbd8db6b8657

Fixed last bugs in matrix multiply code -- gives correct answers consistently Needed to add initializing result matrices to 0 'cause accumulating and fixed bug in sequential bypass where passed the wrong array and fixed problem with end-conditions in blocked multiply loop nest
author Me
date Thu, 04 Nov 2010 17:44:15 -0700
parents 4e14e2663af9
children 0a8667d13627
files src/Application/SSR_Matrix_Mult/Divide_Pr.c src/Application/SSR_Matrix_Mult/Result_Pr.c src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h src/Application/SSR_Matrix_Mult/subMatrix_Pr.c src/Application/main.c
diffstat 5 files changed, 66 insertions(+), 50 deletions(-) [+]
line diff
     1.1 --- a/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Tue Nov 02 17:00:50 2010 -0700
     1.2 +++ b/src/Application/SSR_Matrix_Mult/Divide_Pr.c	Thu Nov 04 17:44:15 2010 -0700
     1.3 @@ -122,7 +122,7 @@
     1.4     SlicingStrucCarrier *slicingStrucCarrier;
     1.5     float32         *resultArray; //points to array inside result matrix
     1.6     
     1.7 -         DEBUG("start divide\n")
     1.8 +         DEBUG( dbgAppFlow, "start divide\n")
     1.9  
    1.10           int32
    1.11           divideProbe = VMS__create_single_interval_probe( "divideProbe",
    1.12 @@ -152,13 +152,16 @@
    1.13         (float32)rightMatrix->numCols  < NUM_CELLS_IN_SEQUENTIAL_CUTOFF )
    1.14      {
    1.15        //====== Do sequential multiply on a single core
    1.16 -            DEBUG("doing sequential")
    1.17 +            DEBUG( dbgAppFlow, "doing sequential")
    1.18  
    1.19 -      //have to transpose the right matrix first
    1.20 +         //zero the result array
    1.21 +      memset( resultArray, 0, numResRows * numResCols * sizeof(float32) );
    1.22 +            
    1.23 +         //transpose the right matrix
    1.24        float32 *
    1.25 -      transRightArray  = SSR__malloc_to( rightMatrix->numRows *
    1.26 -                                         rightMatrix->numCols *
    1.27 -                                         sizeof(float32),        animPr );
    1.28 +      transRightArray  = SSR__malloc_to( rightMatrix->numRows * 
    1.29 +                                         rightMatrix->numCols * sizeof(float32),
    1.30 +                                         animPr );
    1.31  
    1.32           //copy values from orig matrix to local
    1.33        copyTranspose( rightMatrix->numRows, rightMatrix->numCols,
    1.34 @@ -166,7 +169,7 @@
    1.35                       transRightArray, rightMatrix->array );
    1.36        
    1.37        multiplyMatrixArraysTransposed( vectLength, numResRows, numResCols,
    1.38 -                            leftMatrix->array, rightMatrix->array,
    1.39 +                            leftMatrix->array, transRightArray,
    1.40                              resultArray );
    1.41      }
    1.42     else
    1.43 @@ -211,7 +214,7 @@
    1.44     //===============  Work done -- send results back =================
    1.45  
    1.46  
    1.47 -         DEBUG_MSG( dbgAppFlow, "end divide\n")
    1.48 +         DEBUG( dbgAppFlow, "end divide\n")
    1.49  
    1.50           VMS__record_interval_end_in_probe( divideProbe );
    1.51           VMS__print_stats_of_all_probes();
    1.52 @@ -417,6 +420,7 @@
    1.53               { coreToScheduleOnto += 1;
    1.54               }
    1.55            }
    1.56 + 
    1.57         }
    1.58      }
    1.59  
     2.1 --- a/src/Application/SSR_Matrix_Mult/Result_Pr.c	Tue Nov 02 17:00:50 2010 -0700
     2.2 +++ b/src/Application/SSR_Matrix_Mult/Result_Pr.c	Thu Nov 04 17:44:15 2010 -0700
     2.3 @@ -34,7 +34,7 @@
     2.4     void           *msg;
     2.5     SMPairParams   *resParams;
     2.6  
     2.7 -         DEBUG("start resultPr\n")
     2.8 +         DEBUG( dbgAppFlow, "start resultPr\n")
     2.9           
    2.10     params    = (ResultsParams *)_params;
    2.11     dividerPr = params->dividerPr;
     3.1 --- a/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Tue Nov 02 17:00:50 2010 -0700
     3.2 +++ b/src/Application/SSR_Matrix_Mult/SSR_Matrix_Mult.h	Thu Nov 04 17:44:15 2010 -0700
     3.3 @@ -20,8 +20,6 @@
     3.4  #define copyMatrixSingleton 1
     3.5  #define copyTransposeSingleton 2
     3.6  
     3.7 -#define DEBUG(msg) //printf(msg); fflush(stdin);
     3.8 -
     3.9  //==============================  Structures  ==============================
    3.10  typedef struct
    3.11   {
     4.1 --- a/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Tue Nov 02 17:00:50 2010 -0700
     4.2 +++ b/src/Application/SSR_Matrix_Mult/subMatrix_Pr.c	Thu Nov 04 17:44:15 2010 -0700
     4.3 @@ -48,7 +48,7 @@
     4.4     float32        *leftArray,  *rightArray, *resArray;
     4.5     SubMatrix      *leftSubMatrix, *rightSubMatrix;
     4.6  
     4.7 -         DEBUG("start sub-matrix mult\n")
     4.8 +         DEBUG1(dbgAppFlow, "start sub-matrix mult: %d\n", animatingPr->procrID)
     4.9  
    4.10     params         = (SMPairParams *)data;
    4.11     resultPr       = params->resultPr;
    4.12 @@ -63,8 +63,10 @@
    4.13     leftArray      = leftSubMatrix->array;
    4.14     rightArray     = rightSubMatrix->array;
    4.15  
    4.16 -   resArray = SSR__malloc_to(leftSubMatrix->numRows * rightSubMatrix->numCols
    4.17 -                             * sizeof( float32 ), animatingPr );
    4.18 +   int32
    4.19 +   resSize = leftSubMatrix->numRows * rightSubMatrix->numCols * sizeof(float32);
    4.20 +   resArray = SSR__malloc_to( resSize, animatingPr );
    4.21 +   memset( resArray, 0, resSize );
    4.22  
    4.23  
    4.24     int32 numResRows, numResCols, vectLength;
    4.25 @@ -84,97 +86,107 @@
    4.26   }
    4.27  
    4.28  
    4.29 -/*Divides into 32x32 sub-matrices, 3 of which fit into 32KB L1 cache
    4.30 - * Would be nice to embed this within another level that divided into
    4.31 +
    4.32 +/*Divides result and each input into 32x32 sub-matrices, 3 of which fit into
    4.33 + * the 32KB L1 cache.
    4.34 + *Would be nice to embed this within another level that divided into
    4.35   * 8x8 tiles of those, where one 8x8 tile fits within 2MB L2 cache
    4.36   *
    4.37   *Eventually want these divisions to be automatic, using DKU pattern
    4.38 - * embedded into SSR, and with VMS controlling the divisions according to
    4.39 - * the cache sizes, which it knows about.
    4.40 - *And, want VMS to work with language to split among main-mems, so a socket
    4.41 + * embedded into VMS and exposed in the language, and with VMS controlling the
    4.42 + * divisions according to the cache sizes, which it knows about.
    4.43 + *Also, want VMS to work with language to split among main-mems, so a socket
    4.44   * only cranks on data in its local segment of main mem
    4.45   *
    4.46 + *So, outer two loops determine start and end points within the result matrix.
    4.47 + * Inside that, a loop dets the start and end points along the shared dimensions
    4.48 + * of the two input matrices.
    4.49   */
    4.50  void inline
    4.51  multiplyMatrixArraysTransposed( int32 vecLength, int32 numResRows,
    4.52                                  int32 numResCols,
    4.53 -                      float32 *leftArray, float32 *rightArray,
    4.54 -                      float32 *resArray )
    4.55 +                                float32 *leftArray, float32 *rightArray,
    4.56 +                                float32 *resArray )
    4.57   {
    4.58     int resStride, inpStride;
    4.59 -   int startRow, startCol, endRow, endCol, startVec, endVec;
    4.60 +   int resStartRow, resStartCol, resEndRow, resEndCol, startVec, endVec;
    4.61  
    4.62     resStride  = numResCols;
    4.63     inpStride  = vecLength;
    4.64  
    4.65 -   for( startRow = 0; startRow < numResRows; )
    4.66 +   for( resStartRow = 0; resStartRow < numResRows; )
    4.67      {
    4.68 -      endRow = startRow + ROWS_IN_BLOCK;
    4.69 -      if( endRow > numResRows ) endRow = numResRows;
    4.70 +      resEndRow = resStartRow + ROWS_IN_BLOCK -1;  //start at zero, so -1
    4.71 +      if( resEndRow > numResRows ) resEndRow = numResRows -1;
    4.72  
    4.73 -      for( startCol = 0; startCol < numResCols; )
    4.74 +      for( resStartCol = 0; resStartCol < numResCols; )
    4.75         {
    4.76 -         endCol   = startCol + COLS_IN_BLOCK;
    4.77 -         if( endCol > numResCols ) endCol = numResCols;
    4.78 +         resEndCol   = resStartCol + COLS_IN_BLOCK -1;
    4.79 +         if( resEndCol > numResCols ) resEndCol = numResCols -1;
    4.80  
    4.81           for( startVec = 0; startVec < vecLength; )
    4.82            {
    4.83 -            endVec   = startVec + VEC_IN_BLOCK;
    4.84 -            if( endVec > vecLength ) endVec = vecLength;
    4.85 +            endVec   = startVec + VEC_IN_BLOCK -1;
    4.86 +            if( endVec > vecLength ) endVec = vecLength -1;
    4.87  
    4.88                 //By having the "vector" of sub-blocks in a sub-block slice
    4.89                 // be marched down in inner loop, are re-using the result
    4.90 -               // matrix, which stays in L1 cache -- can only re-use one of
    4.91 -               // the three, so this is the most important -- avoids writing
    4.92 +               // matrix, which stays in L1 cache and re-using the left sub-mat
    4.93 +               // which repeats for each right sub-mat -- can only re-use two of
    4.94 +               // the three, so result is the most important -- avoids writing
    4.95                 // dirty blocks until those result-locations fully done
    4.96                 //Row and Col is position in result matrix -- so row and vec
    4.97                 // for left array, then vec and col for right array
    4.98              multiplySubBlocksTransposed( leftArray, rightArray,
    4.99                                           resArray,
   4.100 -                                         startRow,  endRow,
   4.101 -                                         startCol,  endCol,
   4.102 +                                         resStartRow,  resEndRow,
   4.103 +                                         resStartCol,  resEndCol,
   4.104                                           startVec,  endVec,
   4.105                                           resStride, inpStride );
   4.106 -            startVec = endVec;
   4.107 +            startVec = endVec +1;
   4.108            }
   4.109 -         startCol = endCol;
   4.110 +         resStartCol = resEndCol +1;
   4.111         }
   4.112 -      startRow = endRow;
   4.113 +      resStartRow = resEndRow +1;
   4.114      }
   4.115   }
   4.116  
   4.117  
   4.118 +
   4.119  void inline
   4.120  multiplySubBlocksTransposed( float32 *leftArray, float32 *rightArray,
   4.121                       float32 *resArray,
   4.122 -                     int startRow,  int endRow,
   4.123 -                     int startCol,  int endCol,
   4.124 +                     int resStartRow,  int resEndRow,
   4.125 +                     int resStartCol,  int resEndCol,
   4.126                       int startVec,  int endVec,
   4.127                       int resStride, int inpStride )
   4.128   {
   4.129 -   int row,    col,        vec;
   4.130 +   int resRow,     resCol,        vec;
   4.131     int leftOffset, rightOffset;
   4.132     float32 result;
   4.133 -   
   4.134 -   for( row = startRow; row < endRow; row++ )
   4.135 -    { 
   4.136 -      for( col = startCol; col < endCol; col++ )
   4.137 -       { 
   4.138 -         leftOffset  = row * inpStride;//left & right inp strides always same
   4.139 -         rightOffset = col * inpStride;// because right is transposed
   4.140 +
   4.141 +      //The result row is used only for the left matrix, res col for the right
   4.142 +   for( resCol = resStartCol; resCol <= resEndCol; resCol++ )
   4.143 +    {
   4.144 +      for( resRow = resStartRow; resRow <= resEndRow; resRow++ )
   4.145 +       {
   4.146 +         leftOffset  = resRow * inpStride;//left & right inp strides always same
   4.147 +         rightOffset = resCol * inpStride;// because right is transposed
   4.148           result = 0;
   4.149 -         for( vec = startVec; vec < endVec; vec++ )
   4.150 +         for( vec = startVec; vec <= endVec; vec++ )
   4.151            {
   4.152              result +=
   4.153                 leftArray[ leftOffset + vec] * rightArray[ rightOffset + vec];
   4.154            }
   4.155 -         
   4.156 -         resArray[ row * resStride + col ] += result;
   4.157 +
   4.158 +         resArray[ resRow * resStride + resCol ] += result;
   4.159         }
   4.160      }
   4.161   }
   4.162  
   4.163  
   4.164 +
   4.165 +
   4.166  /*Reuse this in divider when do the sequential multiply case
   4.167   */
   4.168  void inline
     5.1 --- a/src/Application/main.c	Tue Nov 02 17:00:50 2010 -0700
     5.2 +++ b/src/Application/main.c	Thu Nov 04 17:44:15 2010 -0700
     5.3 @@ -19,6 +19,8 @@
     5.4   { Matrix      *leftMatrix, *rightMatrix, *resultMatrix;
     5.5     ParamBag    *paramBag;
     5.6     
     5.7 +   printf( "arguments: %s | %s\n", argv[0], argv[1] );
     5.8 +
     5.9     paramBag = makeParamBag();
    5.10     readParamFileIntoBag( argv[1], paramBag );
    5.11     initialize_Input_Matrices_Via( &leftMatrix, &rightMatrix, paramBag );