Hiding computation in a subroutine

我们首先重写基本实现，将内部循环隐藏在子程序AddDot中:
Optimization1 · flame/how-to-optimize-gemm Wiki (github.com)
Optimization2 · flame/how-to-optimize-gemm Wiki (github.com)

这一步不会产生任何性能提升：

它其实是为我们下一步做好准备。

Optimization1

这里最原始的矩阵乘代码：

/* Create macros so that the matrices are stored in column-major order */

//创建宏，使矩阵是列主序
#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

/* Routine for computing C = A * B + C */

void MY_MMult( int m, int n, int k, double *a, int lda, 
                                    double *b, int ldb,
                                    double *c, int ldc )
{
  int i, j, p;
  //loop i j p
  for ( i=0; i<m; i++ ){        /* Loop over the rows of C 遍历C的行 */   
    for ( j=0; j<n; j++ ){        /* Loop over the columns of C 遍历C的列 */
      for ( p=0; p<k; p++ ){        /* Update C( i,j ) with the inner
				       product of the ith row of A and
				       the jth column of B */
    //A的一行B的一列更新C(i,j)
	C( i,j ) = C( i,j ) +  A( i,p ) * B( p,j );
      }
    }
  }
}

拆分内部循环，把乘加运算放在子程序AddDot中:

/* Create macros so that the matrices are stored in column-major order */

#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

/* Routine for computing C = A * B + C */

void AddDot( int, double *, int, double *, double * );

void MY_MMult( int m, int n, int k, double *a, int lda, 
                                    double *b, int ldb,
                                    double *c, int ldc )
{
  int i, j;
	
   //loop j i p   在这里更改了循环变量的顺序
  for ( j=0; j<n; j+=1 ){        /* Loop over the columns of C */
    for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
      /* Update the C( i,j ) with the inner product of the ith row of A
	 and the jth column of B */
	  //拆分内部循环(循环变量p)，把乘加运算放在子程序AddDot中:
      //A的第i行，B的第j列
      AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );
    }
  }
}


/* Create macro to let X( i ) equal the ith element of x */

#define X(i) x[ (i)*incx ]

void AddDot( int k, double *x, int incx,  double *y, double *gamma )
{
  /* compute gamma := x' * y + gamma with vectors x and y of length n.

     Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
  */
 
  int p;
  //列主序，同行访问带跨步，同列访问无需跨步。跨步大小lda
  for ( p=0; p<k; p++ ){
    *gamma += X( p ) * y[ p ];     
  }
}

Optimization2

/* Create macros so that the matrices are stored in column-major order */

#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

/* Routine for computing C = A * B + C */

void AddDot( int, double *, int, double *, double * );

void MY_MMult( int m, int n, int k, double *a, int lda, 
                                    double *b, int ldb,
                                    double *c, int ldc )
{
  int i, j;
  //在这里对C的列进行了循环展开，展开数为4。列主序
  for ( j=0; j<n; j+=4 ){        /* Loop over the columns of C, unrolled by 4 */
    for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
      /* Update the C( i,j ) with the inner product of the ith row of A
	 and the jth column of B */

      AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ) );

      /* Update the C( i,j+1 ) with the inner product of the ith row of A
	 and the (j+1)th column of B */

      AddDot( k, &A( i,0 ), lda, &B( 0,j+1 ), &C( i,j+1 ) );

      /* Update the C( i,j+2 ) with the inner product of the ith row of A
	 and the (j+2)th column of B */

      AddDot( k, &A( i,0 ), lda, &B( 0,j+2 ), &C( i,j+2 ) );

      /* Update the C( i,j+3 ) with the inner product of the ith row of A
	 and the (j+1)th column of B */

      AddDot( k, &A( i,0 ), lda, &B( 0,j+3 ), &C( i,j+3 ) );
    }
  }
}


/* Create macro to let X( i ) equal the ith element of x */

#define X(i) x[ (i)*incx ]

//内层核心相较于上次来说，并没有修改
void AddDot( int k, double *x, int incx,  double *y, double *gamma )
{
  /* compute gamma := x' * y + gamma with vectors x and y of length n.

     Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
  */
 
  int p;

  for ( p=0; p<k; p++ ){
    *gamma += X( p ) * y[ p ];     
  }
}

HPC

blas gemm gemv

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

Computing four elements at a time 上一篇

BLAS（Basic Linear Algebra Subprograms）-基础线性代数子程序库下一篇