Further optimizing

Further optimizing

现在我们开始以不同的方式优化1x4的情况。

请注意,我们现在使用的常规寄存器比物理上可用的寄存器多得多……

We notice a considerable performance boost:

img

img

不过,还有很大的改进空间。

Optimization_4x4_8

现在我们使用寄存器来存储B当前行的元素。(注意,对于一次计算C四个元素的情况,我们没有这样做。)性能实际上略有下降。但是这个步骤支持进一步的优化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

/* Create macros so that the matrices are stored in column-major order */

#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

/* Routine for computing C = A * B + C */

void AddDot4x4( int, double *, int, double *, int, double *, int );

void MY_MMult( int m, int n, int k, double *a, int lda,
double *b, int ldb,
double *c, int ldc )
{
int i, j;

for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
/* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
one routine (four inner products) */

AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}


void AddDot4x4( int k, double *a, int lda, double *b, int ldb, double *c, int ldc )
{
/* So, this routine computes a 4x4 block of matrix A

C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).
C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).
C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).
C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).

Notice that this routine is called with c = C( i, j ) in the
previous routine, so these are actually the elements

C( i , j ), C( i , j+1 ), C( i , j+2 ), C( i , j+3 )
C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 )
C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 )
C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 )

in the original matrix C

In this version, we use registers for elements in the current row
of B as well */

int p;
register double
/* hold contributions to
C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 )
C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 )
C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 )
C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ) */
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
c_10_reg, c_11_reg, c_12_reg, c_13_reg,
c_20_reg, c_21_reg, c_22_reg, c_23_reg,
c_30_reg, c_31_reg, c_32_reg, c_33_reg,
/* hold
A( 0, p )
A( 1, p )
A( 2, p )
A( 3, p ) */
a_0p_reg,
a_1p_reg,
a_2p_reg,
a_3p_reg,
b_p0_reg,
b_p1_reg,
b_p2_reg,
b_p3_reg;

double
/* Point to the current elements in the four columns of B */
*b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr;

b_p0_pntr = &B( 0, 0 );
b_p1_pntr = &B( 0, 1 );
b_p2_pntr = &B( 0, 2 );
b_p3_pntr = &B( 0, 3 );

c_00_reg = 0.0; c_01_reg = 0.0; c_02_reg = 0.0; c_03_reg = 0.0;
c_10_reg = 0.0; c_11_reg = 0.0; c_12_reg = 0.0; c_13_reg = 0.0;
c_20_reg = 0.0; c_21_reg = 0.0; c_22_reg = 0.0; c_23_reg = 0.0;
c_30_reg = 0.0; c_31_reg = 0.0; c_32_reg = 0.0; c_33_reg = 0.0;

for ( p=0; p<k; p++ ){
a_0p_reg = A( 0, p );
a_1p_reg = A( 1, p );
a_2p_reg = A( 2, p );
a_3p_reg = A( 3, p );

b_p0_reg = *b_p0_pntr++;
b_p1_reg = *b_p1_pntr++;
b_p2_reg = *b_p2_pntr++;
b_p3_reg = *b_p3_pntr++;

/* First row */
c_00_reg += a_0p_reg * b_p0_reg;
c_01_reg += a_0p_reg * b_p1_reg;
c_02_reg += a_0p_reg * b_p2_reg;
c_03_reg += a_0p_reg * b_p3_reg;

/* Second row */
c_10_reg += a_1p_reg * b_p0_reg;
c_11_reg += a_1p_reg * b_p1_reg;
c_12_reg += a_1p_reg * b_p2_reg;
c_13_reg += a_1p_reg * b_p3_reg;

/* Third row */
c_20_reg += a_2p_reg * b_p0_reg;
c_21_reg += a_2p_reg * b_p1_reg;
c_22_reg += a_2p_reg * b_p2_reg;
c_23_reg += a_2p_reg * b_p3_reg;

/* Four row */
c_30_reg += a_3p_reg * b_p0_reg;
c_31_reg += a_3p_reg * b_p1_reg;
c_32_reg += a_3p_reg * b_p2_reg;
c_33_reg += a_3p_reg * b_p3_reg;
}

C( 0, 0 ) += c_00_reg; C( 0, 1 ) += c_01_reg; C( 0, 2 ) += c_02_reg; C( 0, 3 ) += c_03_reg;
C( 1, 0 ) += c_10_reg; C( 1, 1 ) += c_11_reg; C( 1, 2 ) += c_12_reg; C( 1, 3 ) += c_13_reg;
C( 2, 0 ) += c_20_reg; C( 2, 1 ) += c_21_reg; C( 2, 2 ) += c_22_reg; C( 2, 3 ) += c_23_reg;
C( 3, 0 ) += c_30_reg; C( 3, 1 ) += c_31_reg; C( 3, 2 ) += c_32_reg; C( 3, 3 ) += c_33_reg;
}

Optimization_4x4_9

从4x4_8到4x4_9是一个微妙的变化:我们不是一次一行地更新4x4块C的行,而是一次计算两行。这为我们使用向量操作做好了准备,我们用向量操作更新对C(0,j)和C(1,j) (j =0,…,3)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

/* Create macros so that the matrices are stored in column-major order */

#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

/* Routine for computing C = A * B + C */

void AddDot4x4( int, double *, int, double *, int, double *, int );

void MY_MMult( int m, int n, int k, double *a, int lda,
double *b, int ldb,
double *c, int ldc )
{
int i, j;

for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
/* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
one routine (four inner products) */

AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}


void AddDot4x4( int k, double *a, int lda, double *b, int ldb, double *c, int ldc )
{
/* So, this routine computes a 4x4 block of matrix A

C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).
C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).
C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).
C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).

Notice that this routine is called with c = C( i, j ) in the
previous routine, so these are actually the elements

C( i , j ), C( i , j+1 ), C( i , j+2 ), C( i , j+3 )
C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 )
C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 )
C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 )

in the original matrix C

A simple rearrangement to prepare for the use of vector registers */

int p;
register double
/* hold contributions to
C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 )
C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 )
C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 )
C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ) */
c_00_reg, c_01_reg, c_02_reg, c_03_reg,
c_10_reg, c_11_reg, c_12_reg, c_13_reg,
c_20_reg, c_21_reg, c_22_reg, c_23_reg,
c_30_reg, c_31_reg, c_32_reg, c_33_reg,
/* hold
A( 0, p )
A( 1, p )
A( 2, p )
A( 3, p ) */
a_0p_reg,
a_1p_reg,
a_2p_reg,
a_3p_reg,
b_p0_reg,
b_p1_reg,
b_p2_reg,
b_p3_reg;

double
/* Point to the current elements in the four columns of B */
*b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr;

b_p0_pntr = &B( 0, 0 );
b_p1_pntr = &B( 0, 1 );
b_p2_pntr = &B( 0, 2 );
b_p3_pntr = &B( 0, 3 );

c_00_reg = 0.0; c_01_reg = 0.0; c_02_reg = 0.0; c_03_reg = 0.0;
c_10_reg = 0.0; c_11_reg = 0.0; c_12_reg = 0.0; c_13_reg = 0.0;
c_20_reg = 0.0; c_21_reg = 0.0; c_22_reg = 0.0; c_23_reg = 0.0;
c_30_reg = 0.0; c_31_reg = 0.0; c_32_reg = 0.0; c_33_reg = 0.0;

for ( p=0; p<k; p++ ){
a_0p_reg = A( 0, p );
a_1p_reg = A( 1, p );
a_2p_reg = A( 2, p );
a_3p_reg = A( 3, p );

b_p0_reg = *b_p0_pntr++;
b_p1_reg = *b_p1_pntr++;
b_p2_reg = *b_p2_pntr++;
b_p3_reg = *b_p3_pntr++;

/* First row and second rows */
c_00_reg += a_0p_reg * b_p0_reg;
c_10_reg += a_1p_reg * b_p0_reg;

c_01_reg += a_0p_reg * b_p1_reg;
c_11_reg += a_1p_reg * b_p1_reg;

c_02_reg += a_0p_reg * b_p2_reg;
c_12_reg += a_1p_reg * b_p2_reg;

c_03_reg += a_0p_reg * b_p3_reg;
c_13_reg += a_1p_reg * b_p3_reg;

/* Third and fourth rows */
c_20_reg += a_2p_reg * b_p0_reg;
c_30_reg += a_3p_reg * b_p0_reg;

c_21_reg += a_2p_reg * b_p1_reg;
c_31_reg += a_3p_reg * b_p1_reg;

c_22_reg += a_2p_reg * b_p2_reg;
c_32_reg += a_3p_reg * b_p2_reg;

c_23_reg += a_2p_reg * b_p3_reg;
c_33_reg += a_3p_reg * b_p3_reg;
}

C( 0, 0 ) += c_00_reg; C( 0, 1 ) += c_01_reg; C( 0, 2 ) += c_02_reg; C( 0, 3 ) += c_03_reg;
C( 1, 0 ) += c_10_reg; C( 1, 1 ) += c_11_reg; C( 1, 2 ) += c_12_reg; C( 1, 3 ) += c_13_reg;
C( 2, 0 ) += c_20_reg; C( 2, 1 ) += c_21_reg; C( 2, 2 ) += c_22_reg; C( 2, 3 ) += c_23_reg;
C( 3, 0 ) += c_30_reg; C( 3, 1 ) += c_31_reg; C( 3, 2 ) += c_32_reg; C( 3, 3 ) += c_33_reg;
}

Optimization_4x4_10

在这里,我们开始使用向量寄存器和向量操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

/* Create macros so that the matrices are stored in column-major order */

#define A(i,j) a[ (j)*lda + (i) ]
#define B(i,j) b[ (j)*ldb + (i) ]
#define C(i,j) c[ (j)*ldc + (i) ]

/* Routine for computing C = A * B + C */

void AddDot4x4( int, double *, int, double *, int, double *, int );

void MY_MMult( int m, int n, int k, double *a, int lda,
double *b, int ldb,
double *c, int ldc )
{
int i, j;

for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */
for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */
/* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in
one routine (four inner products) */

AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc );
}
}
}

#include <mmintrin.h>
#include <xmmintrin.h> // SSE
#include <pmmintrin.h> // SSE2
#include <emmintrin.h> // SSE3

typedef union
{
__m128d v;
double d[2];
} v2df_t;

void AddDot4x4( int k, double *a, int lda, double *b, int ldb, double *c, int ldc )
{
/* So, this routine computes a 4x4 block of matrix A

C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ).
C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ).
C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ).
C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ).

Notice that this routine is called with c = C( i, j ) in the
previous routine, so these are actually the elements

C( i , j ), C( i , j+1 ), C( i , j+2 ), C( i , j+3 )
C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 )
C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 )
C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 )

in the original matrix C

And now we use vector registers and instructions */

int p;

v2df_t
c_00_c_10_vreg, c_01_c_11_vreg, c_02_c_12_vreg, c_03_c_13_vreg,
c_20_c_30_vreg, c_21_c_31_vreg, c_22_c_32_vreg, c_23_c_33_vreg,
a_0p_a_1p_vreg,
a_2p_a_3p_vreg,
b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg;

double
/* Point to the current elements in the four columns of B */
*b_p0_pntr, *b_p1_pntr, *b_p2_pntr, *b_p3_pntr;

b_p0_pntr = &B( 0, 0 );
b_p1_pntr = &B( 0, 1 );
b_p2_pntr = &B( 0, 2 );
b_p3_pntr = &B( 0, 3 );

c_00_c_10_vreg.v = _mm_setzero_pd();
c_01_c_11_vreg.v = _mm_setzero_pd();
c_02_c_12_vreg.v = _mm_setzero_pd();
c_03_c_13_vreg.v = _mm_setzero_pd();
c_20_c_30_vreg.v = _mm_setzero_pd();
c_21_c_31_vreg.v = _mm_setzero_pd();
c_22_c_32_vreg.v = _mm_setzero_pd();
c_23_c_33_vreg.v = _mm_setzero_pd();

for ( p=0; p<k; p++ ){
a_0p_a_1p_vreg.v = _mm_load_pd( (double *) &A( 0, p ) );
a_2p_a_3p_vreg.v = _mm_load_pd( (double *) &A( 2, p ) );

b_p0_vreg.v = _mm_loaddup_pd( (double *) b_p0_pntr++ ); /* load and duplicate */
b_p1_vreg.v = _mm_loaddup_pd( (double *) b_p1_pntr++ ); /* load and duplicate */
b_p2_vreg.v = _mm_loaddup_pd( (double *) b_p2_pntr++ ); /* load and duplicate */
b_p3_vreg.v = _mm_loaddup_pd( (double *) b_p3_pntr++ ); /* load and duplicate */

/* First row and second rows * 向量化,一次计算两个double/
c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v;
c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v;
c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v;
c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v;

/* Third and fourth rows */
c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v;
c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v;
c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v;
c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v;
}

C( 0, 0 ) += c_00_c_10_vreg.d[0]; C( 0, 1 ) += c_01_c_11_vreg.d[0];
C( 0, 2 ) += c_02_c_12_vreg.d[0]; C( 0, 3 ) += c_03_c_13_vreg.d[0];

C( 1, 0 ) += c_00_c_10_vreg.d[1]; C( 1, 1 ) += c_01_c_11_vreg.d[1];
C( 1, 2 ) += c_02_c_12_vreg.d[1]; C( 1, 3 ) += c_03_c_13_vreg.d[1];

C( 2, 0 ) += c_20_c_30_vreg.d[0]; C( 2, 1 ) += c_21_c_31_vreg.d[0];
C( 2, 2 ) += c_22_c_32_vreg.d[0]; C( 2, 3 ) += c_23_c_33_vreg.d[0];

C( 3, 0 ) += c_20_c_30_vreg.d[1]; C( 3, 1 ) += c_21_c_31_vreg.d[1];
C( 3, 2 ) += c_22_c_32_vreg.d[1]; C( 3, 3 ) += c_23_c_33_vreg.d[1];
}

本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!