BLISlab dgemm优化

BLISlab dgemm优化

参考资料:

Git地址:https://github.com/flame/blislab

视频教程:https://www.bilibili.com/video/BV1c94y117Uw?vd_source=3ae32e36058f58c5b85935fca9b77797【澎峰科技-张先轶老师】

阅读:tutorial.pdf【位于代码包中】

Step0

1.克隆项目到本地

1
git clone https://github.com/flame/blislab.git

2.代码结构

image-20230202155728203

3.编译环境

image-20230202155820411

4.运行环境配置脚本

image-20230202160027957

1
2
3
4
5
6
[root@hadoop1 step0]# source ./sourceme.sh 
BLISLAB_DIR = .
BLISLAB_USE_INTEL = false
BLISLAB_USE_BLAS = false
COMPILER_OPT_LEVEL = O3
BLAS_DIR = /u/jianyu/lib/openblas

5.Makefile

image-20230202160244490

1
2
3
4
5
6
7
8
9
10
11
[root@hadoop1 step0]# make
gcc -O3 -march=core-avx2 -fPIC -c dgemm/my_dgemm.c -o dgemm/my_dgemm.o -I./include -I./kernels -I/u/jianyu/lib/openblas/include
gcc -O3 -march=core-avx2 -fPIC -c dgemm/bl_dgemm_ref.c -o dgemm/bl_dgemm_ref.o -I./include -I./kernels -I/u/jianyu/lib/openblas/include
gcc -O3 -march=core-avx2 -fPIC -c dgemm/bl_dgemm_util.c -o dgemm/bl_dgemm_util.o -I./include -I./kernels -I/u/jianyu/lib/openblas/include
ar cr lib/libblislab.a dgemm/my_dgemm.o dgemm/bl_dgemm_ref.o dgemm/bl_dgemm_util.o
ranlib lib/libblislab.a
gcc -O3 -march=core-avx2 -fPIC -shared -o lib/libblislab.so dgemm/my_dgemm.o dgemm/bl_dgemm_ref.o dgemm/bl_dgemm_util.o ./lib/libblislab.a -lpthread -lm -lrt
cd ./test && make && cd . -I./include -I./kernels -I/u/jianyu/lib/openblas/include
make[1]: 进入目录“/root/blislab/step0/test”
gcc -O3 -march=core-avx2 -fPIC test_bl_dgemm.c -o test_bl_dgemm.x -I../include -I../kernels -I/u/jianyu/lib/openblas/include ../lib/libblislab.a -lpthread -lm -lrt
make[1]: 离开目录“/root/blislab/step0/test”

6.make.gnu.inc

image-20230202160608401

7.ref参考实现是否调用BLAS

image-20230202160904158

8.my_dgemm.c

image-20230202160939940

9.相关数据变量含义

image-20230202161055971

10.代码采用列主元

image-20230202161219553

11.dgemm使用脚本测试

Test目录下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
[root@hadoop1 test]# ./run_bl_dgemm.sh 
result=[
%m %n %k %MY_GFLOPS %REF_GFLOPS
16 16 16 7.35 1.93
32 32 32 7.88 1.29
48 48 48 9.81 1.17
64 64 64 8.98 1.20
80 80 80 9.59 1.11
96 96 96 7.74 1.08
112 112 112 8.32 0.95
128 128 128 7.74 1.04
144 144 144 7.39 0.99
160 160 160 7.13 1.06
176 176 176 7.58 1.06
192 192 192 7.73 0.98
208 208 208 7.29 1.01
224 224 224 7.90 0.97
240 240 240 7.61 1.00
256 256 256 6.72 0.96
272 272 272 7.51 0.99
288 288 288 7.39 0.99
304 304 304 7.84 0.99
320 320 320 7.61 0.97
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
//run_bl_dgemm.sh
#!/bin/bash

#For Mac OS only
export DYLD_LIBRARY_PATH=/opt/intel/lib:/opt/intel/mkl/lib

#Single Thread
export KMP_AFFINITY=compact #Rule to bind core to thread for OMP thread with Intel compiler for parallel version
export OMP_NUM_THREADS=1 #Set OMP number of threads for parallel version
export BLISLAB_IC_NT=1 #Set BLISLAB number of threads for parallel version
k_start=16 //起始大小
k_end=1024 //结束大小
k_blocksize=16 //步长
echo "result=["
echo -e "%m\t%n\t%k\t%MY_GFLOPS\t%REF_GFLOPS"
for (( k=k_start; k<=k_end; k+=k_blocksize ))
do
./test_bl_dgemm.x $k $k $k
done
echo "];"

12.dgemm手动指定参数测试

Test目录下

1
2
[root@hadoop1 test]# ./test_bl_dgemm.x 256 256 256 
256 256 256 5.00 0.84
1
2
[root@hadoop1 test]# ./test_bl_dgemm.x 16 32 128
16 32 128 8.77 1.02

这里要注意哪个代表m,n,k?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int main( int argc, char *argv[] )
{
int m, n, k;

if ( argc != 4 ) {
printf( "Error: require 3 arguments, but only %d provided.\n", argc - 1 );
exit( 0 );
}

sscanf( argv[ 1 ], "%d", &m );
sscanf( argv[ 2 ], "%d", &n );
sscanf( argv[ 3 ], "%d", &k );

test_bl_dgemm( m, n, k );

return 0;
}

13.计时区域

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
for ( i = 0; i < nrepeats; i ++ ) {
ref_beg = bl_clock(); //blislib提供的封装
{
bl_dgemm_ref(
m,
n,
k,
A,
lda,
B,
ldb,
C_ref,
ldc_ref
);
}
ref_time = bl_clock() - ref_beg;

if ( i == 0 ) {
ref_rectime = ref_time;
} else {
ref_rectime = ref_time < ref_rectime ? ref_time : ref_rectime; //多次计时取最优
}
}

14.正确性检验

Test目录下Test_bl_gemm.c

结果比较:通过比较你的优化计算结果和参考计算结果对比

Gflops的计算

  • 有效浮点次数 = 2*m*n*k
  • Gflops = 有效浮点次数 / 时间
1
2
3
4
5
6
7
8
9
10
11
12
13
14
computeError(
ldc,
ldc_ref,
m,
n,
C,
C_ref
);

// Compute overall floating point operations.
flops = ( m * n / ( 1000.0 * 1000.0 * 1000.0 ) ) * ( 2 * k );

printf( "%5d\t %5d\t %5d\t %5.2lf\t %5.2lf\n",
m, n, k, flops / bl_dgemm_rectime, flops / ref_rectime );

15.课后作业

perf工具的用法

perf-系统级性能分析工具 - Amicoyuan (xingyuanjie.top)

分析不同的j,p,i循环顺序的性能:

原因cache miss造成的差异

Step1

1.与Step0比较

左边是Step1右边是Step0

image-20230202165659964

2.基本分块

image-20230202170723811

3.反汇编

1
[root@hadoop1 dgemm]# objdump -d ./my_dgemm.o > my_dgemm.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
./my_dgemm.o:     文件格式 elf64-x86-64


Disassembly of section .text:

0000000000000000 <AddDot>:
0: 85 ff test %edi,%edi
2: 7e 2e jle 32 <AddDot+0x32>
4: 48 63 d2 movslq %edx,%rdx
7: c4 c1 7b 10 01 vmovsd (%r9),%xmm0
c: 31 c0 xor %eax,%eax
e: 48 c1 e2 03 shl $0x3,%rdx
12: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18: c5 fb 10 0e vmovsd (%rsi),%xmm1
1c: 48 01 d6 add %rdx,%rsi
1f: c4 e2 f1 b9 04 c1 vfmadd231sd (%rcx,%rax,8),%xmm1,%xmm0
25: 48 83 c0 01 add $0x1,%rax
29: c4 c1 7b 11 01 vmovsd %xmm0,(%r9)
2e: 39 c7 cmp %eax,%edi
30: 7f e6 jg 18 <AddDot+0x18>
32: f3 c3 repz retq
34: 66 90 xchg %ax,%ax
36: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
3d: 00 00 00

0000000000000040 <AddDot_MRxNR>:
40: e9 00 00 00 00 jmpq 45 <AddDot_MRxNR+0x5>
45: 90 nop
46: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4d: 00 00 00

4.反汇编(-fPIC引入的差异)

1
[root@hadoop1 test]# objdump -d ./test_bl_dgemm.x  > test.S

image-20230202171234007

5.Gcc生成汇编

1
[root@hadoop1 step1]# gcc -O3 -march=core-avx2 -fPIC -S dgemm/my_dgemm.c -o dgemm/my_dgemm1.S -I./include -I./kernels -I/u/jianyu/lib/openblas/include

image-20230202171758087

6.Step0与Step1比较

image-20230202172816584

7.分块,修改MR, NR为4×4

image-20230202173132817

8.分块(2×2)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
void AddDot_2×2( int k, double *A, int lda, double *B, int ldb, double *C,int ldc ) {
register double C00, C01, C10, C11;
int p;
C00=0.0;
C01=0.0;
C10=0.0;
C11=0.0;

for( p=0 ;p < k; p++)
{
C00 += A( 0, p) * B( p, 0);
C01 += A( 0, p) * B( p, 1);
C10 += A( 1, p) * B( p, 0);
C11 += A( 1, p) * B( p, 1);
}
C(0, 0) +=C00;
C(0, 1) +=C01;
C(1, 0) +=C10;
C(1, 1) +=C11;

}

image-20230202173645399

9.AddDot_2x2汇编代码

image-20230202191728644

10.AddDot_2x2最内层循环展开

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
void AddDot_2×2( int k, double *A, int lda, double *B, int ldb, double *C,int ldc ) {
register double C00, C01, C10, C11;
int p;
C00=0.0;
C01=0.0;
C10=0.0;
C11=0.0;

for( p=0 ;p < k; p+=2)
{
C00 += A( 0, p) * B( p, 0) + A( 0, p+1) * B( p+1, 0);
C01 += A( 0, p) * B( p, 1) + A( 0, p+1) * B( p+1, 1);
C10 += A( 1, p) * B( p, 0) + A( 1, p+1) * B( p+1, 0);
C11 += A( 1, p) * B( p, 1) + A( 1, p+1) * B( p+1, 1);
}
C(0, 0) +=C00;
C(0, 1) +=C01;
C(1, 0) +=C10;
C(1, 1) +=C11;

}

image-20230202192033816

11.AddDot_2x2汇编代码【最内层循环展开】

image-20230202192319993

Step2

1.与Step1的性能比较

image-20230206163219051

2.优化kernel/bl_dgemm_ukr.c

image-20230206163713966

image-20230206163803615

3.优化后性能对比

image-20230206163833950

4.下降原因分析

image-20230206164335213

image-20230206164734050

5.如何进行分块

image-20230206165017812

image-20230206165103417

image-20230206170050450

image-20230206170105998

image-20230206170126588

image-20230206170153251

image-20230206170226248

image-20230206170242872

image-20230206170320842

image-20230206170334857

6.Dgemm代码

image-20230206171538013

image-20230206171556767

image-20230206171622104

image-20230206171639209

7.Dgemm macro kenrel代码

image-20230206171903844

image-20230206171940282

8.Gemm汇总

image-20230206172351890

9.拓展

image-20230206172425329

10.双缓冲优化

image-20230206172846408


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!