AVX向量化学习(五)-INT型数组相加操作

使用AVX指令集进行2个INT型的数组相加操作

使用到的AVX函数介绍

1.

1	`__m256i _mm256_loadu_si256 (__m256i const * mem_addr)`

Synopsis

m256i _mm256_loadu_si256 (m256i const * mem_addr)
#include <immintrin.h>
Instruction: vmovdqu ymm, m256
CPUID Flags: AVX

Description

Load 256-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.

Operation

1 2	`dst[255:0] := MEM[mem_addr+255:mem_addr] dst[MAX:256] := 0`

Performance

Architecture	Latency	Throughput (CPI)
Icelake	7	0.5
Skylake	7	0.5
Broadwell	1	0.25
Haswell	1	0.25
Ivy Bridge	1	0.5

2.

1	`__m256i _mm256_add_epi32 (__m256i a, __m256i b)`

Synopsis

m256i _mm256_add_epi32 (m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddd ymm, ymm, ymm
CPUID Flags: AVX2

Description

Add packed 32-bit integers in a and b, and store the results in dst.

Operation

FOR j := 0 to 7 
	     i := j*32 
	     dst[i+31:i] := a[i+31:i] + b[i+31:i] 
ENDFOR 
dst[MAX:256] := 0

Performance

Architecture	Latency	Throughput (CPI)
Icelake	1	0.33
Skylake	1	0.33
Broadwell	1	0.5
Haswell	1	0.5

程序源代码

#include<stdio.h>
#include <immintrin.h>
int main()
{
	int a[40];
	int b[40];
	int i=0;
	int ans1[40];    //记录串行结果 
	int ans2[40];	 //记录AVX向量化后的结果
	for (i=0;i<40;i++)
	{
		a[i]=i;
		b[i]=2*i;
	}
	for (i=0;i<40;i++)
	{
		ans1[i]=a[i]+b[i];
	}
	printf("串行计算结果：\n");
	for (i=0;i<40;i++)
	{
		printf("%d ",ans1[i]);
	}
	printf("\n");
	__m256i v0;
	__m256i v1;
	__m256i v2;
	for (i=0;i<40-8;i+=8)
	{	
		v0 = _mm256_loadu_si256((const __m256i*)(a+i));	 //强制类型转换
		v1 = _mm256_loadu_si256((const __m256i*)(b+i));	 //强制类型转化
		v2 = _mm256_add_epi32(v0,v1);     //v0+v1
		_mm256_storeu_si256((__m256i*)(ans2+i),v2);
		
	}//边界处理
	for (;i<40;i++)
	{
		ans2[i]=a[i]+b[i];
	}
	printf("并行计算结果：\n");
	for (i=0;i<40;i++)
	{
		printf("%d ",ans2[i]);
	}
	return 0; 
}

程序输出

串行计算结果：
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57 60 63 66 69 72 75 78 81 84 87 90 93 96 99 102 105 108 111 114 117
并行计算结果：
0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57 60 63 66 69 72 75 78 81 84 87 90 93 96 99 102 105 108 111 114 117

AVX向量化学习(五)-INT型数组相加操作

AVX向量化学习(五)-INT型数组相加操作

使用到的AVX函数介绍

1.

2.

程序源代码

程序输出

相关链接