AVX向量化学习(五)-INT型数组相加操作
使用AVX指令集进行2个INT型的数组相加操作
使用到的AVX函数介绍
1.
 | __m256i _mm256_loadu_si256 (__m256i const * mem_addr)
 
  | 
Synopsis
m256i _mm256_loadu_si256 (m256i const * mem_addr)
#include <immintrin.h>
Instruction: vmovdqu ymm, m256
CPUID Flags: AVX
Description
Load 256-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
Operation
 | dst[255:0] := MEM[mem_addr+255:mem_addr]  dst[MAX:256] := 0
 
  | 
Performance
| Architecture | Latency | Throughput (CPI) | 
|---|
| Icelake | 7 | 0.5 | 
| Skylake | 7 | 0.5 | 
| Broadwell | 1 | 0.25 | 
| Haswell | 1 | 0.25 | 
| Ivy Bridge | 1 | 0.5 | 
2.
 | __m256i _mm256_add_epi32 (__m256i a, __m256i b)
 
  | 
Synopsis
m256i _mm256_add_epi32 (m256i a, __m256i b)
#include <immintrin.h>
Instruction: vpaddd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
 | FOR j := 0 to 7  	     i := j*32  	     dst[i+31:i] := a[i+31:i] + b[i+31:i]  ENDFOR  dst[MAX:256] := 0
 
  | 
Performance
| Architecture | Latency | Throughput (CPI) | 
|---|
| Icelake | 1 | 0.33 | 
| Skylake | 1 | 0.33 | 
| Broadwell | 1 | 0.5 | 
| Haswell | 1 | 0.5 | 
程序源代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
   | #include<stdio.h> #include <immintrin.h> int main() { 	int a[40]; 	int b[40]; 	int i=0; 	int ans1[40];     	int ans2[40];	  	for (i=0;i<40;i++) 	{ 		a[i]=i; 		b[i]=2*i; 	} 	for (i=0;i<40;i++) 	{ 		ans1[i]=a[i]+b[i]; 	} 	printf("串行计算结果:\n"); 	for (i=0;i<40;i++) 	{ 		printf("%d ",ans1[i]); 	} 	printf("\n"); 	__m256i v0; 	__m256i v1; 	__m256i v2; 	for (i=0;i<40-8;i+=8) 	{	 		v0 = _mm256_loadu_si256((const __m256i*)(a+i));	  		v1 = _mm256_loadu_si256((const __m256i*)(b+i));	  		v2 = _mm256_add_epi32(v0,v1);      		_mm256_storeu_si256((__m256i*)(ans2+i),v2); 		 	} 	for (;i<40;i++) 	{ 		ans2[i]=a[i]+b[i]; 	} 	printf("并行计算结果:\n"); 	for (i=0;i<40;i++) 	{ 		printf("%d ",ans2[i]); 	} 	return 0;  }
 
  | 
程序输出
 | 串行计算结果: 0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57 60 63 66 69 72 75 78 81 84 87 90 93 96 99 102 105 108 111 114 117 并行计算结果: 0 3 6 9 12 15 18 21 24 27 30 33 36 39 42 45 48 51 54 57 60 63 66 69 72 75 78 81 84 87 90 93 96 99 102 105 108 111 114 117
 
  | 
相关链接
[https://software.intel.com/sites/landingpage/IntrinsicsGuide/]: “ Intel® Intrinsics Guide”