AVX向量化学习(三)-if判断的处理

AVX-if判断的处理

使用AVX指令集对if判断进行处理

使用到的AVX函数介绍

1.

1
__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)

Description

Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.

Operation

1
2
3
4
5
6
7
8
9
FOR j := 0 to 3
i := j*64
IF mask[i+63]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0

2.

1
__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)

Description

Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.

Operation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
CASE (imm8[4:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0

程序源代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#include<stdio.h>
#include <immintrin.h>
int main()
{
double a[9]={1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8,2.1};
double b[9]={2.1,3.2,6.4,8.6,3.7,9.9,5.1,4.2,6.6};
double d[9]={0}; //记录原始if判断后的值
double e[9]={0}; //记录AVX-if判断后的值

__m256d v0;
__m256d v1;
__m256d v2,v3;
__m256d v4;

for(int i=0;i<9;i++)
{
if(a[i]>b[i])
{
d[i] = a[i];
}
else
{
d[i]=b[i];
}
}

int i=0;

for(;i<9-4;i+=4)
{
v0 = _mm256_loadu_pd(a+i);
v1 = _mm256_loadu_pd(b+i);
v2=_mm256_add_pd(v0,v1);
v3 =_mm256_blendv_pd(v0,v1,_mm256_cmp_pd(v0,v1,_CMP_LE_OQ));
_mm256_storeu_pd(e+i,v3);

}

for(;i<9;i++)
{
if(a[i]>b[i])
{
e[i] = a[i];
}
else
{
e[i]=b[i];
}
}

printf("this is d.\n");
for(int i=0;i<9;i++)
{
printf("%lf\n",d[i]);
}

printf("this is e with AVX.\n");
for(int i=0;i<9;i++)
{
printf("%lf\n",e[i]);
}
return 0;
}

程序输出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
this is d.
2.100000
3.200000
6.400000
8.600000
5.500000
9.900000
7.700000
8.800000
6.600000
this is e with AVX.
2.100000
3.200000
6.400000
8.600000
5.500000
9.900000
7.700000
8.800000
6.600000

相关链接

[https://software.intel.com/sites/landingpage/IntrinsicsGuide/]: “ Intel® Intrinsics Guide”

[simd - How to choose AVX compare predicate variants - Stack Overflow]: “Stack Overflow”

[blendvpd (officedaytime.com)]: “_mm256_Blendv_pd()原理解释”


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!