• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2 // REQUIRES: powerpc-registered-target
3 
4 // RUN: %clang -S -emit-llvm -target powerpc64-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
5 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
6 // RUN: %clang -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
7 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
8 
9 #include <tmmintrin.h>
10 
11 __m64 res, m1, m2;
12 __m128i resi, mi1, mi2;
13 
14 void __attribute__((noinline))
test_abs()15 test_abs() {
16   resi = _mm_abs_epi16(mi1);
17   resi = _mm_abs_epi32(mi1);
18   resi = _mm_abs_epi8(mi1);
19   res = _mm_abs_pi16(m1);
20   res = _mm_abs_pi32(m1);
21   res = _mm_abs_pi8(m1);
22 }
23 
24 // CHECK-LABEL: @test_abs
25 
26 // CHECK: define available_externally <2 x i64> @_mm_abs_epi16(<2 x i64> [[REG1:[0-9a-zA-Z_%.]+]])
27 // CHECK: store <2 x i64> [[REG1]], <2 x i64>* [[REG2:[0-9a-zA-Z_%.]+]], align 16
28 // CHECK-NEXT: [[REG3:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG2]], align 16
29 // CHECK-NEXT: [[REG4:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG3]] to <8 x i16>
30 // CHECK-NEXT: [[REG5:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_abs(short vector[8])(<8 x i16> [[REG4]])
31 // CHECK-NEXT: [[REG6:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG5]] to <2 x i64>
32 // CHECK-NEXT: ret <2 x i64> [[REG6]]
33 
34 // CHECK: define available_externally <2 x i64> @_mm_abs_epi32(<2 x i64> [[REG7:[0-9a-zA-Z_%.]+]])
35 // CHECK: store <2 x i64> [[REG7]], <2 x i64>* [[REG8:[0-9a-zA-Z_%.]+]], align 16
36 // CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG8]], align 16
37 // CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG9]] to <4 x i32>
38 // CHECK-NEXT: [[REG11:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_abs(int vector[4])(<4 x i32> [[REG10]])
39 // CHECK-NEXT: [[REG12:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG11]] to <2 x i64>
40 // CHECK-NEXT: ret <2 x i64> [[REG12]]
41 
42 // CHECK: define available_externally <2 x i64> @_mm_abs_epi8(<2 x i64> [[REG13:[0-9a-zA-Z_%.]+]])
43 // CHECK: store <2 x i64> [[REG13]], <2 x i64>* [[REG14:[0-9a-zA-Z_%.]+]], align 16
44 // CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG14]], align 16
45 // CHECK-NEXT: [[REG16:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG15]] to <16 x i8>
46 // CHECK-NEXT: [[REG17:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_abs(signed char vector[16])(<16 x i8> [[REG16]])
47 // CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG17]] to <2 x i64>
48 // CHECK-NEXT: ret <2 x i64> [[REG18]]
49 
50 // CHECK: define available_externally i64 @_mm_abs_pi16(i64 [[REG19:[0-9a-zA-Z_%.]+]])
51 // CHECK: store i64 [[REG19]], i64* [[REG20:[0-9a-zA-Z_%.]+]], align 8
52 // CHECK-NEXT: [[REG21:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG20]], align 8
53 // CHECK-NEXT: [[REG22:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG21]], i32 0
54 // CHECK-NEXT: [[REG23:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG20]], align 8
55 // CHECK-NEXT: [[REG24:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG22]], i64 [[REG23]], i32 1
56 // CHECK-NEXT: store <2 x i64> [[REG24]], <2 x i64>* [[REG25:[0-9a-zA-Z_%.]+]], align 16
57 // CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG25]], align 16
58 // CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG26]] to <8 x i16>
59 // CHECK-NEXT: store <8 x i16> [[REG27]], <8 x i16>* [[REG28:[0-9a-zA-Z_%.]+]], align 16
60 // CHECK-NEXT: [[REG29:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG28]], align 16
61 // CHECK-NEXT: [[REG30:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_abs(short vector[8])(<8 x i16> [[REG29]])
62 // CHECK-NEXT: [[REG31:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG30]] to <2 x i64>
63 // CHECK-NEXT: [[REG32:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG31]], i32 0
64 // CHECK-NEXT: ret i64 [[REG32]]
65 
66 // CHECK: define available_externally i64 @_mm_abs_pi32(i64 [[REG33:[0-9a-zA-Z_%.]+]])
67 // CHECK: store i64 [[REG33]], i64* [[REG34:[0-9a-zA-Z_%.]+]], align 8
68 // CHECK-NEXT: [[REG35:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG34]], align 8
69 // CHECK-NEXT: [[REG36:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG35]], i32 0
70 // CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG34]], align 8
71 // CHECK-NEXT: [[REG38:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG36]], i64 [[REG37]], i32 1
72 // CHECK-NEXT: store <2 x i64> [[REG38]], <2 x i64>* [[REG39:[0-9a-zA-Z_%.]+]], align 16
73 // CHECK-NEXT: [[REG40:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG39]], align 16
74 // CHECK-NEXT: [[REG41:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG40]] to <4 x i32>
75 // CHECK-NEXT: store <4 x i32> [[REG41]], <4 x i32>* [[REG42:[0-9a-zA-Z_%.]+]], align 16
76 // CHECK-NEXT: [[REG43:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG42]], align 16
77 // CHECK-NEXT: [[REG44:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_abs(int vector[4])(<4 x i32> [[REG43]])
78 // CHECK-NEXT: [[REG45:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG44]] to <2 x i64>
79 // CHECK-NEXT: [[REG46:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG45]], i32 0
80 // CHECK-NEXT: ret i64 [[REG46]]
81 
82 // CHECK: define available_externally i64 @_mm_abs_pi8(i64 [[REG47:[0-9a-zA-Z_%.]+]])
83 // CHECK: store i64 [[REG47]], i64* [[REG48:[0-9a-zA-Z_%.]+]], align 8
84 // CHECK-NEXT: [[REG49:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG48]], align 8
85 // CHECK-NEXT: [[REG50:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG49]], i32 0
86 // CHECK-NEXT: [[REG51:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG48]], align 8
87 // CHECK-NEXT: [[REG52:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG50]], i64 [[REG51]], i32 1
88 // CHECK-NEXT: store <2 x i64> [[REG52]], <2 x i64>* [[REG53:[0-9a-zA-Z_%.]+]], align 16
89 // CHECK-NEXT: [[REG54:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG53]], align 16
90 // CHECK-NEXT: [[REG55:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG54]] to <16 x i8>
91 // CHECK-NEXT: store <16 x i8> [[REG55]], <16 x i8>* [[REG56:[0-9a-zA-Z_%.]+]], align 16
92 // CHECK-NEXT: [[REG57:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG56]], align 16
93 // CHECK-NEXT: [[REG58:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_abs(signed char vector[16])(<16 x i8> [[REG57]])
94 // CHECK-NEXT: [[REG59:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG58]] to <2 x i64>
95 // CHECK-NEXT: [[REG60:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG59]], i32 0
96 // CHECK-NEXT: ret i64 [[REG60]]
97 
98 void __attribute__((noinline))
test_alignr()99 test_alignr() {
100   resi = _mm_alignr_epi8(mi1, mi2, 1U);
101   res = _mm_alignr_pi8(m1, m2, 1U);
102 }
103 
104 // CHECK-LABEL: @test_alignr
105 
106 // CHECK: define available_externally <2 x i64> @_mm_alignr_epi8(<2 x i64> [[REG61:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG62:[0-9a-zA-Z_%.]+]], i32 zeroext [[REG63:[0-9a-zA-Z_%.]+]])
107 // CHECK: [[REG64:[0-9a-zA-Z_%.]+]] = alloca i32, align 4
108 // CHECK: store <2 x i64> [[REG61]], <2 x i64>* [[REG65:[0-9a-zA-Z_%.]+]], align 16
109 // CHECK-NEXT: store <2 x i64> [[REG62]], <2 x i64>* [[REG66:[0-9a-zA-Z_%.]+]], align 16
110 // CHECK-NEXT: store i32 [[REG63]], i32* [[REG64:[0-9a-zA-Z_%.]+]], align 4
111 // CHECK: %[[REG64b:[0-9a-zA-Z_%.]+]] = call i1 @llvm.is.constant.i32(i32 %0)
112 // CHECK-NEXT: br i1 %[[REG64b]], label %[[REG67:[0-9a-zA-Z_%.]+]], label %[[REG68:[0-9a-zA-Z_%.]+]]
113 
114 // CHECK: [[REG67]]:
115 // CHECK-NEXT: [[REG69:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4
116 // CHECK-NEXT: [[REG70:[0-9a-zA-Z_%.]+]] = icmp ult i32 [[REG69]], 16
117 // CHECK-NEXT: br i1 [[REG70]], label %[[REG71:[0-9a-zA-Z_%.]+]], label %[[REG68:[0-9a-zA-Z_%.]+]]
118 
119 // CHECK: [[REG71]]:
120 // CHECK-BE-NEXT: load <2 x i64>, <2 x i64>* [[REG66]], align 16
121 // CHECK-BE: call <16 x i8> @vec_sld(unsigned char vector[16], unsigned char vector[16], unsigned int)
122 // CHECK-LE-NEXT: load <2 x i64>, <2 x i64>* [[REG65]], align 16
123 // CHECK-LE: call <16 x i8> @vec_reve(unsigned char vector[16])
124 // CHECK-LE: call <16 x i8> @vec_reve(unsigned char vector[16])
125 // CHECk-LE: call <16 x i8> @vec_sld(unsigned char vector[16], unsigned char vector[16], unsigned int)
126 // CHECK-LE: call <16 x i8> @vec_reve(unsigned char vector[16])
127 // CHECK: br label %[[REG72:[0-9a-zA-Z_%.]+]]
128 
129 // CHECK: [[REG68]]:
130 // CHECK-NEXT: [[REG73:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4
131 // CHECK-NEXT: [[REG74:[0-9a-zA-Z_%.]+]] = icmp eq i32 [[REG73]], 0
132 // CHECK-NEXT: br i1 [[REG74]], label %[[REG75:[0-9a-zA-Z_%.]+]], label %[[REG76:[0-9a-zA-Z_%.]+]]
133 
134 // CHECK: [[REG75]]:
135 // CHECK-NEXT: [[REG77:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG66]], align 16
136 // CHECK-NEXT: store <2 x i64> [[REG77]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
137 // CHECK-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]]
138 
139 // CHECK: [[REG76]]:
140 // CHECK-NEXT: [[REG78:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4
141 // CHECK-NEXT: [[REG79:[0-9a-zA-Z_%.]+]] = icmp uge i32 [[REG78]], 16
142 // CHECK-NEXT: br i1 [[REG79]], label %[[REG80:[0-9a-zA-Z_%.]+]], label %[[REG81:[0-9a-zA-Z_%.]+]]
143 
144 // CHECK: [[REG80]]:
145 // CHECK-NEXT: [[REG82:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4
146 // CHECK-NEXT: [[REG83:[0-9a-zA-Z_%.]+]] = icmp uge i32 [[REG82]], 32
147 // CHECK-NEXT: br i1 [[REG83]], label %[[REG84:[0-9a-zA-Z_%.]+]], label %[[REG85:[0-9a-zA-Z_%.]+]]
148 
149 // CHECK: [[REG84]]:
150 // CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG86:[0-9a-zA-Z_%.]+]], align 16
151 // CHECK-NEXT: store <2 x i64> zeroinitializer, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
152 // CHECK-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]]
153 
154 // CHECK: [[REG85]]:
155 // CHECK-NEXT: [[REG87:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4
156 // CHECK-NEXT: [[REG88:[0-9a-zA-Z_%.]+]] = sub i32 [[REG87]], 16
157 // CHECK-NEXT: [[REG89:[0-9a-zA-Z_%.]+]] = mul i32 [[REG88]], 8
158 // CHECK-NEXT: [[REG90:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG89]] to i8
159 // CHECK-NEXT: [[REG91:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(unsigned char)(i8 zeroext [[REG90]])
160 // CHECK-NEXT: store <16 x i8> [[REG91]], <16 x i8>* [[REG92:[0-9a-zA-Z_%.]+]], align 16
161 // CHECK-NEXT: [[REG93:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG65]], align 16
162 // CHECK-NEXT: [[REG94:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG93]] to <16 x i8>
163 // CHECK-NEXT: [[REG95:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG92]], align 16
164 // CHECK-BE-NEXT: [[REG96:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16])
165 // CHECK-LE-NEXT: [[REG96:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16])
166 // CHECK-NEXT: [[REG97:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG96]] to <2 x i64>
167 // CHECK-NEXT: store <2 x i64> [[REG97]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
168 // CHECK-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]]
169 
170 // CHECK: [[REG81]]:
171 // CHECK-NEXT: [[REG98:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG64]], align 4
172 // CHECK-NEXT: [[REG99:[0-9a-zA-Z_%.]+]] = sub i32 16, [[REG98]]
173 // CHECK-NEXT: [[REG100:[0-9a-zA-Z_%.]+]] = mul i32 [[REG99]], 8
174 
175 // CHECK-BE: [[REG101:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG100]] to i8
176 // CHECK-BE: [[REG102:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_splats(unsigned char)(i8 zeroext [[REG101]])
177 // CHECK-BE: mul i32 {{[0-9a-zA-Z_%.]+}}, 8
178 // CHECK-BE: call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16])
179 // CHECK-BE: call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16])
180 // CHECK-BE: [[REG103:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_or(unsigned char vector[16], unsigned char vector[16])
181 // CHECK-BE-NEXT: [[REG104:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG103]] to <2 x i64>
182 // CHECK-BE-NEXT: store <2 x i64> [[REG104]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
183 // CHECK-BE-NEXT: br label %[[REG72:[0-9a-zA-Z_%.]+]]
184 
185 // CHECK-LE: [[REG105:[0-9a-zA-Z_%.]+]] = mul i32 {{[0-9a-zA-Z_%.]+}}, 8
186 // CHECK-LE-NEXT: trunc i32 [[REG105]] to i8
187 // CHECK-LE: call <16 x i8> @vec_splats(unsigned char)
188 // CHECK-LE: call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16])
189 // CHECK-LE: call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16])
190 // CHECK-LE: [[REG106:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_or(unsigned char vector[16], unsigned char vector[16])
191 // CHECK-LE-NEXT: [[REG107:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG106]] to <2 x i64>
192 // CHECK-LE-NEXT: store <2 x i64> [[REG107]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
193 
194 // CHECK: [[REG72]]:
195 // CHECK-NEXT: [[REG108:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
196 // CHECK-NEXT: ret <2 x i64> [[REG108]]
197 
198 // CHECK: define available_externally i64 @_mm_alignr_pi8(i64 [[REG109:[0-9a-zA-Z_%.]+]], i64 [[REG110:[0-9a-zA-Z_%.]+]], i32 zeroext [[REG111:[0-9a-zA-Z_%.]+]])
199 // CHECK: store i64 [[REG109]], i64* [[REG112:[0-9a-zA-Z_%.]+]], align 8
200 // CHECK-NEXT: store i64 [[REG110]], i64* [[REG113:[0-9a-zA-Z_%.]+]], align 8
201 // CHECK-NEXT: store i32 [[REG111]], i32* [[REG114:[0-9a-zA-Z_%.]+]], align 4
202 // CHECK-NEXT: [[REG115:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG114]], align 4
203 // CHECK-NEXT: [[REG116:[0-9a-zA-Z_%.]+]] = icmp ult i32 [[REG115]], 16
204 // CHECK-NEXT: br i1 [[REG116]], label %[[REG117:[0-9a-zA-Z_%.]+]], label %[[REG118:[0-9a-zA-Z_%.]+]]
205 
206 // CHECK: [[REG117]]:
207 // CHECK-NEXT: [[REG119:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG113]], align 8
208 // CHECK-NEXT: [[REG120:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG119]], i32 0
209 // CHECK-NEXT: [[REG121:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG112]], align 8
210 // CHECK-NEXT: [[REG122:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG120]], i64 [[REG121]], i32 1
211 // CHECK-NEXT: store <2 x i64> [[REG122]], <2 x i64>* [[REG123:[0-9a-zA-Z_%.]+]], align 16
212 // CHECK-NEXT: [[REG124:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG114]], align 4
213 // CHECK-NEXT: [[REG125:[0-9a-zA-Z_%.]+]] = shl i32 [[REG124]], 3
214 // CHECK-BE-NEXT: [[REG126:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, i32 [[REG125]], i32 3
215 // CHECK-LE-NEXT: [[REG127:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> undef, i32 [[REG125]], i32 0
216 // CHECK-LE-NEXT: [[REG128:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG127]], i32 0, i32 1
217 // CHECK-LE-NEXT: [[REG129:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG128]], i32 0, i32 2
218 // CHECK-LE-NEXT: [[REG126:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG129]], i32 0, i32 3
219 // CHECK-NEXT: store <4 x i32> [[REG126]], <4 x i32>* [[REG130:[0-9a-zA-Z_%.]+]], align 16
220 // CHECK-NEXT: [[REG131:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG123]], align 16
221 // CHECK-NEXT: [[REG132:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG131]] to <16 x i8>
222 // CHECK-NEXT: [[REG133:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG130]], align 16
223 // CHECK-NEXT: [[REG134:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG133]] to <16 x i8>
224 // CHECK-BE-NEXT: [[REG135:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_slo(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG132]], <16 x i8> [[REG134]])
225 // CHECK-LE-NEXT: [[REG135:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sro(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG132]], <16 x i8> [[REG134]])
226 // CHECK-NEXT: [[REG136:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG135]] to <2 x i64>
227 // CHECK-NEXT: store <2 x i64> [[REG136]], <2 x i64>* [[REG123]], align 16
228 // CHECK-NEXT: [[REG137:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG123]], align 16
229 // CHECK-NEXT: [[REG138:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG137]], i32 0
230 // CHECK-NEXT: store i64 [[REG138]], i64* [[REG139:[0-9a-zA-Z_%.]+]], align 8
231 // CHECK-NEXT: br label %[[REG140:[0-9a-zA-Z_%.]+]]
232 
233 // CHECK: [[REG118]]:
234 // CHECK-NEXT: store i64 0, i64* [[REG141:[0-9a-zA-Z_%.]+]], align 8
235 // CHECK-NEXT: store i64 0, i64* [[REG139]], align 8
236 // CHECK-NEXT: br label %[[REG140:[0-9a-zA-Z_%.]+]]
237 
238 // CHECK: [[REG140]]:
239 // CHECK-NEXT: [[REG142:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG139]], align 8
240 // CHECK-NEXT: ret i64 [[REG142]]
241 
242 void __attribute__((noinline))
test_hadd()243 test_hadd() {
244   resi = _mm_hadd_epi16(mi1, mi2);
245   resi = _mm_hadd_epi32(mi1, mi2);
246   res = _mm_hadd_pi16(m1, m2);
247   res = _mm_hadd_pi32(m1, m2);
248   resi = _mm_hadds_epi16(mi1, mi2);
249   res = _mm_hadds_pi16(m1, m2);
250 }
251 
252 // CHECK-LABEL: @test_hadd
253 
254 // CHECK: define available_externally <2 x i64> @_mm_hadd_epi16(<2 x i64> [[REG143:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG144:[0-9a-zA-Z_%.]+]])
255 // CHECK: store <2 x i64> [[REG143]], <2 x i64>* [[REG145:[0-9a-zA-Z_%.]+]], align 16
256 // CHECK-NEXT: store <2 x i64> [[REG144]], <2 x i64>* [[REG146:[0-9a-zA-Z_%.]+]], align 16
257 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>, <16 x i8>* [[REG147:[0-9a-zA-Z_%.]+]], align 16
258 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>, <16 x i8>* [[REG148:[0-9a-zA-Z_%.]+]], align 16
259 // CHECK-NEXT: [[REG149:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG145]], align 16
260 // CHECK-NEXT: [[REG150:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG149]] to <8 x i16>
261 // CHECK-NEXT: [[REG151:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG146]], align 16
262 // CHECK-NEXT: [[REG152:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG151]] to <8 x i16>
263 // CHECK-NEXT: [[REG153:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG150]], <8 x i16> [[REG152]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>)
264 // CHECK-NEXT: store <8 x i16> [[REG153]], <8 x i16>* [[REG154:[0-9a-zA-Z_%.]+]], align 16
265 // CHECK-NEXT: [[REG155:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG145]], align 16
266 // CHECK-NEXT: [[REG156:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG155]] to <8 x i16>
267 // CHECK-NEXT: [[REG157:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG146]], align 16
268 // CHECK-NEXT: [[REG158:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG157]] to <8 x i16>
269 // CHECK-NEXT: [[REG159:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG156]], <8 x i16> [[REG158]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>)
270 // CHECK-NEXT: store <8 x i16> [[REG159]], <8 x i16>* [[REG160:[0-9a-zA-Z_%.]+]], align 16
271 // CHECK-NEXT: [[REG161:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG154]], align 16
272 // CHECK-NEXT: [[REG162:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG160]], align 16
273 // CHECK-NEXT: [[REG163:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_add(short vector[8], short vector[8])(<8 x i16> [[REG161]], <8 x i16> [[REG162]])
274 // CHECK-NEXT: [[REG164:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG163]] to <2 x i64>
275 // CHECK-NEXT: ret <2 x i64> [[REG164]]
276 
277 // CHECK: define available_externally <2 x i64> @_mm_hadd_epi32(<2 x i64> [[REG165:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG166:[0-9a-zA-Z_%.]+]])
278 // CHECK: store <2 x i64> [[REG165]], <2 x i64>* [[REG167:[0-9a-zA-Z_%.]+]], align 16
279 // CHECK-NEXT: store <2 x i64> [[REG166]], <2 x i64>* [[REG168:[0-9a-zA-Z_%.]+]], align 16
280 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 16, i8 17, i8 18, i8 19, i8 24, i8 25, i8 26, i8 27>, <16 x i8>* [[REG169:[0-9a-zA-Z_%.]+]], align 16
281 // CHECK-NEXT: store <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 20, i8 21, i8 22, i8 23, i8 28, i8 29, i8 30, i8 31>, <16 x i8>* [[REG170:[0-9a-zA-Z_%.]+]], align 16
282 // CHECK-NEXT: [[REG171:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG167]], align 16
283 // CHECK-NEXT: [[REG172:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG171]] to <4 x i32>
284 // CHECK-NEXT: [[REG173:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG168]], align 16
285 // CHECK-NEXT: [[REG174:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG173]] to <4 x i32>
286 // CHECK-NEXT: [[REG175:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG172]], <4 x i32> [[REG174]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 16, i8 17, i8 18, i8 19, i8 24, i8 25, i8 26, i8 27>)
287 // CHECK-NEXT: store <4 x i32> [[REG175]], <4 x i32>* [[REG176:[0-9a-zA-Z_%.]+]], align 16
288 // CHECK-NEXT: [[REG177:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG167]], align 16
289 // CHECK-NEXT: [[REG178:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG177]] to <4 x i32>
290 // CHECK-NEXT: [[REG179:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG168]], align 16
291 // CHECK-NEXT: [[REG180:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG179]] to <4 x i32>
292 // CHECK-NEXT: [[REG181:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG178]], <4 x i32> [[REG180]], <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 20, i8 21, i8 22, i8 23, i8 28, i8 29, i8 30, i8 31>)
293 // CHECK-NEXT: store <4 x i32> [[REG181]], <4 x i32>* [[REG182:[0-9a-zA-Z_%.]+]], align 16
294 // CHECK-NEXT: [[REG183:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG176]], align 16
295 // CHECK-NEXT: [[REG184:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG182]], align 16
296 // CHECK-NEXT: [[REG185:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG183]], <4 x i32> [[REG184]])
297 // CHECK-NEXT: [[REG186:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG185]] to <2 x i64>
298 // CHECK-NEXT: ret <2 x i64> [[REG186]]
299 
300 // CHECK: define available_externally i64 @_mm_hadd_pi16(i64 [[REG187:[0-9a-zA-Z_%.]+]], i64 [[REG188:[0-9a-zA-Z_%.]+]])
301 // CHECK: store i64 [[REG187]], i64* [[REG189:[0-9a-zA-Z_%.]+]], align 8
302 // CHECK-NEXT: store i64 [[REG188]], i64* [[REG190:[0-9a-zA-Z_%.]+]], align 8
303 // CHECK-NEXT: [[REG191:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG189]], align 8
304 // CHECK-NEXT: [[REG192:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG191]], i32 0
305 // CHECK-NEXT: [[REG193:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG190]], align 8
306 // CHECK-NEXT: [[REG194:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG192]], i64 [[REG193]], i32 1
307 // CHECK-NEXT: store <2 x i64> [[REG194]], <2 x i64>* [[REG195:[0-9a-zA-Z_%.]+]], align 16
308 // CHECK-NEXT: [[REG196:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG195]], align 16
309 // CHECK-NEXT: [[REG197:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG196]] to <8 x i16>
310 // CHECK-NEXT: store <8 x i16> [[REG197]], <8 x i16>* [[REG198:[0-9a-zA-Z_%.]+]], align 16
311 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13>, <16 x i8>* [[REG199:[0-9a-zA-Z_%.]+]], align 16
312 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15>, <16 x i8>* [[REG200:[0-9a-zA-Z_%.]+]], align 16
313 // CHECK-NEXT: [[REG201:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16
314 // CHECK-NEXT: [[REG202:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16
315 // CHECK-NEXT: [[REG203:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG201]], <8 x i16> [[REG202]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15>)
316 // CHECK-NEXT: store <8 x i16> [[REG203]], <8 x i16>* [[REG204:[0-9a-zA-Z_%.]+]], align 16
317 // CHECK-NEXT: [[REG205:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16
318 // CHECK-NEXT: [[REG206:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16
319 // CHECK-NEXT: [[REG207:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG205]], <8 x i16> [[REG206]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13>)
320 // CHECK-NEXT: store <8 x i16> [[REG207]], <8 x i16>* [[REG198]], align 16
321 // CHECK-NEXT: [[REG208:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16
322 // CHECK-NEXT: [[REG209:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG204]], align 16
323 // CHECK-NEXT: [[REG210:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_add(short vector[8], short vector[8])(<8 x i16> [[REG208]], <8 x i16> [[REG209]])
324 // CHECK-NEXT: store <8 x i16> [[REG210]], <8 x i16>* [[REG198]], align 16
325 // CHECK-NEXT: [[REG211:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG198]], align 16
326 // CHECK-NEXT: [[REG212:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG211]] to <2 x i64>
327 // CHECK-NEXT: [[REG213:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG212]], i32 1
328 // CHECK-NEXT: ret i64 [[REG213]]
329 
330 // CHECK: define available_externally i64 @_mm_hadd_pi32(i64 [[REG214:[0-9a-zA-Z_%.]+]], i64 [[REG215:[0-9a-zA-Z_%.]+]])
331 // CHECK: store i64 [[REG214]], i64* [[REG216:[0-9a-zA-Z_%.]+]], align 8
332 // CHECK-NEXT: store i64 [[REG215]], i64* [[REG217:[0-9a-zA-Z_%.]+]], align 8
333 // CHECK-NEXT: [[REG218:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG216]], align 8
334 // CHECK-NEXT: [[REG219:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG218]], i32 0
335 // CHECK-NEXT: [[REG220:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG217]], align 8
336 // CHECK-NEXT: [[REG221:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG219]], i64 [[REG220]], i32 1
337 // CHECK-NEXT: store <2 x i64> [[REG221]], <2 x i64>* [[REG222:[0-9a-zA-Z_%.]+]], align 16
338 // CHECK-NEXT: [[REG223:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG222]], align 16
339 // CHECK-NEXT: [[REG224:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG223]] to <4 x i32>
340 // CHECK-NEXT: store <4 x i32> [[REG224]], <4 x i32>* [[REG225:[0-9a-zA-Z_%.]+]], align 16
341 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>, <16 x i8>* [[REG226:[0-9a-zA-Z_%.]+]], align 16
342 // CHECK-NEXT: store <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15>, <16 x i8>* [[REG227:[0-9a-zA-Z_%.]+]], align 16
343 // CHECK-NEXT: [[REG228:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16
344 // CHECK-NEXT: [[REG229:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16
345 // CHECK-NEXT: [[REG230:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG228]], <4 x i32> [[REG229]], <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15>)
346 // CHECK-NEXT: store <4 x i32> [[REG230]], <4 x i32>* [[REG231:[0-9a-zA-Z_%.]+]], align 16
347 // CHECK-NEXT: [[REG232:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16
348 // CHECK-NEXT: [[REG233:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16
349 // CHECK-NEXT: [[REG234:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG232]], <4 x i32> [[REG233]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
350 // CHECK-NEXT: store <4 x i32> [[REG234]], <4 x i32>* [[REG225]], align 16
351 // CHECK-NEXT: [[REG235:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16
352 // CHECK-NEXT: [[REG236:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG231]], align 16
353 // CHECK-NEXT: [[REG237:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG235]], <4 x i32> [[REG236]])
354 // CHECK-NEXT: store <4 x i32> [[REG237]], <4 x i32>* [[REG225]], align 16
355 // CHECK-NEXT: [[REG238:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG225]], align 16
356 // CHECK-NEXT: [[REG239:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG238]] to <2 x i64>
357 // CHECK-NEXT: [[REG240:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG239]], i32 1
358 // CHECK-NEXT: ret i64 [[REG240]]
359 
360 // CHECK: define available_externally <2 x i64> @_mm_hadds_epi16(<2 x i64> [[REG241:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG242:[0-9a-zA-Z_%.]+]])
361 // CHECK: store <2 x i64> [[REG241]], <2 x i64>* [[REG243:[0-9a-zA-Z_%.]+]], align 16
362 // CHECK-NEXT: store <2 x i64> [[REG242]], <2 x i64>* [[REG244:[0-9a-zA-Z_%.]+]], align 16
363 // CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG245:[0-9a-zA-Z_%.]+]], align 16
364 // CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG246:[0-9a-zA-Z_%.]+]], align 16
365 // CHECK-NEXT: [[REG247:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG243]], align 16
366 // CHECK-NEXT: [[REG248:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG247]] to <8 x i16>
367 // CHECK-NEXT: [[REG249:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG245]], align 16
368 // CHECK-NEXT: [[REG250:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sum4s(short vector[8], int vector[4])(<8 x i16> [[REG248]], <4 x i32> [[REG249]])
369 // CHECK-NEXT: store <4 x i32> [[REG250]], <4 x i32>* [[REG245]], align 16
370 // CHECK-NEXT: [[REG251:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG244]], align 16
371 // CHECK-NEXT: [[REG252:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG251]] to <8 x i16>
372 // CHECK-NEXT: [[REG253:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG246]], align 16
373 // CHECK-NEXT: [[REG254:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sum4s(short vector[8], int vector[4])(<8 x i16> [[REG252]], <4 x i32> [[REG253]])
374 // CHECK-NEXT: store <4 x i32> [[REG254]], <4 x i32>* [[REG246]], align 16
375 // CHECK-NEXT: [[REG255:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG245]], align 16
376 // CHECK-NEXT: [[REG256:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG246]], align 16
377 // CHECK-NEXT: [[REG257:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_packs(int vector[4], int vector[4])(<4 x i32> [[REG255]], <4 x i32> [[REG256]])
378 // CHECK-NEXT: [[REG258:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG257]] to <4 x i32>
379 // CHECK-NEXT: store <4 x i32> [[REG258]], <4 x i32>* [[REG245]], align 16
380 // CHECK-NEXT: [[REG259:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG245]], align 16
381 // CHECK-NEXT: [[REG260:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG259]] to <2 x i64>
382 // CHECK-NEXT: ret <2 x i64> [[REG260]]
383 
384 // CHECK: define available_externally i64 @_mm_hadds_pi16(i64 [[REG261:[0-9a-zA-Z_%.]+]], i64 [[REG262:[0-9a-zA-Z_%.]+]])
385 // CHECK: store i64 [[REG261]], i64* [[REG263:[0-9a-zA-Z_%.]+]], align 8
386 // CHECK-NEXT: store i64 [[REG262]], i64* [[REG264:[0-9a-zA-Z_%.]+]], align 8
387 // CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG265:[0-9a-zA-Z_%.]+]], align 16
388 // CHECK-NEXT: [[REG266:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG263]], align 8
389 // CHECK-NEXT: [[REG267:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG266]], i32 0
390 // CHECK-NEXT: [[REG268:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG264]], align 8
391 // CHECK-NEXT: [[REG269:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG267]], i64 [[REG268]], i32 1
392 // CHECK-NEXT: store <2 x i64> [[REG269]], <2 x i64>* [[REG270:[0-9a-zA-Z_%.]+]], align 16
393 // CHECK-NEXT: [[REG271:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG270]], align 16
394 // CHECK-NEXT: [[REG272:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG271]] to <8 x i16>
395 // CHECK-NEXT: store <8 x i16> [[REG272]], <8 x i16>* [[REG273:[0-9a-zA-Z_%.]+]], align 16
396 // CHECK-NEXT: [[REG274:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG273]], align 16
397 // CHECK-NEXT: [[REG275:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sum4s(short vector[8], int vector[4])(<8 x i16> [[REG274]], <4 x i32> zeroinitializer)
398 // CHECK-NEXT: store <4 x i32> [[REG275]], <4 x i32>* [[REG276:[0-9a-zA-Z_%.]+]], align 16
399 // CHECK-NEXT: [[REG277:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG276]], align 16
400 // CHECK-NEXT: [[REG278:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG276]], align 16
401 // CHECK-NEXT: [[REG279:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_packs(int vector[4], int vector[4])(<4 x i32> [[REG277]], <4 x i32> [[REG278]])
402 // CHECK-NEXT: store <8 x i16> [[REG279]], <8 x i16>* [[REG273]], align 16
403 // CHECK-NEXT: [[REG280:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG273]], align 16
404 // CHECK-NEXT: [[REG281:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG280]] to <2 x i64>
405 // CHECK-NEXT: [[REG282:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG281]], i32 1
406 // CHECK-NEXT: ret i64 [[REG282]]
407 
408 void __attribute__((noinline))
test_hsub()409 test_hsub() {
410   resi = _mm_hsub_epi16(mi1, mi2);
411   resi = _mm_hsub_epi32(mi1, mi2);
412   res = _mm_hsub_pi16(m1, m2);
413   res = _mm_hsub_pi32(m1, m2);
414   resi = _mm_hsubs_epi16(mi1, mi2);
415   res = _mm_hsubs_pi16(m1, m2);
416 }
417 
418 // CHECK-LABEL: @test_hsub
419 
420 // CHECK: define available_externally <2 x i64> @_mm_hsub_epi16(<2 x i64> [[REG283:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG284:[0-9a-zA-Z_%.]+]])
421 // CHECK: store <2 x i64> [[REG283]], <2 x i64>* [[REG285:[0-9a-zA-Z_%.]+]], align 16
422 // CHECK-NEXT: store <2 x i64> [[REG284]], <2 x i64>* [[REG286:[0-9a-zA-Z_%.]+]], align 16
423 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>, <16 x i8>* [[REG287:[0-9a-zA-Z_%.]+]], align 16
424 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>, <16 x i8>* [[REG288:[0-9a-zA-Z_%.]+]], align 16
425 // CHECK-NEXT: [[REG289:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG285]], align 16
426 // CHECK-NEXT: [[REG290:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG289]] to <8 x i16>
427 // CHECK-NEXT: [[REG291:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG286]], align 16
428 // CHECK-NEXT: [[REG292:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG291]] to <8 x i16>
429 // CHECK-NEXT: [[REG293:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG290]], <8 x i16> [[REG292]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>)
430 // CHECK-NEXT: store <8 x i16> [[REG293]], <8 x i16>* [[REG294:[0-9a-zA-Z_%.]+]], align 16
431 // CHECK-NEXT: [[REG295:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG285]], align 16
432 // CHECK-NEXT: [[REG296:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG295]] to <8 x i16>
433 // CHECK-NEXT: [[REG297:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG286]], align 16
434 // CHECK-NEXT: [[REG298:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG297]] to <8 x i16>
435 // CHECK-NEXT: [[REG299:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG296]], <8 x i16> [[REG298]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>)
436 // CHECK-NEXT: store <8 x i16> [[REG299]], <8 x i16>* [[REG300:[0-9a-zA-Z_%.]+]], align 16
437 // CHECK-NEXT: [[REG301:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG294]], align 16
438 // CHECK-NEXT: [[REG302:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG300]], align 16
439 // CHECK-NEXT: [[REG303:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sub(short vector[8], short vector[8])(<8 x i16> [[REG301]], <8 x i16> [[REG302]])
440 // CHECK-NEXT: [[REG304:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG303]] to <2 x i64>
441 // CHECK-NEXT: ret <2 x i64> [[REG304]]
442 
443 // CHECK: define available_externally <2 x i64> @_mm_hsub_epi32(<2 x i64> [[REG305:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG306:[0-9a-zA-Z_%.]+]])
444 // CHECK: store <2 x i64> [[REG305]], <2 x i64>* [[REG307:[0-9a-zA-Z_%.]+]], align 16
445 // CHECK-NEXT: store <2 x i64> [[REG306]], <2 x i64>* [[REG308:[0-9a-zA-Z_%.]+]], align 16
446 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 16, i8 17, i8 18, i8 19, i8 24, i8 25, i8 26, i8 27>, <16 x i8>* [[REG309:[0-9a-zA-Z_%.]+]], align 16
447 // CHECK-NEXT: store <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 20, i8 21, i8 22, i8 23, i8 28, i8 29, i8 30, i8 31>, <16 x i8>* [[REG310:[0-9a-zA-Z_%.]+]], align 16
448 // CHECK-NEXT: [[REG311:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG307]], align 16
449 // CHECK-NEXT: [[REG312:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG311]] to <4 x i32>
450 // CHECK-NEXT: [[REG313:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG308]], align 16
451 // CHECK-NEXT: [[REG314:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG313]] to <4 x i32>
452 // CHECK-NEXT: [[REG315:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG312]], <4 x i32> [[REG314]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 16, i8 17, i8 18, i8 19, i8 24, i8 25, i8 26, i8 27>)
453 // CHECK-NEXT: store <4 x i32> [[REG315]], <4 x i32>* [[REG316:[0-9a-zA-Z_%.]+]], align 16
454 // CHECK-NEXT: [[REG317:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG307]], align 16
455 // CHECK-NEXT: [[REG318:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG317]] to <4 x i32>
456 // CHECK-NEXT: [[REG319:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG308]], align 16
457 // CHECK-NEXT: [[REG320:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG319]] to <4 x i32>
458 // CHECK-NEXT: [[REG321:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG318]], <4 x i32> [[REG320]], <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 20, i8 21, i8 22, i8 23, i8 28, i8 29, i8 30, i8 31>)
459 // CHECK-NEXT: store <4 x i32> [[REG321]], <4 x i32>* [[REG322:[0-9a-zA-Z_%.]+]], align 16
460 // CHECK-NEXT: [[REG323:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG316]], align 16
461 // CHECK-NEXT: [[REG324:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG322]], align 16
462 // CHECK-NEXT: [[REG325:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sub(int vector[4], int vector[4])(<4 x i32> [[REG323]], <4 x i32> [[REG324]])
463 // CHECK-NEXT: [[REG326:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG325]] to <2 x i64>
464 // CHECK-NEXT: ret <2 x i64> [[REG326]]
465 
466 // CHECK: define available_externally i64 @_mm_hsub_pi16(i64 [[REG327:[0-9a-zA-Z_%.]+]], i64 [[REG328:[0-9a-zA-Z_%.]+]])
467 // CHECK: store i64 [[REG327]], i64* [[REG329:[0-9a-zA-Z_%.]+]], align 8
468 // CHECK-NEXT: store i64 [[REG328]], i64* [[REG330:[0-9a-zA-Z_%.]+]], align 8
469 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13>, <16 x i8>* [[REG331:[0-9a-zA-Z_%.]+]], align 16
470 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15>, <16 x i8>* [[REG332:[0-9a-zA-Z_%.]+]], align 16
471 // CHECK-NEXT: [[REG333:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG329]], align 8
472 // CHECK-NEXT: [[REG334:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG333]], i32 0
473 // CHECK-NEXT: [[REG335:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG330]], align 8
474 // CHECK-NEXT: [[REG336:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG334]], i64 [[REG335]], i32 1
475 // CHECK-NEXT: store <2 x i64> [[REG336]], <2 x i64>* [[REG337:[0-9a-zA-Z_%.]+]], align 16
476 // CHECK-NEXT: [[REG338:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG337]], align 16
477 // CHECK-NEXT: [[REG339:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG338]] to <8 x i16>
478 // CHECK-NEXT: store <8 x i16> [[REG339]], <8 x i16>* [[REG340:[0-9a-zA-Z_%.]+]], align 16
479 // CHECK-NEXT: [[REG341:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16
480 // CHECK-NEXT: [[REG342:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16
481 // CHECK-NEXT: [[REG343:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG341]], <8 x i16> [[REG342]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15>)
482 // CHECK-NEXT: store <8 x i16> [[REG343]], <8 x i16>* [[REG344:[0-9a-zA-Z_%.]+]], align 16
483 // CHECK-NEXT: [[REG345:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16
484 // CHECK-NEXT: [[REG346:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16
485 // CHECK-NEXT: [[REG347:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG345]], <8 x i16> [[REG346]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13>)
486 // CHECK-NEXT: store <8 x i16> [[REG347]], <8 x i16>* [[REG340]], align 16
487 // CHECK-NEXT: [[REG348:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16
488 // CHECK-NEXT: [[REG349:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG344]], align 16
489 // CHECK-NEXT: [[REG350:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sub(short vector[8], short vector[8])(<8 x i16> [[REG348]], <8 x i16> [[REG349]])
490 // CHECK-NEXT: store <8 x i16> [[REG350]], <8 x i16>* [[REG340]], align 16
491 // CHECK-NEXT: [[REG351:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG340]], align 16
492 // CHECK-NEXT: [[REG352:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG351]] to <2 x i64>
493 // CHECK-NEXT: [[REG353:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG352]], i32 1
494 // CHECK-NEXT: ret i64 [[REG353]]
495 
496 // CHECK: define available_externally i64 @_mm_hsub_pi32(i64 [[REG354:[0-9a-zA-Z_%.]+]], i64 [[REG355:[0-9a-zA-Z_%.]+]])
497 // CHECK: store i64 [[REG354]], i64* [[REG356:[0-9a-zA-Z_%.]+]], align 8
498 // CHECK-NEXT: store i64 [[REG355]], i64* [[REG357:[0-9a-zA-Z_%.]+]], align 8
499 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>, <16 x i8>* [[REG358:[0-9a-zA-Z_%.]+]], align 16
500 // CHECK-NEXT: store <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15>, <16 x i8>* [[REG359:[0-9a-zA-Z_%.]+]], align 16
501 // CHECK-NEXT: [[REG360:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG356]], align 8
502 // CHECK-NEXT: [[REG361:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG360]], i32 0
503 // CHECK-NEXT: [[REG362:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG357]], align 8
504 // CHECK-NEXT: [[REG363:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG361]], i64 [[REG362]], i32 1
505 // CHECK-NEXT: store <2 x i64> [[REG363]], <2 x i64>* [[REG364:[0-9a-zA-Z_%.]+]], align 16
506 // CHECK-NEXT: [[REG365:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG364]], align 16
507 // CHECK-NEXT: [[REG366:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG365]] to <4 x i32>
508 // CHECK-NEXT: store <4 x i32> [[REG366]], <4 x i32>* [[REG367:[0-9a-zA-Z_%.]+]], align 16
509 // CHECK-NEXT: [[REG368:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16
510 // CHECK-NEXT: [[REG369:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16
511 // CHECK-NEXT: [[REG370:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG368]], <4 x i32> [[REG369]], <16 x i8> <i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15, i8 4, i8 5, i8 6, i8 7, i8 12, i8 13, i8 14, i8 15>)
512 // CHECK-NEXT: store <4 x i32> [[REG370]], <4 x i32>* [[REG371:[0-9a-zA-Z_%.]+]], align 16
513 // CHECK-NEXT: [[REG372:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16
514 // CHECK-NEXT: [[REG373:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16
515 // CHECK-NEXT: [[REG374:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(int vector[4], int vector[4], unsigned char vector[16])(<4 x i32> [[REG372]], <4 x i32> [[REG373]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11, i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
516 // CHECK-NEXT: store <4 x i32> [[REG374]], <4 x i32>* [[REG367]], align 16
517 // CHECK-NEXT: [[REG375:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16
518 // CHECK-NEXT: [[REG376:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG371]], align 16
519 // CHECK-NEXT: [[REG377:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sub(int vector[4], int vector[4])(<4 x i32> [[REG375]], <4 x i32> [[REG376]])
520 // CHECK-NEXT: store <4 x i32> [[REG377]], <4 x i32>* [[REG367]], align 16
521 // CHECK-NEXT: [[REG378:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG367]], align 16
522 // CHECK-NEXT: [[REG379:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG378]] to <2 x i64>
523 // CHECK-NEXT: [[REG380:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG379]], i32 1
524 // CHECK-NEXT: ret i64 [[REG380]]
525 
526 // CHECK: define available_externally <2 x i64> @_mm_hsubs_epi16(<2 x i64> [[REG381:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG382:[0-9a-zA-Z_%.]+]])
527 // CHECK: store <2 x i64> [[REG381]], <2 x i64>* [[REG383:[0-9a-zA-Z_%.]+]], align 16
528 // CHECK-NEXT: store <2 x i64> [[REG382]], <2 x i64>* [[REG384:[0-9a-zA-Z_%.]+]], align 16
529 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>, <16 x i8>* [[REG385:[0-9a-zA-Z_%.]+]], align 16
530 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>, <16 x i8>* [[REG386:[0-9a-zA-Z_%.]+]], align 16
531 // CHECK-NEXT: [[REG387:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG383]], align 16
532 // CHECK-NEXT: [[REG388:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG387]] to <8 x i16>
533 // CHECK-NEXT: [[REG389:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG384]], align 16
534 // CHECK-NEXT: [[REG390:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG389]] to <8 x i16>
535 // CHECK-NEXT: [[REG391:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG388]], <8 x i16> [[REG390]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>)
536 // CHECK-NEXT: store <8 x i16> [[REG391]], <8 x i16>* [[REG392:[0-9a-zA-Z_%.]+]], align 16
537 // CHECK-NEXT: [[REG393:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG383]], align 16
538 // CHECK-NEXT: [[REG394:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG393]] to <8 x i16>
539 // CHECK-NEXT: [[REG395:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG384]], align 16
540 // CHECK-NEXT: [[REG396:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG395]] to <8 x i16>
541 // CHECK-NEXT: [[REG397:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG394]], <8 x i16> [[REG396]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>)
542 // CHECK-NEXT: store <8 x i16> [[REG397]], <8 x i16>* [[REG398:[0-9a-zA-Z_%.]+]], align 16
543 // CHECK-NEXT: [[REG399:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG392]], align 16
544 // CHECK-NEXT: [[REG400:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG398]], align 16
545 // CHECK-NEXT: [[REG401:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_subs(short vector[8], short vector[8])(<8 x i16> [[REG399]], <8 x i16> [[REG400]])
546 // CHECK-NEXT: [[REG402:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG401]] to <2 x i64>
547 // CHECK-NEXT: ret <2 x i64> [[REG402]]
548 
549 // CHECK: define available_externally i64 @_mm_hsubs_pi16(i64 [[REG403:[0-9a-zA-Z_%.]+]], i64 [[REG404:[0-9a-zA-Z_%.]+]])
550 // CHECK: store i64 [[REG403]], i64* [[REG405:[0-9a-zA-Z_%.]+]], align 8
551 // CHECK-NEXT: store i64 [[REG404]], i64* [[REG406:[0-9a-zA-Z_%.]+]], align 8
552 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13>, <16 x i8>* [[REG407:[0-9a-zA-Z_%.]+]], align 16
553 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15>, <16 x i8>* [[REG408:[0-9a-zA-Z_%.]+]], align 16
554 // CHECK-NEXT: [[REG409:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG405]], align 8
555 // CHECK-NEXT: [[REG410:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG409]], i32 0
556 // CHECK-NEXT: [[REG411:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG406]], align 8
557 // CHECK-NEXT: [[REG412:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG410]], i64 [[REG411]], i32 1
558 // CHECK-NEXT: store <2 x i64> [[REG412]], <2 x i64>* [[REG413:[0-9a-zA-Z_%.]+]], align 16
559 // CHECK-NEXT: [[REG414:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG413]], align 16
560 // CHECK-NEXT: [[REG415:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG414]] to <8 x i16>
561 // CHECK-NEXT: store <8 x i16> [[REG415]], <8 x i16>* [[REG416:[0-9a-zA-Z_%.]+]], align 16
562 // CHECK-NEXT: [[REG417:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16
563 // CHECK-NEXT: [[REG418:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16
564 // CHECK-NEXT: [[REG419:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG417]], <8 x i16> [[REG418]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13>)
565 // CHECK-NEXT: store <8 x i16> [[REG419]], <8 x i16>* [[REG420:[0-9a-zA-Z_%.]+]], align 16
566 // CHECK-NEXT: [[REG421:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16
567 // CHECK-NEXT: [[REG422:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16
568 // CHECK-NEXT: [[REG423:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG421]], <8 x i16> [[REG422]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15>)
569 // CHECK-NEXT: store <8 x i16> [[REG423]], <8 x i16>* [[REG424:[0-9a-zA-Z_%.]+]], align 16
570 // CHECK-NEXT: [[REG425:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG420]], align 16
571 // CHECK-NEXT: [[REG426:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG424]], align 16
572 // CHECK-NEXT: [[REG427:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_subs(short vector[8], short vector[8])(<8 x i16> [[REG425]], <8 x i16> [[REG426]])
573 // CHECK-NEXT: store <8 x i16> [[REG427]], <8 x i16>* [[REG416]], align 16
574 // CHECK-NEXT: [[REG428:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG416]], align 16
575 // CHECK-NEXT: [[REG429:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG428]] to <2 x i64>
576 // CHECK-NEXT: [[REG430:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG429]], i32 1
577 // CHECK-NEXT: ret i64 [[REG430]]
578 
579 void __attribute__((noinline))
test_shuffle()580 test_shuffle() {
581   resi = _mm_shuffle_epi8(mi1, mi2);
582   res = _mm_shuffle_pi8(m1, m2);
583 }
584 
585 // CHECK-LABEL: @test_shuffle
586 
587 // CHECK: define available_externally <2 x i64> @_mm_shuffle_epi8(<2 x i64> [[REG431:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG432:[0-9a-zA-Z_%.]+]])
588 // CHECK: store <2 x i64> [[REG431]], <2 x i64>* [[REG433:[0-9a-zA-Z_%.]+]], align 16
589 // CHECK-NEXT: store <2 x i64> [[REG432]], <2 x i64>* [[REG434:[0-9a-zA-Z_%.]+]], align 16
590 // CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG435:[0-9a-zA-Z_%.]+]], align 16
591 // CHECK-NEXT: [[REG436:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG434]], align 16
592 // CHECK-NEXT: [[REG437:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG436]] to <16 x i8>
593 // CHECK-NEXT: [[REG438:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmplt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG437]], <16 x i8> zeroinitializer)
594 // CHECK-NEXT: store <16 x i8> [[REG438]], <16 x i8>* [[REG439:[0-9a-zA-Z_%.]+]], align 16
595 // CHECK-NEXT: [[REG440:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG433]], align 16
596 // CHECK-NEXT: [[REG441:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG440]] to <16 x i8>
597 // CHECK-NEXT: [[REG442:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG433]], align 16
598 // CHECK-NEXT: [[REG443:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG442]] to <16 x i8>
599 // CHECK-NEXT: [[REG444:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG434]], align 16
600 // CHECK-NEXT: [[REG445:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG444]] to <16 x i8>
601 // CHECK-NEXT: [[REG446:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_perm(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> [[REG441]], <16 x i8> [[REG443]], <16 x i8> [[REG445]])
602 // CHECK-NEXT: store <16 x i8> [[REG446]], <16 x i8>* [[REG447:[0-9a-zA-Z_%.]+]], align 16
603 // CHECK-NEXT: [[REG448:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG447]], align 16
604 // CHECK-NEXT: [[REG449:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG439]], align 16
605 // CHECK-NEXT: [[REG450:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], bool vector[16])(<16 x i8> [[REG448]], <16 x i8> zeroinitializer, <16 x i8> [[REG449]])
606 // CHECK-NEXT: [[REG451:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG450]] to <2 x i64>
607 // CHECK-NEXT: ret <2 x i64> [[REG451]]
608 
609 // CHECK: define available_externally i64 @_mm_shuffle_pi8(i64 [[REG452:[0-9a-zA-Z_%.]+]], i64 [[REG453:[0-9a-zA-Z_%.]+]])
610 // CHECK: store i64 [[REG452]], i64* [[REG454:[0-9a-zA-Z_%.]+]], align 8
611 // CHECK-NEXT: store i64 [[REG453]], i64* [[REG455:[0-9a-zA-Z_%.]+]], align 8
612 // CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG456:[0-9a-zA-Z_%.]+]], align 16
613 // CHECK-NEXT: [[REG457:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG454]], align 8
614 // CHECK-NEXT: [[REG458:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG457]], i32 0
615 // CHECK-NEXT: [[REG459:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG454]], align 8
616 // CHECK-NEXT: [[REG460:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG458]], i64 [[REG459]], i32 1
617 // CHECK-NEXT: store <2 x i64> [[REG460]], <2 x i64>* [[REG461:[0-9a-zA-Z_%.]+]], align 16
618 // CHECK-NEXT: [[REG462:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG461]], align 16
619 // CHECK-NEXT: [[REG463:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG462]] to <16 x i8>
620 // CHECK-NEXT: store <16 x i8> [[REG463]], <16 x i8>* [[REG464:[0-9a-zA-Z_%.]+]], align 16
621 // CHECK-NEXT: [[REG465:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG455]], align 8
622 // CHECK-NEXT: [[REG466:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG465]], i32 0
623 // CHECK-NEXT: [[REG467:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG455]], align 8
624 // CHECK-NEXT: [[REG468:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG466]], i64 [[REG467]], i32 1
625 // CHECK-NEXT: store <2 x i64> [[REG468]], <2 x i64>* [[REG469:[0-9a-zA-Z_%.]+]], align 16
626 // CHECK-NEXT: [[REG470:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG469]], align 16
627 // CHECK-NEXT: [[REG471:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG470]] to <16 x i8>
628 // CHECK-NEXT: store <16 x i8> [[REG471]], <16 x i8>* [[REG472:[0-9a-zA-Z_%.]+]], align 16
629 // CHECK-NEXT: [[REG473:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG472]], align 16
630 // CHECK-NEXT: [[REG474:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmplt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG473]], <16 x i8> zeroinitializer)
631 // CHECK-NEXT: store <16 x i8> [[REG474]], <16 x i8>* [[REG475:[0-9a-zA-Z_%.]+]], align 16
632 // CHECK-NEXT: [[REG476:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16
633 // CHECK-NEXT: [[REG477:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16
634 // CHECK-NEXT: [[REG478:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG472]], align 16
635 // CHECK-NEXT: [[REG479:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_perm(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> [[REG476]], <16 x i8> [[REG477]], <16 x i8> [[REG478]])
636 // CHECK-NEXT: store <16 x i8> [[REG479]], <16 x i8>* [[REG464]], align 16
637 // CHECK-NEXT: [[REG480:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16
638 // CHECK-NEXT: [[REG481:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG475]], align 16
639 // CHECK-NEXT: [[REG482:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], bool vector[16])(<16 x i8> [[REG480]], <16 x i8> zeroinitializer, <16 x i8> [[REG481]])
640 // CHECK-NEXT: store <16 x i8> [[REG482]], <16 x i8>* [[REG464]], align 16
641 // CHECK-NEXT: [[REG483:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG464]], align 16
642 // CHECK-NEXT: [[REG484:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG483]] to <2 x i64>
643 // CHECK-NEXT: [[REG485:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG484]], i32 0
644 // CHECK-NEXT: ret i64 [[REG485]]
645 
646 void __attribute__((noinline))
test_sign()647 test_sign() {
648   resi = _mm_sign_epi8(mi1, mi2);
649   resi = _mm_sign_epi16(mi1, mi2);
650   resi = _mm_sign_epi32(mi1, mi2);
651   res = _mm_sign_pi8(m1, m2);
652   res = _mm_sign_pi16(m1, m2);
653   res = _mm_sign_pi32(m1, m2);
654 }
655 
656 // CHECK-LABEL: @test_sign
657 
658 // CHECK: define available_externally <2 x i64> @_mm_sign_epi8(<2 x i64> [[REG486:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG487:[0-9a-zA-Z_%.]+]])
659 // CHECK: store <2 x i64> [[REG486]], <2 x i64>* [[REG488:[0-9a-zA-Z_%.]+]], align 16
660 // CHECK-NEXT: store <2 x i64> [[REG487]], <2 x i64>* [[REG489:[0-9a-zA-Z_%.]+]], align 16
661 // CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG490:[0-9a-zA-Z_%.]+]], align 16
662 // CHECK-NEXT: [[REG491:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG489]], align 16
663 // CHECK-NEXT: [[REG492:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG491]] to <16 x i8>
664 // CHECK-NEXT: [[REG493:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmplt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG492]], <16 x i8> zeroinitializer)
665 // CHECK-NEXT: store <16 x i8> [[REG493]], <16 x i8>* [[REG494:[0-9a-zA-Z_%.]+]], align 16
666 // CHECK-NEXT: [[REG495:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG489]], align 16
667 // CHECK-NEXT: [[REG496:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG495]] to <16 x i8>
668 // CHECK-NEXT: [[REG497:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmpgt(signed char vector[16], signed char vector[16])(<16 x i8> [[REG496]], <16 x i8> zeroinitializer)
669 // CHECK-NEXT: [[REG498:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_neg(signed char vector[16])(<16 x i8> [[REG497]])
670 // CHECK-NEXT: store <16 x i8> [[REG498]], <16 x i8>* [[REG499:[0-9a-zA-Z_%.]+]], align 16
671 // CHECK-NEXT: [[REG500:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG494]], align 16
672 // CHECK-NEXT: [[REG501:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG499]], align 16
673 // CHECK-NEXT: [[REG502:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_add(signed char vector[16], signed char vector[16])(<16 x i8> [[REG500]], <16 x i8> [[REG501]])
674 // CHECK-NEXT: store <16 x i8> [[REG502]], <16 x i8>* [[REG503:[0-9a-zA-Z_%.]+]], align 16
675 // CHECK-NEXT: [[REG504:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG488]], align 16
676 // CHECK-NEXT: [[REG505:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG504]] to <16 x i8>
677 // CHECK-NEXT: [[REG506:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG503]], align 16
678 // CHECK-NEXT: [[REG507:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_mul(signed char vector[16], signed char vector[16])(<16 x i8> [[REG505]], <16 x i8> [[REG506]])
679 // CHECK-NEXT: [[REG508:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG507]] to <2 x i64>
680 // CHECK-NEXT: ret <2 x i64> [[REG508]]
681 
682 // CHECK: define available_externally <2 x i64> @_mm_sign_epi16(<2 x i64> [[REG509:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG510:[0-9a-zA-Z_%.]+]])
683 // CHECK: store <2 x i64> [[REG509]], <2 x i64>* [[REG511:[0-9a-zA-Z_%.]+]], align 16
684 // CHECK-NEXT: store <2 x i64> [[REG510]], <2 x i64>* [[REG512:[0-9a-zA-Z_%.]+]], align 16
685 // CHECK-NEXT: store <8 x i16> zeroinitializer, <8 x i16>* [[REG513:[0-9a-zA-Z_%.]+]], align 16
686 // CHECK-NEXT: [[REG514:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG512]], align 16
687 // CHECK-NEXT: [[REG515:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG514]] to <8 x i16>
688 // CHECK-NEXT: [[REG516:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_cmplt(short vector[8], short vector[8])(<8 x i16> [[REG515]], <8 x i16> zeroinitializer)
689 // CHECK-NEXT: store <8 x i16> [[REG516]], <8 x i16>* [[REG517:[0-9a-zA-Z_%.]+]], align 16
690 // CHECK-NEXT: [[REG518:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG512]], align 16
691 // CHECK-NEXT: [[REG519:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG518]] to <8 x i16>
692 // CHECK-NEXT: [[REG520:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_cmpgt(short vector[8], short vector[8])(<8 x i16> [[REG519]], <8 x i16> zeroinitializer)
693 // CHECK-NEXT: [[REG521:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_neg(short vector[8])(<8 x i16> [[REG520]])
694 // CHECK-NEXT: store <8 x i16> [[REG521]], <8 x i16>* [[REG522:[0-9a-zA-Z_%.]+]], align 16
695 // CHECK-NEXT: [[REG523:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG517]], align 16
696 // CHECK-NEXT: [[REG524:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG522]], align 16
697 // CHECK-NEXT: [[REG525:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_add(short vector[8], short vector[8])(<8 x i16> [[REG523]], <8 x i16> [[REG524]])
698 // CHECK-NEXT: store <8 x i16> [[REG525]], <8 x i16>* [[REG526:[0-9a-zA-Z_%.]+]], align 16
699 // CHECK-NEXT: [[REG527:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG511]], align 16
700 // CHECK-NEXT: [[REG528:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG527]] to <8 x i16>
701 // CHECK-NEXT: [[REG529:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG526]], align 16
702 // CHECK-NEXT: [[REG530:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG528]], <8 x i16> [[REG529]])
703 // CHECK-NEXT: [[REG531:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG530]] to <2 x i64>
704 // CHECK-NEXT: ret <2 x i64> [[REG531]]
705 
706 // CHECK: define available_externally <2 x i64> @_mm_sign_epi32(<2 x i64> [[REG532:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG533:[0-9a-zA-Z_%.]+]])
707 // CHECK: store <2 x i64> [[REG532]], <2 x i64>* [[REG534:[0-9a-zA-Z_%.]+]], align 16
708 // CHECK-NEXT: store <2 x i64> [[REG533]], <2 x i64>* [[REG535:[0-9a-zA-Z_%.]+]], align 16
709 // CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG536:[0-9a-zA-Z_%.]+]], align 16
710 // CHECK-NEXT: [[REG537:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG535]], align 16
711 // CHECK-NEXT: [[REG538:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG537]] to <4 x i32>
712 // CHECK-NEXT: [[REG539:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmplt(int vector[4], int vector[4])(<4 x i32> [[REG538]], <4 x i32> zeroinitializer)
713 // CHECK-NEXT: store <4 x i32> [[REG539]], <4 x i32>* [[REG540:[0-9a-zA-Z_%.]+]], align 16
714 // CHECK-NEXT: [[REG541:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG535]], align 16
715 // CHECK-NEXT: [[REG542:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG541]] to <4 x i32>
716 // CHECK-NEXT: [[REG543:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(int vector[4], int vector[4])(<4 x i32> [[REG542]], <4 x i32> zeroinitializer)
717 // CHECK-NEXT: [[REG544:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_neg(int vector[4])(<4 x i32> [[REG543]])
718 // CHECK-NEXT: store <4 x i32> [[REG544]], <4 x i32>* [[REG545:[0-9a-zA-Z_%.]+]], align 16
719 // CHECK-NEXT: [[REG546:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG540]], align 16
720 // CHECK-NEXT: [[REG547:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG545]], align 16
721 // CHECK-NEXT: [[REG548:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG546]], <4 x i32> [[REG547]])
722 // CHECK-NEXT: store <4 x i32> [[REG548]], <4 x i32>* [[REG549:[0-9a-zA-Z_%.]+]], align 16
723 // CHECK-NEXT: [[REG550:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG534]], align 16
724 // CHECK-NEXT: [[REG551:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG550]] to <4 x i32>
725 // CHECK-NEXT: [[REG552:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG549]], align 16
726 // CHECK-NEXT: [[REG553:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG551]], <4 x i32> [[REG552]])
727 // CHECK-NEXT: [[REG554:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG553]] to <2 x i64>
728 // CHECK-NEXT: ret <2 x i64> [[REG554]]
729 
730 // CHECK: define available_externally i64 @_mm_sign_pi8(i64 [[REG555:[0-9a-zA-Z_%.]+]], i64 [[REG556:[0-9a-zA-Z_%.]+]])
731 // CHECK: store i64 [[REG555]], i64* [[REG557:[0-9a-zA-Z_%.]+]], align 8
732 // CHECK-NEXT: store i64 [[REG556]], i64* [[REG558:[0-9a-zA-Z_%.]+]], align 8
733 // CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* [[REG559:[0-9a-zA-Z_%.]+]], align 16
734 // CHECK-NEXT: [[REG560:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG557]], align 8
735 // CHECK-NEXT: [[REG561:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG560]], i32 0
736 // CHECK-NEXT: [[REG562:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG557]], align 8
737 // CHECK-NEXT: [[REG563:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG561]], i64 [[REG562]], i32 1
738 // CHECK-NEXT: store <2 x i64> [[REG563]], <2 x i64>* [[REG564:[0-9a-zA-Z_%.]+]], align 16
739 // CHECK-NEXT: [[REG565:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG564]], align 16
740 // CHECK-NEXT: [[REG566:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG565]] to <16 x i8>
741 // CHECK-NEXT: store <16 x i8> [[REG566]], <16 x i8>* [[REG567:[0-9a-zA-Z_%.]+]], align 16
742 // CHECK-NEXT: [[REG568:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG558]], align 8
743 // CHECK-NEXT: [[REG569:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG568]], i32 0
744 // CHECK-NEXT: [[REG570:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG558]], align 8
745 // CHECK-NEXT: [[REG571:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG569]], i64 [[REG570]], i32 1
746 // CHECK-NEXT: store <2 x i64> [[REG571]], <2 x i64>* [[REG572:[0-9a-zA-Z_%.]+]], align 16
747 // CHECK-NEXT: [[REG573:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG572]], align 16
748 // CHECK-NEXT: [[REG574:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG573]] to <16 x i8>
749 // CHECK-NEXT: store <16 x i8> [[REG574]], <16 x i8>* [[REG575:[0-9a-zA-Z_%.]+]], align 16
750 // CHECK-NEXT: [[REG576:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG567]], align 16
751 // CHECK-NEXT: [[REG577:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG576]] to <2 x i64>
752 // CHECK-NEXT: [[REG578:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG575]], align 16
753 // CHECK-NEXT: [[REG579:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG578]] to <2 x i64>
754 // CHECK-NEXT: [[REG580:[0-9a-zA-Z_%.]+]] = call <2 x i64> @_mm_sign_epi8(<2 x i64> [[REG577]], <2 x i64> [[REG579]])
755 // CHECK-NEXT: [[REG581:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG580]] to <16 x i8>
756 // CHECK-NEXT: store <16 x i8> [[REG581]], <16 x i8>* [[REG567]], align 16
757 // CHECK-NEXT: [[REG582:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* [[REG567]], align 16
758 // CHECK-NEXT: [[REG583:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG582]] to <2 x i64>
759 // CHECK-NEXT: [[REG584:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG583]], i32 0
760 // CHECK-NEXT: ret i64 [[REG584]]
761 
762 // CHECK: define available_externally i64 @_mm_sign_pi16(i64 [[REG585:[0-9a-zA-Z_%.]+]], i64 [[REG586:[0-9a-zA-Z_%.]+]])
763 // CHECK: store i64 [[REG585]], i64* [[REG587:[0-9a-zA-Z_%.]+]], align 8
764 // CHECK-NEXT: store i64 [[REG586]], i64* [[REG588:[0-9a-zA-Z_%.]+]], align 8
765 // CHECK-NEXT: store <8 x i16> zeroinitializer, <8 x i16>* [[REG589:[0-9a-zA-Z_%.]+]], align 16
766 // CHECK-NEXT: [[REG590:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG587]], align 8
767 // CHECK-NEXT: [[REG591:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG590]], i32 0
768 // CHECK-NEXT: [[REG592:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG587]], align 8
769 // CHECK-NEXT: [[REG593:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG591]], i64 [[REG592]], i32 1
770 // CHECK-NEXT: store <2 x i64> [[REG593]], <2 x i64>* [[REG594:[0-9a-zA-Z_%.]+]], align 16
771 // CHECK-NEXT: [[REG595:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG594]], align 16
772 // CHECK-NEXT: [[REG596:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG595]] to <8 x i16>
773 // CHECK-NEXT: store <8 x i16> [[REG596]], <8 x i16>* [[REG597:[0-9a-zA-Z_%.]+]], align 16
774 // CHECK-NEXT: [[REG598:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG588]], align 8
775 // CHECK-NEXT: [[REG599:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG598]], i32 0
776 // CHECK-NEXT: [[REG600:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG588]], align 8
777 // CHECK-NEXT: [[REG601:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG599]], i64 [[REG600]], i32 1
778 // CHECK-NEXT: store <2 x i64> [[REG601]], <2 x i64>* [[REG602:[0-9a-zA-Z_%.]+]], align 16
779 // CHECK-NEXT: [[REG603:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG602]], align 16
780 // CHECK-NEXT: [[REG604:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG603]] to <8 x i16>
781 // CHECK-NEXT: store <8 x i16> [[REG604]], <8 x i16>* [[REG605:[0-9a-zA-Z_%.]+]], align 16
782 // CHECK-NEXT: [[REG606:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG597]], align 16
783 // CHECK-NEXT: [[REG607:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG606]] to <2 x i64>
784 // CHECK-NEXT: [[REG608:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG605]], align 16
785 // CHECK-NEXT: [[REG609:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG608]] to <2 x i64>
786 // CHECK-NEXT: [[REG610:[0-9a-zA-Z_%.]+]] = call <2 x i64> @_mm_sign_epi16(<2 x i64> [[REG607]], <2 x i64> [[REG609]])
787 // CHECK-NEXT: [[REG611:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG610]] to <8 x i16>
788 // CHECK-NEXT: store <8 x i16> [[REG611]], <8 x i16>* [[REG597]], align 16
789 // CHECK-NEXT: [[REG612:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG597]], align 16
790 // CHECK-NEXT: [[REG613:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG612]] to <2 x i64>
791 // CHECK-NEXT: [[REG614:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG613]], i32 0
792 // CHECK-NEXT: ret i64 [[REG614]]
793 
794 // CHECK: define available_externally i64 @_mm_sign_pi32(i64 [[REG615:[0-9a-zA-Z_%.]+]], i64 [[REG616:[0-9a-zA-Z_%.]+]])
795 // CHECK: store i64 [[REG615]], i64* [[REG617:[0-9a-zA-Z_%.]+]], align 8
796 // CHECK-NEXT: store i64 [[REG616]], i64* [[REG618:[0-9a-zA-Z_%.]+]], align 8
797 // CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[REG619:[0-9a-zA-Z_%.]+]], align 16
798 // CHECK-NEXT: [[REG620:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG617]], align 8
799 // CHECK-NEXT: [[REG621:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG620]], i32 0
800 // CHECK-NEXT: [[REG622:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG617]], align 8
801 // CHECK-NEXT: [[REG623:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG621]], i64 [[REG622]], i32 1
802 // CHECK-NEXT: store <2 x i64> [[REG623]], <2 x i64>* [[REG624:[0-9a-zA-Z_%.]+]], align 16
803 // CHECK-NEXT: [[REG625:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG624]], align 16
804 // CHECK-NEXT: [[REG626:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG625]] to <4 x i32>
805 // CHECK-NEXT: store <4 x i32> [[REG626]], <4 x i32>* [[REG627:[0-9a-zA-Z_%.]+]], align 16
806 // CHECK-NEXT: [[REG628:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG618]], align 8
807 // CHECK-NEXT: [[REG629:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG628]], i32 0
808 // CHECK-NEXT: [[REG630:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG618]], align 8
809 // CHECK-NEXT: [[REG631:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG629]], i64 [[REG630]], i32 1
810 // CHECK-NEXT: store <2 x i64> [[REG631]], <2 x i64>* [[REG632:[0-9a-zA-Z_%.]+]], align 16
811 // CHECK-NEXT: [[REG633:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG632]], align 16
812 // CHECK-NEXT: [[REG634:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG633]] to <4 x i32>
813 // CHECK-NEXT: store <4 x i32> [[REG634]], <4 x i32>* [[REG635:[0-9a-zA-Z_%.]+]], align 16
814 // CHECK-NEXT: [[REG636:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG627]], align 16
815 // CHECK-NEXT: [[REG637:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG636]] to <2 x i64>
816 // CHECK-NEXT: [[REG638:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG635]], align 16
817 // CHECK-NEXT: [[REG639:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG638]] to <2 x i64>
818 // CHECK-NEXT: [[REG640:[0-9a-zA-Z_%.]+]] = call <2 x i64> @_mm_sign_epi32(<2 x i64> [[REG637]], <2 x i64> [[REG639]])
819 // CHECK-NEXT: [[REG641:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG640]] to <4 x i32>
820 // CHECK-NEXT: store <4 x i32> [[REG641]], <4 x i32>* [[REG627]], align 16
821 // CHECK-NEXT: [[REG642:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG627]], align 16
822 // CHECK-NEXT: [[REG643:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG642]] to <2 x i64>
823 // CHECK-NEXT: [[REG644:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG643]], i32 0
824 // CHECK-NEXT: ret i64 [[REG644]]
825 
826 void __attribute__((noinline))
test_maddubs()827 test_maddubs() {
828   resi = _mm_maddubs_epi16(mi1, mi2);
829   res = _mm_maddubs_pi16(m1, m2);
830 }
831 
832 // CHECK-LABEL: @test_maddubs
833 
834 // CHECK: define available_externally <2 x i64> @_mm_maddubs_epi16(<2 x i64> [[REG645:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG646:[0-9a-zA-Z_%.]+]])
835 // CHECK: store <2 x i64> [[REG645]], <2 x i64>* [[REG647:[0-9a-zA-Z_%.]+]], align 16
836 // CHECK-NEXT: store <2 x i64> [[REG646]], <2 x i64>* [[REG648:[0-9a-zA-Z_%.]+]], align 16
837 // CHECK-NEXT: [[REG649:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_splats(short)(i16 signext 255)
838 // CHECK-NEXT: store <8 x i16> [[REG649]], <8 x i16>* [[REG650:[0-9a-zA-Z_%.]+]], align 16
839 // CHECK-NEXT: [[REG651:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG647]], align 16
840 // CHECK-NEXT: [[REG652:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG651]] to <16 x i8>
841 // CHECK-NEXT: [[REG653:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])(<16 x i8> [[REG652]])
842 // CHECK-NEXT: [[REG654:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG650]], align 16
843 // CHECK-NEXT: [[REG655:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_and(short vector[8], short vector[8])(<8 x i16> [[REG653]], <8 x i16> [[REG654]])
844 // CHECK-NEXT: store <8 x i16> [[REG655]], <8 x i16>* [[REG656:[0-9a-zA-Z_%.]+]], align 16
845 // CHECK-NEXT: [[REG657:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG647]], align 16
846 // CHECK-NEXT: [[REG658:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG657]] to <16 x i8>
847 // CHECK-NEXT: [[REG659:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG658]])
848 // CHECK-NEXT: [[REG660:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG650]], align 16
849 // CHECK-NEXT: [[REG661:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_and(short vector[8], short vector[8])(<8 x i16> [[REG659]], <8 x i16> [[REG660]])
850 // CHECK-NEXT: store <8 x i16> [[REG661]], <8 x i16>* [[REG662:[0-9a-zA-Z_%.]+]], align 16
851 // CHECK-NEXT: [[REG663:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG648]], align 16
852 // CHECK-NEXT: [[REG664:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG663]] to <16 x i8>
853 // CHECK-NEXT: [[REG76:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackh(signed char vector[16])(<16 x i8> [[REG664]])
854 // CHECK-NEXT: store <8 x i16> [[REG76]], <8 x i16>* [[REG665:[0-9a-zA-Z_%.]+]], align 16
855 // CHECK-NEXT: [[REG666:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG648]], align 16
856 // CHECK-NEXT: [[REG667:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG666]] to <16 x i8>
857 // CHECK-NEXT: [[REG668:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG667]])
858 // CHECK-NEXT: store <8 x i16> [[REG668]], <8 x i16>* [[REG669:[0-9a-zA-Z_%.]+]], align 16
859 // CHECK-NEXT: [[REG670:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG656]], align 16
860 // CHECK-NEXT: [[REG671:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG665]], align 16
861 // CHECK-NEXT: [[REG672:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG670]], <8 x i16> [[REG671]])
862 // CHECK-NEXT: store <8 x i16> [[REG672]], <8 x i16>* [[REG656]], align 16
863 // CHECK-NEXT: [[REG673:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG662]], align 16
864 // CHECK-NEXT: [[REG674:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG669]], align 16
865 // CHECK-NEXT: [[REG675:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG673]], <8 x i16> [[REG674]])
866 // CHECK-NEXT: store <8 x i16> [[REG675]], <8 x i16>* [[REG662]], align 16
867 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>, <16 x i8>* [[REG676:[0-9a-zA-Z_%.]+]], align 16
868 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>, <16 x i8>* [[REG677:[0-9a-zA-Z_%.]+]], align 16
869 // CHECK-NEXT: [[REG678:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG656]], align 16
870 // CHECK-NEXT: [[REG679:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG662]], align 16
871 // CHECK-NEXT: [[REG680:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG678]], <8 x i16> [[REG679]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>)
872 // CHECK-NEXT: store <8 x i16> [[REG680]], <8 x i16>* [[REG665]], align 16
873 // CHECK-NEXT: [[REG681:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG656]], align 16
874 // CHECK-NEXT: [[REG682:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG662]], align 16
875 // CHECK-NEXT: [[REG683:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG681]], <8 x i16> [[REG682]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>)
876 // CHECK-NEXT: store <8 x i16> [[REG683]], <8 x i16>* [[REG669]], align 16
877 // CHECK-NEXT: [[REG684:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG665]], align 16
878 // CHECK-NEXT: [[REG685:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG669]], align 16
879 // CHECK-NEXT: [[REG686:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_adds(short vector[8], short vector[8])(<8 x i16> [[REG684]], <8 x i16> [[REG685]])
880 // CHECK-NEXT: [[REG687:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG686]] to <2 x i64>
881 // CHECK-NEXT: ret <2 x i64> [[REG687]]
882 
883 // CHECK: define available_externally i64 @_mm_maddubs_pi16(i64 [[REG688:[0-9a-zA-Z_%.]+]], i64 [[REG689:[0-9a-zA-Z_%.]+]])
884 // CHECK: store i64 [[REG688]], i64* [[REG690:[0-9a-zA-Z_%.]+]], align 8
885 // CHECK-NEXT: store i64 [[REG689]], i64* [[REG691:[0-9a-zA-Z_%.]+]], align 8
886 // CHECK-NEXT: [[REG692:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG690]], align 8
887 // CHECK-NEXT: [[REG75:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG692]], i32 0
888 // CHECK-NEXT: [[REG693:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG690]], align 8
889 // CHECK-NEXT: [[REG694:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG75]], i64 [[REG693]], i32 1
890 // CHECK-NEXT: store <2 x i64> [[REG694]], <2 x i64>* [[REG80:[0-9a-zA-Z_%.]+]], align 16
891 // CHECK-NEXT: [[REG695:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG80]], align 16
892 // CHECK-NEXT: [[REG84:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG695]] to <8 x i16>
893 // CHECK-NEXT: store <8 x i16> [[REG84]], <8 x i16>* [[REG696:[0-9a-zA-Z_%.]+]], align 16
894 // CHECK-NEXT: [[REG697:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16
895 // CHECK-NEXT: [[REG698:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG697]] to <16 x i8>
896 // CHECK-NEXT: [[REG699:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG698]])
897 // CHECK-NEXT: store <8 x i16> [[REG699]], <8 x i16>* [[REG696]], align 16
898 // CHECK-NEXT: [[REG700:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_splats(short)(i16 signext 255)
899 // CHECK-NEXT: store <8 x i16> [[REG700]], <8 x i16>* [[REG701:[0-9a-zA-Z_%.]+]], align 16
900 // CHECK-NEXT: [[REG702:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16
901 // CHECK-NEXT: [[REG703:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG701]], align 16
902 // CHECK-NEXT: [[REG704:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_and(short vector[8], short vector[8])(<8 x i16> [[REG702]], <8 x i16> [[REG703]])
903 // CHECK-NEXT: store <8 x i16> [[REG704]], <8 x i16>* [[REG696]], align 16
904 // CHECK-NEXT: [[REG705:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG691]], align 8
905 // CHECK-NEXT: [[REG706:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG705]], i32 0
906 // CHECK-NEXT: [[REG707:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG691]], align 8
907 // CHECK-NEXT: [[REG708:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG706]], i64 [[REG707]], i32 1
908 // CHECK-NEXT: store <2 x i64> [[REG708]], <2 x i64>* [[REG709:[0-9a-zA-Z_%.]+]], align 16
909 // CHECK-NEXT: [[REG710:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG709]], align 16
910 // CHECK-NEXT: [[REG711:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG710]] to <8 x i16>
911 // CHECK-NEXT: store <8 x i16> [[REG711]], <8 x i16>* [[REG712:[0-9a-zA-Z_%.]+]], align 16
912 // CHECK-NEXT: [[REG713:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16
913 // CHECK-NEXT: [[REG714:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG713]] to <16 x i8>
914 // CHECK-NEXT: [[REG715:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_unpackl(signed char vector[16])(<16 x i8> [[REG714]])
915 // CHECK-NEXT: store <8 x i16> [[REG715]], <8 x i16>* [[REG712]], align 16
916 // CHECK-NEXT: [[REG716:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16
917 // CHECK-NEXT: [[REG717:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16
918 // CHECK-NEXT: [[REG718:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mul(short vector[8], short vector[8])(<8 x i16> [[REG716]], <8 x i16> [[REG717]])
919 // CHECK-NEXT: store <8 x i16> [[REG718]], <8 x i16>* [[REG712]], align 16
920 // CHECK-NEXT: store <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>, <16 x i8>* [[REG719:[0-9a-zA-Z_%.]+]], align 16
921 // CHECK-NEXT: store <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>, <16 x i8>* [[REG720:[0-9a-zA-Z_%.]+]], align 16
922 // CHECK-NEXT: [[REG721:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16
923 // CHECK-NEXT: [[REG722:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16
924 // CHECK-NEXT: [[REG723:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG721]], <8 x i16> [[REG722]], <16 x i8> <i8 0, i8 1, i8 4, i8 5, i8 8, i8 9, i8 12, i8 13, i8 16, i8 17, i8 20, i8 21, i8 24, i8 25, i8 28, i8 29>)
925 // CHECK-NEXT: store <8 x i16> [[REG723]], <8 x i16>* [[REG696]], align 16
926 // CHECK-NEXT: [[REG724:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16
927 // CHECK-NEXT: [[REG725:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16
928 // CHECK-NEXT: [[REG726:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_perm(short vector[8], short vector[8], unsigned char vector[16])(<8 x i16> [[REG724]], <8 x i16> [[REG725]], <16 x i8> <i8 2, i8 3, i8 6, i8 7, i8 10, i8 11, i8 14, i8 15, i8 18, i8 19, i8 22, i8 23, i8 26, i8 27, i8 30, i8 31>)
929 // CHECK-NEXT: store <8 x i16> [[REG726]], <8 x i16>* [[REG712]], align 16
930 // CHECK-NEXT: [[REG727:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16
931 // CHECK-NEXT: [[REG728:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG712]], align 16
932 // CHECK-NEXT: [[REG729:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_adds(short vector[8], short vector[8])(<8 x i16> [[REG727]], <8 x i16> [[REG728]])
933 // CHECK-NEXT: store <8 x i16> [[REG729]], <8 x i16>* [[REG696]], align 16
934 // CHECK-NEXT: [[REG730:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG696]], align 16
935 // CHECK-NEXT: [[REG731:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG730]] to <2 x i64>
936 // CHECK-NEXT: [[REG732:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG731]], i32 0
937 // CHECK-NEXT: ret i64 [[REG732]]
938 
939 void __attribute__((noinline))
test_mulhrs()940 test_mulhrs() {
941   resi = _mm_mulhrs_epi16(mi1, mi2);
942   res = _mm_mulhrs_pi16(m1, m2);
943 }
944 
945 // CHECK-LABEL: @test_mulhrs
946 
947 // CHECK: define available_externally <2 x i64> @_mm_mulhrs_epi16(<2 x i64> [[REG733:[0-9a-zA-Z_%.]+]], <2 x i64> [[REG734:[0-9a-zA-Z_%.]+]])
948 // CHECK: store <2 x i64> [[REG733]], <2 x i64>* [[REG735:[0-9a-zA-Z_%.]+]], align 16
949 // CHECK-NEXT: store <2 x i64> [[REG734]], <2 x i64>* [[REG736:[0-9a-zA-Z_%.]+]], align 16
950 // CHECK-NEXT: [[REG737:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG735]], align 16
951 // CHECK-NEXT: [[REG738:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG737]] to <8 x i16>
952 // CHECK-NEXT: [[REG739:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG738]])
953 // CHECK-NEXT: store <4 x i32> [[REG739]], <4 x i32>* [[REG740:[0-9a-zA-Z_%.]+]], align 16
954 // CHECK-NEXT: [[REG741:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG736]], align 16
955 // CHECK-NEXT: [[REG742:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG741]] to <8 x i16>
956 // CHECK-NEXT: [[REG743:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG742]])
957 // CHECK-NEXT: store <4 x i32> [[REG743]], <4 x i32>* [[REG744:[0-9a-zA-Z_%.]+]], align 16
958 // CHECK-NEXT: [[REG745:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16
959 // CHECK-NEXT: [[REG746:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16
960 // CHECK-NEXT: [[REG747:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG745]], <4 x i32> [[REG746]])
961 // CHECK-NEXT: store <4 x i32> [[REG747]], <4 x i32>* [[REG740]], align 16
962 // CHECK-NEXT: [[REG748:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG735]], align 16
963 // CHECK-NEXT: [[REG749:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG748]] to <8 x i16>
964 // CHECK-NEXT: [[REG750:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackl(short vector[8])(<8 x i16> [[REG749]])
965 // CHECK-NEXT: store <4 x i32> [[REG750]], <4 x i32>* [[REG744]], align 16
966 // CHECK-NEXT: [[REG751:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG736]], align 16
967 // CHECK-NEXT: [[REG752:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG751]] to <8 x i16>
968 // CHECK-NEXT: [[REG753:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackl(short vector[8])(<8 x i16> [[REG752]])
969 // CHECK-NEXT: store <4 x i32> [[REG753]], <4 x i32>* [[REG754:[0-9a-zA-Z_%.]+]], align 16
970 // CHECK-NEXT: [[REG755:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16
971 // CHECK-NEXT: [[REG756:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG754]], align 16
972 // CHECK-NEXT: [[REG757:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG755]], <4 x i32> [[REG756]])
973 // CHECK-NEXT: store <4 x i32> [[REG757]], <4 x i32>* [[REG744]], align 16
974 // CHECK-NEXT: [[REG758:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(unsigned int)(i32 zeroext 14)
975 // CHECK-NEXT: store <4 x i32> [[REG758]], <4 x i32>* [[REG759:[0-9a-zA-Z_%.]+]], align 16
976 // CHECK-NEXT: [[REG760:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16
977 // CHECK-NEXT: [[REG761:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG759]], align 16
978 // CHECK-NEXT: [[REG762:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG760]], <4 x i32> [[REG761]])
979 // CHECK-NEXT: store <4 x i32> [[REG762]], <4 x i32>* [[REG740]], align 16
980 // CHECK-NEXT: [[REG763:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16
981 // CHECK-NEXT: [[REG764:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG759]], align 16
982 // CHECK-NEXT: [[REG765:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG763]], <4 x i32> [[REG764]])
983 // CHECK-NEXT: store <4 x i32> [[REG765]], <4 x i32>* [[REG744]], align 16
984 // CHECK-NEXT: [[REG766:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(int)(i32 signext 1)
985 // CHECK-NEXT: store <4 x i32> [[REG766]], <4 x i32>* [[REG767:[0-9a-zA-Z_%.]+]], align 16
986 // CHECK-NEXT: [[REG768:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16
987 // CHECK-NEXT: [[REG769:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16
988 // CHECK-NEXT: [[REG770:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG768]], <4 x i32> [[REG769]])
989 // CHECK-NEXT: store <4 x i32> [[REG770]], <4 x i32>* [[REG740]], align 16
990 // CHECK-NEXT: [[REG771:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16
991 // CHECK-NEXT: [[REG772:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16
992 // CHECK-NEXT: [[REG773:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG771]], <4 x i32> [[REG772]])
993 // CHECK-NEXT: store <4 x i32> [[REG773]], <4 x i32>* [[REG740]], align 16
994 // CHECK-NEXT: [[REG774:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16
995 // CHECK-NEXT: [[REG775:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16
996 // CHECK-NEXT: [[REG776:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG774]], <4 x i32> [[REG775]])
997 // CHECK-NEXT: store <4 x i32> [[REG776]], <4 x i32>* [[REG744]], align 16
998 // CHECK-NEXT: [[REG777:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16
999 // CHECK-NEXT: [[REG778:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG767]], align 16
1000 // CHECK-NEXT: [[REG779:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG777]], <4 x i32> [[REG778]])
1001 // CHECK-NEXT: store <4 x i32> [[REG779]], <4 x i32>* [[REG744]], align 16
1002 // CHECK-NEXT: [[REG780:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG740]], align 16
1003 // CHECK-NEXT: [[REG781:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG744]], align 16
1004 // CHECK-NEXT: [[REG782:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_pack(int vector[4], int vector[4])(<4 x i32> [[REG780]], <4 x i32> [[REG781]])
1005 // CHECK-NEXT: [[REG783:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG782]] to <2 x i64>
1006 // CHECK-NEXT: ret <2 x i64> [[REG783]]
1007 
1008 // CHECK: define available_externally i64 @_mm_mulhrs_pi16(i64 [[REG784:[0-9a-zA-Z_%.]+]], i64 [[REG785:[0-9a-zA-Z_%.]+]])
1009 // CHECK: store i64 [[REG784]], i64* [[REG786:[0-9a-zA-Z_%.]+]], align 8
1010 // CHECK-NEXT: store i64 [[REG785]], i64* [[REG787:[0-9a-zA-Z_%.]+]], align 8
1011 // CHECK-NEXT: [[REG788:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG786]], align 8
1012 // CHECK-NEXT: [[REG789:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG788]], i32 0
1013 // CHECK-NEXT: [[REG790:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG786]], align 8
1014 // CHECK-NEXT: [[REG791:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG789]], i64 [[REG790]], i32 1
1015 // CHECK-NEXT: store <2 x i64> [[REG791]], <2 x i64>* [[REG792:[0-9a-zA-Z_%.]+]], align 16
1016 // CHECK-NEXT: [[REG793:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG792]], align 16
1017 // CHECK-NEXT: [[REG794:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG793]] to <4 x i32>
1018 // CHECK-NEXT: store <4 x i32> [[REG794]], <4 x i32>* [[REG795:[0-9a-zA-Z_%.]+]], align 16
1019 // CHECK-NEXT: [[REG796:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16
1020 // CHECK-NEXT: [[REG797:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG796]] to <8 x i16>
1021 // CHECK-NEXT: [[REG798:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG797]])
1022 // CHECK-NEXT: store <4 x i32> [[REG798]], <4 x i32>* [[REG795]], align 16
1023 // CHECK-NEXT: [[REG799:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG787]], align 8
1024 // CHECK-NEXT: [[REG800:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG799]], i32 0
1025 // CHECK-NEXT: [[REG801:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG787]], align 8
1026 // CHECK-NEXT: [[REG802:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG800]], i64 [[REG801]], i32 1
1027 // CHECK-NEXT: store <2 x i64> [[REG802]], <2 x i64>* [[REG803:[0-9a-zA-Z_%.]+]], align 16
1028 // CHECK-NEXT: [[REG804:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG803]], align 16
1029 // CHECK-NEXT: [[REG805:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG804]] to <4 x i32>
1030 // CHECK-NEXT: store <4 x i32> [[REG805]], <4 x i32>* [[REG806:[0-9a-zA-Z_%.]+]], align 16
1031 // CHECK-NEXT: [[REG807:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG806]], align 16
1032 // CHECK-NEXT: [[REG808:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG807]] to <8 x i16>
1033 // CHECK-NEXT: [[REG809:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_unpackh(short vector[8])(<8 x i16> [[REG808]])
1034 // CHECK-NEXT: store <4 x i32> [[REG809]], <4 x i32>* [[REG806]], align 16
1035 // CHECK-NEXT: [[REG810:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16
1036 // CHECK-NEXT: [[REG811:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG806]], align 16
1037 // CHECK-NEXT: [[REG812:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_mul(int vector[4], int vector[4])(<4 x i32> [[REG810]], <4 x i32> [[REG811]])
1038 // CHECK-NEXT: store <4 x i32> [[REG812]], <4 x i32>* [[REG795]], align 16
1039 // CHECK-NEXT: [[REG813:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(unsigned int)(i32 zeroext 14)
1040 // CHECK-NEXT: store <4 x i32> [[REG813]], <4 x i32>* [[REG814:[0-9a-zA-Z_%.]+]], align 16
1041 // CHECK-NEXT: [[REG815:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16
1042 // CHECK-NEXT: [[REG816:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG814]], align 16
1043 // CHECK-NEXT: [[REG817:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG815]], <4 x i32> [[REG816]])
1044 // CHECK-NEXT: store <4 x i32> [[REG817]], <4 x i32>* [[REG795]], align 16
1045 // CHECK-NEXT: [[REG818:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_splats(int)(i32 signext 1)
1046 // CHECK-NEXT: store <4 x i32> [[REG818]], <4 x i32>* [[REG819:[0-9a-zA-Z_%.]+]], align 16
1047 // CHECK-NEXT: [[REG820:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16
1048 // CHECK-NEXT: [[REG821:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG819]], align 16
1049 // CHECK-NEXT: [[REG822:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_add(int vector[4], int vector[4])(<4 x i32> [[REG820]], <4 x i32> [[REG821]])
1050 // CHECK-NEXT: store <4 x i32> [[REG822]], <4 x i32>* [[REG795]], align 16
1051 // CHECK-NEXT: [[REG823:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16
1052 // CHECK-NEXT: [[REG824:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG819]], align 16
1053 // CHECK-NEXT: [[REG825:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sr(int vector[4], unsigned int vector[4])(<4 x i32> [[REG823]], <4 x i32> [[REG824]])
1054 // CHECK-NEXT: store <4 x i32> [[REG825]], <4 x i32>* [[REG795]], align 16
1055 // CHECK-NEXT: [[REG826:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG795]], align 16
1056 // CHECK-NEXT: [[REG827:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG806]], align 16
1057 // CHECK-NEXT: [[REG828:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_pack(int vector[4], int vector[4])(<4 x i32> [[REG826]], <4 x i32> [[REG827]])
1058 // CHECK-NEXT: store <8 x i16> [[REG828]], <8 x i16>* [[REG829:[0-9a-zA-Z_%.]+]], align 16
1059 // CHECK-NEXT: [[REG830:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* [[REG829]], align 16
1060 // CHECK-NEXT: [[REG831:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG830]] to <2 x i64>
1061 // CHECK-NEXT: [[REG832:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG831]], i32 0
1062 // CHECK-NEXT: ret i64 [[REG832]]
1063