• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
7
8; NOTE: We're testing with loads because ABI lowering creates a concat_vectors that extract_vector_elt creation can see through.
9; This would require the combine to recreate the concat_vectors.
10define <8 x i16> @pmaddubsw_128(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
11; SSE-LABEL: pmaddubsw_128:
12; SSE:       # %bb.0:
13; SSE-NEXT:    movdqa (%rsi), %xmm0
14; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: pmaddubsw_128:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vmovdqa (%rsi), %xmm0
20; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
21; AVX-NEXT:    retq
22  %A = load <16 x i8>, <16 x i8>* %Aptr
23  %B = load <16 x i8>, <16 x i8>* %Bptr
24  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
25  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
26  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
27  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
28  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
29  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
30  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
31  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
32  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
33  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
34  %add = add <8 x i32> %even_mul, %odd_mul
35  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
36  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
37  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
38  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
39  %trunc = trunc <8 x i32> %min to <8 x i16>
40  ret <8 x i16> %trunc
41}
42
43define <16 x i16> @pmaddubsw_256(<32 x i8>* %Aptr, <32 x i8>* %Bptr) {
44; SSE-LABEL: pmaddubsw_256:
45; SSE:       # %bb.0:
46; SSE-NEXT:    movdqa (%rsi), %xmm0
47; SSE-NEXT:    movdqa 16(%rsi), %xmm1
48; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
49; SSE-NEXT:    pmaddubsw 16(%rdi), %xmm1
50; SSE-NEXT:    retq
51;
52; AVX1-LABEL: pmaddubsw_256:
53; AVX1:       # %bb.0:
54; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
55; AVX1-NEXT:    vmovdqa (%rsi), %ymm1
56; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
57; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
58; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm3, %xmm2
59; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0
60; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
61; AVX1-NEXT:    retq
62;
63; AVX256-LABEL: pmaddubsw_256:
64; AVX256:       # %bb.0:
65; AVX256-NEXT:    vmovdqa (%rsi), %ymm0
66; AVX256-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
67; AVX256-NEXT:    retq
68  %A = load <32 x i8>, <32 x i8>* %Aptr
69  %B = load <32 x i8>, <32 x i8>* %Bptr
70  %A_even = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
71  %A_odd = shufflevector <32 x i8> %A, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
72  %B_even = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
73  %B_odd = shufflevector <32 x i8> %B, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
74  %A_even_ext = sext <16 x i8> %A_even to <16 x i32>
75  %B_even_ext = zext <16 x i8> %B_even to <16 x i32>
76  %A_odd_ext = sext <16 x i8> %A_odd to <16 x i32>
77  %B_odd_ext = zext <16 x i8> %B_odd to <16 x i32>
78  %even_mul = mul <16 x i32> %A_even_ext, %B_even_ext
79  %odd_mul = mul <16 x i32> %A_odd_ext, %B_odd_ext
80  %add = add <16 x i32> %even_mul, %odd_mul
81  %cmp_max = icmp sgt <16 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
82  %max = select <16 x i1> %cmp_max, <16 x i32> %add, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
83  %cmp_min = icmp slt <16 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
84  %min = select <16 x i1> %cmp_min, <16 x i32> %max, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
85  %trunc = trunc <16 x i32> %min to <16 x i16>
86  ret <16 x i16> %trunc
87}
88
89define <64 x i16> @pmaddubsw_512(<128 x i8>* %Aptr, <128 x i8>* %Bptr) {
90; SSE-LABEL: pmaddubsw_512:
91; SSE:       # %bb.0:
92; SSE-NEXT:    movdqa 112(%rdx), %xmm0
93; SSE-NEXT:    movdqa 96(%rdx), %xmm1
94; SSE-NEXT:    movdqa 80(%rdx), %xmm2
95; SSE-NEXT:    movdqa 64(%rdx), %xmm3
96; SSE-NEXT:    movdqa (%rdx), %xmm4
97; SSE-NEXT:    movdqa 16(%rdx), %xmm5
98; SSE-NEXT:    movdqa 32(%rdx), %xmm6
99; SSE-NEXT:    movdqa 48(%rdx), %xmm7
100; SSE-NEXT:    pmaddubsw (%rsi), %xmm4
101; SSE-NEXT:    pmaddubsw 16(%rsi), %xmm5
102; SSE-NEXT:    pmaddubsw 32(%rsi), %xmm6
103; SSE-NEXT:    pmaddubsw 48(%rsi), %xmm7
104; SSE-NEXT:    pmaddubsw 64(%rsi), %xmm3
105; SSE-NEXT:    pmaddubsw 80(%rsi), %xmm2
106; SSE-NEXT:    pmaddubsw 96(%rsi), %xmm1
107; SSE-NEXT:    pmaddubsw 112(%rsi), %xmm0
108; SSE-NEXT:    movdqa %xmm0, 112(%rdi)
109; SSE-NEXT:    movdqa %xmm1, 96(%rdi)
110; SSE-NEXT:    movdqa %xmm2, 80(%rdi)
111; SSE-NEXT:    movdqa %xmm3, 64(%rdi)
112; SSE-NEXT:    movdqa %xmm7, 48(%rdi)
113; SSE-NEXT:    movdqa %xmm6, 32(%rdi)
114; SSE-NEXT:    movdqa %xmm5, 16(%rdi)
115; SSE-NEXT:    movdqa %xmm4, (%rdi)
116; SSE-NEXT:    movq %rdi, %rax
117; SSE-NEXT:    retq
118;
119; AVX1-LABEL: pmaddubsw_512:
120; AVX1:       # %bb.0:
121; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
122; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1
123; AVX1-NEXT:    vmovdqa 64(%rdi), %ymm2
124; AVX1-NEXT:    vmovdqa 96(%rdi), %ymm8
125; AVX1-NEXT:    vmovdqa (%rsi), %ymm4
126; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm5
127; AVX1-NEXT:    vmovdqa 64(%rsi), %ymm6
128; AVX1-NEXT:    vmovdqa 96(%rsi), %ymm9
129; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
130; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm7
131; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm7, %xmm3
132; AVX1-NEXT:    vpmaddubsw %xmm0, %xmm4, %xmm0
133; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
134; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
135; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm4
136; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
137; AVX1-NEXT:    vpmaddubsw %xmm1, %xmm5, %xmm1
138; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
139; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
140; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm4
141; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
142; AVX1-NEXT:    vpmaddubsw %xmm2, %xmm6, %xmm2
143; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
144; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm3
145; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm4
146; AVX1-NEXT:    vpmaddubsw %xmm3, %xmm4, %xmm3
147; AVX1-NEXT:    vpmaddubsw %xmm8, %xmm9, %xmm4
148; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
149; AVX1-NEXT:    retq
150;
151; AVX2-LABEL: pmaddubsw_512:
152; AVX2:       # %bb.0:
153; AVX2-NEXT:    vmovdqa (%rsi), %ymm0
154; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1
155; AVX2-NEXT:    vmovdqa 64(%rsi), %ymm2
156; AVX2-NEXT:    vmovdqa 96(%rsi), %ymm3
157; AVX2-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
158; AVX2-NEXT:    vpmaddubsw 32(%rdi), %ymm1, %ymm1
159; AVX2-NEXT:    vpmaddubsw 64(%rdi), %ymm2, %ymm2
160; AVX2-NEXT:    vpmaddubsw 96(%rdi), %ymm3, %ymm3
161; AVX2-NEXT:    retq
162;
163; AVX512F-LABEL: pmaddubsw_512:
164; AVX512F:       # %bb.0:
165; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
166; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
167; AVX512F-NEXT:    vmovdqa 64(%rsi), %ymm2
168; AVX512F-NEXT:    vmovdqa 96(%rsi), %ymm3
169; AVX512F-NEXT:    vpmaddubsw (%rdi), %ymm0, %ymm0
170; AVX512F-NEXT:    vpmaddubsw 32(%rdi), %ymm1, %ymm1
171; AVX512F-NEXT:    vpmaddubsw 64(%rdi), %ymm2, %ymm2
172; AVX512F-NEXT:    vpmaddubsw 96(%rdi), %ymm3, %ymm3
173; AVX512F-NEXT:    retq
174;
175; AVX512BW-LABEL: pmaddubsw_512:
176; AVX512BW:       # %bb.0:
177; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
178; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm1
179; AVX512BW-NEXT:    vpmaddubsw (%rdi), %zmm0, %zmm0
180; AVX512BW-NEXT:    vpmaddubsw 64(%rdi), %zmm1, %zmm1
181; AVX512BW-NEXT:    retq
182  %A = load <128 x i8>, <128 x i8>* %Aptr
183  %B = load <128 x i8>, <128 x i8>* %Bptr
184  %A_even = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
185  %A_odd = shufflevector <128 x i8> %A, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
186  %B_even = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
187  %B_odd = shufflevector <128 x i8> %B, <128 x i8> undef, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
188  %A_even_ext = sext <64 x i8> %A_even to <64 x i32>
189  %B_even_ext = zext <64 x i8> %B_even to <64 x i32>
190  %A_odd_ext = sext <64 x i8> %A_odd to <64 x i32>
191  %B_odd_ext = zext <64 x i8> %B_odd to <64 x i32>
192  %even_mul = mul <64 x i32> %A_even_ext, %B_even_ext
193  %odd_mul = mul <64 x i32> %A_odd_ext, %B_odd_ext
194  %add = add <64 x i32> %even_mul, %odd_mul
195  %cmp_max = icmp sgt <64 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
196  %max = select <64 x i1> %cmp_max, <64 x i32> %add, <64 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
197  %cmp_min = icmp slt <64 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
198  %min = select <64 x i1> %cmp_min, <64 x i32> %max, <64 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
199  %trunc = trunc <64 x i32> %min to <64 x i16>
200  ret <64 x i16> %trunc
201}
202
203define <8 x i16> @pmaddubsw_swapped_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
204; SSE-LABEL: pmaddubsw_swapped_indices:
205; SSE:       # %bb.0:
206; SSE-NEXT:    movdqa (%rsi), %xmm0
207; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
208; SSE-NEXT:    retq
209;
210; AVX-LABEL: pmaddubsw_swapped_indices:
211; AVX:       # %bb.0:
212; AVX-NEXT:    vmovdqa (%rsi), %xmm0
213; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
214; AVX-NEXT:    retq
215  %A = load <16 x i8>, <16 x i8>* %Aptr
216  %B = load <16 x i8>, <16 x i8>* %Bptr
217  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
218  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
219  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;same indices as A
220  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;same indices as A
221  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
222  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
223  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
224  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
225  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
226  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
227  %add = add <8 x i32> %even_mul, %odd_mul
228  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
229  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
230  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
231  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
232  %trunc = trunc <8 x i32> %min to <8 x i16>
233  ret <8 x i16> %trunc
234}
235
236define <8 x i16> @pmaddubsw_swapped_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
237; SSE-LABEL: pmaddubsw_swapped_extend:
238; SSE:       # %bb.0:
239; SSE-NEXT:    movdqa (%rdi), %xmm0
240; SSE-NEXT:    pmaddubsw (%rsi), %xmm0
241; SSE-NEXT:    retq
242;
243; AVX-LABEL: pmaddubsw_swapped_extend:
244; AVX:       # %bb.0:
245; AVX-NEXT:    vmovdqa (%rdi), %xmm0
246; AVX-NEXT:    vpmaddubsw (%rsi), %xmm0, %xmm0
247; AVX-NEXT:    retq
248  %A = load <16 x i8>, <16 x i8>* %Aptr
249  %B = load <16 x i8>, <16 x i8>* %Bptr
250  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
251  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
252  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
253  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
254  %A_even_ext = zext <8 x i8> %A_even to <8 x i32>
255  %B_even_ext = sext <8 x i8> %B_even to <8 x i32>
256  %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
257  %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
258  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
259  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
260  %add = add <8 x i32> %even_mul, %odd_mul
261  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
262  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
263  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
264  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
265  %trunc = trunc <8 x i32> %min to <8 x i16>
266  ret <8 x i16> %trunc
267}
268
269define <8 x i16> @pmaddubsw_commuted_mul(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
270; SSE-LABEL: pmaddubsw_commuted_mul:
271; SSE:       # %bb.0:
272; SSE-NEXT:    movdqa (%rsi), %xmm0
273; SSE-NEXT:    pmaddubsw (%rdi), %xmm0
274; SSE-NEXT:    retq
275;
276; AVX-LABEL: pmaddubsw_commuted_mul:
277; AVX:       # %bb.0:
278; AVX-NEXT:    vmovdqa (%rsi), %xmm0
279; AVX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0
280; AVX-NEXT:    retq
281  %A = load <16 x i8>, <16 x i8>* %Aptr
282  %B = load <16 x i8>, <16 x i8>* %Bptr
283  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
284  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
285  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
286  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
287  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
288  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
289  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
290  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
291  %even_mul = mul <8 x i32> %B_even_ext, %A_even_ext
292  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
293  %add = add <8 x i32> %even_mul, %odd_mul
294  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
295  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
296  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
297  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
298  %trunc = trunc <8 x i32> %min to <8 x i16>
299  ret <8 x i16> %trunc
300}
301
302define <8 x i16> @pmaddubsw_bad_extend(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
303; SSE-LABEL: pmaddubsw_bad_extend:
304; SSE:       # %bb.0:
305; SSE-NEXT:    movdqa (%rdi), %xmm1
306; SSE-NEXT:    movdqa (%rsi), %xmm0
307; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
308; SSE-NEXT:    pand %xmm0, %xmm2
309; SSE-NEXT:    movdqa %xmm1, %xmm3
310; SSE-NEXT:    psllw $8, %xmm3
311; SSE-NEXT:    psraw $8, %xmm3
312; SSE-NEXT:    movdqa %xmm3, %xmm4
313; SSE-NEXT:    pmulhw %xmm2, %xmm4
314; SSE-NEXT:    pmullw %xmm2, %xmm3
315; SSE-NEXT:    movdqa %xmm3, %xmm2
316; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
317; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
318; SSE-NEXT:    psraw $8, %xmm0
319; SSE-NEXT:    psrlw $8, %xmm1
320; SSE-NEXT:    movdqa %xmm1, %xmm4
321; SSE-NEXT:    pmulhw %xmm0, %xmm4
322; SSE-NEXT:    pmullw %xmm0, %xmm1
323; SSE-NEXT:    movdqa %xmm1, %xmm0
324; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
325; SSE-NEXT:    paddd %xmm2, %xmm0
326; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
327; SSE-NEXT:    paddd %xmm3, %xmm1
328; SSE-NEXT:    packssdw %xmm1, %xmm0
329; SSE-NEXT:    retq
330;
331; AVX1-LABEL: pmaddubsw_bad_extend:
332; AVX1:       # %bb.0:
333; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
334; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
335; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u>
336; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
337; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
338; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u>
339; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
340; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm5
341; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
342; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
343; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
344; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm3
345; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
346; AVX1-NEXT:    vpmulld %xmm3, %xmm5, %xmm3
347; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u>
348; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
349; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
350; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u>
351; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
352; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
353; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
354; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
355; AVX1-NEXT:    vpmulld %xmm4, %xmm5, %xmm4
356; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
357; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
358; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
359; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
360; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
361; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
362; AVX1-NEXT:    retq
363;
364; AVX2-LABEL: pmaddubsw_bad_extend:
365; AVX2:       # %bb.0:
366; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
367; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
368; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
369; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
370; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
371; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
372; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
373; AVX2-NEXT:    vpmulld %ymm2, %ymm3, %ymm2
374; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
375; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
376; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
377; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
378; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
379; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
380; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
381; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
382; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
383; AVX2-NEXT:    vzeroupper
384; AVX2-NEXT:    retq
385;
386; AVX512-LABEL: pmaddubsw_bad_extend:
387; AVX512:       # %bb.0:
388; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
389; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
390; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
391; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
392; AVX512-NEXT:    vpmovsxbd %xmm3, %ymm3
393; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
394; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
395; AVX512-NEXT:    vpmulld %ymm2, %ymm3, %ymm2
396; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
397; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
398; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
399; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
400; AVX512-NEXT:    vpmovsxbd %xmm1, %ymm1
401; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
402; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
403; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
404; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
405; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
406; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
407; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
408; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
409; AVX512-NEXT:    vzeroupper
410; AVX512-NEXT:    retq
411  %A = load <16 x i8>, <16 x i8>* %Aptr
412  %B = load <16 x i8>, <16 x i8>* %Bptr
413  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
414  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
415  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
416  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
417  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
418  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
419  %A_odd_ext = zext <8 x i8> %A_odd to <8 x i32>
420  %B_odd_ext = sext <8 x i8> %B_odd to <8 x i32>
421  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
422  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
423  %add = add <8 x i32> %even_mul, %odd_mul
424  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
425  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
426  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
427  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
428  %trunc = trunc <8 x i32> %min to <8 x i16>
429  ret <8 x i16> %trunc
430}
431
432define <8 x i16> @pmaddubsw_bad_indices(<16 x i8>* %Aptr, <16 x i8>* %Bptr) {
433; SSE-LABEL: pmaddubsw_bad_indices:
434; SSE:       # %bb.0:
435; SSE-NEXT:    movdqa (%rdi), %xmm1
436; SSE-NEXT:    movdqa (%rsi), %xmm0
437; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
438; SSE-NEXT:    pand %xmm0, %xmm2
439; SSE-NEXT:    movdqa %xmm1, %xmm3
440; SSE-NEXT:    pshufb {{.*#+}} xmm3 = xmm3[u,1,u,2,u,5,u,6,u,9,u,10,u,13,u,14]
441; SSE-NEXT:    psraw $8, %xmm3
442; SSE-NEXT:    movdqa %xmm3, %xmm4
443; SSE-NEXT:    pmulhw %xmm2, %xmm4
444; SSE-NEXT:    pmullw %xmm2, %xmm3
445; SSE-NEXT:    movdqa %xmm3, %xmm2
446; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
447; SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
448; SSE-NEXT:    psrlw $8, %xmm0
449; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u,0,u,3,u,4,u,7,u,8,u,11,u,12,u,15]
450; SSE-NEXT:    psraw $8, %xmm1
451; SSE-NEXT:    movdqa %xmm1, %xmm4
452; SSE-NEXT:    pmulhw %xmm0, %xmm4
453; SSE-NEXT:    pmullw %xmm0, %xmm1
454; SSE-NEXT:    movdqa %xmm1, %xmm0
455; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
456; SSE-NEXT:    paddd %xmm2, %xmm0
457; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
458; SSE-NEXT:    paddd %xmm3, %xmm1
459; SSE-NEXT:    packssdw %xmm1, %xmm0
460; SSE-NEXT:    retq
461;
462; AVX1-LABEL: pmaddubsw_bad_indices:
463; AVX1:       # %bb.0:
464; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
465; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
466; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[9,10,13,14,u,u,u,u,u,u,u,u,u,u,u,u]
467; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
468; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm0[1,2,5,6,u,u,u,u,u,u,u,u,u,u,u,u]
469; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
470; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[8,10,12,14,u,u,u,u,u,u,u,u,u,u,u,u]
471; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
472; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
473; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
474; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
475; AVX1-NEXT:    vpmulld %xmm4, %xmm3, %xmm3
476; AVX1-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[8,11,12,15,u,u,u,u,u,u,u,u,u,u,u,u]
477; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
478; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,u,u,u,u,u,u,u,u,u,u,u,u]
479; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
480; AVX1-NEXT:    vpshufb {{.*#+}} xmm5 = xmm1[9,11,13,15,u,u,u,u,u,u,u,u,u,u,u,u]
481; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
482; AVX1-NEXT:    vpmulld %xmm5, %xmm4, %xmm4
483; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
484; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u]
485; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
486; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
487; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
488; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
489; AVX1-NEXT:    retq
490;
491; AVX2-LABEL: pmaddubsw_bad_indices:
492; AVX2:       # %bb.0:
493; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
494; AVX2-NEXT:    vmovdqa (%rsi), %xmm1
495; AVX2-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
496; AVX2-NEXT:    vpmovsxbd %xmm2, %ymm2
497; AVX2-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
498; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
499; AVX2-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
500; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
501; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
502; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
503; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
504; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
505; AVX2-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
506; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
507; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
508; AVX2-NEXT:    vzeroupper
509; AVX2-NEXT:    retq
510;
511; AVX512-LABEL: pmaddubsw_bad_indices:
512; AVX512:       # %bb.0:
513; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
514; AVX512-NEXT:    vmovdqa (%rsi), %xmm1
515; AVX512-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[1,2,5,6,9,10,13,14,u,u,u,u,u,u,u,u]
516; AVX512-NEXT:    vpmovsxbd %xmm2, %ymm2
517; AVX512-NEXT:    vpshufb {{.*#+}} xmm3 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
518; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
519; AVX512-NEXT:    vpmulld %ymm3, %ymm2, %ymm2
520; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,3,4,7,8,11,12,15,u,u,u,u,u,u,u,u]
521; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
522; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
523; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
524; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
525; AVX512-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
526; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
527; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
528; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
529; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
530; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
531; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
532; AVX512-NEXT:    vzeroupper
533; AVX512-NEXT:    retq
534  %A = load <16 x i8>, <16 x i8>* %Aptr
535  %B = load <16 x i8>, <16 x i8>* %Bptr
536  %A_even = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> ;indices aren't all even
537  %A_odd = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 8, i32 11, i32 12, i32 15> ;indices aren't all odd
538  %B_even = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> ;different than A
539  %B_odd = shufflevector <16 x i8> %B, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ;different than A
540  %A_even_ext = sext <8 x i8> %A_even to <8 x i32>
541  %B_even_ext = zext <8 x i8> %B_even to <8 x i32>
542  %A_odd_ext = sext <8 x i8> %A_odd to <8 x i32>
543  %B_odd_ext = zext <8 x i8> %B_odd to <8 x i32>
544  %even_mul = mul <8 x i32> %A_even_ext, %B_even_ext
545  %odd_mul = mul <8 x i32> %A_odd_ext, %B_odd_ext
546  %add = add <8 x i32> %even_mul, %odd_mul
547  %cmp_max = icmp sgt <8 x i32> %add, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
548  %max = select <8 x i1> %cmp_max, <8 x i32> %add, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
549  %cmp_min = icmp slt <8 x i32> %max, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
550  %min = select <8 x i1> %cmp_min, <8 x i32> %max, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
551  %trunc = trunc <8 x i32> %min to <8 x i16>
552  ret <8 x i16> %trunc
553}
554