1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2 3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE41 4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 7 8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 9 10@src64 = common global [8 x double] zeroinitializer, align 64 11@dst64 = common global [8 x double] zeroinitializer, align 64 12@src32 = common global [16 x float] zeroinitializer, align 64 13@dst32 = common global [16 x float] zeroinitializer, align 64 14 15declare double @llvm.ceil.f64(double %p) 16declare double @llvm.floor.f64(double %p) 17declare double @llvm.nearbyint.f64(double %p) 18declare double @llvm.rint.f64(double %p) 19declare double @llvm.trunc.f64(double %p) 20 21declare float @llvm.ceil.f32(float %p) 22declare float @llvm.floor.f32(float %p) 23declare float @llvm.nearbyint.f32(float %p) 24declare float @llvm.rint.f32(float %p) 25declare float @llvm.trunc.f32(float %p) 26 27define void @ceil_2f64() #0 { 28; SSE2-LABEL: @ceil_2f64( 29; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 30; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 31; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) 32; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) 33; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 34; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 35; SSE2-NEXT: ret void 36; 37; SSE41-LABEL: @ceil_2f64( 38; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 39; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 40; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 41; SSE41-NEXT: ret void 42; 43; AVX-LABEL: @ceil_2f64( 44; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 45; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 46; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 47; AVX-NEXT: ret void 48; 49 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 50 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 51 %ceil0 = call double @llvm.ceil.f64(double %ld0) 52 %ceil1 = call double @llvm.ceil.f64(double %ld1) 53 store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 54 store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 55 ret void 56} 57 58define void @ceil_4f64() #0 { 59; SSE2-LABEL: @ceil_4f64( 60; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 61; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 62; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 63; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 64; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) 65; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) 66; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) 67; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) 68; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 69; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 70; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 71; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 72; SSE2-NEXT: ret void 73; 74; SSE41-LABEL: @ceil_4f64( 75; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 76; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 77; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 78; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) 79; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 80; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 81; SSE41-NEXT: ret void 82; 83; AVX-LABEL: @ceil_4f64( 84; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 85; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) 86; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 87; AVX-NEXT: ret void 88; 89 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 90 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 91 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 92 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 93 %ceil0 = call double @llvm.ceil.f64(double %ld0) 94 %ceil1 = call double @llvm.ceil.f64(double %ld1) 95 %ceil2 = call double @llvm.ceil.f64(double %ld2) 96 %ceil3 = call double @llvm.ceil.f64(double %ld3) 97 store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 98 store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 99 store double %ceil2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 100 store double %ceil3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 101 ret void 102} 103 104define void @ceil_8f64() #0 { 105; SSE2-LABEL: @ceil_8f64( 106; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 107; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 108; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 109; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 110; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 111; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 112; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 113; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 114; SSE2-NEXT: [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]]) 115; SSE2-NEXT: [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]]) 116; SSE2-NEXT: [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]]) 117; SSE2-NEXT: [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]]) 118; SSE2-NEXT: [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[LD4]]) 119; SSE2-NEXT: [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]]) 120; SSE2-NEXT: [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]]) 121; SSE2-NEXT: [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]]) 122; SSE2-NEXT: store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 123; SSE2-NEXT: store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 124; SSE2-NEXT: store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 125; SSE2-NEXT: store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 126; SSE2-NEXT: store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 127; SSE2-NEXT: store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 128; SSE2-NEXT: store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 129; SSE2-NEXT: store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 130; SSE2-NEXT: ret void 131; 132; SSE41-LABEL: @ceil_8f64( 133; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 134; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 135; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 136; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 137; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) 138; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) 139; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]]) 140; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]]) 141; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 142; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 143; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 144; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 145; SSE41-NEXT: ret void 146; 147; AVX1-LABEL: @ceil_8f64( 148; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 149; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 150; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) 151; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]]) 152; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 153; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 154; AVX1-NEXT: ret void 155; 156; AVX2-LABEL: @ceil_8f64( 157; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 158; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 159; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) 160; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]]) 161; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 162; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 163; AVX2-NEXT: ret void 164; 165; AVX512-LABEL: @ceil_8f64( 166; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 167; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[TMP1]]) 168; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 169; AVX512-NEXT: ret void 170; 171 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 172 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 173 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 174 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 175 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 176 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 177 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 178 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 179 %ceil0 = call double @llvm.ceil.f64(double %ld0) 180 %ceil1 = call double @llvm.ceil.f64(double %ld1) 181 %ceil2 = call double @llvm.ceil.f64(double %ld2) 182 %ceil3 = call double @llvm.ceil.f64(double %ld3) 183 %ceil4 = call double @llvm.ceil.f64(double %ld4) 184 %ceil5 = call double @llvm.ceil.f64(double %ld5) 185 %ceil6 = call double @llvm.ceil.f64(double %ld6) 186 %ceil7 = call double @llvm.ceil.f64(double %ld7) 187 store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 188 store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 189 store double %ceil2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 190 store double %ceil3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 191 store double %ceil4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 192 store double %ceil5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 193 store double %ceil6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 194 store double %ceil7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 195 ret void 196} 197 198define void @floor_2f64() #0 { 199; SSE2-LABEL: @floor_2f64( 200; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 201; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 202; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) 203; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) 204; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 205; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 206; SSE2-NEXT: ret void 207; 208; SSE41-LABEL: @floor_2f64( 209; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 210; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 211; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 212; SSE41-NEXT: ret void 213; 214; AVX-LABEL: @floor_2f64( 215; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 216; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 217; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 218; AVX-NEXT: ret void 219; 220 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 221 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 222 %floor0 = call double @llvm.floor.f64(double %ld0) 223 %floor1 = call double @llvm.floor.f64(double %ld1) 224 store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 225 store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 226 ret void 227} 228 229define void @floor_4f64() #0 { 230; SSE2-LABEL: @floor_4f64( 231; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 232; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 233; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 234; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 235; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) 236; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) 237; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) 238; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) 239; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 240; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 241; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 242; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 243; SSE2-NEXT: ret void 244; 245; SSE41-LABEL: @floor_4f64( 246; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 247; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 248; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 249; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) 250; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 251; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 252; SSE41-NEXT: ret void 253; 254; AVX-LABEL: @floor_4f64( 255; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 256; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) 257; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 258; AVX-NEXT: ret void 259; 260 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 261 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 262 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 263 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 264 %floor0 = call double @llvm.floor.f64(double %ld0) 265 %floor1 = call double @llvm.floor.f64(double %ld1) 266 %floor2 = call double @llvm.floor.f64(double %ld2) 267 %floor3 = call double @llvm.floor.f64(double %ld3) 268 store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 269 store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 270 store double %floor2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 271 store double %floor3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 272 ret void 273} 274 275define void @floor_8f64() #0 { 276; SSE2-LABEL: @floor_8f64( 277; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 278; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 279; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 280; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 281; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 282; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 283; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 284; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 285; SSE2-NEXT: [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]]) 286; SSE2-NEXT: [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]]) 287; SSE2-NEXT: [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]]) 288; SSE2-NEXT: [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]]) 289; SSE2-NEXT: [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[LD4]]) 290; SSE2-NEXT: [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]]) 291; SSE2-NEXT: [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]]) 292; SSE2-NEXT: [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]]) 293; SSE2-NEXT: store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 294; SSE2-NEXT: store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 295; SSE2-NEXT: store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 296; SSE2-NEXT: store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 297; SSE2-NEXT: store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 298; SSE2-NEXT: store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 299; SSE2-NEXT: store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 300; SSE2-NEXT: store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 301; SSE2-NEXT: ret void 302; 303; SSE41-LABEL: @floor_8f64( 304; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 305; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 306; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 307; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 308; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) 309; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) 310; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]]) 311; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]]) 312; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 313; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 314; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 315; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 316; SSE41-NEXT: ret void 317; 318; AVX1-LABEL: @floor_8f64( 319; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 320; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 321; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) 322; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]]) 323; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 324; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 325; AVX1-NEXT: ret void 326; 327; AVX2-LABEL: @floor_8f64( 328; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 329; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 330; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) 331; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]]) 332; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 333; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 334; AVX2-NEXT: ret void 335; 336; AVX512-LABEL: @floor_8f64( 337; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 338; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[TMP1]]) 339; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 340; AVX512-NEXT: ret void 341; 342 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 343 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 344 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 345 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 346 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 347 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 348 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 349 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 350 %floor0 = call double @llvm.floor.f64(double %ld0) 351 %floor1 = call double @llvm.floor.f64(double %ld1) 352 %floor2 = call double @llvm.floor.f64(double %ld2) 353 %floor3 = call double @llvm.floor.f64(double %ld3) 354 %floor4 = call double @llvm.floor.f64(double %ld4) 355 %floor5 = call double @llvm.floor.f64(double %ld5) 356 %floor6 = call double @llvm.floor.f64(double %ld6) 357 %floor7 = call double @llvm.floor.f64(double %ld7) 358 store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 359 store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 360 store double %floor2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 361 store double %floor3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 362 store double %floor4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 363 store double %floor5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 364 store double %floor6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 365 store double %floor7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 366 ret void 367} 368 369define void @nearbyint_2f64() #0 { 370; SSE2-LABEL: @nearbyint_2f64( 371; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 372; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 373; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) 374; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) 375; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 376; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 377; SSE2-NEXT: ret void 378; 379; SSE41-LABEL: @nearbyint_2f64( 380; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 381; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 382; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 383; SSE41-NEXT: ret void 384; 385; AVX-LABEL: @nearbyint_2f64( 386; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 387; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 388; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 389; AVX-NEXT: ret void 390; 391 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 392 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 393 %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0) 394 %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1) 395 store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 396 store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 397 ret void 398} 399 400define void @nearbyint_4f64() #0 { 401; SSE2-LABEL: @nearbyint_4f64( 402; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 403; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 404; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 405; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 406; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) 407; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) 408; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) 409; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) 410; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 411; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 412; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 413; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 414; SSE2-NEXT: ret void 415; 416; SSE41-LABEL: @nearbyint_4f64( 417; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 418; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 419; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 420; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) 421; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 422; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 423; SSE41-NEXT: ret void 424; 425; AVX-LABEL: @nearbyint_4f64( 426; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 427; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) 428; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 429; AVX-NEXT: ret void 430; 431 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 432 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 433 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 434 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 435 %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0) 436 %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1) 437 %nearbyint2 = call double @llvm.nearbyint.f64(double %ld2) 438 %nearbyint3 = call double @llvm.nearbyint.f64(double %ld3) 439 store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 440 store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 441 store double %nearbyint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 442 store double %nearbyint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 443 ret void 444} 445 446define void @nearbyint_8f64() #0 { 447; SSE2-LABEL: @nearbyint_8f64( 448; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 449; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 450; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 451; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 452; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 453; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 454; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 455; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 456; SSE2-NEXT: [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]]) 457; SSE2-NEXT: [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]]) 458; SSE2-NEXT: [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]]) 459; SSE2-NEXT: [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]]) 460; SSE2-NEXT: [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[LD4]]) 461; SSE2-NEXT: [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]]) 462; SSE2-NEXT: [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]]) 463; SSE2-NEXT: [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]]) 464; SSE2-NEXT: store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 465; SSE2-NEXT: store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 466; SSE2-NEXT: store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 467; SSE2-NEXT: store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 468; SSE2-NEXT: store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 469; SSE2-NEXT: store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 470; SSE2-NEXT: store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 471; SSE2-NEXT: store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 472; SSE2-NEXT: ret void 473; 474; SSE41-LABEL: @nearbyint_8f64( 475; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 476; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 477; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 478; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 479; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) 480; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) 481; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]]) 482; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]]) 483; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 484; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 485; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 486; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 487; SSE41-NEXT: ret void 488; 489; AVX1-LABEL: @nearbyint_8f64( 490; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 491; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 492; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) 493; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]]) 494; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 495; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 496; AVX1-NEXT: ret void 497; 498; AVX2-LABEL: @nearbyint_8f64( 499; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 500; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 501; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) 502; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]]) 503; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 504; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 505; AVX2-NEXT: ret void 506; 507; AVX512-LABEL: @nearbyint_8f64( 508; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 509; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> [[TMP1]]) 510; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 511; AVX512-NEXT: ret void 512; 513 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 514 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 515 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 516 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 517 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 518 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 519 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 520 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 521 %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0) 522 %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1) 523 %nearbyint2 = call double @llvm.nearbyint.f64(double %ld2) 524 %nearbyint3 = call double @llvm.nearbyint.f64(double %ld3) 525 %nearbyint4 = call double @llvm.nearbyint.f64(double %ld4) 526 %nearbyint5 = call double @llvm.nearbyint.f64(double %ld5) 527 %nearbyint6 = call double @llvm.nearbyint.f64(double %ld6) 528 %nearbyint7 = call double @llvm.nearbyint.f64(double %ld7) 529 store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 530 store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 531 store double %nearbyint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 532 store double %nearbyint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 533 store double %nearbyint4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 534 store double %nearbyint5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 535 store double %nearbyint6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 536 store double %nearbyint7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 537 ret void 538} 539 540define void @rint_2f64() #0 { 541; SSE2-LABEL: @rint_2f64( 542; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 543; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 544; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) 545; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) 546; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 547; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 548; SSE2-NEXT: ret void 549; 550; SSE41-LABEL: @rint_2f64( 551; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 552; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 553; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 554; SSE41-NEXT: ret void 555; 556; AVX-LABEL: @rint_2f64( 557; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 558; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 559; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 560; AVX-NEXT: ret void 561; 562 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 563 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 564 %rint0 = call double @llvm.rint.f64(double %ld0) 565 %rint1 = call double @llvm.rint.f64(double %ld1) 566 store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 567 store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 568 ret void 569} 570 571define void @rint_4f64() #0 { 572; SSE2-LABEL: @rint_4f64( 573; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 574; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 575; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 576; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 577; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) 578; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) 579; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) 580; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) 581; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 582; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 583; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 584; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 585; SSE2-NEXT: ret void 586; 587; SSE41-LABEL: @rint_4f64( 588; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 589; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 590; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 591; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) 592; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 593; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 594; SSE41-NEXT: ret void 595; 596; AVX-LABEL: @rint_4f64( 597; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 598; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) 599; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 600; AVX-NEXT: ret void 601; 602 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 603 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 604 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 605 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 606 %rint0 = call double @llvm.rint.f64(double %ld0) 607 %rint1 = call double @llvm.rint.f64(double %ld1) 608 %rint2 = call double @llvm.rint.f64(double %ld2) 609 %rint3 = call double @llvm.rint.f64(double %ld3) 610 store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 611 store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 612 store double %rint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 613 store double %rint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 614 ret void 615} 616 617define void @rint_8f64() #0 { 618; SSE2-LABEL: @rint_8f64( 619; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 620; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 621; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 622; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 623; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 624; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 625; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 626; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 627; SSE2-NEXT: [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]]) 628; SSE2-NEXT: [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]]) 629; SSE2-NEXT: [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]]) 630; SSE2-NEXT: [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]]) 631; SSE2-NEXT: [[RINT4:%.*]] = call double @llvm.rint.f64(double [[LD4]]) 632; SSE2-NEXT: [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]]) 633; SSE2-NEXT: [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]]) 634; SSE2-NEXT: [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]]) 635; SSE2-NEXT: store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 636; SSE2-NEXT: store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 637; SSE2-NEXT: store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 638; SSE2-NEXT: store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 639; SSE2-NEXT: store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 640; SSE2-NEXT: store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 641; SSE2-NEXT: store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 642; SSE2-NEXT: store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 643; SSE2-NEXT: ret void 644; 645; SSE41-LABEL: @rint_8f64( 646; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 647; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 648; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 649; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 650; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) 651; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) 652; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]]) 653; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]]) 654; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 655; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 656; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 657; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 658; SSE41-NEXT: ret void 659; 660; AVX1-LABEL: @rint_8f64( 661; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 662; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 663; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) 664; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]]) 665; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 666; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 667; AVX1-NEXT: ret void 668; 669; AVX2-LABEL: @rint_8f64( 670; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 671; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 672; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) 673; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]]) 674; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 675; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 676; AVX2-NEXT: ret void 677; 678; AVX512-LABEL: @rint_8f64( 679; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 680; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]]) 681; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 682; AVX512-NEXT: ret void 683; 684 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 685 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 686 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 687 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 688 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 689 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 690 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 691 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 692 %rint0 = call double @llvm.rint.f64(double %ld0) 693 %rint1 = call double @llvm.rint.f64(double %ld1) 694 %rint2 = call double @llvm.rint.f64(double %ld2) 695 %rint3 = call double @llvm.rint.f64(double %ld3) 696 %rint4 = call double @llvm.rint.f64(double %ld4) 697 %rint5 = call double @llvm.rint.f64(double %ld5) 698 %rint6 = call double @llvm.rint.f64(double %ld6) 699 %rint7 = call double @llvm.rint.f64(double %ld7) 700 store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 701 store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 702 store double %rint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 703 store double %rint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 704 store double %rint4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 705 store double %rint5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 706 store double %rint6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 707 store double %rint7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 708 ret void 709} 710 711define void @trunc_2f64() #0 { 712; SSE2-LABEL: @trunc_2f64( 713; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 714; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 715; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) 716; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) 717; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 718; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 719; SSE2-NEXT: ret void 720; 721; SSE41-LABEL: @trunc_2f64( 722; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 723; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 724; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 725; SSE41-NEXT: ret void 726; 727; AVX-LABEL: @trunc_2f64( 728; AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 729; AVX-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 730; AVX-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 731; AVX-NEXT: ret void 732; 733 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 734 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 735 %trunc0 = call double @llvm.trunc.f64(double %ld0) 736 %trunc1 = call double @llvm.trunc.f64(double %ld1) 737 store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 738 store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 739 ret void 740} 741 742define void @trunc_4f64() #0 { 743; SSE2-LABEL: @trunc_4f64( 744; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 745; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 746; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 747; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 748; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) 749; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) 750; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) 751; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) 752; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 753; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 754; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 755; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 756; SSE2-NEXT: ret void 757; 758; SSE41-LABEL: @trunc_4f64( 759; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 760; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 761; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 762; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) 763; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 764; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 765; SSE41-NEXT: ret void 766; 767; AVX-LABEL: @trunc_4f64( 768; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 769; AVX-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) 770; AVX-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 771; AVX-NEXT: ret void 772; 773 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 774 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 775 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 776 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 777 %trunc0 = call double @llvm.trunc.f64(double %ld0) 778 %trunc1 = call double @llvm.trunc.f64(double %ld1) 779 %trunc2 = call double @llvm.trunc.f64(double %ld2) 780 %trunc3 = call double @llvm.trunc.f64(double %ld3) 781 store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 782 store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 783 store double %trunc2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 784 store double %trunc3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 785 ret void 786} 787 788define void @trunc_8f64() #0 { 789; SSE2-LABEL: @trunc_8f64( 790; SSE2-NEXT: [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 791; SSE2-NEXT: [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 792; SSE2-NEXT: [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 793; SSE2-NEXT: [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 794; SSE2-NEXT: [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 795; SSE2-NEXT: [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 796; SSE2-NEXT: [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 797; SSE2-NEXT: [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 798; SSE2-NEXT: [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]]) 799; SSE2-NEXT: [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]]) 800; SSE2-NEXT: [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]]) 801; SSE2-NEXT: [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]]) 802; SSE2-NEXT: [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[LD4]]) 803; SSE2-NEXT: [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]]) 804; SSE2-NEXT: [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]]) 805; SSE2-NEXT: [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]]) 806; SSE2-NEXT: store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 807; SSE2-NEXT: store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 808; SSE2-NEXT: store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 809; SSE2-NEXT: store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 810; SSE2-NEXT: store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 811; SSE2-NEXT: store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 812; SSE2-NEXT: store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 813; SSE2-NEXT: store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 814; SSE2-NEXT: ret void 815; 816; SSE41-LABEL: @trunc_8f64( 817; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 818; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 819; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 820; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 821; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) 822; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) 823; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]]) 824; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]]) 825; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 826; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 827; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 828; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 829; SSE41-NEXT: ret void 830; 831; AVX1-LABEL: @trunc_8f64( 832; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 833; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 834; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) 835; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]]) 836; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 837; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 838; AVX1-NEXT: ret void 839; 840; AVX2-LABEL: @trunc_8f64( 841; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 842; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 843; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) 844; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]]) 845; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 846; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 847; AVX2-NEXT: ret void 848; 849; AVX512-LABEL: @trunc_8f64( 850; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 851; AVX512-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.trunc.v8f64(<8 x double> [[TMP1]]) 852; AVX512-NEXT: store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8 853; AVX512-NEXT: ret void 854; 855 %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 856 %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 857 %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 858 %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 859 %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 860 %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 861 %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 862 %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 863 %trunc0 = call double @llvm.trunc.f64(double %ld0) 864 %trunc1 = call double @llvm.trunc.f64(double %ld1) 865 %trunc2 = call double @llvm.trunc.f64(double %ld2) 866 %trunc3 = call double @llvm.trunc.f64(double %ld3) 867 %trunc4 = call double @llvm.trunc.f64(double %ld4) 868 %trunc5 = call double @llvm.trunc.f64(double %ld5) 869 %trunc6 = call double @llvm.trunc.f64(double %ld6) 870 %trunc7 = call double @llvm.trunc.f64(double %ld7) 871 store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8 872 store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8 873 store double %trunc2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8 874 store double %trunc3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8 875 store double %trunc4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8 876 store double %trunc5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8 877 store double %trunc6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8 878 store double %trunc7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8 879 ret void 880} 881 882define void @ceil_4f32() #0 { 883; SSE2-LABEL: @ceil_4f32( 884; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 885; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 886; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 887; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 888; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) 889; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) 890; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) 891; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) 892; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 893; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 894; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 895; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 896; SSE2-NEXT: ret void 897; 898; SSE41-LABEL: @ceil_4f32( 899; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 900; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 901; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 902; SSE41-NEXT: ret void 903; 904; AVX-LABEL: @ceil_4f32( 905; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 906; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 907; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 908; AVX-NEXT: ret void 909; 910 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 911 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 912 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 913 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 914 %ceil0 = call float @llvm.ceil.f32(float %ld0) 915 %ceil1 = call float @llvm.ceil.f32(float %ld1) 916 %ceil2 = call float @llvm.ceil.f32(float %ld2) 917 %ceil3 = call float @llvm.ceil.f32(float %ld3) 918 store float %ceil0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 919 store float %ceil1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 920 store float %ceil2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 921 store float %ceil3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 922 ret void 923} 924 925define void @ceil_8f32() #0 { 926; SSE2-LABEL: @ceil_8f32( 927; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 928; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 929; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 930; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 931; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 932; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 933; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 934; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 935; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) 936; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) 937; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) 938; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) 939; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) 940; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) 941; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) 942; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) 943; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 944; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 945; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 946; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 947; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 948; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 949; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 950; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 951; SSE2-NEXT: ret void 952; 953; SSE41-LABEL: @ceil_8f32( 954; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 955; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 956; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 957; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) 958; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 959; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 960; SSE41-NEXT: ret void 961; 962; AVX-LABEL: @ceil_8f32( 963; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 964; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) 965; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 966; AVX-NEXT: ret void 967; 968 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 969 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 970 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 971 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 972 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 973 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 974 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 975 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 976 %ceil0 = call float @llvm.ceil.f32(float %ld0) 977 %ceil1 = call float @llvm.ceil.f32(float %ld1) 978 %ceil2 = call float @llvm.ceil.f32(float %ld2) 979 %ceil3 = call float @llvm.ceil.f32(float %ld3) 980 %ceil4 = call float @llvm.ceil.f32(float %ld4) 981 %ceil5 = call float @llvm.ceil.f32(float %ld5) 982 %ceil6 = call float @llvm.ceil.f32(float %ld6) 983 %ceil7 = call float @llvm.ceil.f32(float %ld7) 984 store float %ceil0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 985 store float %ceil1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 986 store float %ceil2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 987 store float %ceil3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 988 store float %ceil4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 989 store float %ceil5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 990 store float %ceil6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 991 store float %ceil7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 992 ret void 993} 994 995define void @ceil_16f32() #0 { 996; SSE2-LABEL: @ceil_16f32( 997; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 998; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 999; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1000; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1001; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1002; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1003; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1004; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1005; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1006; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1007; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1008; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1009; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1010; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1011; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1012; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1013; SSE2-NEXT: [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]]) 1014; SSE2-NEXT: [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]]) 1015; SSE2-NEXT: [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]]) 1016; SSE2-NEXT: [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]]) 1017; SSE2-NEXT: [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]]) 1018; SSE2-NEXT: [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]]) 1019; SSE2-NEXT: [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]]) 1020; SSE2-NEXT: [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]]) 1021; SSE2-NEXT: [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[LD8]]) 1022; SSE2-NEXT: [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[LD9]]) 1023; SSE2-NEXT: [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[LD10]]) 1024; SSE2-NEXT: [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[LD11]]) 1025; SSE2-NEXT: [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[LD12]]) 1026; SSE2-NEXT: [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]]) 1027; SSE2-NEXT: [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]]) 1028; SSE2-NEXT: [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]]) 1029; SSE2-NEXT: store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1030; SSE2-NEXT: store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1031; SSE2-NEXT: store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1032; SSE2-NEXT: store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1033; SSE2-NEXT: store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1034; SSE2-NEXT: store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1035; SSE2-NEXT: store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1036; SSE2-NEXT: store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1037; SSE2-NEXT: store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1038; SSE2-NEXT: store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1039; SSE2-NEXT: store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1040; SSE2-NEXT: store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1041; SSE2-NEXT: store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1042; SSE2-NEXT: store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1043; SSE2-NEXT: store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1044; SSE2-NEXT: store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1045; SSE2-NEXT: ret void 1046; 1047; SSE41-LABEL: @ceil_16f32( 1048; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1049; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1050; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1051; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1052; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) 1053; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) 1054; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]]) 1055; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]]) 1056; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1057; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1058; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1059; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1060; SSE41-NEXT: ret void 1061; 1062; AVX1-LABEL: @ceil_16f32( 1063; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1064; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1065; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) 1066; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]]) 1067; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1068; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1069; AVX1-NEXT: ret void 1070; 1071; AVX2-LABEL: @ceil_16f32( 1072; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1073; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1074; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) 1075; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]]) 1076; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1077; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1078; AVX2-NEXT: ret void 1079; 1080; AVX512-LABEL: @ceil_16f32( 1081; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1082; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[TMP1]]) 1083; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1084; AVX512-NEXT: ret void 1085; 1086 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1087 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1088 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1089 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1090 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1091 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1092 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1093 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1094 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1095 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1096 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1097 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1098 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1099 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1100 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1101 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1102 %ceil0 = call float @llvm.ceil.f32(float %ld0 ) 1103 %ceil1 = call float @llvm.ceil.f32(float %ld1 ) 1104 %ceil2 = call float @llvm.ceil.f32(float %ld2 ) 1105 %ceil3 = call float @llvm.ceil.f32(float %ld3 ) 1106 %ceil4 = call float @llvm.ceil.f32(float %ld4 ) 1107 %ceil5 = call float @llvm.ceil.f32(float %ld5 ) 1108 %ceil6 = call float @llvm.ceil.f32(float %ld6 ) 1109 %ceil7 = call float @llvm.ceil.f32(float %ld7 ) 1110 %ceil8 = call float @llvm.ceil.f32(float %ld8 ) 1111 %ceil9 = call float @llvm.ceil.f32(float %ld9 ) 1112 %ceil10 = call float @llvm.ceil.f32(float %ld10) 1113 %ceil11 = call float @llvm.ceil.f32(float %ld11) 1114 %ceil12 = call float @llvm.ceil.f32(float %ld12) 1115 %ceil13 = call float @llvm.ceil.f32(float %ld13) 1116 %ceil14 = call float @llvm.ceil.f32(float %ld14) 1117 %ceil15 = call float @llvm.ceil.f32(float %ld15) 1118 store float %ceil0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1119 store float %ceil1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1120 store float %ceil2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1121 store float %ceil3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1122 store float %ceil4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1123 store float %ceil5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1124 store float %ceil6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1125 store float %ceil7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1126 store float %ceil8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1127 store float %ceil9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1128 store float %ceil10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1129 store float %ceil11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1130 store float %ceil12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1131 store float %ceil13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1132 store float %ceil14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1133 store float %ceil15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1134 ret void 1135} 1136 1137define void @floor_4f32() #0 { 1138; SSE2-LABEL: @floor_4f32( 1139; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1140; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1141; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1142; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1143; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) 1144; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) 1145; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) 1146; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) 1147; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1148; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1149; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1150; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1151; SSE2-NEXT: ret void 1152; 1153; SSE41-LABEL: @floor_4f32( 1154; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1155; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1156; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1157; SSE41-NEXT: ret void 1158; 1159; AVX-LABEL: @floor_4f32( 1160; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1161; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1162; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1163; AVX-NEXT: ret void 1164; 1165 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1166 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1167 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1168 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1169 %floor0 = call float @llvm.floor.f32(float %ld0) 1170 %floor1 = call float @llvm.floor.f32(float %ld1) 1171 %floor2 = call float @llvm.floor.f32(float %ld2) 1172 %floor3 = call float @llvm.floor.f32(float %ld3) 1173 store float %floor0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1174 store float %floor1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1175 store float %floor2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1176 store float %floor3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1177 ret void 1178} 1179 1180define void @floor_8f32() #0 { 1181; SSE2-LABEL: @floor_8f32( 1182; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1183; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1184; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1185; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1186; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1187; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1188; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1189; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1190; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) 1191; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) 1192; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) 1193; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) 1194; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) 1195; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) 1196; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) 1197; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) 1198; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1199; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1200; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1201; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1202; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1203; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1204; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1205; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1206; SSE2-NEXT: ret void 1207; 1208; SSE41-LABEL: @floor_8f32( 1209; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1210; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1211; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1212; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) 1213; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1214; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1215; SSE41-NEXT: ret void 1216; 1217; AVX-LABEL: @floor_8f32( 1218; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1219; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) 1220; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1221; AVX-NEXT: ret void 1222; 1223 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1224 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1225 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1226 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1227 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1228 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1229 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1230 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1231 %floor0 = call float @llvm.floor.f32(float %ld0) 1232 %floor1 = call float @llvm.floor.f32(float %ld1) 1233 %floor2 = call float @llvm.floor.f32(float %ld2) 1234 %floor3 = call float @llvm.floor.f32(float %ld3) 1235 %floor4 = call float @llvm.floor.f32(float %ld4) 1236 %floor5 = call float @llvm.floor.f32(float %ld5) 1237 %floor6 = call float @llvm.floor.f32(float %ld6) 1238 %floor7 = call float @llvm.floor.f32(float %ld7) 1239 store float %floor0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1240 store float %floor1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1241 store float %floor2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1242 store float %floor3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1243 store float %floor4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1244 store float %floor5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1245 store float %floor6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1246 store float %floor7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1247 ret void 1248} 1249 1250define void @floor_16f32() #0 { 1251; SSE2-LABEL: @floor_16f32( 1252; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1253; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1254; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1255; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1256; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1257; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1258; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1259; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1260; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1261; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1262; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1263; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1264; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1265; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1266; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1267; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1268; SSE2-NEXT: [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]]) 1269; SSE2-NEXT: [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]]) 1270; SSE2-NEXT: [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]]) 1271; SSE2-NEXT: [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]]) 1272; SSE2-NEXT: [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]]) 1273; SSE2-NEXT: [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]]) 1274; SSE2-NEXT: [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]]) 1275; SSE2-NEXT: [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]]) 1276; SSE2-NEXT: [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[LD8]]) 1277; SSE2-NEXT: [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[LD9]]) 1278; SSE2-NEXT: [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[LD10]]) 1279; SSE2-NEXT: [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[LD11]]) 1280; SSE2-NEXT: [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[LD12]]) 1281; SSE2-NEXT: [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]]) 1282; SSE2-NEXT: [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]]) 1283; SSE2-NEXT: [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]]) 1284; SSE2-NEXT: store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1285; SSE2-NEXT: store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1286; SSE2-NEXT: store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1287; SSE2-NEXT: store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1288; SSE2-NEXT: store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1289; SSE2-NEXT: store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1290; SSE2-NEXT: store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1291; SSE2-NEXT: store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1292; SSE2-NEXT: store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1293; SSE2-NEXT: store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1294; SSE2-NEXT: store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1295; SSE2-NEXT: store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1296; SSE2-NEXT: store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1297; SSE2-NEXT: store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1298; SSE2-NEXT: store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1299; SSE2-NEXT: store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1300; SSE2-NEXT: ret void 1301; 1302; SSE41-LABEL: @floor_16f32( 1303; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1304; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1305; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1306; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1307; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) 1308; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) 1309; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]]) 1310; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]]) 1311; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1312; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1313; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1314; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1315; SSE41-NEXT: ret void 1316; 1317; AVX1-LABEL: @floor_16f32( 1318; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1319; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1320; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) 1321; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]]) 1322; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1323; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1324; AVX1-NEXT: ret void 1325; 1326; AVX2-LABEL: @floor_16f32( 1327; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1328; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1329; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) 1330; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]]) 1331; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1332; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1333; AVX2-NEXT: ret void 1334; 1335; AVX512-LABEL: @floor_16f32( 1336; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1337; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[TMP1]]) 1338; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1339; AVX512-NEXT: ret void 1340; 1341 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1342 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1343 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1344 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1345 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1346 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1347 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1348 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1349 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1350 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1351 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1352 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1353 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1354 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1355 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1356 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1357 %floor0 = call float @llvm.floor.f32(float %ld0 ) 1358 %floor1 = call float @llvm.floor.f32(float %ld1 ) 1359 %floor2 = call float @llvm.floor.f32(float %ld2 ) 1360 %floor3 = call float @llvm.floor.f32(float %ld3 ) 1361 %floor4 = call float @llvm.floor.f32(float %ld4 ) 1362 %floor5 = call float @llvm.floor.f32(float %ld5 ) 1363 %floor6 = call float @llvm.floor.f32(float %ld6 ) 1364 %floor7 = call float @llvm.floor.f32(float %ld7 ) 1365 %floor8 = call float @llvm.floor.f32(float %ld8 ) 1366 %floor9 = call float @llvm.floor.f32(float %ld9 ) 1367 %floor10 = call float @llvm.floor.f32(float %ld10) 1368 %floor11 = call float @llvm.floor.f32(float %ld11) 1369 %floor12 = call float @llvm.floor.f32(float %ld12) 1370 %floor13 = call float @llvm.floor.f32(float %ld13) 1371 %floor14 = call float @llvm.floor.f32(float %ld14) 1372 %floor15 = call float @llvm.floor.f32(float %ld15) 1373 store float %floor0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1374 store float %floor1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1375 store float %floor2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1376 store float %floor3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1377 store float %floor4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1378 store float %floor5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1379 store float %floor6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1380 store float %floor7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1381 store float %floor8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1382 store float %floor9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1383 store float %floor10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1384 store float %floor11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1385 store float %floor12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1386 store float %floor13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1387 store float %floor14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1388 store float %floor15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1389 ret void 1390} 1391 1392define void @nearbyint_4f32() #0 { 1393; SSE2-LABEL: @nearbyint_4f32( 1394; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1395; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1396; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1397; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1398; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) 1399; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) 1400; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) 1401; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) 1402; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1403; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1404; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1405; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1406; SSE2-NEXT: ret void 1407; 1408; SSE41-LABEL: @nearbyint_4f32( 1409; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1410; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1411; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1412; SSE41-NEXT: ret void 1413; 1414; AVX-LABEL: @nearbyint_4f32( 1415; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1416; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1417; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1418; AVX-NEXT: ret void 1419; 1420 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1421 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1422 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1423 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1424 %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0) 1425 %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1) 1426 %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2) 1427 %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3) 1428 store float %nearbyint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1429 store float %nearbyint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1430 store float %nearbyint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1431 store float %nearbyint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1432 ret void 1433} 1434 1435define void @nearbyint_8f32() #0 { 1436; SSE2-LABEL: @nearbyint_8f32( 1437; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1438; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1439; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1440; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1441; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1442; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1443; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1444; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1445; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) 1446; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) 1447; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) 1448; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) 1449; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) 1450; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) 1451; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) 1452; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) 1453; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1454; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1455; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1456; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1457; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1458; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1459; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1460; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1461; SSE2-NEXT: ret void 1462; 1463; SSE41-LABEL: @nearbyint_8f32( 1464; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1465; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1466; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1467; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) 1468; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1469; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1470; SSE41-NEXT: ret void 1471; 1472; AVX-LABEL: @nearbyint_8f32( 1473; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1474; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) 1475; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1476; AVX-NEXT: ret void 1477; 1478 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1479 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1480 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1481 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1482 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1483 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1484 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1485 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1486 %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0) 1487 %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1) 1488 %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2) 1489 %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3) 1490 %nearbyint4 = call float @llvm.nearbyint.f32(float %ld4) 1491 %nearbyint5 = call float @llvm.nearbyint.f32(float %ld5) 1492 %nearbyint6 = call float @llvm.nearbyint.f32(float %ld6) 1493 %nearbyint7 = call float @llvm.nearbyint.f32(float %ld7) 1494 store float %nearbyint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1495 store float %nearbyint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1496 store float %nearbyint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1497 store float %nearbyint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1498 store float %nearbyint4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1499 store float %nearbyint5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1500 store float %nearbyint6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1501 store float %nearbyint7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1502 ret void 1503} 1504 1505define void @nearbyint_16f32() #0 { 1506; SSE2-LABEL: @nearbyint_16f32( 1507; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1508; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1509; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1510; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1511; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1512; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1513; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1514; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1515; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1516; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1517; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1518; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1519; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1520; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1521; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1522; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1523; SSE2-NEXT: [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]]) 1524; SSE2-NEXT: [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]]) 1525; SSE2-NEXT: [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]]) 1526; SSE2-NEXT: [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]]) 1527; SSE2-NEXT: [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]]) 1528; SSE2-NEXT: [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]]) 1529; SSE2-NEXT: [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]]) 1530; SSE2-NEXT: [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]]) 1531; SSE2-NEXT: [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[LD8]]) 1532; SSE2-NEXT: [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[LD9]]) 1533; SSE2-NEXT: [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[LD10]]) 1534; SSE2-NEXT: [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[LD11]]) 1535; SSE2-NEXT: [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[LD12]]) 1536; SSE2-NEXT: [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]]) 1537; SSE2-NEXT: [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]]) 1538; SSE2-NEXT: [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]]) 1539; SSE2-NEXT: store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1540; SSE2-NEXT: store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1541; SSE2-NEXT: store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1542; SSE2-NEXT: store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1543; SSE2-NEXT: store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1544; SSE2-NEXT: store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1545; SSE2-NEXT: store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1546; SSE2-NEXT: store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1547; SSE2-NEXT: store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1548; SSE2-NEXT: store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1549; SSE2-NEXT: store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1550; SSE2-NEXT: store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1551; SSE2-NEXT: store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1552; SSE2-NEXT: store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1553; SSE2-NEXT: store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1554; SSE2-NEXT: store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1555; SSE2-NEXT: ret void 1556; 1557; SSE41-LABEL: @nearbyint_16f32( 1558; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1559; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1560; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1561; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1562; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) 1563; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) 1564; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]]) 1565; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]]) 1566; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1567; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1568; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1569; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1570; SSE41-NEXT: ret void 1571; 1572; AVX1-LABEL: @nearbyint_16f32( 1573; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1574; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1575; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) 1576; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]]) 1577; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1578; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1579; AVX1-NEXT: ret void 1580; 1581; AVX2-LABEL: @nearbyint_16f32( 1582; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1583; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1584; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) 1585; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]]) 1586; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1587; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1588; AVX2-NEXT: ret void 1589; 1590; AVX512-LABEL: @nearbyint_16f32( 1591; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1592; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> [[TMP1]]) 1593; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1594; AVX512-NEXT: ret void 1595; 1596 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1597 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1598 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1599 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1600 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1601 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1602 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1603 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1604 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1605 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1606 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1607 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1608 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1609 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1610 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1611 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1612 %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0 ) 1613 %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1 ) 1614 %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2 ) 1615 %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3 ) 1616 %nearbyint4 = call float @llvm.nearbyint.f32(float %ld4 ) 1617 %nearbyint5 = call float @llvm.nearbyint.f32(float %ld5 ) 1618 %nearbyint6 = call float @llvm.nearbyint.f32(float %ld6 ) 1619 %nearbyint7 = call float @llvm.nearbyint.f32(float %ld7 ) 1620 %nearbyint8 = call float @llvm.nearbyint.f32(float %ld8 ) 1621 %nearbyint9 = call float @llvm.nearbyint.f32(float %ld9 ) 1622 %nearbyint10 = call float @llvm.nearbyint.f32(float %ld10) 1623 %nearbyint11 = call float @llvm.nearbyint.f32(float %ld11) 1624 %nearbyint12 = call float @llvm.nearbyint.f32(float %ld12) 1625 %nearbyint13 = call float @llvm.nearbyint.f32(float %ld13) 1626 %nearbyint14 = call float @llvm.nearbyint.f32(float %ld14) 1627 %nearbyint15 = call float @llvm.nearbyint.f32(float %ld15) 1628 store float %nearbyint0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1629 store float %nearbyint1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1630 store float %nearbyint2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1631 store float %nearbyint3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1632 store float %nearbyint4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1633 store float %nearbyint5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1634 store float %nearbyint6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1635 store float %nearbyint7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1636 store float %nearbyint8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1637 store float %nearbyint9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1638 store float %nearbyint10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1639 store float %nearbyint11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1640 store float %nearbyint12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1641 store float %nearbyint13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1642 store float %nearbyint14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1643 store float %nearbyint15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1644 ret void 1645} 1646 1647define void @rint_4f32() #0 { 1648; SSE2-LABEL: @rint_4f32( 1649; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1650; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1651; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1652; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1653; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) 1654; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) 1655; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) 1656; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) 1657; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1658; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1659; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1660; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1661; SSE2-NEXT: ret void 1662; 1663; SSE41-LABEL: @rint_4f32( 1664; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1665; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1666; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1667; SSE41-NEXT: ret void 1668; 1669; AVX-LABEL: @rint_4f32( 1670; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1671; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1672; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1673; AVX-NEXT: ret void 1674; 1675 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1676 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1677 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1678 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1679 %rint0 = call float @llvm.rint.f32(float %ld0) 1680 %rint1 = call float @llvm.rint.f32(float %ld1) 1681 %rint2 = call float @llvm.rint.f32(float %ld2) 1682 %rint3 = call float @llvm.rint.f32(float %ld3) 1683 store float %rint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1684 store float %rint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1685 store float %rint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1686 store float %rint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1687 ret void 1688} 1689 1690define void @rint_8f32() #0 { 1691; SSE2-LABEL: @rint_8f32( 1692; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1693; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1694; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1695; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1696; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1697; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1698; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1699; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1700; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) 1701; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) 1702; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) 1703; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) 1704; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) 1705; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) 1706; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) 1707; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) 1708; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1709; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1710; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1711; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1712; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1713; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1714; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1715; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1716; SSE2-NEXT: ret void 1717; 1718; SSE41-LABEL: @rint_8f32( 1719; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1720; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1721; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1722; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) 1723; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1724; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1725; SSE41-NEXT: ret void 1726; 1727; AVX-LABEL: @rint_8f32( 1728; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1729; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) 1730; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1731; AVX-NEXT: ret void 1732; 1733 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1734 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1735 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1736 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1737 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1738 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1739 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1740 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1741 %rint0 = call float @llvm.rint.f32(float %ld0) 1742 %rint1 = call float @llvm.rint.f32(float %ld1) 1743 %rint2 = call float @llvm.rint.f32(float %ld2) 1744 %rint3 = call float @llvm.rint.f32(float %ld3) 1745 %rint4 = call float @llvm.rint.f32(float %ld4) 1746 %rint5 = call float @llvm.rint.f32(float %ld5) 1747 %rint6 = call float @llvm.rint.f32(float %ld6) 1748 %rint7 = call float @llvm.rint.f32(float %ld7) 1749 store float %rint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1750 store float %rint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1751 store float %rint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1752 store float %rint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1753 store float %rint4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1754 store float %rint5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1755 store float %rint6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1756 store float %rint7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1757 ret void 1758} 1759 1760define void @rint_16f32() #0 { 1761; SSE2-LABEL: @rint_16f32( 1762; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1763; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1764; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1765; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1766; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1767; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1768; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1769; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1770; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 1771; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 1772; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1773; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1774; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1775; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1776; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1777; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1778; SSE2-NEXT: [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]]) 1779; SSE2-NEXT: [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]]) 1780; SSE2-NEXT: [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]]) 1781; SSE2-NEXT: [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]]) 1782; SSE2-NEXT: [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]]) 1783; SSE2-NEXT: [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]]) 1784; SSE2-NEXT: [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]]) 1785; SSE2-NEXT: [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]]) 1786; SSE2-NEXT: [[RINT8:%.*]] = call float @llvm.rint.f32(float [[LD8]]) 1787; SSE2-NEXT: [[RINT9:%.*]] = call float @llvm.rint.f32(float [[LD9]]) 1788; SSE2-NEXT: [[RINT10:%.*]] = call float @llvm.rint.f32(float [[LD10]]) 1789; SSE2-NEXT: [[RINT11:%.*]] = call float @llvm.rint.f32(float [[LD11]]) 1790; SSE2-NEXT: [[RINT12:%.*]] = call float @llvm.rint.f32(float [[LD12]]) 1791; SSE2-NEXT: [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]]) 1792; SSE2-NEXT: [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]]) 1793; SSE2-NEXT: [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]]) 1794; SSE2-NEXT: store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1795; SSE2-NEXT: store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1796; SSE2-NEXT: store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1797; SSE2-NEXT: store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1798; SSE2-NEXT: store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1799; SSE2-NEXT: store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1800; SSE2-NEXT: store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1801; SSE2-NEXT: store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1802; SSE2-NEXT: store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 1803; SSE2-NEXT: store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 1804; SSE2-NEXT: store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1805; SSE2-NEXT: store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1806; SSE2-NEXT: store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1807; SSE2-NEXT: store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1808; SSE2-NEXT: store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1809; SSE2-NEXT: store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1810; SSE2-NEXT: ret void 1811; 1812; SSE41-LABEL: @rint_16f32( 1813; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1814; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1815; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 1816; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 1817; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) 1818; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) 1819; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]]) 1820; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]]) 1821; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1822; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1823; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 1824; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 1825; SSE41-NEXT: ret void 1826; 1827; AVX1-LABEL: @rint_16f32( 1828; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1829; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1830; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) 1831; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]]) 1832; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1833; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1834; AVX1-NEXT: ret void 1835; 1836; AVX2-LABEL: @rint_16f32( 1837; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1838; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 1839; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) 1840; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]]) 1841; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1842; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 1843; AVX2-NEXT: ret void 1844; 1845; AVX512-LABEL: @rint_16f32( 1846; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 1847; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.rint.v16f32(<16 x float> [[TMP1]]) 1848; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 1849; AVX512-NEXT: ret void 1850; 1851 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 1852 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 1853 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 1854 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 1855 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 1856 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 1857 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 1858 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 1859 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 1860 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 1861 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 1862 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 1863 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 1864 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 1865 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 1866 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 1867 %rint0 = call float @llvm.rint.f32(float %ld0 ) 1868 %rint1 = call float @llvm.rint.f32(float %ld1 ) 1869 %rint2 = call float @llvm.rint.f32(float %ld2 ) 1870 %rint3 = call float @llvm.rint.f32(float %ld3 ) 1871 %rint4 = call float @llvm.rint.f32(float %ld4 ) 1872 %rint5 = call float @llvm.rint.f32(float %ld5 ) 1873 %rint6 = call float @llvm.rint.f32(float %ld6 ) 1874 %rint7 = call float @llvm.rint.f32(float %ld7 ) 1875 %rint8 = call float @llvm.rint.f32(float %ld8 ) 1876 %rint9 = call float @llvm.rint.f32(float %ld9 ) 1877 %rint10 = call float @llvm.rint.f32(float %ld10) 1878 %rint11 = call float @llvm.rint.f32(float %ld11) 1879 %rint12 = call float @llvm.rint.f32(float %ld12) 1880 %rint13 = call float @llvm.rint.f32(float %ld13) 1881 %rint14 = call float @llvm.rint.f32(float %ld14) 1882 %rint15 = call float @llvm.rint.f32(float %ld15) 1883 store float %rint0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 1884 store float %rint1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 1885 store float %rint2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 1886 store float %rint3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 1887 store float %rint4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 1888 store float %rint5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 1889 store float %rint6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 1890 store float %rint7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 1891 store float %rint8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 1892 store float %rint9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 1893 store float %rint10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 1894 store float %rint11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 1895 store float %rint12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 1896 store float %rint13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 1897 store float %rint14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 1898 store float %rint15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 1899 ret void 1900} 1901 1902define void @trunc_4f32() #0 { 1903; SSE2-LABEL: @trunc_4f32( 1904; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1905; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1906; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1907; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1908; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) 1909; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) 1910; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) 1911; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) 1912; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1913; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1914; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1915; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1916; SSE2-NEXT: ret void 1917; 1918; SSE41-LABEL: @trunc_4f32( 1919; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1920; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 1921; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1922; SSE41-NEXT: ret void 1923; 1924; AVX-LABEL: @trunc_4f32( 1925; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1926; AVX-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 1927; AVX-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1928; AVX-NEXT: ret void 1929; 1930 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1931 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1932 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1933 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1934 %trunc0 = call float @llvm.trunc.f32(float %ld0) 1935 %trunc1 = call float @llvm.trunc.f32(float %ld1) 1936 %trunc2 = call float @llvm.trunc.f32(float %ld2) 1937 %trunc3 = call float @llvm.trunc.f32(float %ld3) 1938 store float %trunc0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1939 store float %trunc1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1940 store float %trunc2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1941 store float %trunc3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1942 ret void 1943} 1944 1945define void @trunc_8f32() #0 { 1946; SSE2-LABEL: @trunc_8f32( 1947; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1948; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1949; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1950; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1951; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1952; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1953; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1954; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1955; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) 1956; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) 1957; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) 1958; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) 1959; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) 1960; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) 1961; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) 1962; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) 1963; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 1964; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 1965; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 1966; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 1967; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 1968; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 1969; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 1970; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 1971; SSE2-NEXT: ret void 1972; 1973; SSE41-LABEL: @trunc_8f32( 1974; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 1975; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 1976; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 1977; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) 1978; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 1979; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 1980; SSE41-NEXT: ret void 1981; 1982; AVX-LABEL: @trunc_8f32( 1983; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 1984; AVX-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) 1985; AVX-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 1986; AVX-NEXT: ret void 1987; 1988 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 1989 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 1990 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 1991 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 1992 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 1993 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 1994 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 1995 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 1996 %trunc0 = call float @llvm.trunc.f32(float %ld0) 1997 %trunc1 = call float @llvm.trunc.f32(float %ld1) 1998 %trunc2 = call float @llvm.trunc.f32(float %ld2) 1999 %trunc3 = call float @llvm.trunc.f32(float %ld3) 2000 %trunc4 = call float @llvm.trunc.f32(float %ld4) 2001 %trunc5 = call float @llvm.trunc.f32(float %ld5) 2002 %trunc6 = call float @llvm.trunc.f32(float %ld6) 2003 %trunc7 = call float @llvm.trunc.f32(float %ld7) 2004 store float %trunc0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 2005 store float %trunc1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 2006 store float %trunc2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 2007 store float %trunc3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 2008 store float %trunc4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 2009 store float %trunc5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 2010 store float %trunc6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 2011 store float %trunc7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 2012 ret void 2013} 2014 2015define void @trunc_16f32() #0 { 2016; SSE2-LABEL: @trunc_16f32( 2017; SSE2-NEXT: [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 2018; SSE2-NEXT: [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 2019; SSE2-NEXT: [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 2020; SSE2-NEXT: [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 2021; SSE2-NEXT: [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 2022; SSE2-NEXT: [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 2023; SSE2-NEXT: [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 2024; SSE2-NEXT: [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 2025; SSE2-NEXT: [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4 2026; SSE2-NEXT: [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4 2027; SSE2-NEXT: [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 2028; SSE2-NEXT: [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 2029; SSE2-NEXT: [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 2030; SSE2-NEXT: [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 2031; SSE2-NEXT: [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 2032; SSE2-NEXT: [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 2033; SSE2-NEXT: [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]]) 2034; SSE2-NEXT: [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]]) 2035; SSE2-NEXT: [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]]) 2036; SSE2-NEXT: [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]]) 2037; SSE2-NEXT: [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]]) 2038; SSE2-NEXT: [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]]) 2039; SSE2-NEXT: [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]]) 2040; SSE2-NEXT: [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]]) 2041; SSE2-NEXT: [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[LD8]]) 2042; SSE2-NEXT: [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[LD9]]) 2043; SSE2-NEXT: [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[LD10]]) 2044; SSE2-NEXT: [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[LD11]]) 2045; SSE2-NEXT: [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[LD12]]) 2046; SSE2-NEXT: [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]]) 2047; SSE2-NEXT: [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]]) 2048; SSE2-NEXT: [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]]) 2049; SSE2-NEXT: store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4 2050; SSE2-NEXT: store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 2051; SSE2-NEXT: store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4 2052; SSE2-NEXT: store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4 2053; SSE2-NEXT: store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4 2054; SSE2-NEXT: store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4 2055; SSE2-NEXT: store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4 2056; SSE2-NEXT: store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4 2057; SSE2-NEXT: store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4 2058; SSE2-NEXT: store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4 2059; SSE2-NEXT: store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 2060; SSE2-NEXT: store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 2061; SSE2-NEXT: store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 2062; SSE2-NEXT: store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 2063; SSE2-NEXT: store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 2064; SSE2-NEXT: store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 2065; SSE2-NEXT: ret void 2066; 2067; SSE41-LABEL: @trunc_16f32( 2068; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 2069; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 2070; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 2071; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 2072; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) 2073; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) 2074; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]]) 2075; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]]) 2076; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 2077; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 2078; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 2079; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 2080; SSE41-NEXT: ret void 2081; 2082; AVX1-LABEL: @trunc_16f32( 2083; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 2084; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 2085; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) 2086; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]]) 2087; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 2088; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 2089; AVX1-NEXT: ret void 2090; 2091; AVX2-LABEL: @trunc_16f32( 2092; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 2093; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 2094; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) 2095; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]]) 2096; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 2097; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 2098; AVX2-NEXT: ret void 2099; 2100; AVX512-LABEL: @trunc_16f32( 2101; AVX512-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4 2102; AVX512-NEXT: [[TMP2:%.*]] = call <16 x float> @llvm.trunc.v16f32(<16 x float> [[TMP1]]) 2103; AVX512-NEXT: store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4 2104; AVX512-NEXT: ret void 2105; 2106 %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4 2107 %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4 2108 %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4 2109 %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4 2110 %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4 2111 %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4 2112 %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4 2113 %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4 2114 %ld8 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4 2115 %ld9 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4 2116 %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4 2117 %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4 2118 %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4 2119 %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4 2120 %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4 2121 %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4 2122 %trunc0 = call float @llvm.trunc.f32(float %ld0 ) 2123 %trunc1 = call float @llvm.trunc.f32(float %ld1 ) 2124 %trunc2 = call float @llvm.trunc.f32(float %ld2 ) 2125 %trunc3 = call float @llvm.trunc.f32(float %ld3 ) 2126 %trunc4 = call float @llvm.trunc.f32(float %ld4 ) 2127 %trunc5 = call float @llvm.trunc.f32(float %ld5 ) 2128 %trunc6 = call float @llvm.trunc.f32(float %ld6 ) 2129 %trunc7 = call float @llvm.trunc.f32(float %ld7 ) 2130 %trunc8 = call float @llvm.trunc.f32(float %ld8 ) 2131 %trunc9 = call float @llvm.trunc.f32(float %ld9 ) 2132 %trunc10 = call float @llvm.trunc.f32(float %ld10) 2133 %trunc11 = call float @llvm.trunc.f32(float %ld11) 2134 %trunc12 = call float @llvm.trunc.f32(float %ld12) 2135 %trunc13 = call float @llvm.trunc.f32(float %ld13) 2136 %trunc14 = call float @llvm.trunc.f32(float %ld14) 2137 %trunc15 = call float @llvm.trunc.f32(float %ld15) 2138 store float %trunc0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4 2139 store float %trunc1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4 2140 store float %trunc2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4 2141 store float %trunc3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4 2142 store float %trunc4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4 2143 store float %trunc5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4 2144 store float %trunc6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4 2145 store float %trunc7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4 2146 store float %trunc8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4 2147 store float %trunc9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4 2148 store float %trunc10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4 2149 store float %trunc11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4 2150 store float %trunc12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4 2151 store float %trunc13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4 2152 store float %trunc14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4 2153 store float %trunc15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4 2154 ret void 2155} 2156 2157attributes #0 = { nounwind } 2158 2159