; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-code \ ; RUN: -disable-output < %s | \ ; RUN: FileCheck -check-prefix=CODE %s ; RUN: opt %loadPolly -polly-codegen-ppcg -S < %s | \ ; RUN: FileCheck %s -check-prefix=IR ; ; REQUIRES: pollyacc ; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (16) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (8) * sizeof(float), cudaMemcpyHostToDevice)); ; CODE: dim3 k0_dimBlock(8); ; CODE-NEXT: dim3 k0_dimGrid(1); ; CODE-NEXT: kernel0 <<>> (dev_MemRef_A); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: { ; CODE-NEXT: dim3 k1_dimBlock(8); ; CODE-NEXT: dim3 k1_dimGrid(1); ; CODE-NEXT: kernel1 <<>> (dev_MemRef_B); ; CODE-NEXT: cudaCheckKernel(); ; CODE-NEXT: } ; CODE: cudaCheckReturn(cudaMemcpy(MemRef_B, dev_MemRef_B, (16) * sizeof(float), cudaMemcpyDeviceToHost)); ; CODE-NEXT: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (8) * sizeof(float), cudaMemcpyDeviceToHost)); ; CODE: # kernel0 ; CODE-NEXT: Stmt_bb11(t0); ; CODE: # kernel1 ; CODE-NEXT: Stmt_bb3(t0); ; IR: %p_dev_array_MemRef_B = call i8* @polly_allocateMemoryForDevice(i64 32) ; IR-NEXT: %p_dev_array_MemRef_A = call i8* @polly_allocateMemoryForDevice(i64 32) ; IR-NEXT: [[REG0:%.+]] = getelementptr float, float* %B, i64 8 ; IR-NEXT: [[REG1:%.+]] = bitcast float* [[REG0]] to i8* ; IR-NEXT: call void @polly_copyFromHostToDevice(i8* [[REG1]], i8* %p_dev_array_MemRef_B, i64 32) ; IR: [[REGA:%.+]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B) ; IR-NEXT: [[REGB:%.+]] = bitcast i8* [[REGA]] to float* ; IR-NEXT: [[REGC:%.+]] = getelementptr float, float* [[REGB]], i64 -8 ; IR-NEXT: [[REGD:%.+]] = bitcast float* [[REGC]] to i8* ; void foo(float A[], float B[]) { ; for (long i = 0; i < 8; i++) ; B[i + 8] *= 4; ; ; for (long i = 0; i < 8; i++) ; A[i] *= 12; ; } ; ; #ifdef OUTPUT ; int main() { ; float A[16]; ; ; for (long i = 0; i < 16; i++) { ; __sync_synchronize(); ; A[i] = i; ; } ; ; foo(A, A); ; ; float sum = 0; ; for (long i = 0; i < 16; i++) { ; __sync_synchronize(); ; sum += A[i]; ; } ; ; printf("%f\n", sum); ; } ; #endif ; target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @foo(float* %A, float* %B) { bb: br label %bb2 bb2: ; preds = %bb7, %bb %i.0 = phi i64 [ 0, %bb ], [ %tmp8, %bb7 ] %exitcond1 = icmp ne i64 %i.0, 8 br i1 %exitcond1, label %bb3, label %bb9 bb3: ; preds = %bb2 %tmp = add nuw nsw i64 %i.0, 8 %tmp4 = getelementptr inbounds float, float* %B, i64 %tmp %tmp5 = load float, float* %tmp4, align 4 %tmp6 = fmul float %tmp5, 4.000000e+00 store float %tmp6, float* %tmp4, align 4 br label %bb7 bb7: ; preds = %bb3 %tmp8 = add nuw nsw i64 %i.0, 1 br label %bb2 bb9: ; preds = %bb2 br label %bb10 bb10: ; preds = %bb15, %bb9 %i1.0 = phi i64 [ 0, %bb9 ], [ %tmp16, %bb15 ] %exitcond = icmp ne i64 %i1.0, 8 br i1 %exitcond, label %bb11, label %bb17 bb11: ; preds = %bb10 %tmp12 = getelementptr inbounds float, float* %A, i64 %i1.0 %tmp13 = load float, float* %tmp12, align 4 %tmp14 = fmul float %tmp13, 1.200000e+01 store float %tmp14, float* %tmp12, align 4 br label %bb15 bb15: ; preds = %bb11 %tmp16 = add nuw nsw i64 %i1.0, 1 br label %bb10 bb17: ; preds = %bb10 ret void }