1 .text 2 .file "matmul.c" 3 .globl init_array # -- Begin function init_array 4 .p2align 4, 0x90 5 .type init_array,@function 6init_array: # @init_array 7 .cfi_startproc 8# %bb.0: # %entry 9 pushq %rbp 10 .cfi_def_cfa_offset 16 11 .cfi_offset %rbp, -16 12 movq %rsp, %rbp 13 .cfi_def_cfa_register %rbp 14 pushq %rbx 15 pushq %rax 16 .cfi_offset %rbx, -24 17 leaq init_array_polly_subfn(%rip), %rdi 18 leaq -16(%rbp), %rbx 19 xorl %edx, %edx 20 xorl %ecx, %ecx 21 movl $1536, %r8d # imm = 0x600 22 movl $1, %r9d 23 movq %rbx, %rsi 24 callq GOMP_parallel_loop_runtime_start@PLT 25 movq %rbx, %rdi 26 callq init_array_polly_subfn 27 callq GOMP_parallel_end@PLT 28 addq $8, %rsp 29 popq %rbx 30 popq %rbp 31 .cfi_def_cfa %rsp, 8 32 retq 33.Lfunc_end0: 34 .size init_array, .Lfunc_end0-init_array 35 .cfi_endproc 36 # -- End function 37 .globl print_array # -- Begin function print_array 38 .p2align 4, 0x90 39 .type print_array,@function 40print_array: # @print_array 41 .cfi_startproc 42# %bb.0: # %entry 43 pushq %rbp 44 .cfi_def_cfa_offset 16 45 .cfi_offset %rbp, -16 46 movq %rsp, %rbp 47 .cfi_def_cfa_register %rbp 48 pushq %r15 49 pushq %r14 50 pushq %r13 51 pushq %r12 52 pushq %rbx 53 pushq %rax 54 .cfi_offset %rbx, -56 55 .cfi_offset %r12, -48 56 .cfi_offset %r13, -40 57 .cfi_offset %r14, -32 58 .cfi_offset %r15, -24 59 leaq C(%rip), %r13 60 xorl %eax, %eax 61 movl $3435973837, %r12d # imm = 0xCCCCCCCD 62 leaq .L.str(%rip), %r14 63 .p2align 4, 0x90 64.LBB1_1: # %for.cond1.preheader 65 # =>This Loop Header: Depth=1 66 # Child Loop BB1_2 Depth 2 67 movq %rax, -48(%rbp) # 8-byte Spill 68 movq stdout(%rip), %rsi 69 xorl %ebx, %ebx 70 .p2align 4, 0x90 71.LBB1_2: # %for.body3 72 # Parent Loop BB1_1 Depth=1 73 # => This Inner Loop Header: Depth=2 74 movl %ebx, %eax 75 imulq %r12, %rax 76 shrq $38, %rax 77 leal (%rax,%rax,4), %r15d 78 shll $4, %r15d 79 addl $79, %r15d 80 movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 81 cvtss2sd %xmm0, %xmm0 82 movb $1, %al 83 movq %rsi, %rdi 84 movq %r14, %rsi 85 callq fprintf 86 cmpl %ebx, %r15d 87 jne .LBB1_4 88# %bb.3: # %if.then 89 # in Loop: Header=BB1_2 Depth=2 90 movq stdout(%rip), %rsi 91 movl $10, %edi 92 callq fputc@PLT 93.LBB1_4: # %for.inc 94 # in Loop: Header=BB1_2 Depth=2 95 addq $1, %rbx 96 movq stdout(%rip), %rsi 97 cmpq $1536, %rbx # imm = 0x600 98 jne .LBB1_2 99# %bb.5: # %for.end 100 # in Loop: Header=BB1_1 Depth=1 101 movl $10, %edi 102 callq fputc@PLT 103 movq -48(%rbp), %rax # 8-byte Reload 104 addq $1, %rax 105 addq $6144, %r13 # imm = 0x1800 106 cmpq $1536, %rax # imm = 0x600 107 jne .LBB1_1 108# %bb.6: # %for.end12 109 addq $8, %rsp 110 popq %rbx 111 popq %r12 112 popq %r13 113 popq %r14 114 popq %r15 115 popq %rbp 116 .cfi_def_cfa %rsp, 8 117 retq 118.Lfunc_end1: 119 .size print_array, .Lfunc_end1-print_array 120 .cfi_endproc 121 # -- End function 122 .globl main # -- Begin function main 123 .p2align 4, 0x90 124 .type main,@function 125main: # @main 126 .cfi_startproc 127# %bb.0: # %entry 128 pushq %rbp 129 .cfi_def_cfa_offset 16 130 .cfi_offset %rbp, -16 131 movq %rsp, %rbp 132 .cfi_def_cfa_register %rbp 133 pushq %rbx 134 pushq %rax 135 .cfi_offset %rbx, -24 136 callq init_array 137 leaq main_polly_subfn(%rip), %rdi 138 leaq -16(%rbp), %rbx 139 xorl %edx, %edx 140 xorl %ecx, %ecx 141 movl $1536, %r8d # imm = 0x600 142 movl $1, %r9d 143 movq %rbx, %rsi 144 callq GOMP_parallel_loop_runtime_start@PLT 145 movq %rbx, %rdi 146 callq main_polly_subfn 147 callq GOMP_parallel_end@PLT 148 leaq main_polly_subfn_1(%rip), %rdi 149 xorl %edx, %edx 150 xorl %ecx, %ecx 151 movl $1536, %r8d # imm = 0x600 152 movl $64, %r9d 153 movq %rbx, %rsi 154 callq GOMP_parallel_loop_runtime_start@PLT 155 movq %rbx, %rdi 156 callq main_polly_subfn_1 157 callq GOMP_parallel_end@PLT 158 xorl %eax, %eax 159 addq $8, %rsp 160 popq %rbx 161 popq %rbp 162 .cfi_def_cfa %rsp, 8 163 retq 164.Lfunc_end2: 165 .size main, .Lfunc_end2-main 166 .cfi_endproc 167 # -- End function 168 .section .rodata.cst8,"aM",@progbits,8 169 .p2align 3 # -- Begin function init_array_polly_subfn 170.LCPI3_0: 171 .quad 4602678819172646912 # double 0.5 172 .text 173 .p2align 4, 0x90 174 .type init_array_polly_subfn,@function 175init_array_polly_subfn: # @init_array_polly_subfn 176 .cfi_startproc 177# %bb.0: # %polly.par.setup 178 pushq %r15 179 .cfi_def_cfa_offset 16 180 pushq %r14 181 .cfi_def_cfa_offset 24 182 pushq %r13 183 .cfi_def_cfa_offset 32 184 pushq %r12 185 .cfi_def_cfa_offset 40 186 pushq %rbx 187 .cfi_def_cfa_offset 48 188 subq $16, %rsp 189 .cfi_def_cfa_offset 64 190 .cfi_offset %rbx, -48 191 .cfi_offset %r12, -40 192 .cfi_offset %r13, -32 193 .cfi_offset %r14, -24 194 .cfi_offset %r15, -16 195 leaq 8(%rsp), %rdi 196 movq %rsp, %rsi 197 callq GOMP_loop_runtime_next@PLT 198 testb %al, %al 199 je .LBB3_2 200# %bb.1: 201 leaq B(%rip), %r15 202 leaq A(%rip), %r12 203 movsd .LCPI3_0(%rip), %xmm1 # xmm1 = mem[0],zero 204 leaq 8(%rsp), %r14 205 movq %rsp, %r13 206 .p2align 4, 0x90 207.LBB3_4: # %polly.par.loadIVBounds 208 # =>This Loop Header: Depth=1 209 # Child Loop BB3_5 Depth 2 210 # Child Loop BB3_6 Depth 3 211 movq 8(%rsp), %rax 212 movq (%rsp), %r8 213 decq %r8 214 movq %rax, %rdx 215 shlq $11, %rdx 216 leaq (%rdx,%rdx,2), %rdx 217 leaq (%r15,%rdx), %rsi 218 addq %r12, %rdx 219 .p2align 4, 0x90 220.LBB3_5: # %polly.loop_header 221 # Parent Loop BB3_4 Depth=1 222 # => This Loop Header: Depth=2 223 # Child Loop BB3_6 Depth 3 224 movq $-6144, %rdi # imm = 0xE800 225 xorl %ecx, %ecx 226 .p2align 4, 0x90 227.LBB3_6: # %polly.loop_header2 228 # Parent Loop BB3_4 Depth=1 229 # Parent Loop BB3_5 Depth=2 230 # => This Inner Loop Header: Depth=3 231 movl %ecx, %ebx 232 andl $1023, %ebx # imm = 0x3FF 233 incl %ebx 234 xorps %xmm0, %xmm0 235 cvtsi2sdl %ebx, %xmm0 236 mulsd %xmm1, %xmm0 237 cvtsd2ss %xmm0, %xmm0 238 movss %xmm0, 6144(%rdx,%rdi) 239 movss %xmm0, 6144(%rsi,%rdi) 240 addl %eax, %ecx 241 addq $4, %rdi 242 jne .LBB3_6 243# %bb.7: # %polly.loop_exit4 244 # in Loop: Header=BB3_5 Depth=2 245 addq $6144, %rsi # imm = 0x1800 246 addq $6144, %rdx # imm = 0x1800 247 cmpq %r8, %rax 248 leaq 1(%rax), %rax 249 jl .LBB3_5 250# %bb.3: # %polly.par.checkNext.loopexit 251 # in Loop: Header=BB3_4 Depth=1 252 movq %r14, %rdi 253 movq %r13, %rsi 254 callq GOMP_loop_runtime_next@PLT 255 movsd .LCPI3_0(%rip), %xmm1 # xmm1 = mem[0],zero 256 testb %al, %al 257 jne .LBB3_4 258.LBB3_2: # %polly.par.exit 259 callq GOMP_loop_end_nowait@PLT 260 addq $16, %rsp 261 .cfi_def_cfa_offset 48 262 popq %rbx 263 .cfi_def_cfa_offset 40 264 popq %r12 265 .cfi_def_cfa_offset 32 266 popq %r13 267 .cfi_def_cfa_offset 24 268 popq %r14 269 .cfi_def_cfa_offset 16 270 popq %r15 271 .cfi_def_cfa_offset 8 272 retq 273.Lfunc_end3: 274 .size init_array_polly_subfn, .Lfunc_end3-init_array_polly_subfn 275 .cfi_endproc 276 # -- End function 277 .p2align 4, 0x90 # -- Begin function main_polly_subfn 278 .type main_polly_subfn,@function 279main_polly_subfn: # @main_polly_subfn 280 .cfi_startproc 281# %bb.0: # %polly.par.setup 282 pushq %r15 283 .cfi_def_cfa_offset 16 284 pushq %r14 285 .cfi_def_cfa_offset 24 286 pushq %rbx 287 .cfi_def_cfa_offset 32 288 subq $16, %rsp 289 .cfi_def_cfa_offset 48 290 .cfi_offset %rbx, -32 291 .cfi_offset %r14, -24 292 .cfi_offset %r15, -16 293 leaq 8(%rsp), %rdi 294 movq %rsp, %rsi 295 callq GOMP_loop_runtime_next@PLT 296 testb %al, %al 297 je .LBB4_3 298# %bb.1: 299 leaq C(%rip), %r15 300 leaq 8(%rsp), %r14 301 movq %rsp, %rbx 302 .p2align 4, 0x90 303.LBB4_2: # %polly.par.loadIVBounds 304 # =>This Inner Loop Header: Depth=1 305 movq 8(%rsp), %rax 306 movq (%rsp), %rcx 307 decq %rcx 308 leaq (%rax,%rax,2), %rdi 309 shlq $11, %rdi 310 addq %r15, %rdi 311 cmpq %rcx, %rax 312 cmovgeq %rax, %rcx 313 incq %rcx 314 subq %rax, %rcx 315 shlq $11, %rcx 316 leaq (%rcx,%rcx,2), %rdx 317 xorl %esi, %esi 318 callq memset@PLT 319 movq %r14, %rdi 320 movq %rbx, %rsi 321 callq GOMP_loop_runtime_next@PLT 322 testb %al, %al 323 jne .LBB4_2 324.LBB4_3: # %polly.par.exit 325 callq GOMP_loop_end_nowait@PLT 326 addq $16, %rsp 327 .cfi_def_cfa_offset 32 328 popq %rbx 329 .cfi_def_cfa_offset 24 330 popq %r14 331 .cfi_def_cfa_offset 16 332 popq %r15 333 .cfi_def_cfa_offset 8 334 retq 335.Lfunc_end4: 336 .size main_polly_subfn, .Lfunc_end4-main_polly_subfn 337 .cfi_endproc 338 # -- End function 339 .p2align 4, 0x90 # -- Begin function main_polly_subfn_1 340 .type main_polly_subfn_1,@function 341main_polly_subfn_1: # @main_polly_subfn_1 342 .cfi_startproc 343# %bb.0: # %polly.par.setup 344 pushq %rbp 345 .cfi_def_cfa_offset 16 346 pushq %r15 347 .cfi_def_cfa_offset 24 348 pushq %r14 349 .cfi_def_cfa_offset 32 350 pushq %r13 351 .cfi_def_cfa_offset 40 352 pushq %r12 353 .cfi_def_cfa_offset 48 354 pushq %rbx 355 .cfi_def_cfa_offset 56 356 subq $296, %rsp # imm = 0x128 357 .cfi_def_cfa_offset 352 358 .cfi_offset %rbx, -56 359 .cfi_offset %r12, -48 360 .cfi_offset %r13, -40 361 .cfi_offset %r14, -32 362 .cfi_offset %r15, -24 363 .cfi_offset %rbp, -16 364 jmp .LBB5_1 365 .p2align 4, 0x90 366.LBB5_2: # %polly.par.loadIVBounds 367 # in Loop: Header=BB5_1 Depth=1 368 movq 40(%rsp), %rdx 369 movq 32(%rsp), %rax 370 decq %rax 371 movq %rax, 136(%rsp) # 8-byte Spill 372 leaq (%rdx,%rdx,2), %rcx 373 shlq $11, %rcx 374 leaq A(%rip), %rax 375 addq %rax, %rcx 376 movq %rcx, 24(%rsp) # 8-byte Spill 377 .p2align 4, 0x90 378.LBB5_3: # %polly.loop_header 379 # Parent Loop BB5_1 Depth=1 380 # => This Loop Header: Depth=2 381 # Child Loop BB5_4 Depth 3 382 # Child Loop BB5_5 Depth 4 383 # Child Loop BB5_6 Depth 5 384 # Child Loop BB5_7 Depth 6 385 leaq 63(%rdx), %rsi 386 leaq B+192(%rip), %r14 387 xorl %ecx, %ecx 388 xorl %eax, %eax 389 movq %rdx, 168(%rsp) # 8-byte Spill 390 .p2align 4, 0x90 391.LBB5_4: # %polly.loop_header2 392 # Parent Loop BB5_1 Depth=1 393 # Parent Loop BB5_3 Depth=2 394 # => This Loop Header: Depth=3 395 # Child Loop BB5_5 Depth 4 396 # Child Loop BB5_6 Depth 5 397 # Child Loop BB5_7 Depth 6 398 movq %rax, 144(%rsp) # 8-byte Spill 399 movq %rcx, 152(%rsp) # 8-byte Spill 400 shlq $6, %rcx 401 leaq 16(%rcx), %rdi 402 leaq 32(%rcx), %rbp 403 leaq 48(%rcx), %r15 404 movq 24(%rsp), %r9 # 8-byte Reload 405 movq %r14, 160(%rsp) # 8-byte Spill 406 xorl %eax, %eax 407 .p2align 4, 0x90 408.LBB5_5: # %polly.loop_header8 409 # Parent Loop BB5_1 Depth=1 410 # Parent Loop BB5_3 Depth=2 411 # Parent Loop BB5_4 Depth=3 412 # => This Loop Header: Depth=4 413 # Child Loop BB5_6 Depth 5 414 # Child Loop BB5_7 Depth 6 415 movq %rax, 176(%rsp) # 8-byte Spill 416 movq %r9, 184(%rsp) # 8-byte Spill 417 movq %rdx, %rax 418 .p2align 4, 0x90 419.LBB5_6: # %polly.loop_header14 420 # Parent Loop BB5_1 Depth=1 421 # Parent Loop BB5_3 Depth=2 422 # Parent Loop BB5_4 Depth=3 423 # Parent Loop BB5_5 Depth=4 424 # => This Loop Header: Depth=5 425 # Child Loop BB5_7 Depth 6 426 leaq (%rax,%rax,2), %rbx 427 shlq $11, %rbx 428 leaq C(%rip), %rdx 429 addq %rdx, %rbx 430 leaq (%rbx,%rcx,4), %r8 431 leaq (%rbx,%rdi,4), %rdx 432 leaq (%rbx,%rbp,4), %r13 433 leaq (%rbx,%r15,4), %r10 434 movups (%rbx,%rcx,4), %xmm8 435 movups 16(%rbx,%rcx,4), %xmm0 436 movaps %xmm0, 96(%rsp) # 16-byte Spill 437 movups 32(%rbx,%rcx,4), %xmm6 438 movups 48(%rbx,%rcx,4), %xmm1 439 movups (%rbx,%rdi,4), %xmm15 440 movups 16(%rbx,%rdi,4), %xmm0 441 movaps %xmm0, (%rsp) # 16-byte Spill 442 movups 32(%rbx,%rdi,4), %xmm0 443 movaps %xmm0, 48(%rsp) # 16-byte Spill 444 movups 48(%rbx,%rdi,4), %xmm0 445 movaps %xmm0, 64(%rsp) # 16-byte Spill 446 movups (%rbx,%rbp,4), %xmm11 447 movups 16(%rbx,%rbp,4), %xmm0 448 movaps %xmm0, 112(%rsp) # 16-byte Spill 449 movups 32(%rbx,%rbp,4), %xmm12 450 movups 48(%rbx,%rbp,4), %xmm0 451 movaps %xmm0, 80(%rsp) # 16-byte Spill 452 movups (%rbx,%r15,4), %xmm9 453 movups 16(%rbx,%r15,4), %xmm13 454 movups 32(%rbx,%r15,4), %xmm2 455 movups 48(%rbx,%r15,4), %xmm3 456 movq $-256, %r12 457 movq %r14, %r11 458 .p2align 4, 0x90 459.LBB5_7: # %vector.ph 460 # Parent Loop BB5_1 Depth=1 461 # Parent Loop BB5_3 Depth=2 462 # Parent Loop BB5_4 Depth=3 463 # Parent Loop BB5_5 Depth=4 464 # Parent Loop BB5_6 Depth=5 465 # => This Inner Loop Header: Depth=6 466 movaps %xmm12, 208(%rsp) # 16-byte Spill 467 movaps %xmm2, 224(%rsp) # 16-byte Spill 468 movaps %xmm3, 240(%rsp) # 16-byte Spill 469 movaps %xmm8, %xmm10 470 movaps 96(%rsp), %xmm7 # 16-byte Reload 471 unpcklps %xmm7, %xmm10 # xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] 472 movaps %xmm1, %xmm4 473 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] 474 shufps $36, %xmm4, %xmm10 # xmm10 = xmm10[0,1],xmm4[2,0] 475 movaps %xmm7, %xmm5 476 shufps $17, %xmm8, %xmm5 # xmm5 = xmm5[1,0],xmm8[1,0] 477 movaps %xmm6, %xmm4 478 unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 479 shufps $226, %xmm4, %xmm5 # xmm5 = xmm5[2,0],xmm4[2,3] 480 movaps %xmm8, %xmm12 481 unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] 482 movaps %xmm1, %xmm4 483 shufps $34, %xmm6, %xmm4 # xmm4 = xmm4[2,0],xmm6[2,0] 484 shufps $36, %xmm4, %xmm12 # xmm12 = xmm12[0,1],xmm4[2,0] 485 shufps $51, %xmm8, %xmm7 # xmm7 = xmm7[3,0],xmm8[3,0] 486 unpckhps %xmm1, %xmm6 # xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] 487 shufps $226, %xmm6, %xmm7 # xmm7 = xmm7[2,0],xmm6[2,3] 488 movaps -160(%r11), %xmm0 489 movaps -144(%r11), %xmm1 490 movaps %xmm1, %xmm6 491 shufps $0, %xmm0, %xmm6 # xmm6 = xmm6[0,0],xmm0[0,0] 492 movaps -192(%r11), %xmm3 493 movaps -176(%r11), %xmm4 494 movaps %xmm3, %xmm8 495 unpcklps %xmm4, %xmm8 # xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 496 shufps $36, %xmm6, %xmm8 # xmm8 = xmm8[0,1],xmm6[2,0] 497 movaps %xmm0, %xmm2 498 unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 499 movaps %xmm4, %xmm6 500 shufps $17, %xmm3, %xmm6 # xmm6 = xmm6[1,0],xmm3[1,0] 501 shufps $226, %xmm2, %xmm6 # xmm6 = xmm6[2,0],xmm2[2,3] 502 movaps %xmm1, %xmm2 503 shufps $34, %xmm0, %xmm2 # xmm2 = xmm2[2,0],xmm0[2,0] 504 movaps %xmm3, %xmm14 505 unpckhps %xmm4, %xmm14 # xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] 506 shufps $36, %xmm2, %xmm14 # xmm14 = xmm14[0,1],xmm2[2,0] 507 unpckhps %xmm1, %xmm0 # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 508 shufps $51, %xmm3, %xmm4 # xmm4 = xmm4[3,0],xmm3[3,0] 509 shufps $226, %xmm0, %xmm4 # xmm4 = xmm4[2,0],xmm0[2,3] 510 movss 256(%r9,%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero 511 shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] 512 mulps %xmm0, %xmm8 513 addps %xmm10, %xmm8 514 mulps %xmm0, %xmm6 515 addps %xmm5, %xmm6 516 mulps %xmm0, %xmm14 517 addps %xmm12, %xmm14 518 mulps %xmm0, %xmm4 519 movaps %xmm0, %xmm5 520 addps %xmm7, %xmm4 521 movaps %xmm14, %xmm0 522 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 523 movaps %xmm6, %xmm1 524 shufps $51, %xmm8, %xmm1 # xmm1 = xmm1[3,0],xmm8[3,0] 525 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 526 movaps %xmm1, 272(%rsp) # 16-byte Spill 527 movaps %xmm4, %xmm0 528 shufps $34, %xmm14, %xmm0 # xmm0 = xmm0[2,0],xmm14[2,0] 529 movaps %xmm8, %xmm1 530 unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 531 shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] 532 movaps %xmm1, 256(%rsp) # 16-byte Spill 533 movaps %xmm14, %xmm0 534 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 535 movaps %xmm6, %xmm1 536 shufps $17, %xmm8, %xmm1 # xmm1 = xmm1[1,0],xmm8[1,0] 537 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 538 movaps %xmm1, 96(%rsp) # 16-byte Spill 539 shufps $0, %xmm14, %xmm4 # xmm4 = xmm4[0,0],xmm14[0,0] 540 unpcklps %xmm6, %xmm8 # xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] 541 shufps $36, %xmm4, %xmm8 # xmm8 = xmm8[0,1],xmm4[2,0] 542 movaps %xmm15, %xmm14 543 movaps (%rsp), %xmm4 # 16-byte Reload 544 unpcklps %xmm4, %xmm14 # xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] 545 movaps 64(%rsp), %xmm1 # 16-byte Reload 546 movaps %xmm1, %xmm0 547 movaps 48(%rsp), %xmm3 # 16-byte Reload 548 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] 549 shufps $36, %xmm0, %xmm14 # xmm14 = xmm14[0,1],xmm0[2,0] 550 movaps %xmm4, %xmm12 551 shufps $17, %xmm15, %xmm12 # xmm12 = xmm12[1,0],xmm15[1,0] 552 movaps %xmm3, %xmm2 553 unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 554 shufps $226, %xmm2, %xmm12 # xmm12 = xmm12[2,0],xmm2[2,3] 555 movaps %xmm15, %xmm7 556 unpckhps %xmm4, %xmm7 # xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] 557 movaps %xmm1, %xmm2 558 shufps $34, %xmm3, %xmm2 # xmm2 = xmm2[2,0],xmm3[2,0] 559 shufps $36, %xmm2, %xmm7 # xmm7 = xmm7[0,1],xmm2[2,0] 560 shufps $51, %xmm15, %xmm4 # xmm4 = xmm4[3,0],xmm15[3,0] 561 unpckhps %xmm1, %xmm3 # xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 562 shufps $226, %xmm3, %xmm4 # xmm4 = xmm4[2,0],xmm3[2,3] 563 movaps %xmm4, (%rsp) # 16-byte Spill 564 movaps -96(%r11), %xmm2 565 movaps -80(%r11), %xmm1 566 movaps %xmm1, %xmm4 567 shufps $0, %xmm2, %xmm4 # xmm4 = xmm4[0,0],xmm2[0,0] 568 movaps -112(%r11), %xmm10 569 movaps -128(%r11), %xmm0 570 movaps %xmm0, %xmm15 571 unpcklps %xmm10, %xmm15 # xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] 572 shufps $36, %xmm4, %xmm15 # xmm15 = xmm15[0,1],xmm4[2,0] 573 movaps %xmm2, %xmm4 574 unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 575 movaps %xmm10, %xmm6 576 shufps $17, %xmm0, %xmm6 # xmm6 = xmm6[1,0],xmm0[1,0] 577 shufps $226, %xmm4, %xmm6 # xmm6 = xmm6[2,0],xmm4[2,3] 578 movaps %xmm1, %xmm3 579 shufps $34, %xmm2, %xmm3 # xmm3 = xmm3[2,0],xmm2[2,0] 580 movaps %xmm0, %xmm4 581 unpckhps %xmm10, %xmm4 # xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] 582 shufps $36, %xmm3, %xmm4 # xmm4 = xmm4[0,1],xmm3[2,0] 583 unpckhps %xmm1, %xmm2 # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 584 shufps $51, %xmm0, %xmm10 # xmm10 = xmm10[3,0],xmm0[3,0] 585 shufps $226, %xmm2, %xmm10 # xmm10 = xmm10[2,0],xmm2[2,3] 586 movaps %xmm5, 192(%rsp) # 16-byte Spill 587 mulps %xmm5, %xmm15 588 addps %xmm14, %xmm15 589 mulps %xmm5, %xmm6 590 addps %xmm12, %xmm6 591 mulps %xmm5, %xmm4 592 addps %xmm7, %xmm4 593 mulps %xmm5, %xmm10 594 addps (%rsp), %xmm10 # 16-byte Folded Reload 595 movaps %xmm4, %xmm0 596 unpckhps %xmm10, %xmm0 # xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] 597 movaps %xmm6, %xmm1 598 shufps $51, %xmm15, %xmm1 # xmm1 = xmm1[3,0],xmm15[3,0] 599 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 600 movaps %xmm1, 64(%rsp) # 16-byte Spill 601 movaps %xmm10, %xmm0 602 shufps $34, %xmm4, %xmm0 # xmm0 = xmm0[2,0],xmm4[2,0] 603 movaps %xmm15, %xmm1 604 unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 605 shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] 606 movaps %xmm1, 48(%rsp) # 16-byte Spill 607 movaps %xmm4, %xmm0 608 unpcklps %xmm10, %xmm0 # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 609 movaps %xmm6, %xmm1 610 shufps $17, %xmm15, %xmm1 # xmm1 = xmm1[1,0],xmm15[1,0] 611 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 612 movaps %xmm1, (%rsp) # 16-byte Spill 613 shufps $0, %xmm4, %xmm10 # xmm10 = xmm10[0,0],xmm4[0,0] 614 unpcklps %xmm6, %xmm15 # xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] 615 shufps $36, %xmm10, %xmm15 # xmm15 = xmm15[0,1],xmm10[2,0] 616 movaps %xmm11, %xmm10 617 movaps 112(%rsp), %xmm14 # 16-byte Reload 618 unpcklps %xmm14, %xmm10 # xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] 619 movaps 80(%rsp), %xmm2 # 16-byte Reload 620 movaps %xmm2, %xmm0 621 movaps 208(%rsp), %xmm3 # 16-byte Reload 622 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] 623 shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] 624 movaps %xmm14, %xmm12 625 shufps $17, %xmm11, %xmm12 # xmm12 = xmm12[1,0],xmm11[1,0] 626 movaps %xmm3, %xmm0 627 unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 628 shufps $226, %xmm0, %xmm12 # xmm12 = xmm12[2,0],xmm0[2,3] 629 movaps %xmm11, %xmm0 630 unpckhps %xmm14, %xmm0 # xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] 631 movaps %xmm2, %xmm1 632 shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] 633 shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] 634 shufps $51, %xmm11, %xmm14 # xmm14 = xmm14[3,0],xmm11[3,0] 635 unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 636 shufps $226, %xmm3, %xmm14 # xmm14 = xmm14[2,0],xmm3[2,3] 637 movaps -32(%r11), %xmm1 638 movaps -16(%r11), %xmm2 639 movaps %xmm2, %xmm3 640 shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] 641 movaps -48(%r11), %xmm4 642 movaps -64(%r11), %xmm5 643 movaps %xmm5, %xmm11 644 unpcklps %xmm4, %xmm11 # xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] 645 shufps $36, %xmm3, %xmm11 # xmm11 = xmm11[0,1],xmm3[2,0] 646 movaps %xmm1, %xmm3 647 unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 648 movaps %xmm4, %xmm7 649 shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] 650 shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] 651 movaps %xmm2, %xmm3 652 shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] 653 movaps %xmm5, %xmm6 654 unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] 655 shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] 656 unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 657 shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] 658 shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] 659 movaps 192(%rsp), %xmm1 # 16-byte Reload 660 mulps %xmm1, %xmm11 661 addps %xmm10, %xmm11 662 mulps %xmm1, %xmm7 663 addps %xmm12, %xmm7 664 mulps %xmm1, %xmm6 665 addps %xmm0, %xmm6 666 mulps %xmm1, %xmm4 667 addps %xmm14, %xmm4 668 movaps %xmm6, %xmm0 669 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 670 movaps %xmm7, %xmm1 671 shufps $51, %xmm11, %xmm1 # xmm1 = xmm1[3,0],xmm11[3,0] 672 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 673 movaps %xmm1, 80(%rsp) # 16-byte Spill 674 movaps %xmm4, %xmm0 675 shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] 676 movaps %xmm11, %xmm12 677 unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] 678 shufps $36, %xmm0, %xmm12 # xmm12 = xmm12[0,1],xmm0[2,0] 679 movaps %xmm6, %xmm0 680 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 681 movaps %xmm7, %xmm1 682 shufps $17, %xmm11, %xmm1 # xmm1 = xmm1[1,0],xmm11[1,0] 683 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 684 movaps %xmm1, 112(%rsp) # 16-byte Spill 685 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] 686 unpcklps %xmm7, %xmm11 # xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 687 shufps $36, %xmm4, %xmm11 # xmm11 = xmm11[0,1],xmm4[2,0] 688 movaps %xmm9, %xmm10 689 unpcklps %xmm13, %xmm10 # xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] 690 movaps 240(%rsp), %xmm2 # 16-byte Reload 691 movaps %xmm2, %xmm0 692 movaps 224(%rsp), %xmm3 # 16-byte Reload 693 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] 694 shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] 695 movaps %xmm13, %xmm14 696 shufps $17, %xmm9, %xmm14 # xmm14 = xmm14[1,0],xmm9[1,0] 697 movaps %xmm3, %xmm0 698 unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 699 shufps $226, %xmm0, %xmm14 # xmm14 = xmm14[2,0],xmm0[2,3] 700 movaps %xmm9, %xmm0 701 unpckhps %xmm13, %xmm0 # xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] 702 movaps %xmm2, %xmm1 703 shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] 704 shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] 705 shufps $51, %xmm9, %xmm13 # xmm13 = xmm13[3,0],xmm9[3,0] 706 unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 707 shufps $226, %xmm3, %xmm13 # xmm13 = xmm13[2,0],xmm3[2,3] 708 movaps 32(%r11), %xmm1 709 movaps 48(%r11), %xmm2 710 movaps %xmm2, %xmm3 711 shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] 712 movaps 16(%r11), %xmm4 713 movaps (%r11), %xmm5 714 movaps %xmm5, %xmm9 715 unpcklps %xmm4, %xmm9 # xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] 716 shufps $36, %xmm3, %xmm9 # xmm9 = xmm9[0,1],xmm3[2,0] 717 movaps %xmm1, %xmm3 718 unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 719 movaps %xmm4, %xmm7 720 shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] 721 shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] 722 movaps %xmm2, %xmm3 723 shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] 724 movaps %xmm5, %xmm6 725 unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] 726 shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] 727 unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 728 shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] 729 shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] 730 movaps 192(%rsp), %xmm1 # 16-byte Reload 731 mulps %xmm1, %xmm9 732 addps %xmm10, %xmm9 733 mulps %xmm1, %xmm7 734 addps %xmm14, %xmm7 735 mulps %xmm1, %xmm6 736 addps %xmm0, %xmm6 737 mulps %xmm1, %xmm4 738 addps %xmm13, %xmm4 739 movaps %xmm6, %xmm0 740 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 741 movaps %xmm7, %xmm3 742 shufps $51, %xmm9, %xmm3 # xmm3 = xmm3[3,0],xmm9[3,0] 743 shufps $226, %xmm0, %xmm3 # xmm3 = xmm3[2,0],xmm0[2,3] 744 movaps %xmm4, %xmm0 745 shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] 746 movaps %xmm9, %xmm2 747 unpckhps %xmm7, %xmm2 # xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 748 shufps $36, %xmm0, %xmm2 # xmm2 = xmm2[0,1],xmm0[2,0] 749 movaps %xmm6, %xmm0 750 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 751 movaps %xmm7, %xmm13 752 shufps $17, %xmm9, %xmm13 # xmm13 = xmm13[1,0],xmm9[1,0] 753 shufps $226, %xmm0, %xmm13 # xmm13 = xmm13[2,0],xmm0[2,3] 754 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] 755 movaps 256(%rsp), %xmm6 # 16-byte Reload 756 movaps 272(%rsp), %xmm1 # 16-byte Reload 757 unpcklps %xmm7, %xmm9 # xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] 758 shufps $36, %xmm4, %xmm9 # xmm9 = xmm9[0,1],xmm4[2,0] 759 addq $6144, %r11 # imm = 0x1800 760 addq $4, %r12 761 jne .LBB5_7 762# %bb.8: # %polly.loop_exit22 763 # in Loop: Header=BB5_6 Depth=5 764 movups %xmm8, (%r8) 765 movaps 96(%rsp), %xmm0 # 16-byte Reload 766 movups %xmm0, 16(%r8) 767 movups %xmm6, 32(%r8) 768 movups %xmm1, 48(%r8) 769 movaps 64(%rsp), %xmm0 # 16-byte Reload 770 movups %xmm0, 48(%rdx) 771 movaps 48(%rsp), %xmm0 # 16-byte Reload 772 movups %xmm0, 32(%rdx) 773 movaps (%rsp), %xmm0 # 16-byte Reload 774 movups %xmm0, 16(%rdx) 775 movups %xmm15, (%rdx) 776 movaps 80(%rsp), %xmm0 # 16-byte Reload 777 movups %xmm0, 48(%r13) 778 movaps 112(%rsp), %xmm0 # 16-byte Reload 779 movups %xmm0, 16(%r13) 780 movups %xmm11, (%r13) 781 movups %xmm12, 32(%r13) 782 movups %xmm3, 48(%r10) 783 movups %xmm13, 16(%r10) 784 movups %xmm9, (%r10) 785 movups %xmm2, 32(%r10) 786 addq $6144, %r9 # imm = 0x1800 787 cmpq %rsi, %rax 788 leaq 1(%rax), %rax 789 jl .LBB5_6 790# %bb.9: # %polly.loop_exit16 791 # in Loop: Header=BB5_5 Depth=4 792 movq 176(%rsp), %rax # 8-byte Reload 793 addq $64, %rax 794 addq $393216, %r14 # imm = 0x60000 795 movq 184(%rsp), %r9 # 8-byte Reload 796 addq $256, %r9 # imm = 0x100 797 cmpq $1536, %rax # imm = 0x600 798 movq 168(%rsp), %rdx # 8-byte Reload 799 jb .LBB5_5 800# %bb.10: # %polly.loop_exit10 801 # in Loop: Header=BB5_4 Depth=3 802 movq 144(%rsp), %rax # 8-byte Reload 803 addq $64, %rax 804 movq 152(%rsp), %rcx # 8-byte Reload 805 incq %rcx 806 movq 160(%rsp), %r14 # 8-byte Reload 807 addq $256, %r14 # imm = 0x100 808 cmpq $1536, %rax # imm = 0x600 809 jb .LBB5_4 810# %bb.11: # %polly.loop_exit4 811 # in Loop: Header=BB5_3 Depth=2 812 addq $64, %rdx 813 addq $393216, 24(%rsp) # 8-byte Folded Spill 814 # imm = 0x60000 815 cmpq 136(%rsp), %rdx # 8-byte Folded Reload 816 jle .LBB5_3 817.LBB5_1: # %polly.par.setup 818 # =>This Loop Header: Depth=1 819 # Child Loop BB5_3 Depth 2 820 # Child Loop BB5_4 Depth 3 821 # Child Loop BB5_5 Depth 4 822 # Child Loop BB5_6 Depth 5 823 # Child Loop BB5_7 Depth 6 824 leaq 40(%rsp), %rdi 825 leaq 32(%rsp), %rsi 826 callq GOMP_loop_runtime_next@PLT 827 testb %al, %al 828 jne .LBB5_2 829# %bb.12: # %polly.par.exit 830 callq GOMP_loop_end_nowait@PLT 831 addq $296, %rsp # imm = 0x128 832 .cfi_def_cfa_offset 56 833 popq %rbx 834 .cfi_def_cfa_offset 48 835 popq %r12 836 .cfi_def_cfa_offset 40 837 popq %r13 838 .cfi_def_cfa_offset 32 839 popq %r14 840 .cfi_def_cfa_offset 24 841 popq %r15 842 .cfi_def_cfa_offset 16 843 popq %rbp 844 .cfi_def_cfa_offset 8 845 retq 846.Lfunc_end5: 847 .size main_polly_subfn_1, .Lfunc_end5-main_polly_subfn_1 848 .cfi_endproc 849 # -- End function 850 .type A,@object # @A 851 .comm A,9437184,16 852 .type B,@object # @B 853 .comm B,9437184,16 854 .type .L.str,@object # @.str 855 .section .rodata.str1.1,"aMS",@progbits,1 856.L.str: 857 .asciz "%lf " 858 .size .L.str, 5 859 860 .type C,@object # @C 861 .comm C,9437184,16 862 863 .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)" 864 .section ".note.GNU-stack","",@progbits 865