1 .text 2 .file "matmul.c" 3 .section .rodata.cst8,"aM",@progbits,8 4 .p2align 3 # -- Begin function init_array 5.LCPI0_0: 6 .quad 4602678819172646912 # double 0.5 7 .text 8 .globl init_array 9 .p2align 4, 0x90 10 .type init_array,@function 11init_array: # @init_array 12 .cfi_startproc 13# %bb.0: # %entry 14 pushq %rbp 15 .cfi_def_cfa_offset 16 16 .cfi_offset %rbp, -16 17 movq %rsp, %rbp 18 .cfi_def_cfa_register %rbp 19 leaq B(%rip), %rax 20 leaq A(%rip), %rcx 21 xorl %r8d, %r8d 22 movsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero 23 xorl %r9d, %r9d 24 .p2align 4, 0x90 25.LBB0_1: # %polly.loop_header 26 # =>This Loop Header: Depth=1 27 # Child Loop BB0_2 Depth 2 28 movl $1, %edi 29 xorl %edx, %edx 30 .p2align 4, 0x90 31.LBB0_2: # %polly.loop_header1 32 # Parent Loop BB0_1 Depth=1 33 # => This Inner Loop Header: Depth=2 34 movl %edx, %esi 35 andl $1022, %esi # imm = 0x3FE 36 orl $1, %esi 37 xorps %xmm1, %xmm1 38 cvtsi2sdl %esi, %xmm1 39 mulsd %xmm0, %xmm1 40 cvtsd2ss %xmm1, %xmm1 41 movss %xmm1, -4(%rcx,%rdi,4) 42 movss %xmm1, -4(%rax,%rdi,4) 43 leal (%r9,%rdx), %esi 44 andl $1023, %esi # imm = 0x3FF 45 addl $1, %esi 46 xorps %xmm1, %xmm1 47 cvtsi2sdl %esi, %xmm1 48 mulsd %xmm0, %xmm1 49 cvtsd2ss %xmm1, %xmm1 50 movss %xmm1, (%rcx,%rdi,4) 51 movss %xmm1, (%rax,%rdi,4) 52 addq $2, %rdi 53 addl %r8d, %edx 54 cmpq $1537, %rdi # imm = 0x601 55 jne .LBB0_2 56# %bb.3: # %polly.loop_exit3 57 # in Loop: Header=BB0_1 Depth=1 58 addq $1, %r9 59 addq $6144, %rax # imm = 0x1800 60 addq $6144, %rcx # imm = 0x1800 61 addl $2, %r8d 62 cmpq $1536, %r9 # imm = 0x600 63 jne .LBB0_1 64# %bb.4: # %polly.exiting 65 popq %rbp 66 .cfi_def_cfa %rsp, 8 67 retq 68.Lfunc_end0: 69 .size init_array, .Lfunc_end0-init_array 70 .cfi_endproc 71 # -- End function 72 .globl print_array # -- Begin function print_array 73 .p2align 4, 0x90 74 .type print_array,@function 75print_array: # @print_array 76 .cfi_startproc 77# %bb.0: # %entry 78 pushq %rbp 79 .cfi_def_cfa_offset 16 80 .cfi_offset %rbp, -16 81 movq %rsp, %rbp 82 .cfi_def_cfa_register %rbp 83 pushq %r15 84 pushq %r14 85 pushq %r13 86 pushq %r12 87 pushq %rbx 88 pushq %rax 89 .cfi_offset %rbx, -56 90 .cfi_offset %r12, -48 91 .cfi_offset %r13, -40 92 .cfi_offset %r14, -32 93 .cfi_offset %r15, -24 94 leaq C(%rip), %r13 95 xorl %eax, %eax 96 movl $3435973837, %r12d # imm = 0xCCCCCCCD 97 leaq .L.str(%rip), %r14 98 .p2align 4, 0x90 99.LBB1_1: # %for.cond1.preheader 100 # =>This Loop Header: Depth=1 101 # Child Loop BB1_2 Depth 2 102 movq %rax, -48(%rbp) # 8-byte Spill 103 movq stdout(%rip), %rsi 104 xorl %ebx, %ebx 105 .p2align 4, 0x90 106.LBB1_2: # %for.body3 107 # Parent Loop BB1_1 Depth=1 108 # => This Inner Loop Header: Depth=2 109 movl %ebx, %eax 110 imulq %r12, %rax 111 shrq $38, %rax 112 leal (%rax,%rax,4), %r15d 113 shll $4, %r15d 114 addl $79, %r15d 115 movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 116 cvtss2sd %xmm0, %xmm0 117 movb $1, %al 118 movq %rsi, %rdi 119 movq %r14, %rsi 120 callq fprintf 121 cmpl %ebx, %r15d 122 jne .LBB1_4 123# %bb.3: # %if.then 124 # in Loop: Header=BB1_2 Depth=2 125 movq stdout(%rip), %rsi 126 movl $10, %edi 127 callq fputc@PLT 128.LBB1_4: # %for.inc 129 # in Loop: Header=BB1_2 Depth=2 130 addq $1, %rbx 131 movq stdout(%rip), %rsi 132 cmpq $1536, %rbx # imm = 0x600 133 jne .LBB1_2 134# %bb.5: # %for.end 135 # in Loop: Header=BB1_1 Depth=1 136 movl $10, %edi 137 callq fputc@PLT 138 movq -48(%rbp), %rax # 8-byte Reload 139 addq $1, %rax 140 addq $6144, %r13 # imm = 0x1800 141 cmpq $1536, %rax # imm = 0x600 142 jne .LBB1_1 143# %bb.6: # %for.end12 144 addq $8, %rsp 145 popq %rbx 146 popq %r12 147 popq %r13 148 popq %r14 149 popq %r15 150 popq %rbp 151 .cfi_def_cfa %rsp, 8 152 retq 153.Lfunc_end1: 154 .size print_array, .Lfunc_end1-print_array 155 .cfi_endproc 156 # -- End function 157 .globl main # -- Begin function main 158 .p2align 4, 0x90 159 .type main,@function 160main: # @main 161 .cfi_startproc 162# %bb.0: # %entry 163 pushq %rbp 164 .cfi_def_cfa_offset 16 165 .cfi_offset %rbp, -16 166 movq %rsp, %rbp 167 .cfi_def_cfa_register %rbp 168 pushq %r15 169 pushq %r14 170 pushq %r13 171 pushq %r12 172 pushq %rbx 173 subq $264, %rsp # imm = 0x108 174 .cfi_offset %rbx, -56 175 .cfi_offset %r12, -48 176 .cfi_offset %r13, -40 177 .cfi_offset %r14, -32 178 .cfi_offset %r15, -24 179 callq init_array 180 leaq C(%rip), %rdi 181 xorl %eax, %eax 182 movq %rax, -48(%rbp) # 8-byte Spill 183 xorl %esi, %esi 184 movl $9437184, %edx # imm = 0x900000 185 callq memset@PLT 186 movl $64, %eax 187 movq %rax, -80(%rbp) # 8-byte Spill 188 leaq A(%rip), %rax 189 movq %rax, -72(%rbp) # 8-byte Spill 190 .p2align 4, 0x90 191.LBB2_1: # %polly.loop_header8 192 # =>This Loop Header: Depth=1 193 # Child Loop BB2_2 Depth 2 194 # Child Loop BB2_3 Depth 3 195 # Child Loop BB2_4 Depth 4 196 # Child Loop BB2_5 Depth 5 197 leaq B+192(%rip), %r9 198 xorl %edi, %edi 199 xorl %eax, %eax 200 .p2align 4, 0x90 201.LBB2_2: # %polly.loop_header14 202 # Parent Loop BB2_1 Depth=1 203 # => This Loop Header: Depth=2 204 # Child Loop BB2_3 Depth 3 205 # Child Loop BB2_4 Depth 4 206 # Child Loop BB2_5 Depth 5 207 movq %rax, -168(%rbp) # 8-byte Spill 208 movq %rdi, -176(%rbp) # 8-byte Spill 209 shlq $6, %rdi 210 leaq 16(%rdi), %rdx 211 leaq 32(%rdi), %rsi 212 leaq 48(%rdi), %rcx 213 movq -72(%rbp), %r12 # 8-byte Reload 214 movq %r9, -184(%rbp) # 8-byte Spill 215 xorl %eax, %eax 216 .p2align 4, 0x90 217.LBB2_3: # %polly.loop_header20 218 # Parent Loop BB2_1 Depth=1 219 # Parent Loop BB2_2 Depth=2 220 # => This Loop Header: Depth=3 221 # Child Loop BB2_4 Depth 4 222 # Child Loop BB2_5 Depth 5 223 movq %rax, -192(%rbp) # 8-byte Spill 224 movq %r12, -200(%rbp) # 8-byte Spill 225 movq -48(%rbp), %r14 # 8-byte Reload 226 .p2align 4, 0x90 227.LBB2_4: # %polly.loop_header26 228 # Parent Loop BB2_1 Depth=1 229 # Parent Loop BB2_2 Depth=2 230 # Parent Loop BB2_3 Depth=3 231 # => This Loop Header: Depth=4 232 # Child Loop BB2_5 Depth 5 233 leaq (%r14,%r14,2), %rbx 234 shlq $11, %rbx 235 leaq C(%rip), %rax 236 addq %rax, %rbx 237 leaq (%rbx,%rdi,4), %r8 238 leaq (%rbx,%rdx,4), %r15 239 leaq (%rbx,%rsi,4), %r10 240 leaq (%rbx,%rcx,4), %r11 241 movups (%rbx,%rdi,4), %xmm8 242 movups 16(%rbx,%rdi,4), %xmm0 243 movaps %xmm0, -144(%rbp) # 16-byte Spill 244 movups 32(%rbx,%rdi,4), %xmm6 245 movups 48(%rbx,%rdi,4), %xmm1 246 movups (%rbx,%rdx,4), %xmm15 247 movups 16(%rbx,%rdx,4), %xmm0 248 movaps %xmm0, -64(%rbp) # 16-byte Spill 249 movups 32(%rbx,%rdx,4), %xmm0 250 movaps %xmm0, -96(%rbp) # 16-byte Spill 251 movups 48(%rbx,%rdx,4), %xmm0 252 movaps %xmm0, -112(%rbp) # 16-byte Spill 253 movups (%rbx,%rsi,4), %xmm11 254 movups 16(%rbx,%rsi,4), %xmm0 255 movaps %xmm0, -160(%rbp) # 16-byte Spill 256 movups 32(%rbx,%rsi,4), %xmm12 257 movups 48(%rbx,%rsi,4), %xmm0 258 movaps %xmm0, -128(%rbp) # 16-byte Spill 259 movups (%rbx,%rcx,4), %xmm9 260 movups 16(%rbx,%rcx,4), %xmm13 261 movups 32(%rbx,%rcx,4), %xmm2 262 movups 48(%rbx,%rcx,4), %xmm3 263 movq %r9, %rbx 264 movl $0, %r13d 265 .p2align 4, 0x90 266.LBB2_5: # %vector.ph 267 # Parent Loop BB2_1 Depth=1 268 # Parent Loop BB2_2 Depth=2 269 # Parent Loop BB2_3 Depth=3 270 # Parent Loop BB2_4 Depth=4 271 # => This Inner Loop Header: Depth=5 272 movaps %xmm12, -240(%rbp) # 16-byte Spill 273 movaps %xmm2, -256(%rbp) # 16-byte Spill 274 movaps %xmm3, -272(%rbp) # 16-byte Spill 275 movaps %xmm8, %xmm10 276 movaps -144(%rbp), %xmm7 # 16-byte Reload 277 unpcklps %xmm7, %xmm10 # xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1] 278 movaps %xmm1, %xmm4 279 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] 280 shufps $36, %xmm4, %xmm10 # xmm10 = xmm10[0,1],xmm4[2,0] 281 movaps %xmm7, %xmm5 282 shufps $17, %xmm8, %xmm5 # xmm5 = xmm5[1,0],xmm8[1,0] 283 movaps %xmm6, %xmm4 284 unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 285 shufps $226, %xmm4, %xmm5 # xmm5 = xmm5[2,0],xmm4[2,3] 286 movaps %xmm8, %xmm12 287 unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] 288 movaps %xmm1, %xmm4 289 shufps $34, %xmm6, %xmm4 # xmm4 = xmm4[2,0],xmm6[2,0] 290 shufps $36, %xmm4, %xmm12 # xmm12 = xmm12[0,1],xmm4[2,0] 291 shufps $51, %xmm8, %xmm7 # xmm7 = xmm7[3,0],xmm8[3,0] 292 unpckhps %xmm1, %xmm6 # xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] 293 shufps $226, %xmm6, %xmm7 # xmm7 = xmm7[2,0],xmm6[2,3] 294 movaps -160(%rbx), %xmm0 295 movaps -144(%rbx), %xmm1 296 movaps %xmm1, %xmm6 297 shufps $0, %xmm0, %xmm6 # xmm6 = xmm6[0,0],xmm0[0,0] 298 movaps -192(%rbx), %xmm3 299 movaps -176(%rbx), %xmm4 300 movaps %xmm3, %xmm8 301 unpcklps %xmm4, %xmm8 # xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 302 shufps $36, %xmm6, %xmm8 # xmm8 = xmm8[0,1],xmm6[2,0] 303 movaps %xmm0, %xmm2 304 unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 305 movaps %xmm4, %xmm6 306 shufps $17, %xmm3, %xmm6 # xmm6 = xmm6[1,0],xmm3[1,0] 307 shufps $226, %xmm2, %xmm6 # xmm6 = xmm6[2,0],xmm2[2,3] 308 movaps %xmm1, %xmm2 309 shufps $34, %xmm0, %xmm2 # xmm2 = xmm2[2,0],xmm0[2,0] 310 movaps %xmm3, %xmm14 311 unpckhps %xmm4, %xmm14 # xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] 312 shufps $36, %xmm2, %xmm14 # xmm14 = xmm14[0,1],xmm2[2,0] 313 unpckhps %xmm1, %xmm0 # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 314 shufps $51, %xmm3, %xmm4 # xmm4 = xmm4[3,0],xmm3[3,0] 315 shufps $226, %xmm0, %xmm4 # xmm4 = xmm4[2,0],xmm0[2,3] 316 movss (%r12,%r13,4), %xmm0 # xmm0 = mem[0],zero,zero,zero 317 shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] 318 mulps %xmm0, %xmm8 319 addps %xmm10, %xmm8 320 mulps %xmm0, %xmm6 321 addps %xmm5, %xmm6 322 mulps %xmm0, %xmm14 323 addps %xmm12, %xmm14 324 mulps %xmm0, %xmm4 325 movaps %xmm0, %xmm5 326 addps %xmm7, %xmm4 327 movaps %xmm14, %xmm0 328 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 329 movaps %xmm6, %xmm1 330 shufps $51, %xmm8, %xmm1 # xmm1 = xmm1[3,0],xmm8[3,0] 331 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 332 movaps %xmm1, -304(%rbp) # 16-byte Spill 333 movaps %xmm4, %xmm0 334 shufps $34, %xmm14, %xmm0 # xmm0 = xmm0[2,0],xmm14[2,0] 335 movaps %xmm8, %xmm1 336 unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 337 shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] 338 movaps %xmm1, -288(%rbp) # 16-byte Spill 339 movaps %xmm14, %xmm0 340 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 341 movaps %xmm6, %xmm1 342 shufps $17, %xmm8, %xmm1 # xmm1 = xmm1[1,0],xmm8[1,0] 343 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 344 movaps %xmm1, -144(%rbp) # 16-byte Spill 345 shufps $0, %xmm14, %xmm4 # xmm4 = xmm4[0,0],xmm14[0,0] 346 unpcklps %xmm6, %xmm8 # xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] 347 shufps $36, %xmm4, %xmm8 # xmm8 = xmm8[0,1],xmm4[2,0] 348 movaps %xmm15, %xmm14 349 movaps -64(%rbp), %xmm4 # 16-byte Reload 350 unpcklps %xmm4, %xmm14 # xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1] 351 movaps -112(%rbp), %xmm1 # 16-byte Reload 352 movaps %xmm1, %xmm0 353 movaps -96(%rbp), %xmm3 # 16-byte Reload 354 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] 355 shufps $36, %xmm0, %xmm14 # xmm14 = xmm14[0,1],xmm0[2,0] 356 movaps %xmm4, %xmm12 357 shufps $17, %xmm15, %xmm12 # xmm12 = xmm12[1,0],xmm15[1,0] 358 movaps %xmm3, %xmm2 359 unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 360 shufps $226, %xmm2, %xmm12 # xmm12 = xmm12[2,0],xmm2[2,3] 361 movaps %xmm15, %xmm7 362 unpckhps %xmm4, %xmm7 # xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] 363 movaps %xmm1, %xmm2 364 shufps $34, %xmm3, %xmm2 # xmm2 = xmm2[2,0],xmm3[2,0] 365 shufps $36, %xmm2, %xmm7 # xmm7 = xmm7[0,1],xmm2[2,0] 366 shufps $51, %xmm15, %xmm4 # xmm4 = xmm4[3,0],xmm15[3,0] 367 unpckhps %xmm1, %xmm3 # xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3] 368 shufps $226, %xmm3, %xmm4 # xmm4 = xmm4[2,0],xmm3[2,3] 369 movaps %xmm4, -64(%rbp) # 16-byte Spill 370 movaps -96(%rbx), %xmm2 371 movaps -80(%rbx), %xmm1 372 movaps %xmm1, %xmm4 373 shufps $0, %xmm2, %xmm4 # xmm4 = xmm4[0,0],xmm2[0,0] 374 movaps -112(%rbx), %xmm10 375 movaps -128(%rbx), %xmm0 376 movaps %xmm0, %xmm15 377 unpcklps %xmm10, %xmm15 # xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1] 378 shufps $36, %xmm4, %xmm15 # xmm15 = xmm15[0,1],xmm4[2,0] 379 movaps %xmm2, %xmm4 380 unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 381 movaps %xmm10, %xmm6 382 shufps $17, %xmm0, %xmm6 # xmm6 = xmm6[1,0],xmm0[1,0] 383 shufps $226, %xmm4, %xmm6 # xmm6 = xmm6[2,0],xmm4[2,3] 384 movaps %xmm1, %xmm3 385 shufps $34, %xmm2, %xmm3 # xmm3 = xmm3[2,0],xmm2[2,0] 386 movaps %xmm0, %xmm4 387 unpckhps %xmm10, %xmm4 # xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3] 388 shufps $36, %xmm3, %xmm4 # xmm4 = xmm4[0,1],xmm3[2,0] 389 unpckhps %xmm1, %xmm2 # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] 390 shufps $51, %xmm0, %xmm10 # xmm10 = xmm10[3,0],xmm0[3,0] 391 shufps $226, %xmm2, %xmm10 # xmm10 = xmm10[2,0],xmm2[2,3] 392 movaps %xmm5, -224(%rbp) # 16-byte Spill 393 mulps %xmm5, %xmm15 394 addps %xmm14, %xmm15 395 mulps %xmm5, %xmm6 396 addps %xmm12, %xmm6 397 mulps %xmm5, %xmm4 398 addps %xmm7, %xmm4 399 mulps %xmm5, %xmm10 400 addps -64(%rbp), %xmm10 # 16-byte Folded Reload 401 movaps %xmm4, %xmm0 402 unpckhps %xmm10, %xmm0 # xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] 403 movaps %xmm6, %xmm1 404 shufps $51, %xmm15, %xmm1 # xmm1 = xmm1[3,0],xmm15[3,0] 405 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 406 movaps %xmm1, -112(%rbp) # 16-byte Spill 407 movaps %xmm10, %xmm0 408 shufps $34, %xmm4, %xmm0 # xmm0 = xmm0[2,0],xmm4[2,0] 409 movaps %xmm15, %xmm1 410 unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 411 shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0] 412 movaps %xmm1, -96(%rbp) # 16-byte Spill 413 movaps %xmm4, %xmm0 414 unpcklps %xmm10, %xmm0 # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] 415 movaps %xmm6, %xmm1 416 shufps $17, %xmm15, %xmm1 # xmm1 = xmm1[1,0],xmm15[1,0] 417 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 418 movaps %xmm1, -64(%rbp) # 16-byte Spill 419 shufps $0, %xmm4, %xmm10 # xmm10 = xmm10[0,0],xmm4[0,0] 420 unpcklps %xmm6, %xmm15 # xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] 421 shufps $36, %xmm10, %xmm15 # xmm15 = xmm15[0,1],xmm10[2,0] 422 movaps %xmm11, %xmm10 423 movaps -160(%rbp), %xmm14 # 16-byte Reload 424 unpcklps %xmm14, %xmm10 # xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] 425 movaps -128(%rbp), %xmm2 # 16-byte Reload 426 movaps %xmm2, %xmm0 427 movaps -240(%rbp), %xmm3 # 16-byte Reload 428 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] 429 shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] 430 movaps %xmm14, %xmm12 431 shufps $17, %xmm11, %xmm12 # xmm12 = xmm12[1,0],xmm11[1,0] 432 movaps %xmm3, %xmm0 433 unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 434 shufps $226, %xmm0, %xmm12 # xmm12 = xmm12[2,0],xmm0[2,3] 435 movaps %xmm11, %xmm0 436 unpckhps %xmm14, %xmm0 # xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] 437 movaps %xmm2, %xmm1 438 shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] 439 shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] 440 shufps $51, %xmm11, %xmm14 # xmm14 = xmm14[3,0],xmm11[3,0] 441 unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 442 shufps $226, %xmm3, %xmm14 # xmm14 = xmm14[2,0],xmm3[2,3] 443 movaps -32(%rbx), %xmm1 444 movaps -16(%rbx), %xmm2 445 movaps %xmm2, %xmm3 446 shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] 447 movaps -48(%rbx), %xmm4 448 movaps -64(%rbx), %xmm5 449 movaps %xmm5, %xmm11 450 unpcklps %xmm4, %xmm11 # xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] 451 shufps $36, %xmm3, %xmm11 # xmm11 = xmm11[0,1],xmm3[2,0] 452 movaps %xmm1, %xmm3 453 unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 454 movaps %xmm4, %xmm7 455 shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] 456 shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] 457 movaps %xmm2, %xmm3 458 shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] 459 movaps %xmm5, %xmm6 460 unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] 461 shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] 462 unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 463 shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] 464 shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] 465 movaps -224(%rbp), %xmm1 # 16-byte Reload 466 mulps %xmm1, %xmm11 467 addps %xmm10, %xmm11 468 mulps %xmm1, %xmm7 469 addps %xmm12, %xmm7 470 mulps %xmm1, %xmm6 471 addps %xmm0, %xmm6 472 mulps %xmm1, %xmm4 473 addps %xmm14, %xmm4 474 movaps %xmm6, %xmm0 475 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 476 movaps %xmm7, %xmm1 477 shufps $51, %xmm11, %xmm1 # xmm1 = xmm1[3,0],xmm11[3,0] 478 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 479 movaps %xmm1, -128(%rbp) # 16-byte Spill 480 movaps %xmm4, %xmm0 481 shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] 482 movaps %xmm11, %xmm12 483 unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3] 484 shufps $36, %xmm0, %xmm12 # xmm12 = xmm12[0,1],xmm0[2,0] 485 movaps %xmm6, %xmm0 486 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 487 movaps %xmm7, %xmm1 488 shufps $17, %xmm11, %xmm1 # xmm1 = xmm1[1,0],xmm11[1,0] 489 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3] 490 movaps %xmm1, -160(%rbp) # 16-byte Spill 491 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] 492 unpcklps %xmm7, %xmm11 # xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] 493 shufps $36, %xmm4, %xmm11 # xmm11 = xmm11[0,1],xmm4[2,0] 494 movaps %xmm9, %xmm10 495 unpcklps %xmm13, %xmm10 # xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] 496 movaps -272(%rbp), %xmm2 # 16-byte Reload 497 movaps %xmm2, %xmm0 498 movaps -256(%rbp), %xmm3 # 16-byte Reload 499 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0] 500 shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0] 501 movaps %xmm13, %xmm14 502 shufps $17, %xmm9, %xmm14 # xmm14 = xmm14[1,0],xmm9[1,0] 503 movaps %xmm3, %xmm0 504 unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 505 shufps $226, %xmm0, %xmm14 # xmm14 = xmm14[2,0],xmm0[2,3] 506 movaps %xmm9, %xmm0 507 unpckhps %xmm13, %xmm0 # xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3] 508 movaps %xmm2, %xmm1 509 shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0] 510 shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0] 511 shufps $51, %xmm9, %xmm13 # xmm13 = xmm13[3,0],xmm9[3,0] 512 unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 513 shufps $226, %xmm3, %xmm13 # xmm13 = xmm13[2,0],xmm3[2,3] 514 movaps 32(%rbx), %xmm1 515 movaps 48(%rbx), %xmm2 516 movaps %xmm2, %xmm3 517 shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0] 518 movaps 16(%rbx), %xmm4 519 movaps (%rbx), %xmm5 520 movaps %xmm5, %xmm9 521 unpcklps %xmm4, %xmm9 # xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] 522 shufps $36, %xmm3, %xmm9 # xmm9 = xmm9[0,1],xmm3[2,0] 523 movaps %xmm1, %xmm3 524 unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 525 movaps %xmm4, %xmm7 526 shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0] 527 shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3] 528 movaps %xmm2, %xmm3 529 shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0] 530 movaps %xmm5, %xmm6 531 unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3] 532 shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0] 533 unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 534 shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0] 535 shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3] 536 movaps -224(%rbp), %xmm1 # 16-byte Reload 537 mulps %xmm1, %xmm9 538 addps %xmm10, %xmm9 539 mulps %xmm1, %xmm7 540 addps %xmm14, %xmm7 541 mulps %xmm1, %xmm6 542 addps %xmm0, %xmm6 543 mulps %xmm1, %xmm4 544 addps %xmm13, %xmm4 545 movaps %xmm6, %xmm0 546 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 547 movaps %xmm7, %xmm3 548 shufps $51, %xmm9, %xmm3 # xmm3 = xmm3[3,0],xmm9[3,0] 549 shufps $226, %xmm0, %xmm3 # xmm3 = xmm3[2,0],xmm0[2,3] 550 movaps %xmm4, %xmm0 551 shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0] 552 movaps %xmm9, %xmm2 553 unpckhps %xmm7, %xmm2 # xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 554 shufps $36, %xmm0, %xmm2 # xmm2 = xmm2[0,1],xmm0[2,0] 555 movaps %xmm6, %xmm0 556 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 557 movaps %xmm7, %xmm13 558 shufps $17, %xmm9, %xmm13 # xmm13 = xmm13[1,0],xmm9[1,0] 559 shufps $226, %xmm0, %xmm13 # xmm13 = xmm13[2,0],xmm0[2,3] 560 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0] 561 movaps -288(%rbp), %xmm6 # 16-byte Reload 562 movaps -304(%rbp), %xmm1 # 16-byte Reload 563 unpcklps %xmm7, %xmm9 # xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1] 564 shufps $36, %xmm4, %xmm9 # xmm9 = xmm9[0,1],xmm4[2,0] 565 addq $1, %r13 566 addq $6144, %rbx # imm = 0x1800 567 cmpq $64, %r13 568 jne .LBB2_5 569# %bb.6: # %polly.loop_exit34 570 # in Loop: Header=BB2_4 Depth=4 571 movups %xmm8, (%r8) 572 movaps -144(%rbp), %xmm0 # 16-byte Reload 573 movups %xmm0, 16(%r8) 574 movups %xmm6, 32(%r8) 575 movups %xmm1, 48(%r8) 576 movaps -112(%rbp), %xmm0 # 16-byte Reload 577 movups %xmm0, 48(%r15) 578 movaps -96(%rbp), %xmm0 # 16-byte Reload 579 movups %xmm0, 32(%r15) 580 movaps -64(%rbp), %xmm0 # 16-byte Reload 581 movups %xmm0, 16(%r15) 582 movups %xmm15, (%r15) 583 movaps -128(%rbp), %xmm0 # 16-byte Reload 584 movups %xmm0, 48(%r10) 585 movaps -160(%rbp), %xmm0 # 16-byte Reload 586 movups %xmm0, 16(%r10) 587 movups %xmm11, (%r10) 588 movups %xmm12, 32(%r10) 589 movups %xmm3, 48(%r11) 590 movups %xmm13, 16(%r11) 591 movups %xmm9, (%r11) 592 movups %xmm2, 32(%r11) 593 addq $1, %r14 594 addq $6144, %r12 # imm = 0x1800 595 cmpq -80(%rbp), %r14 # 8-byte Folded Reload 596 jne .LBB2_4 597# %bb.7: # %polly.loop_exit28 598 # in Loop: Header=BB2_3 Depth=3 599 movq -192(%rbp), %rax # 8-byte Reload 600 addq $64, %rax 601 addq $393216, %r9 # imm = 0x60000 602 movq -200(%rbp), %r12 # 8-byte Reload 603 addq $256, %r12 # imm = 0x100 604 cmpq $1536, %rax # imm = 0x600 605 jb .LBB2_3 606# %bb.8: # %polly.loop_exit22 607 # in Loop: Header=BB2_2 Depth=2 608 movq -168(%rbp), %rax # 8-byte Reload 609 addq $64, %rax 610 movq -176(%rbp), %rdi # 8-byte Reload 611 addq $1, %rdi 612 movq -184(%rbp), %r9 # 8-byte Reload 613 addq $256, %r9 # imm = 0x100 614 cmpq $1536, %rax # imm = 0x600 615 jb .LBB2_2 616# %bb.9: # %polly.loop_exit16 617 # in Loop: Header=BB2_1 Depth=1 618 movq -48(%rbp), %rax # 8-byte Reload 619 movq %rax, %rcx 620 addq $64, %rcx 621 addq $64, -80(%rbp) # 8-byte Folded Spill 622 addq $393216, -72(%rbp) # 8-byte Folded Spill 623 # imm = 0x60000 624 movq %rcx, %rax 625 movq %rcx, -48(%rbp) # 8-byte Spill 626 cmpq $1536, %rcx # imm = 0x600 627 jb .LBB2_1 628# %bb.10: # %polly.exiting 629 xorl %eax, %eax 630 addq $264, %rsp # imm = 0x108 631 popq %rbx 632 popq %r12 633 popq %r13 634 popq %r14 635 popq %r15 636 popq %rbp 637 .cfi_def_cfa %rsp, 8 638 retq 639.Lfunc_end2: 640 .size main, .Lfunc_end2-main 641 .cfi_endproc 642 # -- End function 643 .type A,@object # @A 644 .comm A,9437184,16 645 .type B,@object # @B 646 .comm B,9437184,16 647 .type .L.str,@object # @.str 648 .section .rodata.str1.1,"aMS",@progbits,1 649.L.str: 650 .asciz "%lf " 651 .size .L.str, 5 652 653 .type C,@object # @C 654 .comm C,9437184,16 655 656 .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)" 657 .section ".note.GNU-stack","",@progbits 658