1/* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16/* 17 * Contributed by: Intel Corporation 18 */ 19 20#ifndef L 21# define L(label) .L##label 22#endif 23 24#ifndef ALIGN 25# define ALIGN(n) .p2align n 26#endif 27 28#ifndef cfi_startproc 29# define cfi_startproc .cfi_startproc 30#endif 31 32#ifndef cfi_endproc 33# define cfi_endproc .cfi_endproc 34#endif 35 36#ifndef cfi_rel_offset 37# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 38#endif 39 40#ifndef cfi_restore 41# define cfi_restore(reg) .cfi_restore reg 42#endif 43 44#ifndef cfi_adjust_cfa_offset 45# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 46#endif 47 48#ifndef ENTRY 49# define ENTRY(name) \ 50 .type name, @function; \ 51 .globl name; \ 52 .p2align 4; \ 53name: \ 54 cfi_startproc 55#endif 56 57#ifndef END 58# define END(name) \ 59 cfi_endproc; \ 60 .size name, .-name 61#endif 62 63#define CFI_PUSH(REG) \ 64 cfi_adjust_cfa_offset (4); \ 65 cfi_rel_offset (REG, 0) 66 67#define CFI_POP(REG) \ 68 cfi_adjust_cfa_offset (-4); \ 69 cfi_restore (REG) 70 71#define PUSH(REG) pushl REG; CFI_PUSH (REG) 72#define POP(REG) popl REG; CFI_POP (REG) 73 74#ifdef USE_AS_BZERO32 75# define DEST PARMS 76# define LEN DEST+4 77#else 78# define DEST PARMS 79# define DWDS DEST+4 80# define LEN DWDS+4 81#endif 82 83#ifdef USE_AS_WMEMSET32 84# define SETRTNVAL movl DEST(%esp), %eax 85#else 86# define SETRTNVAL 87#endif 88 89#ifdef SHARED 90# define ENTRANCE PUSH (%ebx); 91# define RETURN_END POP (%ebx); ret 92# define RETURN RETURN_END; CFI_PUSH (%ebx) 93# define PARMS 8 /* Preserve EBX. */ 94# define JMPTBL(I, B) I - B 95 96/* Load an entry in a jump table into EBX and branch to it. TABLE is a 97 jump table with relative offsets. */ 98# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 99 /* We first load PC into EBX. */ \ 100 call __i686.get_pc_thunk.bx; \ 101 /* Get the address of the jump table. */ \ 102 add $(TABLE - .), %ebx; \ 103 /* Get the entry and convert the relative offset to the \ 104 absolute address. */ \ 105 add (%ebx,%ecx,4), %ebx; \ 106 /* We loaded the jump table and adjuested EDX. Go. */ \ 107 jmp *%ebx 108 109 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits 110 .globl __i686.get_pc_thunk.bx 111 .hidden __i686.get_pc_thunk.bx 112 ALIGN (4) 113 .type __i686.get_pc_thunk.bx,@function 114__i686.get_pc_thunk.bx: 115 movl (%esp), %ebx 116 ret 117#else 118# define ENTRANCE 119# define RETURN_END ret 120# define RETURN RETURN_END 121# define PARMS 4 122# define JMPTBL(I, B) I 123 124/* Branch to an entry in a jump table. TABLE is a jump table with 125 absolute offsets. */ 126# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ 127 jmp *TABLE(,%ecx,4) 128#endif 129 130 .section .text.sse2,"ax",@progbits 131 ALIGN (4) 132ENTRY (sse2_memset32_atom) 133 ENTRANCE 134 135 movl LEN(%esp), %ecx 136#ifdef USE_AS_ANDROID 137 shr $2, %ecx 138#endif 139#ifdef USE_AS_BZERO32 140 xor %eax, %eax 141#else 142 mov DWDS(%esp), %eax 143 mov %eax, %edx 144#endif 145 movl DEST(%esp), %edx 146 cmp $16, %ecx 147 jae L(16dbwordsormore) 148 149L(write_less16dbwords): 150 lea (%edx, %ecx, 4), %edx 151 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords)) 152 153 .pushsection .rodata.sse2,"a",@progbits 154 ALIGN (2) 155L(table_less16dbwords): 156 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 157 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 158 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 159 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 160 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 161 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 162 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 163 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 164 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 165 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 166 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 167 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 168 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 169 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 170 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 171 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 172 .popsection 173 174 ALIGN (4) 175L(write_15dbwords): 176 movl %eax, -60(%edx) 177L(write_14dbwords): 178 movl %eax, -56(%edx) 179L(write_13dbwords): 180 movl %eax, -52(%edx) 181L(write_12dbwords): 182 movl %eax, -48(%edx) 183L(write_11dbwords): 184 movl %eax, -44(%edx) 185L(write_10dbwords): 186 movl %eax, -40(%edx) 187L(write_9dbwords): 188 movl %eax, -36(%edx) 189L(write_8dbwords): 190 movl %eax, -32(%edx) 191L(write_7dbwords): 192 movl %eax, -28(%edx) 193L(write_6dbwords): 194 movl %eax, -24(%edx) 195L(write_5dbwords): 196 movl %eax, -20(%edx) 197L(write_4dbwords): 198 movl %eax, -16(%edx) 199L(write_3dbwords): 200 movl %eax, -12(%edx) 201L(write_2dbwords): 202 movl %eax, -8(%edx) 203L(write_1dbwords): 204 movl %eax, -4(%edx) 205L(write_0dbwords): 206 SETRTNVAL 207 RETURN 208 209 ALIGN (4) 210L(16dbwordsormore): 211 test $3, %edx 212 jz L(aligned4bytes) 213 mov %eax, (%edx) 214 mov %eax, -4(%edx, %ecx, 4) 215 sub $1, %ecx 216 rol $24, %eax 217 add $1, %edx 218 test $3, %edx 219 jz L(aligned4bytes) 220 ror $8, %eax 221 add $1, %edx 222 test $3, %edx 223 jz L(aligned4bytes) 224 ror $8, %eax 225 add $1, %edx 226L(aligned4bytes): 227 shl $2, %ecx 228 229#ifdef USE_AS_BZERO32 230 pxor %xmm0, %xmm0 231#else 232 movd %eax, %xmm0 233 pshufd $0, %xmm0, %xmm0 234#endif 235 testl $0xf, %edx 236 jz L(aligned_16) 237/* ECX > 32 and EDX is not 16 byte aligned. */ 238L(not_aligned_16): 239 movdqu %xmm0, (%edx) 240 movl %edx, %eax 241 and $-16, %edx 242 add $16, %edx 243 sub %edx, %eax 244 add %eax, %ecx 245 movd %xmm0, %eax 246 ALIGN (4) 247L(aligned_16): 248 cmp $128, %ecx 249 jae L(128bytesormore) 250 251L(aligned_16_less128bytes): 252 add %ecx, %edx 253 shr $2, %ecx 254 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 255 256 ALIGN (4) 257L(128bytesormore): 258#ifdef SHARED_CACHE_SIZE 259 PUSH (%ebx) 260 mov $SHARED_CACHE_SIZE, %ebx 261#else 262# ifdef SHARED 263 call __i686.get_pc_thunk.bx 264 add $_GLOBAL_OFFSET_TABLE_, %ebx 265 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx 266# else 267 PUSH (%ebx) 268 mov __x86_shared_cache_size, %ebx 269# endif 270#endif 271 cmp %ebx, %ecx 272 jae L(128bytesormore_nt_start) 273 274#ifdef DATA_CACHE_SIZE 275 POP (%ebx) 276# define RESTORE_EBX_STATE CFI_PUSH (%ebx) 277 cmp $DATA_CACHE_SIZE, %ecx 278#else 279# ifdef SHARED 280# define RESTORE_EBX_STATE 281 call __i686.get_pc_thunk.bx 282 add $_GLOBAL_OFFSET_TABLE_, %ebx 283 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx 284# else 285 POP (%ebx) 286# define RESTORE_EBX_STATE CFI_PUSH (%ebx) 287 cmp __x86_data_cache_size, %ecx 288# endif 289#endif 290 291 jae L(128bytes_L2_normal) 292 subl $128, %ecx 293L(128bytesormore_normal): 294 sub $128, %ecx 295 movdqa %xmm0, (%edx) 296 movdqa %xmm0, 0x10(%edx) 297 movdqa %xmm0, 0x20(%edx) 298 movdqa %xmm0, 0x30(%edx) 299 movdqa %xmm0, 0x40(%edx) 300 movdqa %xmm0, 0x50(%edx) 301 movdqa %xmm0, 0x60(%edx) 302 movdqa %xmm0, 0x70(%edx) 303 lea 128(%edx), %edx 304 jb L(128bytesless_normal) 305 306 307 sub $128, %ecx 308 movdqa %xmm0, (%edx) 309 movdqa %xmm0, 0x10(%edx) 310 movdqa %xmm0, 0x20(%edx) 311 movdqa %xmm0, 0x30(%edx) 312 movdqa %xmm0, 0x40(%edx) 313 movdqa %xmm0, 0x50(%edx) 314 movdqa %xmm0, 0x60(%edx) 315 movdqa %xmm0, 0x70(%edx) 316 lea 128(%edx), %edx 317 jae L(128bytesormore_normal) 318 319L(128bytesless_normal): 320 lea 128(%ecx), %ecx 321 add %ecx, %edx 322 shr $2, %ecx 323 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 324 325 ALIGN (4) 326L(128bytes_L2_normal): 327 prefetcht0 0x380(%edx) 328 prefetcht0 0x3c0(%edx) 329 sub $128, %ecx 330 movdqa %xmm0, (%edx) 331 movaps %xmm0, 0x10(%edx) 332 movaps %xmm0, 0x20(%edx) 333 movaps %xmm0, 0x30(%edx) 334 movaps %xmm0, 0x40(%edx) 335 movaps %xmm0, 0x50(%edx) 336 movaps %xmm0, 0x60(%edx) 337 movaps %xmm0, 0x70(%edx) 338 add $128, %edx 339 cmp $128, %ecx 340 jae L(128bytes_L2_normal) 341 342L(128bytesless_L2_normal): 343 add %ecx, %edx 344 shr $2, %ecx 345 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 346 347 RESTORE_EBX_STATE 348L(128bytesormore_nt_start): 349 sub %ebx, %ecx 350 mov %ebx, %eax 351 and $0x7f, %eax 352 add %eax, %ecx 353 movd %xmm0, %eax 354 ALIGN (4) 355L(128bytesormore_shared_cache_loop): 356 prefetcht0 0x3c0(%edx) 357 prefetcht0 0x380(%edx) 358 sub $0x80, %ebx 359 movdqa %xmm0, (%edx) 360 movdqa %xmm0, 0x10(%edx) 361 movdqa %xmm0, 0x20(%edx) 362 movdqa %xmm0, 0x30(%edx) 363 movdqa %xmm0, 0x40(%edx) 364 movdqa %xmm0, 0x50(%edx) 365 movdqa %xmm0, 0x60(%edx) 366 movdqa %xmm0, 0x70(%edx) 367 add $0x80, %edx 368 cmp $0x80, %ebx 369 jae L(128bytesormore_shared_cache_loop) 370 cmp $0x80, %ecx 371 jb L(shared_cache_loop_end) 372 373 ALIGN (4) 374L(128bytesormore_nt): 375 sub $0x80, %ecx 376 movntdq %xmm0, (%edx) 377 movntdq %xmm0, 0x10(%edx) 378 movntdq %xmm0, 0x20(%edx) 379 movntdq %xmm0, 0x30(%edx) 380 movntdq %xmm0, 0x40(%edx) 381 movntdq %xmm0, 0x50(%edx) 382 movntdq %xmm0, 0x60(%edx) 383 movntdq %xmm0, 0x70(%edx) 384 add $0x80, %edx 385 cmp $0x80, %ecx 386 jae L(128bytesormore_nt) 387 sfence 388L(shared_cache_loop_end): 389#if defined DATA_CACHE_SIZE || !defined SHARED 390 POP (%ebx) 391#endif 392 add %ecx, %edx 393 shr $2, %ecx 394 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) 395 396 .pushsection .rodata.sse2,"a",@progbits 397 ALIGN (2) 398L(table_16_128bytes): 399 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 400 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 401 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 402 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 403 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 404 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 405 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 406 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 407 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 408 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 409 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 410 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 411 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 412 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 413 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 414 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 415 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 416 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 417 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 418 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 419 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 420 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 421 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 422 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 423 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 424 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 425 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 426 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 427 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 428 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 429 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 430 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 431 .popsection 432 433 ALIGN (4) 434L(aligned_16_112bytes): 435 movdqa %xmm0, -112(%edx) 436L(aligned_16_96bytes): 437 movdqa %xmm0, -96(%edx) 438L(aligned_16_80bytes): 439 movdqa %xmm0, -80(%edx) 440L(aligned_16_64bytes): 441 movdqa %xmm0, -64(%edx) 442L(aligned_16_48bytes): 443 movdqa %xmm0, -48(%edx) 444L(aligned_16_32bytes): 445 movdqa %xmm0, -32(%edx) 446L(aligned_16_16bytes): 447 movdqa %xmm0, -16(%edx) 448L(aligned_16_0bytes): 449 SETRTNVAL 450 RETURN 451 452 ALIGN (4) 453L(aligned_16_116bytes): 454 movdqa %xmm0, -116(%edx) 455L(aligned_16_100bytes): 456 movdqa %xmm0, -100(%edx) 457L(aligned_16_84bytes): 458 movdqa %xmm0, -84(%edx) 459L(aligned_16_68bytes): 460 movdqa %xmm0, -68(%edx) 461L(aligned_16_52bytes): 462 movdqa %xmm0, -52(%edx) 463L(aligned_16_36bytes): 464 movdqa %xmm0, -36(%edx) 465L(aligned_16_20bytes): 466 movdqa %xmm0, -20(%edx) 467L(aligned_16_4bytes): 468 movl %eax, -4(%edx) 469 SETRTNVAL 470 RETURN 471 472 ALIGN (4) 473L(aligned_16_120bytes): 474 movdqa %xmm0, -120(%edx) 475L(aligned_16_104bytes): 476 movdqa %xmm0, -104(%edx) 477L(aligned_16_88bytes): 478 movdqa %xmm0, -88(%edx) 479L(aligned_16_72bytes): 480 movdqa %xmm0, -72(%edx) 481L(aligned_16_56bytes): 482 movdqa %xmm0, -56(%edx) 483L(aligned_16_40bytes): 484 movdqa %xmm0, -40(%edx) 485L(aligned_16_24bytes): 486 movdqa %xmm0, -24(%edx) 487L(aligned_16_8bytes): 488 movq %xmm0, -8(%edx) 489 SETRTNVAL 490 RETURN 491 492 ALIGN (4) 493L(aligned_16_124bytes): 494 movdqa %xmm0, -124(%edx) 495L(aligned_16_108bytes): 496 movdqa %xmm0, -108(%edx) 497L(aligned_16_92bytes): 498 movdqa %xmm0, -92(%edx) 499L(aligned_16_76bytes): 500 movdqa %xmm0, -76(%edx) 501L(aligned_16_60bytes): 502 movdqa %xmm0, -60(%edx) 503L(aligned_16_44bytes): 504 movdqa %xmm0, -44(%edx) 505L(aligned_16_28bytes): 506 movdqa %xmm0, -28(%edx) 507L(aligned_16_12bytes): 508 movq %xmm0, -12(%edx) 509 movl %eax, -4(%edx) 510 SETRTNVAL 511 RETURN 512 513END (sse2_memset32_atom) 514