1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16/* 17 * Contributed by: Intel Corporation 18 */ 19 20#include "cache.h" 21 22#ifndef L 23# define L(label) .L##label 24#endif 25 26#ifndef ALIGN 27# define ALIGN(n) .p2align n 28#endif 29 30#ifndef cfi_startproc 31# define cfi_startproc .cfi_startproc 32#endif 33 34#ifndef cfi_endproc 35# define cfi_endproc .cfi_endproc 36#endif 37 38#ifndef ENTRY 39# define ENTRY(name) \ 40 .type name, @function; \ 41 .globl name; \ 42 .p2align 4; \ 43name: \ 44 cfi_startproc 45#endif 46 47#ifndef END 48# define END(name) \ 49 cfi_endproc; \ 50 .size name, .-name 51#endif 52 53#define JMPTBL(I, B) I - B 54 55/* Branch to an entry in a jump table. TABLE is a jump table with 56 relative offsets. INDEX is a register contains the index into the 57 jump table. SCALE is the scale of INDEX. */ 58#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 59 lea TABLE(%rip), %r11; \ 60 movslq (%r11, INDEX, SCALE), INDEX; \ 61 lea (%r11, INDEX), INDEX; \ 62 jmp *INDEX 63 64 .section .text.sse2,"ax",@progbits 65 ALIGN (4) 66ENTRY (android_memset32) // Address in rdi 67 shr $2, %rdx // Count in rdx 68 movl %esi, %ecx // Pattern in ecx 69 70 cmp $16, %rdx 71 jae L(16dbwordsormore) 72 73L(write_less16dbwords): 74 lea (%rdi, %rdx, 4), %rdi 75 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4) 76 77 .pushsection .rodata.sse2,"a",@progbits 78 ALIGN (2) 79L(table_less16dbwords): 80 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords)) 81 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords)) 82 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords)) 83 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords)) 84 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords)) 85 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords)) 86 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords)) 87 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords)) 88 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords)) 89 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords)) 90 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords)) 91 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords)) 92 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords)) 93 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords)) 94 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords)) 95 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords)) 96 .popsection 97 98 ALIGN (4) 99L(write_15dbwords): 100 movl %ecx, -60(%rdi) 101L(write_14dbwords): 102 movl %ecx, -56(%rdi) 103L(write_13dbwords): 104 movl %ecx, -52(%rdi) 105L(write_12dbwords): 106 movl %ecx, -48(%rdi) 107L(write_11dbwords): 108 movl %ecx, -44(%rdi) 109L(write_10dbwords): 110 movl %ecx, -40(%rdi) 111L(write_9dbwords): 112 movl %ecx, -36(%rdi) 113L(write_8dbwords): 114 movl %ecx, -32(%rdi) 115L(write_7dbwords): 116 movl %ecx, -28(%rdi) 117L(write_6dbwords): 118 movl %ecx, -24(%rdi) 119L(write_5dbwords): 120 movl %ecx, -20(%rdi) 121L(write_4dbwords): 122 movl %ecx, -16(%rdi) 123L(write_3dbwords): 124 movl %ecx, -12(%rdi) 125L(write_2dbwords): 126 movl %ecx, -8(%rdi) 127L(write_1dbwords): 128 movl %ecx, -4(%rdi) 129L(write_0dbwords): 130 ret 131 132 ALIGN (4) 133L(16dbwordsormore): 134 test $3, %edi 135 jz L(aligned4bytes) 136 mov %ecx, (%rdi) 137 mov %ecx, -4(%rdi, %rdx, 4) 138 sub $1, %rdx 139 rol $24, %ecx 140 add $1, %rdi 141 test $3, %edi 142 jz L(aligned4bytes) 143 ror $8, %ecx 144 add $1, %rdi 145 test $3, %edi 146 jz L(aligned4bytes) 147 ror $8, %ecx 148 add $1, %rdi 149L(aligned4bytes): 150 shl $2, %rdx 151 152 /* Fill xmm0 with the pattern. */ 153 movd %ecx, %xmm0 154 pshufd $0, %xmm0, %xmm0 155 156 testl $0xf, %edi 157 jz L(aligned_16) 158/* RDX > 32 and RDI is not 16 byte aligned. */ 159 movdqu %xmm0, (%rdi) 160 mov %rdi, %rsi 161 and $-16, %rdi 162 add $16, %rdi 163 sub %rdi, %rsi 164 add %rsi, %rdx 165 166 ALIGN (4) 167L(aligned_16): 168 cmp $128, %rdx 169 jge L(128bytesormore) 170 171L(aligned_16_less128bytes): 172 add %rdx, %rdi 173 shr $2, %rdx 174 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 175 176 ALIGN (4) 177L(128bytesormore): 178 cmp $SHARED_CACHE_SIZE, %rdx 179 jg L(128bytesormore_nt) 180 181L(128bytesormore_normal): 182 sub $128, %rdx 183 movdqa %xmm0, (%rdi) 184 movdqa %xmm0, 0x10(%rdi) 185 movdqa %xmm0, 0x20(%rdi) 186 movdqa %xmm0, 0x30(%rdi) 187 movdqa %xmm0, 0x40(%rdi) 188 movdqa %xmm0, 0x50(%rdi) 189 movdqa %xmm0, 0x60(%rdi) 190 movdqa %xmm0, 0x70(%rdi) 191 lea 128(%rdi), %rdi 192 cmp $128, %rdx 193 jl L(128bytesless_normal) 194 195 sub $128, %rdx 196 movdqa %xmm0, (%rdi) 197 movdqa %xmm0, 0x10(%rdi) 198 movdqa %xmm0, 0x20(%rdi) 199 movdqa %xmm0, 0x30(%rdi) 200 movdqa %xmm0, 0x40(%rdi) 201 movdqa %xmm0, 0x50(%rdi) 202 movdqa %xmm0, 0x60(%rdi) 203 movdqa %xmm0, 0x70(%rdi) 204 lea 128(%rdi), %rdi 205 cmp $128, %rdx 206 jl L(128bytesless_normal) 207 208 sub $128, %rdx 209 movdqa %xmm0, (%rdi) 210 movdqa %xmm0, 0x10(%rdi) 211 movdqa %xmm0, 0x20(%rdi) 212 movdqa %xmm0, 0x30(%rdi) 213 movdqa %xmm0, 0x40(%rdi) 214 movdqa %xmm0, 0x50(%rdi) 215 movdqa %xmm0, 0x60(%rdi) 216 movdqa %xmm0, 0x70(%rdi) 217 lea 128(%rdi), %rdi 218 cmp $128, %rdx 219 jl L(128bytesless_normal) 220 221 sub $128, %rdx 222 movdqa %xmm0, (%rdi) 223 movdqa %xmm0, 0x10(%rdi) 224 movdqa %xmm0, 0x20(%rdi) 225 movdqa %xmm0, 0x30(%rdi) 226 movdqa %xmm0, 0x40(%rdi) 227 movdqa %xmm0, 0x50(%rdi) 228 movdqa %xmm0, 0x60(%rdi) 229 movdqa %xmm0, 0x70(%rdi) 230 lea 128(%rdi), %rdi 231 cmp $128, %rdx 232 jge L(128bytesormore_normal) 233 234L(128bytesless_normal): 235 add %rdx, %rdi 236 shr $2, %rdx 237 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 238 239 ALIGN (4) 240L(128bytesormore_nt): 241 sub $128, %rdx 242 movntdq %xmm0, (%rdi) 243 movntdq %xmm0, 0x10(%rdi) 244 movntdq %xmm0, 0x20(%rdi) 245 movntdq %xmm0, 0x30(%rdi) 246 movntdq %xmm0, 0x40(%rdi) 247 movntdq %xmm0, 0x50(%rdi) 248 movntdq %xmm0, 0x60(%rdi) 249 movntdq %xmm0, 0x70(%rdi) 250 lea 128(%rdi), %rdi 251 cmp $128, %rdx 252 jge L(128bytesormore_nt) 253 254 sfence 255 add %rdx, %rdi 256 shr $2, %rdx 257 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4) 258 259 .pushsection .rodata.sse2,"a",@progbits 260 ALIGN (2) 261L(table_16_128bytes): 262 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) 263 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) 264 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) 265 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) 266 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) 267 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) 268 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) 269 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) 270 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) 271 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) 272 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) 273 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) 274 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) 275 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) 276 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) 277 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) 278 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) 279 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) 280 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) 281 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) 282 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) 283 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) 284 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) 285 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) 286 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) 287 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) 288 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) 289 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) 290 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) 291 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) 292 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) 293 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) 294 .popsection 295 296 ALIGN (4) 297L(aligned_16_112bytes): 298 movdqa %xmm0, -112(%rdi) 299L(aligned_16_96bytes): 300 movdqa %xmm0, -96(%rdi) 301L(aligned_16_80bytes): 302 movdqa %xmm0, -80(%rdi) 303L(aligned_16_64bytes): 304 movdqa %xmm0, -64(%rdi) 305L(aligned_16_48bytes): 306 movdqa %xmm0, -48(%rdi) 307L(aligned_16_32bytes): 308 movdqa %xmm0, -32(%rdi) 309L(aligned_16_16bytes): 310 movdqa %xmm0, -16(%rdi) 311L(aligned_16_0bytes): 312 ret 313 314 ALIGN (4) 315L(aligned_16_116bytes): 316 movdqa %xmm0, -116(%rdi) 317L(aligned_16_100bytes): 318 movdqa %xmm0, -100(%rdi) 319L(aligned_16_84bytes): 320 movdqa %xmm0, -84(%rdi) 321L(aligned_16_68bytes): 322 movdqa %xmm0, -68(%rdi) 323L(aligned_16_52bytes): 324 movdqa %xmm0, -52(%rdi) 325L(aligned_16_36bytes): 326 movdqa %xmm0, -36(%rdi) 327L(aligned_16_20bytes): 328 movdqa %xmm0, -20(%rdi) 329L(aligned_16_4bytes): 330 movl %ecx, -4(%rdi) 331 ret 332 333 ALIGN (4) 334L(aligned_16_120bytes): 335 movdqa %xmm0, -120(%rdi) 336L(aligned_16_104bytes): 337 movdqa %xmm0, -104(%rdi) 338L(aligned_16_88bytes): 339 movdqa %xmm0, -88(%rdi) 340L(aligned_16_72bytes): 341 movdqa %xmm0, -72(%rdi) 342L(aligned_16_56bytes): 343 movdqa %xmm0, -56(%rdi) 344L(aligned_16_40bytes): 345 movdqa %xmm0, -40(%rdi) 346L(aligned_16_24bytes): 347 movdqa %xmm0, -24(%rdi) 348L(aligned_16_8bytes): 349 movq %xmm0, -8(%rdi) 350 ret 351 352 ALIGN (4) 353L(aligned_16_124bytes): 354 movdqa %xmm0, -124(%rdi) 355L(aligned_16_108bytes): 356 movdqa %xmm0, -108(%rdi) 357L(aligned_16_92bytes): 358 movdqa %xmm0, -92(%rdi) 359L(aligned_16_76bytes): 360 movdqa %xmm0, -76(%rdi) 361L(aligned_16_60bytes): 362 movdqa %xmm0, -60(%rdi) 363L(aligned_16_44bytes): 364 movdqa %xmm0, -44(%rdi) 365L(aligned_16_28bytes): 366 movdqa %xmm0, -28(%rdi) 367L(aligned_16_12bytes): 368 movq %xmm0, -12(%rdi) 369 movl %ecx, -4(%rdi) 370 ret 371 372END (android_memset32) 373