1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMMOVE 34# define MEMMOVE memmove_generic 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#define DEST PARMS 77#define SRC DEST+4 78#define LEN SRC+4 79 80#define CFI_PUSH(REG) \ 81 cfi_adjust_cfa_offset (4); \ 82 cfi_rel_offset (REG, 0) 83 84#define CFI_POP(REG) \ 85 cfi_adjust_cfa_offset (-4); \ 86 cfi_restore (REG) 87 88#define PUSH(REG) pushl REG; CFI_PUSH (REG) 89#define POP(REG) popl REG; CFI_POP (REG) 90 91#define PARMS 8 /* Preserve EBX. */ 92#define ENTRANCE PUSH (%ebx); 93#define RETURN_END POP (%ebx); ret 94#define RETURN RETURN_END; CFI_PUSH (%ebx) 95 96 .section .text.sse2,"ax",@progbits 97ENTRY (MEMMOVE) 98 ENTRANCE 99 movl LEN(%esp), %ecx 100 movl SRC(%esp), %eax 101 movl DEST(%esp), %edx 102 103/* Check whether we should copy backward or forward. */ 104 cmp %eax, %edx 105 je L(mm_return) 106 jg L(mm_len_0_or_more_backward) 107 108/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 109 separately. */ 110 cmp $16, %ecx 111 jbe L(mm_len_0_16_bytes_forward) 112 113 cmpl $32, %ecx 114 ja L(mm_len_32_or_more_forward) 115 116/* Copy [0..32] and return. */ 117 movdqu (%eax), %xmm0 118 movdqu -16(%eax, %ecx), %xmm1 119 movdqu %xmm0, (%edx) 120 movdqu %xmm1, -16(%edx, %ecx) 121 jmp L(mm_return) 122 123L(mm_len_32_or_more_forward): 124 cmpl $64, %ecx 125 ja L(mm_len_64_or_more_forward) 126 127/* Copy [0..64] and return. */ 128 movdqu (%eax), %xmm0 129 movdqu 16(%eax), %xmm1 130 movdqu -16(%eax, %ecx), %xmm2 131 movdqu -32(%eax, %ecx), %xmm3 132 movdqu %xmm0, (%edx) 133 movdqu %xmm1, 16(%edx) 134 movdqu %xmm2, -16(%edx, %ecx) 135 movdqu %xmm3, -32(%edx, %ecx) 136 jmp L(mm_return) 137 138L(mm_len_64_or_more_forward): 139 cmpl $128, %ecx 140 ja L(mm_len_128_or_more_forward) 141 142/* Copy [0..128] and return. */ 143 movdqu (%eax), %xmm0 144 movdqu 16(%eax), %xmm1 145 movdqu 32(%eax), %xmm2 146 movdqu 48(%eax), %xmm3 147 movdqu -64(%eax, %ecx), %xmm4 148 movdqu -48(%eax, %ecx), %xmm5 149 movdqu -32(%eax, %ecx), %xmm6 150 movdqu -16(%eax, %ecx), %xmm7 151 movdqu %xmm0, (%edx) 152 movdqu %xmm1, 16(%edx) 153 movdqu %xmm2, 32(%edx) 154 movdqu %xmm3, 48(%edx) 155 movdqu %xmm4, -64(%edx, %ecx) 156 movdqu %xmm5, -48(%edx, %ecx) 157 movdqu %xmm6, -32(%edx, %ecx) 158 movdqu %xmm7, -16(%edx, %ecx) 159 jmp L(mm_return) 160 161L(mm_len_128_or_more_forward): 162 PUSH (%esi) 163 PUSH (%edi) 164 165/* Aligning the address of destination. */ 166 movdqu (%eax), %xmm0 167 movdqu 16(%eax), %xmm1 168 movdqu 32(%eax), %xmm2 169 movdqu 48(%eax), %xmm3 170 171 leal 64(%edx), %edi 172 andl $-64, %edi 173 subl %edx, %eax 174 175 movdqu (%eax, %edi), %xmm4 176 movdqu 16(%eax, %edi), %xmm5 177 movdqu 32(%eax, %edi), %xmm6 178 movdqu 48(%eax, %edi), %xmm7 179 180 movdqu %xmm0, (%edx) 181 movdqu %xmm1, 16(%edx) 182 movdqu %xmm2, 32(%edx) 183 movdqu %xmm3, 48(%edx) 184 movdqa %xmm4, (%edi) 185 movaps %xmm5, 16(%edi) 186 movaps %xmm6, 32(%edi) 187 movaps %xmm7, 48(%edi) 188 addl $64, %edi 189 190 leal (%edx, %ecx), %ebx 191 andl $-64, %ebx 192 cmp %edi, %ebx 193 jbe L(mm_copy_remaining_forward) 194 195 cmp $SHARED_CACHE_SIZE_HALF, %ecx 196 jae L(mm_large_page_loop_forward) 197 198 .p2align 4 199L(mm_main_loop_forward): 200 201 prefetcht0 128(%eax, %edi) 202 203 movdqu (%eax, %edi), %xmm0 204 movdqu 16(%eax, %edi), %xmm1 205 movdqu 32(%eax, %edi), %xmm2 206 movdqu 48(%eax, %edi), %xmm3 207 movdqa %xmm0, (%edi) 208 movaps %xmm1, 16(%edi) 209 movaps %xmm2, 32(%edi) 210 movaps %xmm3, 48(%edi) 211 leal 64(%edi), %edi 212 cmp %edi, %ebx 213 ja L(mm_main_loop_forward) 214 215L(mm_copy_remaining_forward): 216 addl %edx, %ecx 217 subl %edi, %ecx 218/* We copied all up till %edi position in the dst. 219 In %ecx now is how many bytes are left to copy. 220 Now we need to advance %esi. */ 221 leal (%edi, %eax), %esi 222 223L(mm_remaining_0_64_bytes_forward): 224 cmp $32, %ecx 225 ja L(mm_remaining_33_64_bytes_forward) 226 cmp $16, %ecx 227 ja L(mm_remaining_17_32_bytes_forward) 228 testl %ecx, %ecx 229 .p2align 4,,2 230 je L(mm_return_pop_all) 231 232 cmpb $8, %cl 233 ja L(mm_remaining_9_16_bytes_forward) 234 cmpb $4, %cl 235 .p2align 4,,5 236 ja L(mm_remaining_5_8_bytes_forward) 237 cmpb $2, %cl 238 .p2align 4,,1 239 ja L(mm_remaining_3_4_bytes_forward) 240 movzbl -1(%esi,%ecx), %eax 241 movzbl (%esi), %ebx 242 movb %al, -1(%edi,%ecx) 243 movb %bl, (%edi) 244 jmp L(mm_return_pop_all) 245 246L(mm_remaining_33_64_bytes_forward): 247 movdqu (%esi), %xmm0 248 movdqu 16(%esi), %xmm1 249 movdqu -32(%esi, %ecx), %xmm2 250 movdqu -16(%esi, %ecx), %xmm3 251 movdqu %xmm0, (%edi) 252 movdqu %xmm1, 16(%edi) 253 movdqu %xmm2, -32(%edi, %ecx) 254 movdqu %xmm3, -16(%edi, %ecx) 255 jmp L(mm_return_pop_all) 256 257L(mm_remaining_17_32_bytes_forward): 258 movdqu (%esi), %xmm0 259 movdqu -16(%esi, %ecx), %xmm1 260 movdqu %xmm0, (%edi) 261 movdqu %xmm1, -16(%edi, %ecx) 262 jmp L(mm_return_pop_all) 263 264L(mm_remaining_9_16_bytes_forward): 265 movq (%esi), %xmm0 266 movq -8(%esi, %ecx), %xmm1 267 movq %xmm0, (%edi) 268 movq %xmm1, -8(%edi, %ecx) 269 jmp L(mm_return_pop_all) 270 271L(mm_remaining_5_8_bytes_forward): 272 movl (%esi), %eax 273 movl -4(%esi,%ecx), %ebx 274 movl %eax, (%edi) 275 movl %ebx, -4(%edi,%ecx) 276 jmp L(mm_return_pop_all) 277 278L(mm_remaining_3_4_bytes_forward): 279 movzwl -2(%esi,%ecx), %eax 280 movzwl (%esi), %ebx 281 movw %ax, -2(%edi,%ecx) 282 movw %bx, (%edi) 283 jmp L(mm_return_pop_all) 284 285L(mm_len_0_16_bytes_forward): 286 testb $24, %cl 287 jne L(mm_len_9_16_bytes_forward) 288 testb $4, %cl 289 .p2align 4,,5 290 jne L(mm_len_5_8_bytes_forward) 291 testl %ecx, %ecx 292 .p2align 4,,2 293 je L(mm_return) 294 testb $2, %cl 295 .p2align 4,,1 296 jne L(mm_len_2_4_bytes_forward) 297 movzbl -1(%eax,%ecx), %ebx 298 movzbl (%eax), %eax 299 movb %bl, -1(%edx,%ecx) 300 movb %al, (%edx) 301 jmp L(mm_return) 302 303L(mm_len_2_4_bytes_forward): 304 movzwl -2(%eax,%ecx), %ebx 305 movzwl (%eax), %eax 306 movw %bx, -2(%edx,%ecx) 307 movw %ax, (%edx) 308 jmp L(mm_return) 309 310L(mm_len_5_8_bytes_forward): 311 movl (%eax), %ebx 312 movl -4(%eax,%ecx), %eax 313 movl %ebx, (%edx) 314 movl %eax, -4(%edx,%ecx) 315 jmp L(mm_return) 316 317L(mm_len_9_16_bytes_forward): 318 movq (%eax), %xmm0 319 movq -8(%eax, %ecx), %xmm1 320 movq %xmm0, (%edx) 321 movq %xmm1, -8(%edx, %ecx) 322 jmp L(mm_return) 323 324 CFI_POP (%edi) 325 CFI_POP (%esi) 326 327L(mm_recalc_len): 328/* Compute in %ecx how many bytes are left to copy after 329 the main loop stops. */ 330 movl %ebx, %ecx 331 subl %edx, %ecx 332/* The code for copying backwards. */ 333L(mm_len_0_or_more_backward): 334 335/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 336 separately. */ 337 cmp $16, %ecx 338 jbe L(mm_len_0_16_bytes_backward) 339 340 cmpl $32, %ecx 341 jg L(mm_len_32_or_more_backward) 342 343/* Copy [0..32] and return. */ 344 movdqu (%eax), %xmm0 345 movdqu -16(%eax, %ecx), %xmm1 346 movdqu %xmm0, (%edx) 347 movdqu %xmm1, -16(%edx, %ecx) 348 jmp L(mm_return) 349 350L(mm_len_32_or_more_backward): 351 cmpl $64, %ecx 352 jg L(mm_len_64_or_more_backward) 353 354/* Copy [0..64] and return. */ 355 movdqu (%eax), %xmm0 356 movdqu 16(%eax), %xmm1 357 movdqu -16(%eax, %ecx), %xmm2 358 movdqu -32(%eax, %ecx), %xmm3 359 movdqu %xmm0, (%edx) 360 movdqu %xmm1, 16(%edx) 361 movdqu %xmm2, -16(%edx, %ecx) 362 movdqu %xmm3, -32(%edx, %ecx) 363 jmp L(mm_return) 364 365L(mm_len_64_or_more_backward): 366 cmpl $128, %ecx 367 jg L(mm_len_128_or_more_backward) 368 369/* Copy [0..128] and return. */ 370 movdqu (%eax), %xmm0 371 movdqu 16(%eax), %xmm1 372 movdqu 32(%eax), %xmm2 373 movdqu 48(%eax), %xmm3 374 movdqu -64(%eax, %ecx), %xmm4 375 movdqu -48(%eax, %ecx), %xmm5 376 movdqu -32(%eax, %ecx), %xmm6 377 movdqu -16(%eax, %ecx), %xmm7 378 movdqu %xmm0, (%edx) 379 movdqu %xmm1, 16(%edx) 380 movdqu %xmm2, 32(%edx) 381 movdqu %xmm3, 48(%edx) 382 movdqu %xmm4, -64(%edx, %ecx) 383 movdqu %xmm5, -48(%edx, %ecx) 384 movdqu %xmm6, -32(%edx, %ecx) 385 movdqu %xmm7, -16(%edx, %ecx) 386 jmp L(mm_return) 387 388L(mm_len_128_or_more_backward): 389 PUSH (%esi) 390 PUSH (%edi) 391 392/* Aligning the address of destination. We need to save 393 16 bits from the source in order not to overwrite them. */ 394 movdqu -16(%eax, %ecx), %xmm0 395 movdqu -32(%eax, %ecx), %xmm1 396 movdqu -48(%eax, %ecx), %xmm2 397 movdqu -64(%eax, %ecx), %xmm3 398 399 leal (%edx, %ecx), %edi 400 andl $-64, %edi 401 402 movl %eax, %esi 403 subl %edx, %esi 404 405 movdqu -16(%edi, %esi), %xmm4 406 movdqu -32(%edi, %esi), %xmm5 407 movdqu -48(%edi, %esi), %xmm6 408 movdqu -64(%edi, %esi), %xmm7 409 410 movdqu %xmm0, -16(%edx, %ecx) 411 movdqu %xmm1, -32(%edx, %ecx) 412 movdqu %xmm2, -48(%edx, %ecx) 413 movdqu %xmm3, -64(%edx, %ecx) 414 movdqa %xmm4, -16(%edi) 415 movdqa %xmm5, -32(%edi) 416 movdqa %xmm6, -48(%edi) 417 movdqa %xmm7, -64(%edi) 418 leal -64(%edi), %edi 419 420 leal 64(%edx), %ebx 421 andl $-64, %ebx 422 423 cmp %edi, %ebx 424 jae L(mm_main_loop_backward_end) 425 426 cmp $SHARED_CACHE_SIZE_HALF, %ecx 427 jae L(mm_large_page_loop_backward) 428 429 .p2align 4 430L(mm_main_loop_backward): 431 432 prefetcht0 -128(%edi, %esi) 433 434 movdqu -64(%edi, %esi), %xmm0 435 movdqu -48(%edi, %esi), %xmm1 436 movdqu -32(%edi, %esi), %xmm2 437 movdqu -16(%edi, %esi), %xmm3 438 movdqa %xmm0, -64(%edi) 439 movdqa %xmm1, -48(%edi) 440 movdqa %xmm2, -32(%edi) 441 movdqa %xmm3, -16(%edi) 442 leal -64(%edi), %edi 443 cmp %edi, %ebx 444 jb L(mm_main_loop_backward) 445L(mm_main_loop_backward_end): 446 POP (%edi) 447 POP (%esi) 448 jmp L(mm_recalc_len) 449 450/* Copy [0..16] and return. */ 451L(mm_len_0_16_bytes_backward): 452 testb $24, %cl 453 jnz L(mm_len_9_16_bytes_backward) 454 testb $4, %cl 455 .p2align 4,,5 456 jnz L(mm_len_5_8_bytes_backward) 457 testl %ecx, %ecx 458 .p2align 4,,2 459 je L(mm_return) 460 testb $2, %cl 461 .p2align 4,,1 462 jne L(mm_len_3_4_bytes_backward) 463 movzbl -1(%eax,%ecx), %ebx 464 movzbl (%eax), %eax 465 movb %bl, -1(%edx,%ecx) 466 movb %al, (%edx) 467 jmp L(mm_return) 468 469L(mm_len_3_4_bytes_backward): 470 movzwl -2(%eax,%ecx), %ebx 471 movzwl (%eax), %eax 472 movw %bx, -2(%edx,%ecx) 473 movw %ax, (%edx) 474 jmp L(mm_return) 475 476L(mm_len_9_16_bytes_backward): 477 PUSH (%esi) 478 movl -4(%eax,%ecx), %ebx 479 movl -8(%eax,%ecx), %esi 480 movl %ebx, -4(%edx,%ecx) 481 movl %esi, -8(%edx,%ecx) 482 subl $8, %ecx 483 POP (%esi) 484 jmp L(mm_len_0_16_bytes_backward) 485 486L(mm_len_5_8_bytes_backward): 487 movl (%eax), %ebx 488 movl -4(%eax,%ecx), %eax 489 movl %ebx, (%edx) 490 movl %eax, -4(%edx,%ecx) 491 492L(mm_return): 493 movl %edx, %eax 494 RETURN 495 496L(mm_return_pop_all): 497 movl %edx, %eax 498 POP (%edi) 499 POP (%esi) 500 RETURN 501 502/* Big length copy forward part. */ 503 504 .p2align 4 505L(mm_large_page_loop_forward): 506 movdqu (%eax, %edi), %xmm0 507 movdqu 16(%eax, %edi), %xmm1 508 movdqu 32(%eax, %edi), %xmm2 509 movdqu 48(%eax, %edi), %xmm3 510 movntdq %xmm0, (%edi) 511 movntdq %xmm1, 16(%edi) 512 movntdq %xmm2, 32(%edi) 513 movntdq %xmm3, 48(%edi) 514 leal 64(%edi), %edi 515 cmp %edi, %ebx 516 ja L(mm_large_page_loop_forward) 517 sfence 518 jmp L(mm_copy_remaining_forward) 519 520/* Big length copy backward part. */ 521 .p2align 4 522L(mm_large_page_loop_backward): 523 movdqu -64(%edi, %esi), %xmm0 524 movdqu -48(%edi, %esi), %xmm1 525 movdqu -32(%edi, %esi), %xmm2 526 movdqu -16(%edi, %esi), %xmm3 527 movntdq %xmm0, -64(%edi) 528 movntdq %xmm1, -48(%edi) 529 movntdq %xmm2, -32(%edi) 530 movntdq %xmm3, -16(%edi) 531 leal -64(%edi), %edi 532 cmp %edi, %ebx 533 jb L(mm_large_page_loop_backward) 534 sfence 535 POP (%edi) 536 POP (%esi) 537 jmp L(mm_recalc_len) 538 539END (MEMMOVE) 540