1/* 2 * Copyright (C) 2017 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * * Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in 14 * the documentation and/or other materials provided with the 15 * distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 24 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31#include <private/bionic_asm.h> 32 33#define PLDOFFS (16) 34#define PLDSIZE (128) /* L2 cache line size */ 35 36 .syntax unified 37 38 // To avoid warning about deprecated instructions, add an explicit 39 // arch. The code generated is exactly the same. 40 .arch armv7-a 41 42 .code 32 43ENTRY(__memcpy_kryo) 44 push {r0} 45 .cfi_def_cfa_offset 4 46 .cfi_rel_offset r0, 0 47 cmp r2, #4 48 blt .Lneon_lt4 49 cmp r2, #16 50 blt .Lneon_lt16 51 cmp r2, #32 52 blt .Lneon_16 53 cmp r2, #128 54 blt .Lneon_copy_32_a 55 /* Copy blocks of 128-bytes (word-aligned) at a time*/ 56 /* Code below is optimized for PLDSIZE=128 only */ 57 mov r12, r2, lsr #7 58 cmp r12, #PLDOFFS 59 ble .Lneon_copy_128_loop_nopld 60 sub r12, #PLDOFFS 61 pld [r1, #(PLDOFFS-1)*PLDSIZE] 62.Lneon_copy_128_loop_outer: 63 pld [r1, #(PLDOFFS*PLDSIZE)] 64 pld [r1, #(PLDOFFS)*(PLDSIZE)+64] 65 vld1.32 {q0, q1}, [r1]! 66 vld1.32 {q2, q3}, [r1]! 67 vld1.32 {q8, q9}, [r1]! 68 vld1.32 {q10, q11}, [r1]! 69 subs r12, r12, #1 70 vst1.32 {q0, q1}, [r0]! 71 vst1.32 {q2, q3}, [r0]! 72 vst1.32 {q8, q9}, [r0]! 73 vst1.32 {q10, q11}, [r0]! 74 bne .Lneon_copy_128_loop_outer 75 mov r12, #PLDOFFS 76.Lneon_copy_128_loop_nopld: 77 vld1.32 {q0, q1}, [r1]! 78 vld1.32 {q2, q3}, [r1]! 79 vld1.32 {q8, q9}, [r1]! 80 vld1.32 {q10, q11}, [r1]! 81 subs r12, r12, #1 82 vst1.32 {q0, q1}, [r0]! 83 vst1.32 {q2, q3}, [r0]! 84 vst1.32 {q8, q9}, [r0]! 85 vst1.32 {q10, q11}, [r0]! 86 bne .Lneon_copy_128_loop_nopld 87 ands r2, r2, #0x7f 88 beq .Lneon_exit 89 cmp r2, #32 90 blt .Lneon_16 91 nop 92 /* Copy blocks of 32-bytes (word aligned) at a time*/ 93.Lneon_copy_32_a: 94 mov r12, r2, lsr #5 95.Lneon_copy_32_loop_a: 96 vld1.32 {q0,q1}, [r1]! 97 subs r12, r12, #1 98 vst1.32 {q0,q1}, [r0]! 99 bne .Lneon_copy_32_loop_a 100 ands r2, r2, #0x1f 101 beq .Lneon_exit 102.Lneon_16: 103 subs r2, r2, #16 104 blt .Lneon_lt16 105 vld1.32 {q8}, [r1]! 106 vst1.32 {q8}, [r0]! 107 beq .Lneon_exit 108.Lneon_lt16: 109 movs r12, r2, lsl #29 110 bcc .Lneon_skip8 111 ldr r3, [r1], #4 112 ldr r12, [r1], #4 113 str r3, [r0], #4 114 str r12, [r0], #4 115.Lneon_skip8: 116 bpl .Lneon_lt4 117 ldr r3, [r1], #4 118 str r3, [r0], #4 119.Lneon_lt4: 120 movs r2, r2, lsl #31 121 bcc .Lneon_lt2 122 ldrh r3, [r1], #2 123 strh r3, [r0], #2 124.Lneon_lt2: 125 bpl .Lneon_exit 126 ldrb r12, [r1] 127 strb r12, [r0] 128.Lneon_exit: 129 pop {r0} 130 bx lr 131 132END(__memcpy_kryo) 133