1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * All rights reserved. 4 * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in 13 * the documentation and/or other materials provided with the 14 * distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30#include <private/bionic_asm.h> 31 32 .text 33 .syntax unified 34 .fpu neon 35 36#define CACHE_LINE_SIZE (64) 37#define MEMCPY_BLOCK_SIZE_SMALL (32768) 38#define MEMCPY_BLOCK_SIZE_MID (1048576) 39#define PREFETCH_DISTANCE_NEAR (CACHE_LINE_SIZE*4) 40#define PREFETCH_DISTANCE_MID (CACHE_LINE_SIZE*4) 41#define PREFETCH_DISTANCE_FAR (CACHE_LINE_SIZE*16) 42 43ENTRY(memmove_a15) 44 cmp r2, #0 45 cmpne r0, r1 46 bxeq lr 47 subs r3, r0, r1 48 bls .L_jump_to_memcpy 49 cmp r2, r3 50 bhi .L_reversed_memcpy 51 52.L_jump_to_memcpy: 53 b __memcpy 54 55.L_reversed_memcpy: 56 push {r0, lr} 57 .cfi_def_cfa_offset 8 58 .cfi_rel_offset r0, 0 59 .cfi_rel_offset lr, 4 60 61 add r0, r0, r2 62 add r1, r1, r2 63 64 /* preload next cache line */ 65 pld [r1, #-CACHE_LINE_SIZE] 66 pld [r1, #-CACHE_LINE_SIZE*2] 67 68.L_reversed_memcpy_align_dest: 69 /* Deal with very small blocks (< 32bytes) asap */ 70 cmp r2, #32 71 blo .L_reversed_memcpy_lt_32bytes 72 /* no need to align if len < 128 bytes */ 73 cmp r2, #128 74 blo .L_reversed_memcpy_lt_128bytes 75 /* align destination to 64 bytes (1 cache line) */ 76 ands r3, r0, #0x3f 77 beq .L_reversed_memcpy_dispatch 78 sub r2, r2, r3 790: /* copy 1 byte */ 80 movs ip, r3, lsl #31 81 ldrbmi ip, [r1, #-1]! 82 strbmi ip, [r0, #-1]! 831: /* copy 2 bytes */ 84 ldrbcs ip, [r1, #-1]! 85 strbcs ip, [r0, #-1]! 86 ldrbcs ip, [r1, #-1]! 87 strbcs ip, [r0, #-1]! 882: /* copy 4 bytes */ 89 movs ip, r3, lsl #29 90 bpl 3f 91 sub r1, r1, #4 92 sub r0, r0, #4 93 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] 94 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32] 953: /* copy 8 bytes */ 96 bcc 4f 97 sub r1, r1, #8 98 sub r0, r0, #8 99 vld1.8 {d0}, [r1] 100 vst1.8 {d0}, [r0, :64] 1014: /* copy 16 bytes */ 102 movs ip, r3, lsl #27 103 bpl 5f 104 sub r1, r1, #16 105 sub r0, r0, #16 106 vld1.8 {q0}, [r1] 107 vst1.8 {q0}, [r0, :128] 1085: /* copy 32 bytes */ 109 bcc .L_reversed_memcpy_dispatch 110 sub r1, r1, #32 111 sub r0, r0, #32 112 vld1.8 {q0, q1}, [r1] 113 vst1.8 {q0, q1}, [r0, :256] 114 115.L_reversed_memcpy_dispatch: 116 /* preload more cache lines */ 117 pld [r1, #-CACHE_LINE_SIZE*3] 118 pld [r1, #-CACHE_LINE_SIZE*4] 119 120 cmp r2, #MEMCPY_BLOCK_SIZE_SMALL 121 blo .L_reversed_memcpy_neon_pld_near 122 cmp r2, #MEMCPY_BLOCK_SIZE_MID 123 blo .L_reversed_memcpy_neon_pld_mid 124 b .L_reversed_memcpy_neon_pld_far 125 126.L_reversed_memcpy_neon_pld_near: 127 /* less than 128 bytes? */ 128 subs r2, r2, #128 129 blo 1f 130 sub r1, r1, #32 131 sub r0, r0, #32 132 mov r3, #-32 133 .align 4 1340: 135 /* copy 128 bytes in each loop */ 136 subs r2, r2, #128 137 138 /* preload to cache */ 139 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] 140 /* copy a cache line */ 141 vld1.8 {q0, q1}, [r1], r3 142 vst1.8 {q0, q1}, [r0, :256], r3 143 vld1.8 {q0, q1}, [r1], r3 144 vst1.8 {q0, q1}, [r0, :256], r3 145 146 /* preload to cache */ 147 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32] 148 /* copy a cache line */ 149 vld1.8 {q0, q1}, [r1], r3 150 vst1.8 {q0, q1}, [r0, :256], r3 151 vld1.8 {q0, q1}, [r1], r3 152 vst1.8 {q0, q1}, [r0, :256], r3 153 154 bhs 0b 155 add r1, r1, #32 156 add r0, r0, #32 1571: 158 adds r2, r2, #128 159 bne .L_reversed_memcpy_lt_128bytes 160 pop {r0, pc} 161 162.L_reversed_memcpy_neon_pld_mid: 163 subs r2, r2, #128 164 sub r1, r1, #32 165 sub r0, r0, #32 166 mov r3, #-32 167 .align 4 1680: 169 /* copy 128 bytes in each loop */ 170 subs r2, r2, #128 171 172 /* preload to cache */ 173 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] 174 /* copy a cache line */ 175 vld1.8 {q0, q1}, [r1], r3 176 vst1.8 {q0, q1}, [r0, :256], r3 177 vld1.8 {q0, q1}, [r1], r3 178 vst1.8 {q0, q1}, [r0, :256], r3 179 180 /* preload to cache */ 181 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32] 182 /* copy a cache line */ 183 vld1.8 {q0, q1}, [r1], r3 184 vst1.8 {q0, q1}, [r0, :256], r3 185 vld1.8 {q0, q1}, [r1], r3 186 vst1.8 {q0, q1}, [r0, :256], r3 187 188 bhs 0b 189 add r1, r1, #32 190 add r0, r0, #32 1911: 192 adds r2, r2, #128 193 bne .L_reversed_memcpy_lt_128bytes 194 pop {r0, pc} 195 196.L_reversed_memcpy_neon_pld_far: 197 sub r2, r2, #128 198 sub r0, r0, #128 199 sub r1, r1, #128 200 .align 4 2010: 202 /* copy 128 bytes in each loop */ 203 subs r2, r2, #128 204 205 /* preload to cache */ 206 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128] 207 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128] 208 /* read */ 209 vld1.8 {q0, q1}, [r1]! 210 vld1.8 {q2, q3}, [r1]! 211 vld1.8 {q8, q9}, [r1]! 212 vld1.8 {q10, q11}, [r1]! 213 /* write */ 214 vst1.8 {q0, q1}, [r0, :256]! 215 vst1.8 {q2, q3}, [r0, :256]! 216 vst1.8 {q8, q9}, [r0, :256]! 217 vst1.8 {q10, q11}, [r0, :256]! 218 219 sub r0, r0, #256 220 sub r1, r1, #256 221 bhs 0b 222 add r0, r0, #128 223 add r1, r1, #128 2241: 225 adds r2, r2, #128 226 bne .L_reversed_memcpy_lt_128bytes 227 pop {r0, pc} 228 229.L_reversed_memcpy_lt_128bytes: 2306: /* copy 64 bytes */ 231 movs ip, r2, lsl #26 232 bcc 5f 233 sub r1, r1, #32 234 sub r0, r0, #32 235 vld1.8 {q0, q1}, [r1] 236 vst1.8 {q0, q1}, [r0] 237 sub r1, r1, #32 238 sub r0, r0, #32 239 vld1.8 {q0, q1}, [r1] 240 vst1.8 {q0, q1}, [r0] 2415: /* copy 32 bytes */ 242 bpl 4f 243 sub r1, r1, #32 244 sub r0, r0, #32 245 vld1.8 {q0, q1}, [r1] 246 vst1.8 {q0, q1}, [r0] 247.L_reversed_memcpy_lt_32bytes: 2484: /* copy 16 bytes */ 249 movs ip, r2, lsl #28 250 bcc 3f 251 sub r1, r1, #16 252 sub r0, r0, #16 253 vld1.8 {q0}, [r1] 254 vst1.8 {q0}, [r0] 2553: /* copy 8 bytes */ 256 bpl 2f 257 sub r1, r1, #8 258 sub r0, r0, #8 259 vld1.8 {d0}, [r1] 260 vst1.8 {d0}, [r0] 2612: /* copy 4 bytes */ 262 ands ip, r2, #0x4 263 beq 1f 264 sub r1, r1, #4 265 sub r0, r0, #4 266 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1] 267 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 2681: /* copy 2 bytes */ 269 movs ip, r2, lsl #31 270 ldrbcs ip, [r1, #-1]! 271 strbcs ip, [r0, #-1]! 272 ldrbcs ip, [r1, #-1]! 273 strbcs ip, [r0, #-1]! 2740: /* copy 1 byte */ 275 ldrbmi ip, [r1, #-1]! 276 strbmi ip, [r0, #-1]! 277 278 pop {r0, pc} 279 280END(memmove_a15) 281