1/* 2 * Copyright (c) 2013 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without 30 unaligned access. 31 32 If compiled with GCC, this file should be enclosed within following 33 pre-processing check: 34 if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__) 35 36 Prototype: void *memcpy (void *dst, const void *src, size_t count); 37 38 The job will be done in 5 steps. 39 Step 1: Align src/dest pointers, copy mis-aligned if fail to align both 40 Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE 41 Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE 42 Step 4: Copy word by word 43 Step 5: Copy byte-to-byte 44 45 Tunable options: 46 __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64. 47 __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16. 48 */ 49#ifndef __OPT_BIG_BLOCK_SIZE 50#define __OPT_BIG_BLOCK_SIZE (4 * 16) 51#endif 52 53#ifndef __OPT_MID_BLOCK_SIZE 54#define __OPT_MID_BLOCK_SIZE (4 * 4) 55#endif 56 57#if __OPT_BIG_BLOCK_SIZE == 16 58#define BEGIN_UNROLL_BIG_BLOCK \ 59 .irp offset, 0,4,8,12 60#elif __OPT_BIG_BLOCK_SIZE == 32 61#define BEGIN_UNROLL_BIG_BLOCK \ 62 .irp offset, 0,4,8,12,16,20,24,28 63#elif __OPT_BIG_BLOCK_SIZE == 64 64#define BEGIN_UNROLL_BIG_BLOCK \ 65 .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60 66#else 67#error "Illegal __OPT_BIG_BLOCK_SIZE" 68#endif 69 70#if __OPT_MID_BLOCK_SIZE == 8 71#define BEGIN_UNROLL_MID_BLOCK \ 72 .irp offset, 0,4 73#elif __OPT_MID_BLOCK_SIZE == 16 74#define BEGIN_UNROLL_MID_BLOCK \ 75 .irp offset, 0,4,8,12 76#else 77#error "Illegal __OPT_MID_BLOCK_SIZE" 78#endif 79 80#define END_UNROLL .endr 81 82 .syntax unified 83 .text 84 .align 2 85 .global memcpy 86 .thumb 87 .thumb_func 88 .type memcpy, %function 89memcpy: 90 @ r0: dst 91 @ r1: src 92 @ r2: len 93#ifdef __ARM_FEATURE_UNALIGNED 94 /* In case of UNALIGNED access supported, ip is not used in 95 function body. */ 96 mov ip, r0 97#else 98 push {r0} 99#endif 100 orr r3, r1, r0 101 ands r3, r3, #3 102 bne .Lmisaligned_copy 103 104.Lbig_block: 105 subs r2, __OPT_BIG_BLOCK_SIZE 106 blo .Lmid_block 107 108 /* Kernel loop for big block copy */ 109 .align 2 110.Lbig_block_loop: 111 BEGIN_UNROLL_BIG_BLOCK 112#ifdef __ARM_ARCH_7EM__ 113 ldr r3, [r1], #4 114 str r3, [r0], #4 115 END_UNROLL 116#else /* __ARM_ARCH_7M__ */ 117 ldr r3, [r1, \offset] 118 str r3, [r0, \offset] 119 END_UNROLL 120 adds r0, __OPT_BIG_BLOCK_SIZE 121 adds r1, __OPT_BIG_BLOCK_SIZE 122#endif 123 subs r2, __OPT_BIG_BLOCK_SIZE 124 bhs .Lbig_block_loop 125 126.Lmid_block: 127 adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE 128 blo .Lcopy_word_by_word 129 130 /* Kernel loop for mid-block copy */ 131 .align 2 132.Lmid_block_loop: 133 BEGIN_UNROLL_MID_BLOCK 134#ifdef __ARM_ARCH_7EM__ 135 ldr r3, [r1], #4 136 str r3, [r0], #4 137 END_UNROLL 138#else /* __ARM_ARCH_7M__ */ 139 ldr r3, [r1, \offset] 140 str r3, [r0, \offset] 141 END_UNROLL 142 adds r0, __OPT_MID_BLOCK_SIZE 143 adds r1, __OPT_MID_BLOCK_SIZE 144#endif 145 subs r2, __OPT_MID_BLOCK_SIZE 146 bhs .Lmid_block_loop 147 148.Lcopy_word_by_word: 149 adds r2, __OPT_MID_BLOCK_SIZE - 4 150 blo .Lcopy_less_than_4 151 152 /* Kernel loop for small block copy */ 153 .align 2 154.Lcopy_word_by_word_loop: 155 ldr r3, [r1], #4 156 str r3, [r0], #4 157 subs r2, #4 158 bhs .Lcopy_word_by_word_loop 159 160.Lcopy_less_than_4: 161 adds r2, #4 162 beq .Ldone 163 164 lsls r2, r2, #31 165 itt ne 166 ldrbne r3, [r1], #1 167 strbne r3, [r0], #1 168 169 bcc .Ldone 170#ifdef __ARM_FEATURE_UNALIGNED 171 ldrh r3, [r1] 172 strh r3, [r0] 173#else 174 ldrb r3, [r1] 175 strb r3, [r0] 176 ldrb r3, [r1, #1] 177 strb r3, [r0, #1] 178#endif /* __ARM_FEATURE_UNALIGNED */ 179 180.Ldone: 181#ifdef __ARM_FEATURE_UNALIGNED 182 mov r0, ip 183#else 184 pop {r0} 185#endif 186 bx lr 187 188 .align 2 189.Lmisaligned_copy: 190#ifdef __ARM_FEATURE_UNALIGNED 191 /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy 192 once destination is adjusted to aligned. */ 193#define Ldst_aligned Lbig_block 194 195 /* Copy word by word using LDR when alignment can be done in hardware, 196 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */ 197 198 cmp r2, #8 199 blo .Lbyte_copy 200 201 /* if src is aligned, just go to the big block loop. */ 202 lsls r3, r1, #30 203 beq .Ldst_aligned 204#else 205 /* if len < 12, misalignment adjustment has more overhead than 206 just byte-to-byte copy. Also, len must >=8 to guarantee code 207 afterward work correctly. */ 208 cmp r2, #12 209 blo .Lbyte_copy 210#endif /* __ARM_FEATURE_UNALIGNED */ 211 212 /* Align dst only, not trying to align src. That is the because 213 handling of aligned src and misaligned dst need more overhead than 214 otherwise. By doing this the worst case is when initial src is aligned, 215 additional up to 4 byte additional copy will executed, which is 216 acceptable. */ 217 218 ands r3, r0, #3 219 beq .Ldst_aligned 220 221 rsb r3, #4 222 subs r2, r3 223 224 lsls r3, r3, #31 225 itt ne 226 ldrbne r3, [r1], #1 227 strbne r3, [r0], #1 228 229 bcc .Ldst_aligned 230 231#ifdef __ARM_FEATURE_UNALIGNED 232 ldrh r3, [r1], #2 233 strh r3, [r0], #2 234 b .Ldst_aligned 235#else 236 ldrb r3, [r1], #1 237 strb r3, [r0], #1 238 ldrb r3, [r1], #1 239 strb r3, [r0], #1 240 /* Now that dst is aligned */ 241.Ldst_aligned: 242 /* if r1 is aligned now, it means r0/r1 has the same misalignment, 243 and they are both aligned now. Go aligned copy. */ 244 ands r3, r1, #3 245 beq .Lbig_block 246 247 /* dst is aligned, but src isn't. Misaligned copy. */ 248 249 push {r4, r5} 250 subs r2, #4 251 252 /* Backward r1 by misaligned bytes, to make r1 aligned. 253 Since we need to restore r1 to unaligned address after the loop, 254 we need keep the offset bytes to ip and sub it from r1 afterward. */ 255 subs r1, r3 256 rsb ip, r3, #4 257 258 /* Pre-load on word */ 259 ldr r4, [r1], #4 260 261 cmp r3, #2 262 beq .Lmisaligned_copy_2_2 263 cmp r3, #3 264 beq .Lmisaligned_copy_3_1 265 266 .macro mis_src_copy shift 2671: 268 lsrs r4, r4, \shift 269 ldr r3, [r1], #4 270 lsls r5, r3, 32-\shift 271 orr r4, r4, r5 272 str r4, [r0], #4 273 mov r4, r3 274 subs r2, #4 275 bhs 1b 276 .endm 277 278.Lmisaligned_copy_1_3: 279 mis_src_copy shift=8 280 b .Lsrc_misaligned_tail 281 282.Lmisaligned_copy_3_1: 283 mis_src_copy shift=24 284 b .Lsrc_misaligned_tail 285 286.Lmisaligned_copy_2_2: 287 /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */ 288 mis_src_copy shift=16 289 290.Lsrc_misaligned_tail: 291 adds r2, #4 292 subs r1, ip 293 pop {r4, r5} 294 295#endif /* __ARM_FEATURE_UNALIGNED */ 296 297.Lbyte_copy: 298 subs r2, #4 299 blo .Lcopy_less_than_4 300 301.Lbyte_copy_loop: 302 subs r2, #1 303 ldrb r3, [r1], #1 304 strb r3, [r0], #1 305 bhs .Lbyte_copy_loop 306 307 ldrb r3, [r1] 308 strb r3, [r0] 309 ldrb r3, [r1, #1] 310 strb r3, [r0, #1] 311 ldrb r3, [r1, #2] 312 strb r3, [r0, #2] 313 314#ifdef __ARM_FEATURE_UNALIGNED 315 mov r0, ip 316#else 317 pop {r0} 318#endif 319 bx lr 320 321 .size memcpy, .-memcpy 322