1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <private/bionic_asm.h> 30 31 /* 32 * Optimized memset() for ARM. 33 * 34 * memset() returns its first argument. 35 */ 36 37 .fpu neon 38 .syntax unified 39 40 // To avoid warning about deprecated instructions, add an explicit 41 // arch. The code generated is exactly the same. 42 .arch armv7-a 43 44ENTRY(__memset_chk_a15) 45 cmp r2, r3 46 bls memset 47 48 // Preserve lr for backtrace. 49 push {lr} 50 .cfi_def_cfa_offset 4 51 .cfi_rel_offset lr, 0 52 53 bl __memset_chk_fail 54END(__memset_chk_a15) 55 56ENTRY(memset_a15) 57 stmfd sp!, {r0} 58 .cfi_def_cfa_offset 4 59 .cfi_rel_offset r0, 0 60 61 // The new algorithm is slower for copies < 16 so use the old 62 // neon code in that case. 63 cmp r2, #16 64 blo .L_set_less_than_16_unknown_align 65 66 // Use strd which requires an even and odd register so move the 67 // values so that: 68 // r0 and r1 contain the memset value 69 // r2 is the number of bytes to set 70 // r3 is the destination pointer 71 mov r3, r0 72 73 // Copy the byte value in every byte of r1. 74 mov r1, r1, lsl #24 75 orr r1, r1, r1, lsr #8 76 orr r1, r1, r1, lsr #16 77 78.L_check_alignment: 79 // Align destination to a double word to avoid the strd crossing 80 // a cache line boundary. 81 ands ip, r3, #7 82 bne .L_do_double_word_align 83 84.L_double_word_aligned: 85 mov r0, r1 86 87 subs r2, #64 88 blo .L_set_less_than_64 89 901: // Main loop sets 64 bytes at a time. 91 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56 92 strd r0, r1, [r3, \offset] 93 .endr 94 95 add r3, #64 96 subs r2, #64 97 bge 1b 98 99.L_set_less_than_64: 100 // Restore r2 to the count of bytes left to set. 101 add r2, #64 102 lsls ip, r2, #27 103 bcc .L_set_less_than_32 104 // Set 32 bytes. 105 .irp offset, #0, #8, #16, #24 106 strd r0, r1, [r3, \offset] 107 .endr 108 add r3, #32 109 110.L_set_less_than_32: 111 bpl .L_set_less_than_16 112 // Set 16 bytes. 113 .irp offset, #0, #8 114 strd r0, r1, [r3, \offset] 115 .endr 116 add r3, #16 117 118.L_set_less_than_16: 119 // Less than 16 bytes to set. 120 lsls ip, r2, #29 121 bcc .L_set_less_than_8 122 123 // Set 8 bytes. 124 strd r0, r1, [r3], #8 125 126.L_set_less_than_8: 127 bpl .L_set_less_than_4 128 // Set 4 bytes 129 str r1, [r3], #4 130 131.L_set_less_than_4: 132 lsls ip, r2, #31 133 it ne 134 strbne r1, [r3], #1 135 itt cs 136 strbcs r1, [r3], #1 137 strbcs r1, [r3] 138 139 ldmfd sp!, {r0} 140 bx lr 141 142.L_do_double_word_align: 143 rsb ip, ip, #8 144 sub r2, r2, ip 145 movs r0, ip, lsl #31 146 it mi 147 strbmi r1, [r3], #1 148 itt cs 149 strbcs r1, [r3], #1 150 strbcs r1, [r3], #1 151 152 // Dst is at least word aligned by this point. 153 cmp ip, #4 154 blo .L_double_word_aligned 155 str r1, [r3], #4 156 b .L_double_word_aligned 157 158.L_set_less_than_16_unknown_align: 159 // Set up to 15 bytes. 160 vdup.8 d0, r1 161 movs ip, r2, lsl #29 162 bcc 1f 163 vst1.8 {d0}, [r0]! 1641: bge 2f 165 vst1.32 {d0[0]}, [r0]! 1662: movs ip, r2, lsl #31 167 it mi 168 strbmi r1, [r0], #1 169 itt cs 170 strbcs r1, [r0], #1 171 strbcs r1, [r0], #1 172 ldmfd sp!, {r0} 173 bx lr 174END(memset_a15) 175