1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <private/bionic_asm.h> 30 31 32#ifdef HAVE_32_BYTE_CACHE_LINE 33#define CACHE_LINE_SIZE 32 34#else 35#define CACHE_LINE_SIZE 64 36#endif 37 38/* 39 * Optimized memcmp() for Cortex-A9. 40 */ 41 42.syntax unified 43 44ENTRY(memcmp) 45 pld [r0, #(CACHE_LINE_SIZE * 0)] 46 pld [r0, #(CACHE_LINE_SIZE * 1)] 47 48 /* take of the case where length is 0 or the buffers are the same */ 49 cmp r0, r1 50 moveq r0, #0 51 bxeq lr 52 53 pld [r1, #(CACHE_LINE_SIZE * 0)] 54 pld [r1, #(CACHE_LINE_SIZE * 1)] 55 56 /* make sure we have at least 8+4 bytes, this simplify things below 57 * and avoid some overhead for small blocks 58 */ 59 cmp r2, #(8+4) 60 bmi 10f 61/* 62 * Neon optimization 63 * Comparing 32 bytes at a time 64 */ 65#if defined(__ARM_NEON__) 66 subs r2, r2, #32 67 blo 3f 68 69 /* preload all the cache lines we need. */ 70 pld [r0, #(CACHE_LINE_SIZE * 2)] 71 pld [r1, #(CACHE_LINE_SIZE * 2)] 72 731: /* The main loop compares 32 bytes at a time */ 74 vld1.8 {d0 - d3}, [r0]! 75 pld [r0, #(CACHE_LINE_SIZE * 2)] 76 vld1.8 {d4 - d7}, [r1]! 77 pld [r1, #(CACHE_LINE_SIZE * 2)] 78 79 /* Start subtracting the values and merge results */ 80 vsub.i8 q0, q2 81 vsub.i8 q1, q3 82 vorr q2, q0, q1 83 vorr d4, d5 84 vmov r3, ip, d4 85 /* Check if there are any differences among the 32 bytes */ 86 orrs r3, ip 87 bne 2f 88 subs r2, r2, #32 89 bhs 1b 90 b 3f 912: 92 /* Check if the difference was in the first or last 16 bytes */ 93 sub r0, #32 94 vorr d0, d1 95 sub r1, #32 96 vmov r3, ip, d0 97 orrs r3, ip 98 /* if the first 16 bytes are equal, we only have to rewind 16 bytes */ 99 ittt eq 100 subeq r2, #16 101 addeq r0, #16 102 addeq r1, #16 103 1043: /* fix-up the remaining count */ 105 add r2, r2, #32 106 107 cmp r2, #(8+4) 108 bmi 10f 109#endif 110 111 /* save registers */ 112 stmfd sp!, {r4, lr} 113 .cfi_def_cfa_offset 8 114 .cfi_rel_offset r4, 0 115 .cfi_rel_offset lr, 4 116 117 /* since r0 hold the result, move the first source 118 * pointer somewhere else 119 */ 120 mov r4, r0 121 122 /* align first pointer to word boundary 123 * offset = -src & 3 124 */ 125 rsb r3, r4, #0 126 ands r3, r3, #3 127 beq 0f 128 129 /* align first pointer */ 130 sub r2, r2, r3 1311: ldrb r0, [r4], #1 132 ldrb ip, [r1], #1 133 subs r0, r0, ip 134 bne 9f 135 subs r3, r3, #1 136 bne 1b 137 138 1390: /* here the first pointer is aligned, and we have at least 4 bytes 140 * to process. 141 */ 142 143 /* see if the pointers are congruent */ 144 eor r0, r4, r1 145 ands r0, r0, #3 146 bne 5f 147 148 /* congruent case, 32 bytes per iteration 149 * We need to make sure there are at least 32+4 bytes left 150 * because we effectively read ahead one word, and we could 151 * read past the buffer (and segfault) if we're not careful. 152 */ 153 154 ldr ip, [r1] 155 subs r2, r2, #(32 + 4) 156 bmi 1f 157 1580: pld [r4, #(CACHE_LINE_SIZE * 2)] 159 pld [r1, #(CACHE_LINE_SIZE * 2)] 160 ldr r0, [r4], #4 161 ldr lr, [r1, #4]! 162 eors r0, r0, ip 163 ldreq r0, [r4], #4 164 ldreq ip, [r1, #4]! 165 eorseq r0, r0, lr 166 ldreq r0, [r4], #4 167 ldreq lr, [r1, #4]! 168 eorseq r0, r0, ip 169 ldreq r0, [r4], #4 170 ldreq ip, [r1, #4]! 171 eorseq r0, r0, lr 172 ldreq r0, [r4], #4 173 ldreq lr, [r1, #4]! 174 eorseq r0, r0, ip 175 ldreq r0, [r4], #4 176 ldreq ip, [r1, #4]! 177 eorseq r0, r0, lr 178 ldreq r0, [r4], #4 179 ldreq lr, [r1, #4]! 180 eorseq r0, r0, ip 181 ldreq r0, [r4], #4 182 ldreq ip, [r1, #4]! 183 eorseq r0, r0, lr 184 bne 2f 185 subs r2, r2, #32 186 bhs 0b 187 188 /* do we have at least 4 bytes left? */ 1891: adds r2, r2, #(32 - 4 + 4) 190 bmi 4f 191 192 /* finish off 4 bytes at a time */ 1933: ldr r0, [r4], #4 194 ldr ip, [r1], #4 195 eors r0, r0, ip 196 bne 2f 197 subs r2, r2, #4 198 bhs 3b 199 200 /* are we done? */ 2014: adds r2, r2, #4 202 moveq r0, #0 203 beq 9f 204 205 /* finish off the remaining bytes */ 206 b 8f 207 2082: /* the last 4 bytes are different, restart them */ 209 sub r4, r4, #4 210 sub r1, r1, #4 211 mov r2, #4 212 213 /* process the last few bytes */ 2148: ldrb r0, [r4], #1 215 ldrb ip, [r1], #1 216 // stall 217 subs r0, r0, ip 218 bne 9f 219 subs r2, r2, #1 220 bne 8b 221 2229: /* restore registers and return */ 223 ldmfd sp!, {r4, pc} 224 22510: /* process less than 12 bytes */ 226 cmp r2, #0 227 moveq r0, #0 228 bxeq lr 229 mov r3, r0 23011: 231 ldrb r0, [r3], #1 232 ldrb ip, [r1], #1 233 subs r0, ip 234 bxne lr 235 subs r2, r2, #1 236 bne 11b 237 bx lr 238 2395: /*************** non-congruent case ***************/ 240 and r0, r1, #3 241 cmp r0, #2 242 bne 4f 243 244 /* here, offset is 2 (16-bits aligned, special cased) */ 245 246 /* make sure we have at least 16 bytes to process */ 247 subs r2, r2, #16 248 addmi r2, r2, #16 249 bmi 8b 250 251 /* align the unaligned pointer */ 252 bic r1, r1, #3 253 ldr lr, [r1], #4 254 2556: pld [r1, #(CACHE_LINE_SIZE * 2)] 256 pld [r4, #(CACHE_LINE_SIZE * 2)] 257 mov ip, lr, lsr #16 258 ldr lr, [r1], #4 259 ldr r0, [r4], #4 260 orr ip, ip, lr, lsl #16 261 eors r0, r0, ip 262 moveq ip, lr, lsr #16 263 ldreq lr, [r1], #4 264 ldreq r0, [r4], #4 265 orreq ip, ip, lr, lsl #16 266 eorseq r0, r0, ip 267 moveq ip, lr, lsr #16 268 ldreq lr, [r1], #4 269 ldreq r0, [r4], #4 270 orreq ip, ip, lr, lsl #16 271 eorseq r0, r0, ip 272 moveq ip, lr, lsr #16 273 ldreq lr, [r1], #4 274 ldreq r0, [r4], #4 275 orreq ip, ip, lr, lsl #16 276 eorseq r0, r0, ip 277 bne 7f 278 subs r2, r2, #16 279 bhs 6b 280 sub r1, r1, #2 281 /* are we done? */ 282 adds r2, r2, #16 283 moveq r0, #0 284 beq 9b 285 /* finish off the remaining bytes */ 286 b 8b 287 2887: /* fix up the 2 pointers and fallthrough... */ 289 sub r1, r1, #(4+2) 290 sub r4, r4, #4 291 mov r2, #4 292 b 8b 293 294 2954: /*************** offset is 1 or 3 (less optimized) ***************/ 296 297 stmfd sp!, {r5, r6, r7} 298 299 // r5 = rhs 300 // r6 = lhs 301 // r7 = scratch 302 303 mov r5, r0, lsl #3 /* r5 = right shift */ 304 rsb r6, r5, #32 /* r6 = left shift */ 305 306 /* align the unaligned pointer */ 307 bic r1, r1, #3 308 ldr r7, [r1], #4 309 sub r2, r2, #8 310 3116: mov ip, r7, lsr r5 312 ldr r7, [r1], #4 313 ldr r0, [r4], #4 314 orr ip, ip, r7, lsl r6 315 eors r0, r0, ip 316 moveq ip, r7, lsr r5 317 ldreq r7, [r1], #4 318 ldreq r0, [r4], #4 319 orreq ip, ip, r7, lsl r6 320 eorseq r0, r0, ip 321 bne 7f 322 subs r2, r2, #8 323 bhs 6b 324 325 sub r1, r1, r6, lsr #3 326 ldmfd sp!, {r5, r6, r7} 327 328 /* are we done? */ 329 adds r2, r2, #8 330 moveq r0, #0 331 beq 9b 332 333 /* finish off the remaining bytes */ 334 b 8b 335 3367: /* fix up the 2 pointers and fallthrough... */ 337 sub r1, r1, #4 338 sub r1, r1, r6, lsr #3 339 sub r4, r4, #4 340 mov r2, #4 341 ldmfd sp!, {r5, r6, r7} 342 b 8b 343END(memcmp) 344