1/* 2 * Copyright (c) 2012-2014 ARM Ltd 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the company may not be used to endorse or promote 14 * products derived from this software without specific prior written 15 * permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED 18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include <private/bionic_asm.h> 30 31#ifdef __ARMEB__ 32#define S2LO lsl 33#define S2LOEQ lsleq 34#define S2HI lsr 35#define MSB 0x000000ff 36#define LSB 0xff000000 37#define BYTE0_OFFSET 24 38#define BYTE1_OFFSET 16 39#define BYTE2_OFFSET 8 40#define BYTE3_OFFSET 0 41#else /* not __ARMEB__ */ 42#define S2LO lsr 43#define S2LOEQ lsreq 44#define S2HI lsl 45#define BYTE0_OFFSET 0 46#define BYTE1_OFFSET 8 47#define BYTE2_OFFSET 16 48#define BYTE3_OFFSET 24 49#define MSB 0xff000000 50#define LSB 0x000000ff 51#endif /* not __ARMEB__ */ 52 53/* Parameters and result. */ 54#define src1 r0 55#define src2 r1 56#define result r0 /* Overlaps src1. */ 57 58/* Internal variables. */ 59#define tmp1 r4 60#define tmp2 r5 61#define const_m1 r12 62 63/* Additional internal variables for 64-bit aligned data. */ 64#define data1a r2 65#define data1b r3 66#define data2a r6 67#define data2b r7 68#define syndrome_a tmp1 69#define syndrome_b tmp2 70 71/* Additional internal variables for 32-bit aligned data. */ 72#define data1 r2 73#define data2 r3 74#define syndrome tmp2 75 76 /* Implementation of strcmp for ARMv7 when DSP instructions are 77 available. Use ldrd to support wider loads, provided the data 78 is sufficiently aligned. Use saturating arithmetic to optimize 79 the compares. */ 80 81 /* Build Options: 82 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 83 byte in the string. If comparing completely random strings 84 the pre-check will save time, since there is a very high 85 probability of a mismatch in the first character: we save 86 significant overhead if this is the common case. However, 87 if strings are likely to be identical (eg because we're 88 verifying a hit in a hash table), then this check is largely 89 redundant. */ 90 91 92.syntax unified 93.thumb 94 95 // To avoid warning about deprecated instructions, add an explicit 96 // arch. The code generated is exactly the same. 97 .arch armv7-a 98 99 /* Macro to compute and return the result value for word-aligned 100 cases. */ 101 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 102#ifdef __ARM_BIG_ENDIAN 103 /* If data1 contains a zero byte, then syndrome will contain a 1 in 104 bit 7 of that byte. Otherwise, the highest set bit in the 105 syndrome will highlight the first different bit. It is therefore 106 sufficient to extract the eight bits starting with the syndrome 107 bit. */ 108 clz tmp1, \synd 109 lsl r1, \d2, tmp1 110 .if \restore_r6 111 ldrd r6, r7, [sp, #8] 112 .endif 113 .cfi_restore 6 114 .cfi_restore 7 115 lsl \d1, \d1, tmp1 116 .cfi_remember_state 117 lsr result, \d1, #24 118 ldrd r4, r5, [sp], #16 119 .cfi_restore 4 120 .cfi_restore 5 121 sub result, result, r1, lsr #24 122 bx lr 123#else 124 /* To use the big-endian trick we'd have to reverse all three words. 125 that's slower than this approach. */ 126 rev \synd, \synd 127 clz tmp1, \synd 128 bic tmp1, tmp1, #7 129 lsr r1, \d2, tmp1 130 .cfi_remember_state 131 .if \restore_r6 132 ldrd r6, r7, [sp, #8] 133 .endif 134 .cfi_restore 6 135 .cfi_restore 7 136 lsr \d1, \d1, tmp1 137 and result, \d1, #255 138 and r1, r1, #255 139 ldrd r4, r5, [sp], #16 140 .cfi_restore 4 141 .cfi_restore 5 142 sub result, result, r1 143 144 bx lr 145#endif 146 .endm 147 148 .text 149 .p2align 5 150.Lstrcmp_start_addr: 151#ifndef STRCMP_NO_PRECHECK 152.Lfastpath_exit: 153 sub r0, r2, r3 154 bx lr 155 nop 156#endif 157 158ENTRY(strcmp_a15) 159#ifndef STRCMP_NO_PRECHECK 160 ldrb r2, [src1] 161 ldrb r3, [src2] 162 cmp r2, #1 163 it cs 164 cmpcs r2, r3 165 bne .Lfastpath_exit 166#endif 167 .cfi_sections .debug_frame 168 strd r4, r5, [sp, #-16]! 169 .cfi_def_cfa_offset 16 170 .cfi_offset 4, -16 171 .cfi_offset 5, -12 172 orr tmp1, src1, src2 173 strd r6, r7, [sp, #8] 174 .cfi_offset 6, -8 175 .cfi_offset 7, -4 176 mvn const_m1, #0 177 lsl r2, tmp1, #29 178 cbz r2, .Lloop_aligned8 179 180.Lnot_aligned: 181 eor tmp1, src1, src2 182 tst tmp1, #7 183 bne .Lmisaligned8 184 185 /* Deal with mutual misalignment by aligning downwards and then 186 masking off the unwanted loaded data to prevent a difference. */ 187 and tmp1, src1, #7 188 bic src1, src1, #7 189 and tmp2, tmp1, #3 190 bic src2, src2, #7 191 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 192 ldrd data1a, data1b, [src1], #16 193 tst tmp1, #4 194 ldrd data2a, data2b, [src2], #16 195 /* In thumb code we can't use MVN with a register shift, but 196 we do have ORN. */ 197 S2HI tmp1, const_m1, tmp2 198 orn data1a, data1a, tmp1 199 orn data2a, data2a, tmp1 200 beq .Lstart_realigned8 201 orn data1b, data1b, tmp1 202 mov data1a, const_m1 203 orn data2b, data2b, tmp1 204 mov data2a, const_m1 205 b .Lstart_realigned8 206 207 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 208 pass. */ 209 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 210 .p2align 2 /* Always word aligned. */ 211.Lloop_aligned8: 212 ldrd data1a, data1b, [src1], #16 213 ldrd data2a, data2b, [src2], #16 214.Lstart_realigned8: 215 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 216 eor syndrome_a, data1a, data2a 217 sel syndrome_a, syndrome_a, const_m1 218 cbnz syndrome_a, .Ldiff_in_a 219 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 220 eor syndrome_b, data1b, data2b 221 sel syndrome_b, syndrome_b, const_m1 222 cbnz syndrome_b, .Ldiff_in_b 223 224 ldrd data1a, data1b, [src1, #-8] 225 ldrd data2a, data2b, [src2, #-8] 226 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 227 eor syndrome_a, data1a, data2a 228 sel syndrome_a, syndrome_a, const_m1 229 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 230 eor syndrome_b, data1b, data2b 231 sel syndrome_b, syndrome_b, const_m1 232 /* Can't use CBZ for backwards branch. */ 233 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 234 beq .Lloop_aligned8 235 236.Ldiff_found: 237 cbnz syndrome_a, .Ldiff_in_a 238 239.Ldiff_in_b: 240 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 241 242.Ldiff_in_a: 243 .cfi_restore_state 244 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 245 246 .cfi_restore_state 247.Lmisaligned8: 248 tst tmp1, #3 249 bne .Lmisaligned4 250 ands tmp1, src1, #3 251 bne .Lmutual_align4 252 253 /* Unrolled by a factor of 2, to reduce the number of post-increment 254 operations. */ 255.Lloop_aligned4: 256 ldr data1, [src1], #8 257 ldr data2, [src2], #8 258.Lstart_realigned4: 259 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 260 eor syndrome, data1, data2 261 sel syndrome, syndrome, const_m1 262 cbnz syndrome, .Laligned4_done 263 ldr data1, [src1, #-4] 264 ldr data2, [src2, #-4] 265 uadd8 syndrome, data1, const_m1 266 eor syndrome, data1, data2 267 sel syndrome, syndrome, const_m1 268 cmp syndrome, #0 269 beq .Lloop_aligned4 270 271.Laligned4_done: 272 strcmp_epilogue_aligned syndrome, data1, data2, 0 273 274.Lmutual_align4: 275 .cfi_restore_state 276 /* Deal with mutual misalignment by aligning downwards and then 277 masking off the unwanted loaded data to prevent a difference. */ 278 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 279 bic src1, src1, #3 280 ldr data1, [src1], #8 281 bic src2, src2, #3 282 ldr data2, [src2], #8 283 284 /* In thumb code we can't use MVN with a register shift, but 285 we do have ORN. */ 286 S2HI tmp1, const_m1, tmp1 287 orn data1, data1, tmp1 288 orn data2, data2, tmp1 289 b .Lstart_realigned4 290 291.Lmisaligned4: 292 ands tmp1, src1, #3 293 beq .Lsrc1_aligned 294 sub src2, src2, tmp1 295 bic src1, src1, #3 296 lsls tmp1, tmp1, #31 297 ldr data1, [src1], #4 298 beq .Laligned_m2 299 bcs .Laligned_m1 300 301#ifdef STRCMP_NO_PRECHECK 302 ldrb data2, [src2, #1] 303 uxtb tmp1, data1, ror #BYTE1_OFFSET 304 subs tmp1, tmp1, data2 305 bne .Lmisaligned_exit 306 cbz data2, .Lmisaligned_exit 307 308.Laligned_m2: 309 ldrb data2, [src2, #2] 310 uxtb tmp1, data1, ror #BYTE2_OFFSET 311 subs tmp1, tmp1, data2 312 bne .Lmisaligned_exit 313 cbz data2, .Lmisaligned_exit 314 315.Laligned_m1: 316 ldrb data2, [src2, #3] 317 uxtb tmp1, data1, ror #BYTE3_OFFSET 318 subs tmp1, tmp1, data2 319 bne .Lmisaligned_exit 320 add src2, src2, #4 321 cbnz data2, .Lsrc1_aligned 322#else /* STRCMP_NO_PRECHECK */ 323 /* If we've done the pre-check, then we don't need to check the 324 first byte again here. */ 325 ldrb data2, [src2, #2] 326 uxtb tmp1, data1, ror #BYTE2_OFFSET 327 subs tmp1, tmp1, data2 328 bne .Lmisaligned_exit 329 cbz data2, .Lmisaligned_exit 330 331.Laligned_m2: 332 ldrb data2, [src2, #3] 333 uxtb tmp1, data1, ror #BYTE3_OFFSET 334 subs tmp1, tmp1, data2 335 bne .Lmisaligned_exit 336 cbnz data2, .Laligned_m1 337#endif 338 339.Lmisaligned_exit: 340 .cfi_remember_state 341 mov result, tmp1 342 ldr r4, [sp], #16 343 .cfi_restore 4 344 bx lr 345 346#ifndef STRCMP_NO_PRECHECK 347.Laligned_m1: 348 add src2, src2, #4 349#endif 350.Lsrc1_aligned: 351 .cfi_restore_state 352 /* src1 is word aligned, but src2 has no common alignment 353 with it. */ 354 ldr data1, [src1], #4 355 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 356 357 bic src2, src2, #3 358 ldr data2, [src2], #4 359 bhi .Loverlap1 /* C=1, Z=0 => src2[1:0] = 0b11. */ 360 bcs .Loverlap2 /* C=1, Z=1 => src2[1:0] = 0b10. */ 361 362 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 363.Loverlap3: 364 bic tmp1, data1, #MSB 365 uadd8 syndrome, data1, const_m1 366 eors syndrome, tmp1, data2, S2LO #8 367 sel syndrome, syndrome, const_m1 368 bne 4f 369 cbnz syndrome, 5f 370 ldr data2, [src2], #4 371 eor tmp1, tmp1, data1 372 cmp tmp1, data2, S2HI #24 373 bne 6f 374 ldr data1, [src1], #4 375 b .Loverlap3 3764: 377 S2LO data2, data2, #8 378 b .Lstrcmp_tail 379 3805: 381 bics syndrome, syndrome, #MSB 382 bne .Lstrcmp_done_equal 383 384 /* We can only get here if the MSB of data1 contains 0, so 385 fast-path the exit. */ 386 ldrb result, [src2] 387 .cfi_remember_state 388 ldrd r4, r5, [sp], #16 389 .cfi_restore 4 390 .cfi_restore 5 391 /* R6/7 Not used in this sequence. */ 392 .cfi_restore 6 393 .cfi_restore 7 394 neg result, result 395 bx lr 396 3976: 398 .cfi_restore_state 399 S2LO data1, data1, #24 400 and data2, data2, #LSB 401 b .Lstrcmp_tail 402 403 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 404.Loverlap2: 405 and tmp1, data1, const_m1, S2LO #16 406 uadd8 syndrome, data1, const_m1 407 eors syndrome, tmp1, data2, S2LO #16 408 sel syndrome, syndrome, const_m1 409 bne 4f 410 cbnz syndrome, 5f 411 ldr data2, [src2], #4 412 eor tmp1, tmp1, data1 413 cmp tmp1, data2, S2HI #16 414 bne 6f 415 ldr data1, [src1], #4 416 b .Loverlap2 4174: 418 S2LO data2, data2, #16 419 b .Lstrcmp_tail 4205: 421 ands syndrome, syndrome, const_m1, S2LO #16 422 bne .Lstrcmp_done_equal 423 424 ldrh data2, [src2] 425 S2LO data1, data1, #16 426#ifdef __ARM_BIG_ENDIAN 427 lsl data2, data2, #16 428#endif 429 b .Lstrcmp_tail 430 4316: 432 S2LO data1, data1, #16 433 and data2, data2, const_m1, S2LO #16 434 b .Lstrcmp_tail 435 436 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 437.Loverlap1: 438 and tmp1, data1, #LSB 439 uadd8 syndrome, data1, const_m1 440 eors syndrome, tmp1, data2, S2LO #24 441 sel syndrome, syndrome, const_m1 442 bne 4f 443 cbnz syndrome, 5f 444 ldr data2, [src2], #4 445 eor tmp1, tmp1, data1 446 cmp tmp1, data2, S2HI #8 447 bne 6f 448 ldr data1, [src1], #4 449 b .Loverlap1 4504: 451 S2LO data2, data2, #24 452 b .Lstrcmp_tail 4535: 454 tst syndrome, #LSB 455 bne .Lstrcmp_done_equal 456 ldr data2, [src2] 4576: 458 S2LO data1, data1, #8 459 bic data2, data2, #MSB 460 b .Lstrcmp_tail 461 462.Lstrcmp_done_equal: 463 mov result, #0 464 .cfi_remember_state 465 ldrd r4, r5, [sp], #16 466 .cfi_restore 4 467 .cfi_restore 5 468 /* R6/7 not used in this sequence. */ 469 .cfi_restore 6 470 .cfi_restore 7 471 bx lr 472 473.Lstrcmp_tail: 474 .cfi_restore_state 475#ifndef __ARM_BIG_ENDIAN 476 rev data1, data1 477 rev data2, data2 478 /* Now everything looks big-endian... */ 479#endif 480 uadd8 tmp1, data1, const_m1 481 eor tmp1, data1, data2 482 sel syndrome, tmp1, const_m1 483 clz tmp1, syndrome 484 lsl data1, data1, tmp1 485 lsl data2, data2, tmp1 486 lsr result, data1, #24 487 ldrd r4, r5, [sp], #16 488 .cfi_restore 4 489 .cfi_restore 5 490 /* R6/7 not used in this sequence. */ 491 .cfi_restore 6 492 .cfi_restore 7 493 sub result, result, data2, lsr #24 494 bx lr 495END(strcmp_a15) 496