1/*
2 * Copyright (c) 2012-2014 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 *    products derived from this software without specific prior written
15 *    permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <private/bionic_asm.h>
30
31#ifdef __ARMEB__
32#define S2LO lsl
33#define S2LOEQ lsleq
34#define S2HI lsr
35#define MSB 0x000000ff
36#define LSB 0xff000000
37#define BYTE0_OFFSET 24
38#define BYTE1_OFFSET 16
39#define BYTE2_OFFSET 8
40#define BYTE3_OFFSET 0
41#else /* not  __ARMEB__ */
42#define S2LO lsr
43#define S2LOEQ lsreq
44#define S2HI lsl
45#define BYTE0_OFFSET 0
46#define BYTE1_OFFSET 8
47#define BYTE2_OFFSET 16
48#define BYTE3_OFFSET 24
49#define MSB 0xff000000
50#define LSB 0x000000ff
51#endif /* not  __ARMEB__ */
52
53/* Parameters and result.  */
54#define src1		r0
55#define src2		r1
56#define result		r0	/* Overlaps src1.  */
57
58/* Internal variables.  */
59#define tmp1		r4
60#define tmp2		r5
61#define const_m1	r12
62
63/* Additional internal variables for 64-bit aligned data.  */
64#define data1a		r2
65#define data1b		r3
66#define data2a		r6
67#define data2b		r7
68#define syndrome_a	tmp1
69#define syndrome_b	tmp2
70
71/* Additional internal variables for 32-bit aligned data.  */
72#define data1		r2
73#define data2		r3
74#define syndrome	tmp2
75
76	/* Implementation of strcmp for ARMv7 when DSP instructions are
77	   available.  Use ldrd to support wider loads, provided the data
78	   is sufficiently aligned.  Use saturating arithmetic to optimize
79	   the compares.  */
80
81	/* Build Options:
82	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
83	   byte in the string.  If comparing completely random strings
84	   the pre-check will save time, since there is a very high
85	   probability of a mismatch in the first character: we save
86	   significant overhead if this is the common case.  However,
87	   if strings are likely to be identical (eg because we're
88	   verifying a hit in a hash table), then this check is largely
89	   redundant.  */
90
91
92.syntax         unified
93.thumb
94
95        // To avoid warning about deprecated instructions, add an explicit
96        // arch. The code generated is exactly the same.
97        .arch armv7-a
98
99	/* Macro to compute and return the result value for word-aligned
100	   cases.  */
101	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
102#ifdef __ARM_BIG_ENDIAN
103	/* If data1 contains a zero byte, then syndrome will contain a 1 in
104	   bit 7 of that byte.  Otherwise, the highest set bit in the
105	   syndrome will highlight the first different bit.  It is therefore
106	   sufficient to extract the eight bits starting with the syndrome
107	   bit.  */
108	clz	tmp1, \synd
109	lsl	r1, \d2, tmp1
110	.if \restore_r6
111	ldrd	r6, r7, [sp, #8]
112	.endif
113	.cfi_restore 6
114	.cfi_restore 7
115	lsl	\d1, \d1, tmp1
116	.cfi_remember_state
117	lsr	result, \d1, #24
118	ldrd	r4, r5, [sp], #16
119	.cfi_restore 4
120	.cfi_restore 5
121	sub	result, result, r1, lsr #24
122	bx	lr
123#else
124	/* To use the big-endian trick we'd have to reverse all three words.
125	   that's slower than this approach.  */
126	rev	\synd, \synd
127	clz	tmp1, \synd
128	bic	tmp1, tmp1, #7
129	lsr	r1, \d2, tmp1
130	.cfi_remember_state
131	.if \restore_r6
132	ldrd	r6, r7, [sp, #8]
133	.endif
134	.cfi_restore 6
135	.cfi_restore 7
136	lsr	\d1, \d1, tmp1
137	and	result, \d1, #255
138	and	r1, r1, #255
139	ldrd	r4, r5, [sp], #16
140	.cfi_restore 4
141	.cfi_restore 5
142	sub	result, result, r1
143
144	bx	lr
145#endif
146	.endm
147
148	.text
149	.p2align	5
150.Lstrcmp_start_addr:
151#ifndef STRCMP_NO_PRECHECK
152.Lfastpath_exit:
153	sub	r0, r2, r3
154	bx	lr
155	nop
156#endif
157
158ENTRY(strcmp_a15)
159#ifndef STRCMP_NO_PRECHECK
160	ldrb	r2, [src1]
161	ldrb	r3, [src2]
162	cmp	r2, #1
163	it	cs
164	cmpcs	r2, r3
165	bne	.Lfastpath_exit
166#endif
167	.cfi_sections .debug_frame
168	strd	r4, r5, [sp, #-16]!
169	.cfi_def_cfa_offset 16
170	.cfi_offset 4, -16
171	.cfi_offset 5, -12
172	orr	tmp1, src1, src2
173	strd	r6, r7, [sp, #8]
174	.cfi_offset 6, -8
175	.cfi_offset 7, -4
176	mvn	const_m1, #0
177	lsl	r2, tmp1, #29
178	cbz	r2, .Lloop_aligned8
179
180.Lnot_aligned:
181	eor	tmp1, src1, src2
182	tst	tmp1, #7
183	bne	.Lmisaligned8
184
185	/* Deal with mutual misalignment by aligning downwards and then
186	   masking off the unwanted loaded data to prevent a difference.  */
187	and	tmp1, src1, #7
188	bic	src1, src1, #7
189	and	tmp2, tmp1, #3
190	bic	src2, src2, #7
191	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
192	ldrd	data1a, data1b, [src1], #16
193	tst	tmp1, #4
194	ldrd	data2a, data2b, [src2], #16
195	/* In thumb code we can't use MVN with a register shift, but
196	   we do have ORN.  */
197	S2HI	tmp1, const_m1, tmp2
198	orn	data1a, data1a, tmp1
199	orn	data2a, data2a, tmp1
200	beq	.Lstart_realigned8
201	orn	data1b, data1b, tmp1
202	mov	data1a, const_m1
203	orn	data2b, data2b, tmp1
204	mov	data2a, const_m1
205	b	.Lstart_realigned8
206
207	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
208	   pass.  */
209	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
210	.p2align 2	/* Always word aligned.  */
211.Lloop_aligned8:
212	ldrd	data1a, data1b, [src1], #16
213	ldrd	data2a, data2b, [src2], #16
214.Lstart_realigned8:
215	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
216	eor	syndrome_a, data1a, data2a
217	sel	syndrome_a, syndrome_a, const_m1
218	cbnz	syndrome_a, .Ldiff_in_a
219	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
220	eor	syndrome_b, data1b, data2b
221	sel	syndrome_b, syndrome_b, const_m1
222	cbnz	syndrome_b, .Ldiff_in_b
223
224	ldrd	data1a, data1b, [src1, #-8]
225	ldrd	data2a, data2b, [src2, #-8]
226	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
227	eor	syndrome_a, data1a, data2a
228	sel	syndrome_a, syndrome_a, const_m1
229	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
230	eor	syndrome_b, data1b, data2b
231	sel	syndrome_b, syndrome_b, const_m1
232	/* Can't use CBZ for backwards branch.  */
233	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
234	beq	.Lloop_aligned8
235
236.Ldiff_found:
237	cbnz	syndrome_a, .Ldiff_in_a
238
239.Ldiff_in_b:
240	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
241
242.Ldiff_in_a:
243	.cfi_restore_state
244	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
245
246	.cfi_restore_state
247.Lmisaligned8:
248	tst	tmp1, #3
249	bne	.Lmisaligned4
250	ands	tmp1, src1, #3
251	bne	.Lmutual_align4
252
253	/* Unrolled by a factor of 2, to reduce the number of post-increment
254	   operations.  */
255.Lloop_aligned4:
256	ldr	data1, [src1], #8
257	ldr	data2, [src2], #8
258.Lstart_realigned4:
259	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
260	eor	syndrome, data1, data2
261	sel	syndrome, syndrome, const_m1
262	cbnz	syndrome, .Laligned4_done
263	ldr	data1, [src1, #-4]
264	ldr	data2, [src2, #-4]
265	uadd8	syndrome, data1, const_m1
266	eor	syndrome, data1, data2
267	sel	syndrome, syndrome, const_m1
268	cmp	syndrome, #0
269	beq	.Lloop_aligned4
270
271.Laligned4_done:
272	strcmp_epilogue_aligned syndrome, data1, data2, 0
273
274.Lmutual_align4:
275	.cfi_restore_state
276	/* Deal with mutual misalignment by aligning downwards and then
277	   masking off the unwanted loaded data to prevent a difference.  */
278	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
279	bic	src1, src1, #3
280	ldr	data1, [src1], #8
281	bic	src2, src2, #3
282	ldr	data2, [src2], #8
283
284	/* In thumb code we can't use MVN with a register shift, but
285	   we do have ORN.  */
286	S2HI	tmp1, const_m1, tmp1
287	orn	data1, data1, tmp1
288	orn	data2, data2, tmp1
289	b	.Lstart_realigned4
290
291.Lmisaligned4:
292	ands	tmp1, src1, #3
293	beq	.Lsrc1_aligned
294	sub	src2, src2, tmp1
295	bic	src1, src1, #3
296	lsls	tmp1, tmp1, #31
297	ldr	data1, [src1], #4
298	beq	.Laligned_m2
299	bcs	.Laligned_m1
300
301#ifdef STRCMP_NO_PRECHECK
302	ldrb	data2, [src2, #1]
303	uxtb	tmp1, data1, ror #BYTE1_OFFSET
304	subs	tmp1, tmp1, data2
305	bne	.Lmisaligned_exit
306	cbz	data2, .Lmisaligned_exit
307
308.Laligned_m2:
309	ldrb	data2, [src2, #2]
310	uxtb	tmp1, data1, ror #BYTE2_OFFSET
311	subs	tmp1, tmp1, data2
312	bne	.Lmisaligned_exit
313	cbz	data2, .Lmisaligned_exit
314
315.Laligned_m1:
316	ldrb	data2, [src2, #3]
317	uxtb	tmp1, data1, ror #BYTE3_OFFSET
318	subs	tmp1, tmp1, data2
319	bne	.Lmisaligned_exit
320	add	src2, src2, #4
321	cbnz	data2, .Lsrc1_aligned
322#else  /* STRCMP_NO_PRECHECK */
323	/* If we've done the pre-check, then we don't need to check the
324	   first byte again here.  */
325	ldrb	data2, [src2, #2]
326	uxtb	tmp1, data1, ror #BYTE2_OFFSET
327	subs	tmp1, tmp1, data2
328	bne	.Lmisaligned_exit
329	cbz	data2, .Lmisaligned_exit
330
331.Laligned_m2:
332	ldrb	data2, [src2, #3]
333	uxtb	tmp1, data1, ror #BYTE3_OFFSET
334	subs	tmp1, tmp1, data2
335	bne	.Lmisaligned_exit
336	cbnz	data2, .Laligned_m1
337#endif
338
339.Lmisaligned_exit:
340	.cfi_remember_state
341	mov	result, tmp1
342	ldr	r4, [sp], #16
343	.cfi_restore 4
344	bx	lr
345
346#ifndef STRCMP_NO_PRECHECK
347.Laligned_m1:
348	add	src2, src2, #4
349#endif
350.Lsrc1_aligned:
351	.cfi_restore_state
352	/* src1 is word aligned, but src2 has no common alignment
353	   with it.  */
354	ldr	data1, [src1], #4
355	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
356
357	bic	src2, src2, #3
358	ldr	data2, [src2], #4
359	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
360	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
361
362	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
363.Loverlap3:
364	bic	tmp1, data1, #MSB
365	uadd8	syndrome, data1, const_m1
366	eors	syndrome, tmp1, data2, S2LO #8
367	sel	syndrome, syndrome, const_m1
368	bne	4f
369	cbnz	syndrome, 5f
370	ldr	data2, [src2], #4
371	eor	tmp1, tmp1, data1
372	cmp	tmp1, data2, S2HI #24
373	bne	6f
374	ldr	data1, [src1], #4
375	b	.Loverlap3
3764:
377	S2LO	data2, data2, #8
378	b	.Lstrcmp_tail
379
3805:
381	bics	syndrome, syndrome, #MSB
382	bne	.Lstrcmp_done_equal
383
384	/* We can only get here if the MSB of data1 contains 0, so
385	   fast-path the exit.  */
386	ldrb	result, [src2]
387	.cfi_remember_state
388	ldrd	r4, r5, [sp], #16
389	.cfi_restore 4
390	.cfi_restore 5
391	/* R6/7 Not used in this sequence.  */
392	.cfi_restore 6
393	.cfi_restore 7
394	neg	result, result
395	bx	lr
396
3976:
398	.cfi_restore_state
399	S2LO	data1, data1, #24
400	and	data2, data2, #LSB
401	b	.Lstrcmp_tail
402
403	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
404.Loverlap2:
405	and	tmp1, data1, const_m1, S2LO #16
406	uadd8	syndrome, data1, const_m1
407	eors	syndrome, tmp1, data2, S2LO #16
408	sel	syndrome, syndrome, const_m1
409	bne	4f
410	cbnz	syndrome, 5f
411	ldr	data2, [src2], #4
412	eor	tmp1, tmp1, data1
413	cmp	tmp1, data2, S2HI #16
414	bne	6f
415	ldr	data1, [src1], #4
416	b	.Loverlap2
4174:
418	S2LO	data2, data2, #16
419	b	.Lstrcmp_tail
4205:
421	ands	syndrome, syndrome, const_m1, S2LO #16
422	bne	.Lstrcmp_done_equal
423
424	ldrh	data2, [src2]
425	S2LO	data1, data1, #16
426#ifdef __ARM_BIG_ENDIAN
427	lsl	data2, data2, #16
428#endif
429	b	.Lstrcmp_tail
430
4316:
432	S2LO	data1, data1, #16
433	and	data2, data2, const_m1, S2LO #16
434	b	.Lstrcmp_tail
435
436	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
437.Loverlap1:
438	and	tmp1, data1, #LSB
439	uadd8	syndrome, data1, const_m1
440	eors	syndrome, tmp1, data2, S2LO #24
441	sel	syndrome, syndrome, const_m1
442	bne	4f
443	cbnz	syndrome, 5f
444	ldr	data2, [src2], #4
445	eor	tmp1, tmp1, data1
446	cmp	tmp1, data2, S2HI #8
447	bne	6f
448	ldr	data1, [src1], #4
449	b	.Loverlap1
4504:
451	S2LO	data2, data2, #24
452	b	.Lstrcmp_tail
4535:
454	tst	syndrome, #LSB
455	bne	.Lstrcmp_done_equal
456	ldr	data2, [src2]
4576:
458	S2LO	data1, data1, #8
459	bic	data2, data2, #MSB
460	b	.Lstrcmp_tail
461
462.Lstrcmp_done_equal:
463	mov	result, #0
464	.cfi_remember_state
465	ldrd	r4, r5, [sp], #16
466	.cfi_restore 4
467	.cfi_restore 5
468	/* R6/7 not used in this sequence.  */
469	.cfi_restore 6
470	.cfi_restore 7
471	bx	lr
472
473.Lstrcmp_tail:
474	.cfi_restore_state
475#ifndef __ARM_BIG_ENDIAN
476	rev	data1, data1
477	rev	data2, data2
478	/* Now everything looks big-endian...  */
479#endif
480	uadd8	tmp1, data1, const_m1
481	eor	tmp1, data1, data2
482	sel	syndrome, tmp1, const_m1
483	clz	tmp1, syndrome
484	lsl	data1, data1, tmp1
485	lsl	data2, data2, tmp1
486	lsr	result, data1, #24
487	ldrd	r4, r5, [sp], #16
488	.cfi_restore 4
489	.cfi_restore 5
490	/* R6/7 not used in this sequence.  */
491	.cfi_restore 6
492	.cfi_restore 7
493	sub	result, result, data2, lsr #24
494	bx	lr
495END(strcmp_a15)
496