1/*
2Copyright (c) 2010, 2011, 2012, 2013 Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc	.cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc	.cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg)	.cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
53#endif
54
55#ifndef cfi_remember_state
56# define cfi_remember_state	.cfi_remember_state
57#endif
58
59#ifndef cfi_restore_state
60# define cfi_restore_state	.cfi_restore_state
61#endif
62
63#ifndef ENTRY
64# define ENTRY(name)             \
65	.type name, @function;   \
66	.globl name;             \
67	.p2align 4;              \
68name:                            \
69	cfi_startproc
70#endif
71
72#ifndef END
73# define END(name)               \
74	cfi_endproc;             \
75	.size name, .-name
76#endif
77
78#ifndef MEMCMP
79# define MEMCMP	memcmp_atom
80#endif
81
82#define CFI_PUSH(REG)	\
83	cfi_adjust_cfa_offset (4);	\
84	cfi_rel_offset (REG, 0)
85
86#define CFI_POP(REG)	\
87	cfi_adjust_cfa_offset (-4);	\
88	cfi_restore (REG)
89
90#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
91#define POP(REG)	popl REG; CFI_POP (REG)
92
93#define PARMS		4
94#define BLK1		PARMS
95#define BLK2		BLK1+4
96#define LEN		BLK2+4
97#define RETURN_END	POP (%edi); POP (%esi); POP (%ebx); ret
98#define RETURN		RETURN_END; cfi_restore_state; cfi_remember_state
99
100/* Warning!
101           wmemcmp has to use SIGNED comparison for elements.
102           memcmp has to use UNSIGNED comparison for elemnts.
103*/
104
105	.text
106ENTRY (MEMCMP)
107	movl	LEN(%esp), %ecx
108
109#ifdef USE_WCHAR
110	shl	$2, %ecx
111	jz	L(zero)
112#elif defined USE_UTF16
113	shl	$1, %ecx
114	jz	L(zero)
115#endif
116
117	movl	BLK1(%esp), %eax
118	cmp	$48, %ecx
119	movl	BLK2(%esp), %edx
120	jae	L(48bytesormore)
121
122#if !defined(USE_WCHAR) && !defined(USE_UTF16)
123	cmp	$1, %ecx
124	jbe	L(less1bytes)
125#endif
126
127	PUSH	(%ebx)
128	add	%ecx, %edx
129	add	%ecx, %eax
130	jmp	L(less48bytes)
131
132	CFI_POP	(%ebx)
133
134#if !defined(USE_WCHAR) && !defined(USE_UTF16)
135	.p2align 4
136L(less1bytes):
137	jb	L(zero)
138	movb	(%eax), %cl
139	cmp	(%edx), %cl
140	je	L(zero)
141	mov	$1, %eax
142	ja	L(1bytesend)
143	neg	%eax
144L(1bytesend):
145	ret
146#endif
147
148	.p2align 4
149L(zero):
150	xor	%eax, %eax
151	ret
152
153	.p2align 4
154L(48bytesormore):
155	PUSH	(%ebx)
156	PUSH	(%esi)
157	PUSH	(%edi)
158	cfi_remember_state
159	movdqu	(%eax), %xmm3
160	movdqu	(%edx), %xmm0
161	movl	%eax, %edi
162	movl	%edx, %esi
163	pcmpeqb	%xmm0, %xmm3
164	pmovmskb %xmm3, %edx
165	lea	16(%edi), %edi
166
167	sub	$0xffff, %edx
168	lea	16(%esi), %esi
169	jnz	L(less16bytes)
170	mov	%edi, %edx
171	and	$0xf, %edx
172	xor	%edx, %edi
173	sub	%edx, %esi
174	add	%edx, %ecx
175	mov	%esi, %edx
176	and	$0xf, %edx
177	jz	L(shr_0)
178	xor	%edx, %esi
179
180#if !defined(USE_WCHAR) && !defined(USE_UTF16)
181	cmp	$8, %edx
182	jae	L(next_unaligned_table)
183	cmp	$0, %edx
184	je	L(shr_0)
185	cmp	$1, %edx
186	je	L(shr_1)
187	cmp	$2, %edx
188	je	L(shr_2)
189	cmp	$3, %edx
190	je	L(shr_3)
191	cmp	$4, %edx
192	je	L(shr_4)
193	cmp	$5, %edx
194	je	L(shr_5)
195	cmp	$6, %edx
196	je	L(shr_6)
197	jmp	L(shr_7)
198
199	.p2align 2
200L(next_unaligned_table):
201	cmp	$8, %edx
202	je	L(shr_8)
203	cmp	$9, %edx
204	je	L(shr_9)
205	cmp	$10, %edx
206	je	L(shr_10)
207	cmp	$11, %edx
208	je	L(shr_11)
209	cmp	$12, %edx
210	je	L(shr_12)
211	cmp	$13, %edx
212	je	L(shr_13)
213	cmp	$14, %edx
214	je	L(shr_14)
215	jmp	L(shr_15)
216#elif defined(USE_WCHAR)
217	cmp	$0, %edx
218	je	L(shr_0)
219	cmp	$4, %edx
220	je	L(shr_4)
221	cmp	$8, %edx
222	je	L(shr_8)
223	jmp	L(shr_12)
224#elif defined(USE_UTF16)
225	cmp	$0, %edx
226	je	L(shr_0)
227	cmp	$2, %edx
228	je	L(shr_2)
229	cmp	$4, %edx
230	je	L(shr_4)
231	cmp	$6, %edx
232	je	L(shr_6)
233	cmp	$8, %edx
234	je	L(shr_8)
235	cmp	$10, %edx
236	je	L(shr_10)
237	cmp	$12, %edx
238	je	L(shr_12)
239	jmp	L(shr_14)
240#endif
241
242	.p2align 4
243L(shr_0):
244	cmp	$80, %ecx
245	jae	L(shr_0_gobble)
246	lea	-48(%ecx), %ecx
247	xor	%eax, %eax
248	movaps	(%esi), %xmm1
249	pcmpeqb	(%edi), %xmm1
250	movaps	16(%esi), %xmm2
251	pcmpeqb	16(%edi), %xmm2
252	pand	%xmm1, %xmm2
253	pmovmskb %xmm2, %edx
254	add	$32, %edi
255	add	$32, %esi
256	sub	$0xffff, %edx
257	jnz	L(exit)
258
259	lea	(%ecx, %edi,1), %eax
260	lea	(%ecx, %esi,1), %edx
261	POP	(%edi)
262	POP	(%esi)
263	jmp	L(less48bytes)
264
265	cfi_restore_state
266	cfi_remember_state
267	.p2align 4
268L(shr_0_gobble):
269	lea	-48(%ecx), %ecx
270	movdqa	(%esi), %xmm0
271	xor	%eax, %eax
272	pcmpeqb	(%edi), %xmm0
273	sub	$32, %ecx
274	movdqa	16(%esi), %xmm2
275	pcmpeqb	16(%edi), %xmm2
276L(shr_0_gobble_loop):
277	pand	%xmm0, %xmm2
278	sub	$32, %ecx
279	pmovmskb %xmm2, %edx
280	movdqa	%xmm0, %xmm1
281	movdqa	32(%esi), %xmm0
282	movdqa	48(%esi), %xmm2
283	sbb	$0xffff, %edx
284	pcmpeqb	32(%edi), %xmm0
285	pcmpeqb	48(%edi), %xmm2
286	lea	32(%edi), %edi
287	lea	32(%esi), %esi
288	jz	L(shr_0_gobble_loop)
289
290	pand	%xmm0, %xmm2
291	cmp	$0, %ecx
292	jge	L(shr_0_gobble_loop_next)
293	inc	%edx
294	add	$32, %ecx
295L(shr_0_gobble_loop_next):
296	test	%edx, %edx
297	jnz	L(exit)
298
299	pmovmskb %xmm2, %edx
300	movdqa	%xmm0, %xmm1
301	lea	32(%edi), %edi
302	lea	32(%esi), %esi
303	sub	$0xffff, %edx
304	jnz	L(exit)
305	lea	(%ecx, %edi,1), %eax
306	lea	(%ecx, %esi,1), %edx
307	POP	(%edi)
308	POP	(%esi)
309	jmp	L(less48bytes)
310
311#if !defined(USE_WCHAR) && !defined(USE_UTF16)
312	cfi_restore_state
313	cfi_remember_state
314	.p2align 4
315L(shr_1):
316	cmp	$80, %ecx
317	lea	-48(%ecx), %ecx
318	mov	%edx, %eax
319	jae	L(shr_1_gobble)
320
321	movdqa	16(%esi), %xmm1
322	movdqa	%xmm1, %xmm2
323	palignr	$1,(%esi), %xmm1
324	pcmpeqb	(%edi), %xmm1
325
326	movdqa	32(%esi), %xmm3
327	palignr	$1,%xmm2, %xmm3
328	pcmpeqb	16(%edi), %xmm3
329
330	pand	%xmm1, %xmm3
331	pmovmskb %xmm3, %edx
332	lea	32(%edi), %edi
333	lea	32(%esi), %esi
334	sub	$0xffff, %edx
335	jnz	L(exit)
336	lea	(%ecx, %edi,1), %eax
337	lea	1(%ecx, %esi,1), %edx
338	POP	(%edi)
339	POP	(%esi)
340	jmp	L(less48bytes)
341
342	cfi_restore_state
343	cfi_remember_state
344	.p2align 4
345L(shr_1_gobble):
346	sub	$32, %ecx
347	movdqa	16(%esi), %xmm0
348	palignr	$1,(%esi), %xmm0
349	pcmpeqb	(%edi), %xmm0
350
351	movdqa	32(%esi), %xmm3
352	palignr	$1,16(%esi), %xmm3
353	pcmpeqb	16(%edi), %xmm3
354
355L(shr_1_gobble_loop):
356	pand	%xmm0, %xmm3
357	sub	$32, %ecx
358	pmovmskb %xmm3, %edx
359	movdqa	%xmm0, %xmm1
360
361	movdqa	64(%esi), %xmm3
362	palignr	$1,48(%esi), %xmm3
363	sbb	$0xffff, %edx
364	movdqa	48(%esi), %xmm0
365	palignr	$1,32(%esi), %xmm0
366	pcmpeqb	32(%edi), %xmm0
367	lea	32(%esi), %esi
368	pcmpeqb	48(%edi), %xmm3
369
370	lea	32(%edi), %edi
371	jz	L(shr_1_gobble_loop)
372	pand	%xmm0, %xmm3
373
374	cmp	$0, %ecx
375	jge	L(shr_1_gobble_next)
376	inc	%edx
377	add	$32, %ecx
378L(shr_1_gobble_next):
379	test	%edx, %edx
380	jnz	L(exit)
381
382	pmovmskb %xmm3, %edx
383	movdqa	%xmm0, %xmm1
384	lea	32(%edi), %edi
385	lea	32(%esi), %esi
386	sub	$0xffff, %edx
387	jnz	L(exit)
388
389	lea	(%ecx, %edi,1), %eax
390	lea	1(%ecx, %esi,1), %edx
391	POP	(%edi)
392	POP	(%esi)
393	jmp	L(less48bytes)
394#endif
395
396
397#if !defined(USE_WCHAR)
398	cfi_restore_state
399	cfi_remember_state
400	.p2align 4
401L(shr_2):
402	cmp	$80, %ecx
403	lea	-48(%ecx), %ecx
404	mov	%edx, %eax
405	jae	L(shr_2_gobble)
406
407	movdqa	16(%esi), %xmm1
408	movdqa	%xmm1, %xmm2
409	palignr	$2,(%esi), %xmm1
410	pcmpeqb	(%edi), %xmm1
411
412	movdqa	32(%esi), %xmm3
413	palignr	$2,%xmm2, %xmm3
414	pcmpeqb	16(%edi), %xmm3
415
416	pand	%xmm1, %xmm3
417	pmovmskb %xmm3, %edx
418	lea	32(%edi), %edi
419	lea	32(%esi), %esi
420	sub	$0xffff, %edx
421	jnz	L(exit)
422	lea	(%ecx, %edi,1), %eax
423	lea	2(%ecx, %esi,1), %edx
424	POP	(%edi)
425	POP	(%esi)
426	jmp	L(less48bytes)
427
428	cfi_restore_state
429	cfi_remember_state
430	.p2align 4
431L(shr_2_gobble):
432	sub	$32, %ecx
433	movdqa	16(%esi), %xmm0
434	palignr	$2,(%esi), %xmm0
435	pcmpeqb	(%edi), %xmm0
436
437	movdqa	32(%esi), %xmm3
438	palignr	$2,16(%esi), %xmm3
439	pcmpeqb	16(%edi), %xmm3
440
441L(shr_2_gobble_loop):
442	pand	%xmm0, %xmm3
443	sub	$32, %ecx
444	pmovmskb %xmm3, %edx
445	movdqa	%xmm0, %xmm1
446
447	movdqa	64(%esi), %xmm3
448	palignr	$2,48(%esi), %xmm3
449	sbb	$0xffff, %edx
450	movdqa	48(%esi), %xmm0
451	palignr	$2,32(%esi), %xmm0
452	pcmpeqb	32(%edi), %xmm0
453	lea	32(%esi), %esi
454	pcmpeqb	48(%edi), %xmm3
455
456	lea	32(%edi), %edi
457	jz	L(shr_2_gobble_loop)
458	pand	%xmm0, %xmm3
459
460	cmp	$0, %ecx
461	jge	L(shr_2_gobble_next)
462	inc	%edx
463	add	$32, %ecx
464L(shr_2_gobble_next):
465	test	%edx, %edx
466	jnz	L(exit)
467
468	pmovmskb %xmm3, %edx
469	movdqa	%xmm0, %xmm1
470	lea	32(%edi), %edi
471	lea	32(%esi), %esi
472	sub	$0xffff, %edx
473	jnz	L(exit)
474
475	lea	(%ecx, %edi,1), %eax
476	lea	2(%ecx, %esi,1), %edx
477	POP	(%edi)
478	POP	(%esi)
479	jmp	L(less48bytes)
480#endif
481
482#if !defined(USE_WCHAR) && !defined(USE_UTF16)
483	cfi_restore_state
484	cfi_remember_state
485	.p2align 4
486L(shr_3):
487	cmp	$80, %ecx
488	lea	-48(%ecx), %ecx
489	mov	%edx, %eax
490	jae	L(shr_3_gobble)
491
492	movdqa	16(%esi), %xmm1
493	movdqa	%xmm1, %xmm2
494	palignr	$3,(%esi), %xmm1
495	pcmpeqb	(%edi), %xmm1
496
497	movdqa	32(%esi), %xmm3
498	palignr	$3,%xmm2, %xmm3
499	pcmpeqb	16(%edi), %xmm3
500
501	pand	%xmm1, %xmm3
502	pmovmskb %xmm3, %edx
503	lea	32(%edi), %edi
504	lea	32(%esi), %esi
505	sub	$0xffff, %edx
506	jnz	L(exit)
507	lea	(%ecx, %edi,1), %eax
508	lea	3(%ecx, %esi,1), %edx
509	POP	(%edi)
510	POP	(%esi)
511	jmp	L(less48bytes)
512
513	cfi_restore_state
514	cfi_remember_state
515	.p2align 4
516L(shr_3_gobble):
517	sub	$32, %ecx
518	movdqa	16(%esi), %xmm0
519	palignr	$3,(%esi), %xmm0
520	pcmpeqb	(%edi), %xmm0
521
522	movdqa	32(%esi), %xmm3
523	palignr	$3,16(%esi), %xmm3
524	pcmpeqb	16(%edi), %xmm3
525
526L(shr_3_gobble_loop):
527	pand	%xmm0, %xmm3
528	sub	$32, %ecx
529	pmovmskb %xmm3, %edx
530	movdqa	%xmm0, %xmm1
531
532	movdqa	64(%esi), %xmm3
533	palignr	$3,48(%esi), %xmm3
534	sbb	$0xffff, %edx
535	movdqa	48(%esi), %xmm0
536	palignr	$3,32(%esi), %xmm0
537	pcmpeqb	32(%edi), %xmm0
538	lea	32(%esi), %esi
539	pcmpeqb	48(%edi), %xmm3
540
541	lea	32(%edi), %edi
542	jz	L(shr_3_gobble_loop)
543	pand	%xmm0, %xmm3
544
545	cmp	$0, %ecx
546	jge	L(shr_3_gobble_next)
547	inc	%edx
548	add	$32, %ecx
549L(shr_3_gobble_next):
550	test	%edx, %edx
551	jnz	L(exit)
552
553	pmovmskb %xmm3, %edx
554	movdqa	%xmm0, %xmm1
555	lea	32(%edi), %edi
556	lea	32(%esi), %esi
557	sub	$0xffff, %edx
558	jnz	L(exit)
559
560	lea	(%ecx, %edi,1), %eax
561	lea	3(%ecx, %esi,1), %edx
562	POP	(%edi)
563	POP	(%esi)
564	jmp	L(less48bytes)
565#endif
566
567	cfi_restore_state
568	cfi_remember_state
569	.p2align 4
570L(shr_4):
571	cmp	$80, %ecx
572	lea	-48(%ecx), %ecx
573	mov	%edx, %eax
574	jae	L(shr_4_gobble)
575
576	movdqa	16(%esi), %xmm1
577	movdqa	%xmm1, %xmm2
578	palignr	$4,(%esi), %xmm1
579	pcmpeqb	(%edi), %xmm1
580
581	movdqa	32(%esi), %xmm3
582	palignr	$4,%xmm2, %xmm3
583	pcmpeqb	16(%edi), %xmm3
584
585	pand	%xmm1, %xmm3
586	pmovmskb %xmm3, %edx
587	lea	32(%edi), %edi
588	lea	32(%esi), %esi
589	sub	$0xffff, %edx
590	jnz	L(exit)
591	lea	(%ecx, %edi,1), %eax
592	lea	4(%ecx, %esi,1), %edx
593	POP	(%edi)
594	POP	(%esi)
595	jmp	L(less48bytes)
596
597	cfi_restore_state
598	cfi_remember_state
599	.p2align 4
600L(shr_4_gobble):
601	sub	$32, %ecx
602	movdqa	16(%esi), %xmm0
603	palignr	$4,(%esi), %xmm0
604	pcmpeqb	(%edi), %xmm0
605
606	movdqa	32(%esi), %xmm3
607	palignr	$4,16(%esi), %xmm3
608	pcmpeqb	16(%edi), %xmm3
609
610L(shr_4_gobble_loop):
611	pand	%xmm0, %xmm3
612	sub	$32, %ecx
613	pmovmskb %xmm3, %edx
614	movdqa	%xmm0, %xmm1
615
616	movdqa	64(%esi), %xmm3
617	palignr	$4,48(%esi), %xmm3
618	sbb	$0xffff, %edx
619	movdqa	48(%esi), %xmm0
620	palignr	$4,32(%esi), %xmm0
621	pcmpeqb	32(%edi), %xmm0
622	lea	32(%esi), %esi
623	pcmpeqb	48(%edi), %xmm3
624
625	lea	32(%edi), %edi
626	jz	L(shr_4_gobble_loop)
627	pand	%xmm0, %xmm3
628
629	cmp	$0, %ecx
630	jge	L(shr_4_gobble_next)
631	inc	%edx
632	add	$32, %ecx
633L(shr_4_gobble_next):
634	test	%edx, %edx
635	jnz	L(exit)
636
637	pmovmskb %xmm3, %edx
638	movdqa	%xmm0, %xmm1
639	lea	32(%edi), %edi
640	lea	32(%esi), %esi
641	sub	$0xffff, %edx
642	jnz	L(exit)
643
644	lea	(%ecx, %edi,1), %eax
645	lea	4(%ecx, %esi,1), %edx
646	POP	(%edi)
647	POP	(%esi)
648	jmp	L(less48bytes)
649
650#if !defined(USE_WCHAR) && !defined(USE_UTF16)
651	cfi_restore_state
652	cfi_remember_state
653	.p2align 4
654L(shr_5):
655	cmp	$80, %ecx
656	lea	-48(%ecx), %ecx
657	mov	%edx, %eax
658	jae	L(shr_5_gobble)
659
660	movdqa	16(%esi), %xmm1
661	movdqa	%xmm1, %xmm2
662	palignr	$5,(%esi), %xmm1
663	pcmpeqb	(%edi), %xmm1
664
665	movdqa	32(%esi), %xmm3
666	palignr	$5,%xmm2, %xmm3
667	pcmpeqb	16(%edi), %xmm3
668
669	pand	%xmm1, %xmm3
670	pmovmskb %xmm3, %edx
671	lea	32(%edi), %edi
672	lea	32(%esi), %esi
673	sub	$0xffff, %edx
674	jnz	L(exit)
675	lea	(%ecx, %edi,1), %eax
676	lea	5(%ecx, %esi,1), %edx
677	POP	(%edi)
678	POP	(%esi)
679	jmp	L(less48bytes)
680
681	cfi_restore_state
682	cfi_remember_state
683	.p2align 4
684L(shr_5_gobble):
685	sub	$32, %ecx
686	movdqa	16(%esi), %xmm0
687	palignr	$5,(%esi), %xmm0
688	pcmpeqb	(%edi), %xmm0
689
690	movdqa	32(%esi), %xmm3
691	palignr	$5,16(%esi), %xmm3
692	pcmpeqb	16(%edi), %xmm3
693
694L(shr_5_gobble_loop):
695	pand	%xmm0, %xmm3
696	sub	$32, %ecx
697	pmovmskb %xmm3, %edx
698	movdqa	%xmm0, %xmm1
699
700	movdqa	64(%esi), %xmm3
701	palignr	$5,48(%esi), %xmm3
702	sbb	$0xffff, %edx
703	movdqa	48(%esi), %xmm0
704	palignr	$5,32(%esi), %xmm0
705	pcmpeqb	32(%edi), %xmm0
706	lea	32(%esi), %esi
707	pcmpeqb	48(%edi), %xmm3
708
709	lea	32(%edi), %edi
710	jz	L(shr_5_gobble_loop)
711	pand	%xmm0, %xmm3
712
713	cmp	$0, %ecx
714	jge	L(shr_5_gobble_next)
715	inc	%edx
716	add	$32, %ecx
717L(shr_5_gobble_next):
718	test	%edx, %edx
719	jnz	L(exit)
720
721	pmovmskb %xmm3, %edx
722	movdqa	%xmm0, %xmm1
723	lea	32(%edi), %edi
724	lea	32(%esi), %esi
725	sub	$0xffff, %edx
726	jnz	L(exit)
727
728	lea	(%ecx, %edi,1), %eax
729	lea	5(%ecx, %esi,1), %edx
730	POP	(%edi)
731	POP	(%esi)
732	jmp	L(less48bytes)
733#endif
734
735#if !defined(USE_WCHAR)
736	cfi_restore_state
737	cfi_remember_state
738	.p2align 4
739L(shr_6):
740	cmp	$80, %ecx
741	lea	-48(%ecx), %ecx
742	mov	%edx, %eax
743	jae	L(shr_6_gobble)
744
745	movdqa	16(%esi), %xmm1
746	movdqa	%xmm1, %xmm2
747	palignr	$6,(%esi), %xmm1
748	pcmpeqb	(%edi), %xmm1
749
750	movdqa	32(%esi), %xmm3
751	palignr	$6,%xmm2, %xmm3
752	pcmpeqb	16(%edi), %xmm3
753
754	pand	%xmm1, %xmm3
755	pmovmskb %xmm3, %edx
756	lea	32(%edi), %edi
757	lea	32(%esi), %esi
758	sub	$0xffff, %edx
759	jnz	L(exit)
760	lea	(%ecx, %edi,1), %eax
761	lea	6(%ecx, %esi,1), %edx
762	POP	(%edi)
763	POP	(%esi)
764	jmp	L(less48bytes)
765
766	cfi_restore_state
767	cfi_remember_state
768	.p2align 4
769L(shr_6_gobble):
770	sub	$32, %ecx
771	movdqa	16(%esi), %xmm0
772	palignr	$6,(%esi), %xmm0
773	pcmpeqb	(%edi), %xmm0
774
775	movdqa	32(%esi), %xmm3
776	palignr	$6,16(%esi), %xmm3
777	pcmpeqb	16(%edi), %xmm3
778
779L(shr_6_gobble_loop):
780	pand	%xmm0, %xmm3
781	sub	$32, %ecx
782	pmovmskb %xmm3, %edx
783	movdqa	%xmm0, %xmm1
784
785	movdqa	64(%esi), %xmm3
786	palignr	$6,48(%esi), %xmm3
787	sbb	$0xffff, %edx
788	movdqa	48(%esi), %xmm0
789	palignr	$6,32(%esi), %xmm0
790	pcmpeqb	32(%edi), %xmm0
791	lea	32(%esi), %esi
792	pcmpeqb	48(%edi), %xmm3
793
794	lea	32(%edi), %edi
795	jz	L(shr_6_gobble_loop)
796	pand	%xmm0, %xmm3
797
798	cmp	$0, %ecx
799	jge	L(shr_6_gobble_next)
800	inc	%edx
801	add	$32, %ecx
802L(shr_6_gobble_next):
803	test	%edx, %edx
804	jnz	L(exit)
805
806	pmovmskb %xmm3, %edx
807	movdqa	%xmm0, %xmm1
808	lea	32(%edi), %edi
809	lea	32(%esi), %esi
810	sub	$0xffff, %edx
811	jnz	L(exit)
812
813	lea	(%ecx, %edi,1), %eax
814	lea	6(%ecx, %esi,1), %edx
815	POP	(%edi)
816	POP	(%esi)
817	jmp	L(less48bytes)
818#endif
819
820#if !defined(USE_WCHAR) && !defined(USE_UTF16)
821	cfi_restore_state
822	cfi_remember_state
823	.p2align 4
824L(shr_7):
825	cmp	$80, %ecx
826	lea	-48(%ecx), %ecx
827	mov	%edx, %eax
828	jae	L(shr_7_gobble)
829
830	movdqa	16(%esi), %xmm1
831	movdqa	%xmm1, %xmm2
832	palignr	$7,(%esi), %xmm1
833	pcmpeqb	(%edi), %xmm1
834
835	movdqa	32(%esi), %xmm3
836	palignr	$7,%xmm2, %xmm3
837	pcmpeqb	16(%edi), %xmm3
838
839	pand	%xmm1, %xmm3
840	pmovmskb %xmm3, %edx
841	lea	32(%edi), %edi
842	lea	32(%esi), %esi
843	sub	$0xffff, %edx
844	jnz	L(exit)
845	lea	(%ecx, %edi,1), %eax
846	lea	7(%ecx, %esi,1), %edx
847	POP	(%edi)
848	POP	(%esi)
849	jmp	L(less48bytes)
850
851	cfi_restore_state
852	cfi_remember_state
853	.p2align 4
854L(shr_7_gobble):
855	sub	$32, %ecx
856	movdqa	16(%esi), %xmm0
857	palignr	$7,(%esi), %xmm0
858	pcmpeqb	(%edi), %xmm0
859
860	movdqa	32(%esi), %xmm3
861	palignr	$7,16(%esi), %xmm3
862	pcmpeqb	16(%edi), %xmm3
863
864L(shr_7_gobble_loop):
865	pand	%xmm0, %xmm3
866	sub	$32, %ecx
867	pmovmskb %xmm3, %edx
868	movdqa	%xmm0, %xmm1
869
870	movdqa	64(%esi), %xmm3
871	palignr	$7,48(%esi), %xmm3
872	sbb	$0xffff, %edx
873	movdqa	48(%esi), %xmm0
874	palignr	$7,32(%esi), %xmm0
875	pcmpeqb	32(%edi), %xmm0
876	lea	32(%esi), %esi
877	pcmpeqb	48(%edi), %xmm3
878
879	lea	32(%edi), %edi
880	jz	L(shr_7_gobble_loop)
881	pand	%xmm0, %xmm3
882
883	cmp	$0, %ecx
884	jge	L(shr_7_gobble_next)
885	inc	%edx
886	add	$32, %ecx
887L(shr_7_gobble_next):
888	test	%edx, %edx
889	jnz	L(exit)
890
891	pmovmskb %xmm3, %edx
892	movdqa	%xmm0, %xmm1
893	lea	32(%edi), %edi
894	lea	32(%esi), %esi
895	sub	$0xffff, %edx
896	jnz	L(exit)
897
898	lea	(%ecx, %edi,1), %eax
899	lea	7(%ecx, %esi,1), %edx
900	POP	(%edi)
901	POP	(%esi)
902	jmp	L(less48bytes)
903#endif
904
905	cfi_restore_state
906	cfi_remember_state
907	.p2align 4
908L(shr_8):
909	cmp	$80, %ecx
910	lea	-48(%ecx), %ecx
911	mov	%edx, %eax
912	jae	L(shr_8_gobble)
913
914	movdqa	16(%esi), %xmm1
915	movdqa	%xmm1, %xmm2
916	palignr	$8,(%esi), %xmm1
917	pcmpeqb	(%edi), %xmm1
918
919	movdqa	32(%esi), %xmm3
920	palignr	$8,%xmm2, %xmm3
921	pcmpeqb	16(%edi), %xmm3
922
923	pand	%xmm1, %xmm3
924	pmovmskb %xmm3, %edx
925	lea	32(%edi), %edi
926	lea	32(%esi), %esi
927	sub	$0xffff, %edx
928	jnz	L(exit)
929	lea	(%ecx, %edi,1), %eax
930	lea	8(%ecx, %esi,1), %edx
931	POP	(%edi)
932	POP	(%esi)
933	jmp	L(less48bytes)
934
935	cfi_restore_state
936	cfi_remember_state
937	.p2align 4
938L(shr_8_gobble):
939	sub	$32, %ecx
940	movdqa	16(%esi), %xmm0
941	palignr	$8,(%esi), %xmm0
942	pcmpeqb	(%edi), %xmm0
943
944	movdqa	32(%esi), %xmm3
945	palignr	$8,16(%esi), %xmm3
946	pcmpeqb	16(%edi), %xmm3
947
948L(shr_8_gobble_loop):
949	pand	%xmm0, %xmm3
950	sub	$32, %ecx
951	pmovmskb %xmm3, %edx
952	movdqa	%xmm0, %xmm1
953
954	movdqa	64(%esi), %xmm3
955	palignr	$8,48(%esi), %xmm3
956	sbb	$0xffff, %edx
957	movdqa	48(%esi), %xmm0
958	palignr	$8,32(%esi), %xmm0
959	pcmpeqb	32(%edi), %xmm0
960	lea	32(%esi), %esi
961	pcmpeqb	48(%edi), %xmm3
962
963	lea	32(%edi), %edi
964	jz	L(shr_8_gobble_loop)
965	pand	%xmm0, %xmm3
966
967	cmp	$0, %ecx
968	jge	L(shr_8_gobble_next)
969	inc	%edx
970	add	$32, %ecx
971L(shr_8_gobble_next):
972	test	%edx, %edx
973	jnz	L(exit)
974
975	pmovmskb %xmm3, %edx
976	movdqa	%xmm0, %xmm1
977	lea	32(%edi), %edi
978	lea	32(%esi), %esi
979	sub	$0xffff, %edx
980	jnz	L(exit)
981
982	lea	(%ecx, %edi,1), %eax
983	lea	8(%ecx, %esi,1), %edx
984	POP	(%edi)
985	POP	(%esi)
986	jmp	L(less48bytes)
987
988#if !defined(USE_WCHAR) && !defined(USE_UTF16)
989	cfi_restore_state
990	cfi_remember_state
991	.p2align 4
992L(shr_9):
993	cmp	$80, %ecx
994	lea	-48(%ecx), %ecx
995	mov	%edx, %eax
996	jae	L(shr_9_gobble)
997
998	movdqa	16(%esi), %xmm1
999	movdqa	%xmm1, %xmm2
1000	palignr	$9,(%esi), %xmm1
1001	pcmpeqb	(%edi), %xmm1
1002
1003	movdqa	32(%esi), %xmm3
1004	palignr	$9,%xmm2, %xmm3
1005	pcmpeqb	16(%edi), %xmm3
1006
1007	pand	%xmm1, %xmm3
1008	pmovmskb %xmm3, %edx
1009	lea	32(%edi), %edi
1010	lea	32(%esi), %esi
1011	sub	$0xffff, %edx
1012	jnz	L(exit)
1013	lea	(%ecx, %edi,1), %eax
1014	lea	9(%ecx, %esi,1), %edx
1015	POP	(%edi)
1016	POP	(%esi)
1017	jmp	L(less48bytes)
1018
1019	cfi_restore_state
1020	cfi_remember_state
1021	.p2align 4
1022L(shr_9_gobble):
1023	sub	$32, %ecx
1024	movdqa	16(%esi), %xmm0
1025	palignr	$9,(%esi), %xmm0
1026	pcmpeqb	(%edi), %xmm0
1027
1028	movdqa	32(%esi), %xmm3
1029	palignr	$9,16(%esi), %xmm3
1030	pcmpeqb	16(%edi), %xmm3
1031
1032L(shr_9_gobble_loop):
1033	pand	%xmm0, %xmm3
1034	sub	$32, %ecx
1035	pmovmskb %xmm3, %edx
1036	movdqa	%xmm0, %xmm1
1037
1038	movdqa	64(%esi), %xmm3
1039	palignr	$9,48(%esi), %xmm3
1040	sbb	$0xffff, %edx
1041	movdqa	48(%esi), %xmm0
1042	palignr	$9,32(%esi), %xmm0
1043	pcmpeqb	32(%edi), %xmm0
1044	lea	32(%esi), %esi
1045	pcmpeqb	48(%edi), %xmm3
1046
1047	lea	32(%edi), %edi
1048	jz	L(shr_9_gobble_loop)
1049	pand	%xmm0, %xmm3
1050
1051	cmp	$0, %ecx
1052	jge	L(shr_9_gobble_next)
1053	inc	%edx
1054	add	$32, %ecx
1055L(shr_9_gobble_next):
1056	test	%edx, %edx
1057	jnz	L(exit)
1058
1059	pmovmskb %xmm3, %edx
1060	movdqa	%xmm0, %xmm1
1061	lea	32(%edi), %edi
1062	lea	32(%esi), %esi
1063	sub	$0xffff, %edx
1064	jnz	L(exit)
1065
1066	lea	(%ecx, %edi,1), %eax
1067	lea	9(%ecx, %esi,1), %edx
1068	POP	(%edi)
1069	POP	(%esi)
1070	jmp	L(less48bytes)
1071#endif
1072
1073#if !defined(USE_WCHAR)
1074	cfi_restore_state
1075	cfi_remember_state
1076	.p2align 4
1077L(shr_10):
1078	cmp	$80, %ecx
1079	lea	-48(%ecx), %ecx
1080	mov	%edx, %eax
1081	jae	L(shr_10_gobble)
1082
1083	movdqa	16(%esi), %xmm1
1084	movdqa	%xmm1, %xmm2
1085	palignr	$10, (%esi), %xmm1
1086	pcmpeqb	(%edi), %xmm1
1087
1088	movdqa	32(%esi), %xmm3
1089	palignr	$10,%xmm2, %xmm3
1090	pcmpeqb	16(%edi), %xmm3
1091
1092	pand	%xmm1, %xmm3
1093	pmovmskb %xmm3, %edx
1094	lea	32(%edi), %edi
1095	lea	32(%esi), %esi
1096	sub	$0xffff, %edx
1097	jnz	L(exit)
1098	lea	(%ecx, %edi,1), %eax
1099	lea	10(%ecx, %esi,1), %edx
1100	POP	(%edi)
1101	POP	(%esi)
1102	jmp	L(less48bytes)
1103
1104	cfi_restore_state
1105	cfi_remember_state
1106	.p2align 4
1107L(shr_10_gobble):
1108	sub	$32, %ecx
1109	movdqa	16(%esi), %xmm0
1110	palignr	$10, (%esi), %xmm0
1111	pcmpeqb	(%edi), %xmm0
1112
1113	movdqa	32(%esi), %xmm3
1114	palignr	$10, 16(%esi), %xmm3
1115	pcmpeqb	16(%edi), %xmm3
1116
1117L(shr_10_gobble_loop):
1118	pand	%xmm0, %xmm3
1119	sub	$32, %ecx
1120	pmovmskb %xmm3, %edx
1121	movdqa	%xmm0, %xmm1
1122
1123	movdqa	64(%esi), %xmm3
1124	palignr	$10,48(%esi), %xmm3
1125	sbb	$0xffff, %edx
1126	movdqa	48(%esi), %xmm0
1127	palignr	$10,32(%esi), %xmm0
1128	pcmpeqb	32(%edi), %xmm0
1129	lea	32(%esi), %esi
1130	pcmpeqb	48(%edi), %xmm3
1131
1132	lea	32(%edi), %edi
1133	jz	L(shr_10_gobble_loop)
1134	pand	%xmm0, %xmm3
1135
1136	cmp	$0, %ecx
1137	jge	L(shr_10_gobble_next)
1138	inc	%edx
1139	add	$32, %ecx
1140L(shr_10_gobble_next):
1141	test	%edx, %edx
1142	jnz	L(exit)
1143
1144	pmovmskb %xmm3, %edx
1145	movdqa	%xmm0, %xmm1
1146	lea	32(%edi), %edi
1147	lea	32(%esi), %esi
1148	sub	$0xffff, %edx
1149	jnz	L(exit)
1150
1151	lea	(%ecx, %edi,1), %eax
1152	lea	10(%ecx, %esi,1), %edx
1153	POP	(%edi)
1154	POP	(%esi)
1155	jmp	L(less48bytes)
1156#endif
1157
1158#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1159	cfi_restore_state
1160	cfi_remember_state
1161	.p2align 4
1162L(shr_11):
1163	cmp	$80, %ecx
1164	lea	-48(%ecx), %ecx
1165	mov	%edx, %eax
1166	jae	L(shr_11_gobble)
1167
1168	movdqa	16(%esi), %xmm1
1169	movdqa	%xmm1, %xmm2
1170	palignr	$11, (%esi), %xmm1
1171	pcmpeqb	(%edi), %xmm1
1172
1173	movdqa	32(%esi), %xmm3
1174	palignr	$11, %xmm2, %xmm3
1175	pcmpeqb	16(%edi), %xmm3
1176
1177	pand	%xmm1, %xmm3
1178	pmovmskb %xmm3, %edx
1179	lea	32(%edi), %edi
1180	lea	32(%esi), %esi
1181	sub	$0xffff, %edx
1182	jnz	L(exit)
1183	lea	(%ecx, %edi,1), %eax
1184	lea	11(%ecx, %esi,1), %edx
1185	POP	(%edi)
1186	POP	(%esi)
1187	jmp	L(less48bytes)
1188
1189	cfi_restore_state
1190	cfi_remember_state
1191	.p2align 4
1192L(shr_11_gobble):
1193	sub	$32, %ecx
1194	movdqa	16(%esi), %xmm0
1195	palignr	$11, (%esi), %xmm0
1196	pcmpeqb	(%edi), %xmm0
1197
1198	movdqa	32(%esi), %xmm3
1199	palignr	$11, 16(%esi), %xmm3
1200	pcmpeqb	16(%edi), %xmm3
1201
1202L(shr_11_gobble_loop):
1203	pand	%xmm0, %xmm3
1204	sub	$32, %ecx
1205	pmovmskb %xmm3, %edx
1206	movdqa	%xmm0, %xmm1
1207
1208	movdqa	64(%esi), %xmm3
1209	palignr	$11,48(%esi), %xmm3
1210	sbb	$0xffff, %edx
1211	movdqa	48(%esi), %xmm0
1212	palignr	$11,32(%esi), %xmm0
1213	pcmpeqb	32(%edi), %xmm0
1214	lea	32(%esi), %esi
1215	pcmpeqb	48(%edi), %xmm3
1216
1217	lea	32(%edi), %edi
1218	jz	L(shr_11_gobble_loop)
1219	pand	%xmm0, %xmm3
1220
1221	cmp	$0, %ecx
1222	jge	L(shr_11_gobble_next)
1223	inc	%edx
1224	add	$32, %ecx
1225L(shr_11_gobble_next):
1226	test	%edx, %edx
1227	jnz	L(exit)
1228
1229	pmovmskb %xmm3, %edx
1230	movdqa	%xmm0, %xmm1
1231	lea	32(%edi), %edi
1232	lea	32(%esi), %esi
1233	sub	$0xffff, %edx
1234	jnz	L(exit)
1235
1236	lea	(%ecx, %edi,1), %eax
1237	lea	11(%ecx, %esi,1), %edx
1238	POP	(%edi)
1239	POP	(%esi)
1240	jmp	L(less48bytes)
1241#endif
1242
1243	cfi_restore_state
1244	cfi_remember_state
1245	.p2align 4
1246L(shr_12):
1247	cmp	$80, %ecx
1248	lea	-48(%ecx), %ecx
1249	mov	%edx, %eax
1250	jae	L(shr_12_gobble)
1251
1252	movdqa	16(%esi), %xmm1
1253	movdqa	%xmm1, %xmm2
1254	palignr	$12, (%esi), %xmm1
1255	pcmpeqb	(%edi), %xmm1
1256
1257	movdqa	32(%esi), %xmm3
1258	palignr	$12, %xmm2, %xmm3
1259	pcmpeqb	16(%edi), %xmm3
1260
1261	pand	%xmm1, %xmm3
1262	pmovmskb %xmm3, %edx
1263	lea	32(%edi), %edi
1264	lea	32(%esi), %esi
1265	sub	$0xffff, %edx
1266	jnz	L(exit)
1267	lea	(%ecx, %edi,1), %eax
1268	lea	12(%ecx, %esi,1), %edx
1269	POP	(%edi)
1270	POP	(%esi)
1271	jmp	L(less48bytes)
1272
1273	cfi_restore_state
1274	cfi_remember_state
1275	.p2align 4
1276L(shr_12_gobble):
1277	sub	$32, %ecx
1278	movdqa	16(%esi), %xmm0
1279	palignr	$12, (%esi), %xmm0
1280	pcmpeqb	(%edi), %xmm0
1281
1282	movdqa	32(%esi), %xmm3
1283	palignr	$12, 16(%esi), %xmm3
1284	pcmpeqb	16(%edi), %xmm3
1285
1286L(shr_12_gobble_loop):
1287	pand	%xmm0, %xmm3
1288	sub	$32, %ecx
1289	pmovmskb %xmm3, %edx
1290	movdqa	%xmm0, %xmm1
1291
1292	movdqa	64(%esi), %xmm3
1293	palignr	$12,48(%esi), %xmm3
1294	sbb	$0xffff, %edx
1295	movdqa	48(%esi), %xmm0
1296	palignr	$12,32(%esi), %xmm0
1297	pcmpeqb	32(%edi), %xmm0
1298	lea	32(%esi), %esi
1299	pcmpeqb	48(%edi), %xmm3
1300
1301	lea	32(%edi), %edi
1302	jz	L(shr_12_gobble_loop)
1303	pand	%xmm0, %xmm3
1304
1305	cmp	$0, %ecx
1306	jge	L(shr_12_gobble_next)
1307	inc	%edx
1308	add	$32, %ecx
1309L(shr_12_gobble_next):
1310	test	%edx, %edx
1311	jnz	L(exit)
1312
1313	pmovmskb %xmm3, %edx
1314	movdqa	%xmm0, %xmm1
1315	lea	32(%edi), %edi
1316	lea	32(%esi), %esi
1317	sub	$0xffff, %edx
1318	jnz	L(exit)
1319
1320	lea	(%ecx, %edi,1), %eax
1321	lea	12(%ecx, %esi,1), %edx
1322	POP	(%edi)
1323	POP	(%esi)
1324	jmp	L(less48bytes)
1325
1326#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1327	cfi_restore_state
1328	cfi_remember_state
1329	.p2align 4
1330L(shr_13):
1331	cmp	$80, %ecx
1332	lea	-48(%ecx), %ecx
1333	mov	%edx, %eax
1334	jae	L(shr_13_gobble)
1335
1336	movdqa	16(%esi), %xmm1
1337	movdqa	%xmm1, %xmm2
1338	palignr	$13, (%esi), %xmm1
1339	pcmpeqb	(%edi), %xmm1
1340
1341	movdqa	32(%esi), %xmm3
1342	palignr	$13, %xmm2, %xmm3
1343	pcmpeqb	16(%edi), %xmm3
1344
1345	pand	%xmm1, %xmm3
1346	pmovmskb %xmm3, %edx
1347	lea	32(%edi), %edi
1348	lea	32(%esi), %esi
1349	sub	$0xffff, %edx
1350	jnz	L(exit)
1351	lea	(%ecx, %edi,1), %eax
1352	lea	13(%ecx, %esi,1), %edx
1353	POP	(%edi)
1354	POP	(%esi)
1355	jmp	L(less48bytes)
1356
1357	cfi_restore_state
1358	cfi_remember_state
1359	.p2align 4
1360L(shr_13_gobble):
1361	sub	$32, %ecx
1362	movdqa	16(%esi), %xmm0
1363	palignr	$13, (%esi), %xmm0
1364	pcmpeqb	(%edi), %xmm0
1365
1366	movdqa	32(%esi), %xmm3
1367	palignr	$13, 16(%esi), %xmm3
1368	pcmpeqb	16(%edi), %xmm3
1369
1370L(shr_13_gobble_loop):
1371	pand	%xmm0, %xmm3
1372	sub	$32, %ecx
1373	pmovmskb %xmm3, %edx
1374	movdqa	%xmm0, %xmm1
1375
1376	movdqa	64(%esi), %xmm3
1377	palignr	$13,48(%esi), %xmm3
1378	sbb	$0xffff, %edx
1379	movdqa	48(%esi), %xmm0
1380	palignr	$13,32(%esi), %xmm0
1381	pcmpeqb	32(%edi), %xmm0
1382	lea	32(%esi), %esi
1383	pcmpeqb	48(%edi), %xmm3
1384
1385	lea	32(%edi), %edi
1386	jz	L(shr_13_gobble_loop)
1387	pand	%xmm0, %xmm3
1388
1389	cmp	$0, %ecx
1390	jge	L(shr_13_gobble_next)
1391	inc	%edx
1392	add	$32, %ecx
1393L(shr_13_gobble_next):
1394	test	%edx, %edx
1395	jnz	L(exit)
1396
1397	pmovmskb %xmm3, %edx
1398	movdqa	%xmm0, %xmm1
1399	lea	32(%edi), %edi
1400	lea	32(%esi), %esi
1401	sub	$0xffff, %edx
1402	jnz	L(exit)
1403
1404	lea	(%ecx, %edi,1), %eax
1405	lea	13(%ecx, %esi,1), %edx
1406	POP	(%edi)
1407	POP	(%esi)
1408	jmp	L(less48bytes)
1409#endif
1410
1411#if !defined(USE_WCHAR)
1412	cfi_restore_state
1413	cfi_remember_state
1414	.p2align 4
1415L(shr_14):
1416	cmp	$80, %ecx
1417	lea	-48(%ecx), %ecx
1418	mov	%edx, %eax
1419	jae	L(shr_14_gobble)
1420
1421	movdqa	16(%esi), %xmm1
1422	movdqa	%xmm1, %xmm2
1423	palignr	$14, (%esi), %xmm1
1424	pcmpeqb	(%edi), %xmm1
1425
1426	movdqa	32(%esi), %xmm3
1427	palignr	$14, %xmm2, %xmm3
1428	pcmpeqb	16(%edi), %xmm3
1429
1430	pand	%xmm1, %xmm3
1431	pmovmskb %xmm3, %edx
1432	lea	32(%edi), %edi
1433	lea	32(%esi), %esi
1434	sub	$0xffff, %edx
1435	jnz	L(exit)
1436	lea	(%ecx, %edi,1), %eax
1437	lea	14(%ecx, %esi,1), %edx
1438	POP	(%edi)
1439	POP	(%esi)
1440	jmp	L(less48bytes)
1441
1442	cfi_restore_state
1443	cfi_remember_state
1444	.p2align 4
1445L(shr_14_gobble):
1446	sub	$32, %ecx
1447	movdqa	16(%esi), %xmm0
1448	palignr	$14, (%esi), %xmm0
1449	pcmpeqb	(%edi), %xmm0
1450
1451	movdqa	32(%esi), %xmm3
1452	palignr	$14, 16(%esi), %xmm3
1453	pcmpeqb	16(%edi), %xmm3
1454
1455L(shr_14_gobble_loop):
1456	pand	%xmm0, %xmm3
1457	sub	$32, %ecx
1458	pmovmskb %xmm3, %edx
1459	movdqa	%xmm0, %xmm1
1460
1461	movdqa	64(%esi), %xmm3
1462	palignr	$14,48(%esi), %xmm3
1463	sbb	$0xffff, %edx
1464	movdqa	48(%esi), %xmm0
1465	palignr	$14,32(%esi), %xmm0
1466	pcmpeqb	32(%edi), %xmm0
1467	lea	32(%esi), %esi
1468	pcmpeqb	48(%edi), %xmm3
1469
1470	lea	32(%edi), %edi
1471	jz	L(shr_14_gobble_loop)
1472	pand	%xmm0, %xmm3
1473
1474	cmp	$0, %ecx
1475	jge	L(shr_14_gobble_next)
1476	inc	%edx
1477	add	$32, %ecx
1478L(shr_14_gobble_next):
1479	test	%edx, %edx
1480	jnz	L(exit)
1481
1482	pmovmskb %xmm3, %edx
1483	movdqa	%xmm0, %xmm1
1484	lea	32(%edi), %edi
1485	lea	32(%esi), %esi
1486	sub	$0xffff, %edx
1487	jnz	L(exit)
1488
1489	lea	(%ecx, %edi,1), %eax
1490	lea	14(%ecx, %esi,1), %edx
1491	POP	(%edi)
1492	POP	(%esi)
1493	jmp	L(less48bytes)
1494#endif
1495
1496#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1497	cfi_restore_state
1498	cfi_remember_state
1499	.p2align 4
1500L(shr_15):
1501	cmp	$80, %ecx
1502	lea	-48(%ecx), %ecx
1503	mov	%edx, %eax
1504	jae	L(shr_15_gobble)
1505
1506	movdqa	16(%esi), %xmm1
1507	movdqa	%xmm1, %xmm2
1508	palignr	$15, (%esi), %xmm1
1509	pcmpeqb	(%edi), %xmm1
1510
1511	movdqa	32(%esi), %xmm3
1512	palignr	$15, %xmm2, %xmm3
1513	pcmpeqb	16(%edi), %xmm3
1514
1515	pand	%xmm1, %xmm3
1516	pmovmskb %xmm3, %edx
1517	lea	32(%edi), %edi
1518	lea	32(%esi), %esi
1519	sub	$0xffff, %edx
1520	jnz	L(exit)
1521	lea	(%ecx, %edi,1), %eax
1522	lea	15(%ecx, %esi,1), %edx
1523	POP	(%edi)
1524	POP	(%esi)
1525	jmp	L(less48bytes)
1526
1527	cfi_restore_state
1528	cfi_remember_state
1529	.p2align 4
1530L(shr_15_gobble):
1531	sub	$32, %ecx
1532	movdqa	16(%esi), %xmm0
1533	palignr	$15, (%esi), %xmm0
1534	pcmpeqb	(%edi), %xmm0
1535
1536	movdqa	32(%esi), %xmm3
1537	palignr	$15, 16(%esi), %xmm3
1538	pcmpeqb	16(%edi), %xmm3
1539
1540L(shr_15_gobble_loop):
1541	pand	%xmm0, %xmm3
1542	sub	$32, %ecx
1543	pmovmskb %xmm3, %edx
1544	movdqa	%xmm0, %xmm1
1545
1546	movdqa	64(%esi), %xmm3
1547	palignr	$15,48(%esi), %xmm3
1548	sbb	$0xffff, %edx
1549	movdqa	48(%esi), %xmm0
1550	palignr	$15,32(%esi), %xmm0
1551	pcmpeqb	32(%edi), %xmm0
1552	lea	32(%esi), %esi
1553	pcmpeqb	48(%edi), %xmm3
1554
1555	lea	32(%edi), %edi
1556	jz	L(shr_15_gobble_loop)
1557	pand	%xmm0, %xmm3
1558
1559	cmp	$0, %ecx
1560	jge	L(shr_15_gobble_next)
1561	inc	%edx
1562	add	$32, %ecx
1563L(shr_15_gobble_next):
1564	test	%edx, %edx
1565	jnz	L(exit)
1566
1567	pmovmskb %xmm3, %edx
1568	movdqa	%xmm0, %xmm1
1569	lea	32(%edi), %edi
1570	lea	32(%esi), %esi
1571	sub	$0xffff, %edx
1572	jnz	L(exit)
1573
1574	lea	(%ecx, %edi,1), %eax
1575	lea	15(%ecx, %esi,1), %edx
1576	POP	(%edi)
1577	POP	(%esi)
1578	jmp	L(less48bytes)
1579#endif
1580
1581	cfi_restore_state
1582	cfi_remember_state
1583	.p2align 4
1584L(exit):
1585	pmovmskb %xmm1, %ebx
1586	sub	$0xffff, %ebx
1587	jz	L(first16bytes)
1588	lea	-16(%esi), %esi
1589	lea	-16(%edi), %edi
1590	mov	%ebx, %edx
1591
1592L(first16bytes):
1593	add	%eax, %esi
1594L(less16bytes):
1595
1596#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1597	test	%dl, %dl
1598	jz	L(next_24_bytes)
1599
1600	test	$0x01, %dl
1601	jnz	L(Byte16)
1602
1603	test	$0x02, %dl
1604	jnz	L(Byte17)
1605
1606	test	$0x04, %dl
1607	jnz	L(Byte18)
1608
1609	test	$0x08, %dl
1610	jnz	L(Byte19)
1611
1612	test	$0x10, %dl
1613	jnz	L(Byte20)
1614
1615	test	$0x20, %dl
1616	jnz	L(Byte21)
1617
1618	test	$0x40, %dl
1619	jnz	L(Byte22)
1620L(Byte23):
1621	movzbl	-9(%edi), %eax
1622	movzbl	-9(%esi), %edx
1623	sub	%edx, %eax
1624	RETURN
1625
1626	.p2align 4
1627L(Byte16):
1628	movzbl	-16(%edi), %eax
1629	movzbl	-16(%esi), %edx
1630	sub	%edx, %eax
1631	RETURN
1632
1633	.p2align 4
1634L(Byte17):
1635	movzbl	-15(%edi), %eax
1636	movzbl	-15(%esi), %edx
1637	sub	%edx, %eax
1638	RETURN
1639
1640	.p2align 4
1641L(Byte18):
1642	movzbl	-14(%edi), %eax
1643	movzbl	-14(%esi), %edx
1644	sub	%edx, %eax
1645	RETURN
1646
1647	.p2align 4
1648L(Byte19):
1649	movzbl	-13(%edi), %eax
1650	movzbl	-13(%esi), %edx
1651	sub	%edx, %eax
1652	RETURN
1653
1654	.p2align 4
1655L(Byte20):
1656	movzbl	-12(%edi), %eax
1657	movzbl	-12(%esi), %edx
1658	sub	%edx, %eax
1659	RETURN
1660
1661	.p2align 4
1662L(Byte21):
1663	movzbl	-11(%edi), %eax
1664	movzbl	-11(%esi), %edx
1665	sub	%edx, %eax
1666	RETURN
1667
1668	.p2align 4
1669L(Byte22):
1670	movzbl	-10(%edi), %eax
1671	movzbl	-10(%esi), %edx
1672	sub	%edx, %eax
1673	RETURN
1674
1675	.p2align 4
1676L(next_24_bytes):
1677	lea	8(%edi), %edi
1678	lea	8(%esi), %esi
1679	test	$0x01, %dh
1680	jnz	L(Byte16)
1681
1682	test	$0x02, %dh
1683	jnz	L(Byte17)
1684
1685	test	$0x04, %dh
1686	jnz	L(Byte18)
1687
1688	test	$0x08, %dh
1689	jnz	L(Byte19)
1690
1691	test	$0x10, %dh
1692	jnz	L(Byte20)
1693
1694	test	$0x20, %dh
1695	jnz	L(Byte21)
1696
1697	test	$0x40, %dh
1698	jnz	L(Byte22)
1699
1700	.p2align 4
1701L(Byte31):
1702	movzbl	-9(%edi), %eax
1703	movzbl	-9(%esi), %edx
1704	sub	%edx, %eax
1705	RETURN_END
1706#elif defined(USE_AS_WMEMCMP)
1707
1708/* special for wmemcmp */
1709	test	%dl, %dl
1710	jz	L(next_two_double_words)
1711	and	$15, %dl
1712	jz	L(second_double_word)
1713	mov	-16(%edi), %ecx
1714	cmp	-16(%esi), %ecx
1715	mov	$1, %eax
1716	jg	L(nequal_bigger)
1717	neg	%eax
1718	RETURN
1719
1720	.p2align 4
1721L(second_double_word):
1722	mov	-12(%edi), %ecx
1723	cmp	-12(%esi), %ecx
1724	mov	$1, %eax
1725	jg	L(nequal_bigger)
1726	neg	%eax
1727	RETURN
1728
1729	.p2align 4
1730L(next_two_double_words):
1731	and	$15, %dh
1732	jz	L(fourth_double_word)
1733	mov	-8(%edi), %ecx
1734	cmp	-8(%esi), %ecx
1735	mov	$1, %eax
1736	jg	L(nequal_bigger)
1737	neg	%eax
1738	RETURN
1739
1740	.p2align 4
1741L(fourth_double_word):
1742	mov	-4(%edi), %ecx
1743	cmp	-4(%esi), %ecx
1744	mov	$1, %eax
1745	jg	L(nequal_bigger)
1746	neg	%eax
1747	RETURN
1748
1749	.p2align 4
1750L(nequal_bigger):
1751	RETURN_END
1752
1753#elif defined(USE_AS_MEMCMP16)
1754
1755/* special for __memcmp16 */
1756	test	%dl, %dl
1757	jz	L(next_four_words)
1758	test	$15, %dl
1759	jz	L(second_two_words)
1760	test	$3, %dl
1761	jz	L(second_word)
1762	movzwl	-16(%edi), %eax
1763	movzwl	-16(%esi), %ebx
1764	subl	%ebx, %eax
1765	RETURN
1766
1767	.p2align 4
1768L(second_word):
1769	movzwl	-14(%edi), %eax
1770	movzwl	-14(%esi), %ebx
1771	subl	%ebx, %eax
1772	RETURN
1773
1774	.p2align 4
1775L(second_two_words):
1776	test	$63, %dl
1777	jz	L(fourth_word)
1778	movzwl	-12(%edi), %eax
1779	movzwl	-12(%esi), %ebx
1780	subl	%ebx, %eax
1781	RETURN
1782
1783	.p2align 4
1784L(fourth_word):
1785	movzwl	-10(%edi), %eax
1786	movzwl	-10(%esi), %ebx
1787	subl	%ebx, %eax
1788	RETURN
1789
1790	.p2align 4
1791L(next_four_words):
1792	test	$15, %dh
1793	jz	L(fourth_two_words)
1794	test	$3, %dh
1795	jz	L(sixth_word)
1796	movzwl	-8(%edi), %eax
1797	movzwl	-8(%esi), %ebx
1798	subl	%ebx, %eax
1799	RETURN
1800
1801	.p2align 4
1802L(sixth_word):
1803	movzwl	-6(%edi), %eax
1804	movzwl	-6(%esi), %ebx
1805	subl	%ebx, %eax
1806	RETURN
1807
1808	.p2align 4
1809L(fourth_two_words):
1810	test	$63, %dh
1811	jz	L(eighth_word)
1812	movzwl	-4(%edi), %eax
1813	movzwl	-4(%esi), %ebx
1814	subl	%ebx, %eax
1815	RETURN
1816
1817	.p2align 4
1818L(eighth_word):
1819	movzwl	-2(%edi), %eax
1820	movzwl	-2(%esi), %ebx
1821	subl	%ebx, %eax
1822	RETURN
1823#else
1824# error Unreachable preprocessor case
1825#endif
1826
1827	CFI_PUSH (%ebx)
1828
1829	.p2align 4
1830L(more8bytes):
1831	cmp	$16, %ecx
1832	jae	L(more16bytes)
1833	cmp	$8, %ecx
1834	je	L(8bytes)
1835#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1836	cmp	$9, %ecx
1837	je	L(9bytes)
1838	cmp	$10, %ecx
1839	je	L(10bytes)
1840	cmp	$11, %ecx
1841	je	L(11bytes)
1842	cmp	$12, %ecx
1843	je	L(12bytes)
1844	cmp	$13, %ecx
1845	je	L(13bytes)
1846	cmp	$14, %ecx
1847	je	L(14bytes)
1848	jmp	L(15bytes)
1849#elif defined(USE_WCHAR) && !defined(USE_UTF16)
1850	jmp	L(12bytes)
1851#elif defined(USE_UTF16) && !defined(USE_WCHAR)
1852	cmp	$10, %ecx
1853	je	L(10bytes)
1854	cmp	$12, %ecx
1855	je	L(12bytes)
1856	jmp	L(14bytes)
1857#else
1858# error Unreachable preprocessor case
1859#endif
1860
1861	.p2align 4
1862L(more16bytes):
1863	cmp	$24, %ecx
1864	jae	L(more24bytes)
1865	cmp	$16, %ecx
1866	je	L(16bytes)
1867#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1868	cmp	$17, %ecx
1869	je	L(17bytes)
1870	cmp	$18, %ecx
1871	je	L(18bytes)
1872	cmp	$19, %ecx
1873	je	L(19bytes)
1874	cmp	$20, %ecx
1875	je	L(20bytes)
1876	cmp	$21, %ecx
1877	je	L(21bytes)
1878	cmp	$22, %ecx
1879	je	L(22bytes)
1880	jmp	L(23bytes)
1881#elif defined(USE_WCHAR) && !defined(USE_UTF16)
1882	jmp	L(20bytes)
1883#elif defined(USE_UTF16) && !defined(USE_WCHAR)
1884	cmp	$18, %ecx
1885	je	L(18bytes)
1886	cmp	$20, %ecx
1887	je	L(20bytes)
1888	jmp	L(22bytes)
1889#else
1890# error Unreachable preprocessor case
1891#endif
1892
1893	.p2align 4
1894L(more24bytes):
1895	cmp	$32, %ecx
1896	jae	L(more32bytes)
1897	cmp	$24, %ecx
1898	je	L(24bytes)
1899#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1900	cmp	$25, %ecx
1901	je	L(25bytes)
1902	cmp	$26, %ecx
1903	je	L(26bytes)
1904	cmp	$27, %ecx
1905	je	L(27bytes)
1906	cmp	$28, %ecx
1907	je	L(28bytes)
1908	cmp	$29, %ecx
1909	je	L(29bytes)
1910	cmp	$30, %ecx
1911	je	L(30bytes)
1912	jmp	L(31bytes)
1913#elif defined(USE_WCHAR) && !defined(USE_UTF16)
1914	jmp	L(28bytes)
1915#elif defined(USE_UTF16) && !defined(USE_WCHAR)
1916	cmp	$26, %ecx
1917	je	L(26bytes)
1918	cmp	$28, %ecx
1919	je	L(28bytes)
1920	jmp	L(30bytes)
1921#else
1922# error Unreachable preprocessor case
1923#endif
1924
1925	.p2align 4
1926L(more32bytes):
1927	cmp	$40, %ecx
1928	jae	L(more40bytes)
1929	cmp	$32, %ecx
1930	je	L(32bytes)
1931#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1932	cmp	$33, %ecx
1933	je	L(33bytes)
1934	cmp	$34, %ecx
1935	je	L(34bytes)
1936	cmp	$35, %ecx
1937	je	L(35bytes)
1938	cmp	$36, %ecx
1939	je	L(36bytes)
1940	cmp	$37, %ecx
1941	je	L(37bytes)
1942	cmp	$38, %ecx
1943	je	L(38bytes)
1944	jmp	L(39bytes)
1945#elif defined(USE_WCHAR) && !defined(USE_UTF16)
1946	jmp	L(36bytes)
1947#elif defined(USE_UTF16) && !defined(USE_WCHAR)
1948	cmp	$34, %ecx
1949	je	L(34bytes)
1950	cmp	$36, %ecx
1951	je	L(36bytes)
1952	jmp	L(38bytes)
1953#else
1954# error Unreachable preprocessor case
1955#endif
1956
1957	.p2align 4
1958L(less48bytes):
1959	cmp	$8, %ecx
1960	jae	L(more8bytes)
1961#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1962	cmp	$2, %ecx
1963	je	L(2bytes)
1964	cmp	$3, %ecx
1965	je	L(3bytes)
1966	cmp	$4, %ecx
1967	je	L(4bytes)
1968	cmp	$5, %ecx
1969	je	L(5bytes)
1970	cmp	$6, %ecx
1971	je	L(6bytes)
1972	jmp	L(7bytes)
1973#elif defined(USE_WCHAR) && !defined(USE_UTF16)
1974	jmp	L(4bytes)
1975#elif defined(USE_UTF16) && !defined(USE_WCHAR)
1976	cmp	$2, %ecx
1977	je	L(2bytes)
1978	cmp	$4, %ecx
1979	je	L(4bytes)
1980	jmp	L(6bytes)
1981#else
1982# error Unreachable preprocessor case
1983#endif
1984
1985	.p2align 4
1986L(more40bytes):
1987	cmp	$40, %ecx
1988	je	L(40bytes)
1989#if !defined(USE_WCHAR) && !defined(USE_UTF16)
1990	cmp	$41, %ecx
1991	je	L(41bytes)
1992	cmp	$42, %ecx
1993	je	L(42bytes)
1994	cmp	$43, %ecx
1995	je	L(43bytes)
1996	cmp	$44, %ecx
1997	je	L(44bytes)
1998	cmp	$45, %ecx
1999	je	L(45bytes)
2000	cmp	$46, %ecx
2001	je	L(46bytes)
2002	jmp	L(47bytes)
2003#elif defined(USE_UTF16) && !defined(USE_WCHAR)
2004	cmp	$42, %ecx
2005	je	L(42bytes)
2006	cmp	$44, %ecx
2007	je	L(44bytes)
2008	jmp	L(46bytes)
2009#endif
2010
2011#if !defined(USE_AS_WMEMCMP) && !defined(USE_AS_MEMCMP16)
2012	.p2align 4
2013L(44bytes):
2014	mov	-44(%eax), %ecx
2015	mov	-44(%edx), %ebx
2016	cmp	%ebx, %ecx
2017	jne	L(find_diff)
2018L(40bytes):
2019	mov	-40(%eax), %ecx
2020	mov	-40(%edx), %ebx
2021	cmp	%ebx, %ecx
2022	jne	L(find_diff)
2023L(36bytes):
2024	mov	-36(%eax), %ecx
2025	mov	-36(%edx), %ebx
2026	cmp	%ebx, %ecx
2027	jne	L(find_diff)
2028L(32bytes):
2029	mov	-32(%eax), %ecx
2030	mov	-32(%edx), %ebx
2031	cmp	%ebx, %ecx
2032	jne	L(find_diff)
2033L(28bytes):
2034	mov	-28(%eax), %ecx
2035	mov	-28(%edx), %ebx
2036	cmp	%ebx, %ecx
2037	jne	L(find_diff)
2038L(24bytes):
2039	mov	-24(%eax), %ecx
2040	mov	-24(%edx), %ebx
2041	cmp	%ebx, %ecx
2042	jne	L(find_diff)
2043L(20bytes):
2044	mov	-20(%eax), %ecx
2045	mov	-20(%edx), %ebx
2046	cmp	%ebx, %ecx
2047	jne	L(find_diff)
2048L(16bytes):
2049	mov	-16(%eax), %ecx
2050	mov	-16(%edx), %ebx
2051	cmp	%ebx, %ecx
2052	jne	L(find_diff)
2053L(12bytes):
2054	mov	-12(%eax), %ecx
2055	mov	-12(%edx), %ebx
2056	cmp	%ebx, %ecx
2057	jne	L(find_diff)
2058L(8bytes):
2059	mov	-8(%eax), %ecx
2060	mov	-8(%edx), %ebx
2061	cmp	%ebx, %ecx
2062	jne	L(find_diff)
2063L(4bytes):
2064	mov	-4(%eax), %ecx
2065	mov	-4(%edx), %ebx
2066	cmp	%ebx, %ecx
2067	mov	$0, %eax
2068	jne	L(find_diff)
2069	POP	(%ebx)
2070	ret
2071	CFI_PUSH (%ebx)
2072#elif defined(USE_AS_WMEMCMP)
2073
2074	.p2align 4
2075L(44bytes):
2076	mov	-44(%eax), %ecx
2077	cmp	-44(%edx), %ecx
2078	jne	L(find_diff)
2079L(40bytes):
2080	mov	-40(%eax), %ecx
2081	cmp	-40(%edx), %ecx
2082	jne	L(find_diff)
2083L(36bytes):
2084	mov	-36(%eax), %ecx
2085	cmp	-36(%edx), %ecx
2086	jne	L(find_diff)
2087L(32bytes):
2088	mov	-32(%eax), %ecx
2089	cmp	-32(%edx), %ecx
2090	jne	L(find_diff)
2091L(28bytes):
2092	mov	-28(%eax), %ecx
2093	cmp	-28(%edx), %ecx
2094	jne	L(find_diff)
2095L(24bytes):
2096	mov	-24(%eax), %ecx
2097	cmp	-24(%edx), %ecx
2098	jne	L(find_diff)
2099L(20bytes):
2100	mov	-20(%eax), %ecx
2101	cmp	-20(%edx), %ecx
2102	jne	L(find_diff)
2103L(16bytes):
2104	mov	-16(%eax), %ecx
2105	cmp	-16(%edx), %ecx
2106	jne	L(find_diff)
2107L(12bytes):
2108	mov	-12(%eax), %ecx
2109	cmp	-12(%edx), %ecx
2110	jne	L(find_diff)
2111L(8bytes):
2112	mov	-8(%eax), %ecx
2113	cmp	-8(%edx), %ecx
2114	jne	L(find_diff)
2115L(4bytes):
2116	mov	-4(%eax), %ecx
2117	xor	%eax, %eax
2118	cmp	-4(%edx), %ecx
2119	jne	L(find_diff)
2120	POP	(%ebx)
2121	ret
2122	CFI_PUSH (%ebx)
2123#elif defined USE_AS_MEMCMP16
2124
2125	.p2align 4
2126L(46bytes):
2127	movzwl	-46(%eax), %ecx
2128	movzwl	-46(%edx), %ebx
2129	subl	%ebx, %ecx
2130	jne	L(memcmp16_exit)
2131L(44bytes):
2132	movzwl	-44(%eax), %ecx
2133	movzwl	-44(%edx), %ebx
2134	subl	%ebx, %ecx
2135	jne	L(memcmp16_exit)
2136L(42bytes):
2137	movzwl	-42(%eax), %ecx
2138	movzwl	-42(%edx), %ebx
2139	subl	%ebx, %ecx
2140	jne	L(memcmp16_exit)
2141L(40bytes):
2142	movzwl	-40(%eax), %ecx
2143	movzwl	-40(%edx), %ebx
2144	subl	%ebx, %ecx
2145	jne	L(memcmp16_exit)
2146L(38bytes):
2147	movzwl	-38(%eax), %ecx
2148	movzwl	-38(%edx), %ebx
2149	subl	%ebx, %ecx
2150	jne	L(memcmp16_exit)
2151L(36bytes):
2152	movzwl	-36(%eax), %ecx
2153	movzwl	-36(%edx), %ebx
2154	subl	%ebx, %ecx
2155	jne	L(memcmp16_exit)
2156L(34bytes):
2157	movzwl	-34(%eax), %ecx
2158	movzwl	-34(%edx), %ebx
2159	subl	%ebx, %ecx
2160	jne	L(memcmp16_exit)
2161L(32bytes):
2162	movzwl	-32(%eax), %ecx
2163	movzwl	-32(%edx), %ebx
2164	subl	%ebx, %ecx
2165	jne	L(memcmp16_exit)
2166L(30bytes):
2167	movzwl	-30(%eax), %ecx
2168	movzwl	-30(%edx), %ebx
2169	subl	%ebx, %ecx
2170	jne	L(memcmp16_exit)
2171L(28bytes):
2172	movzwl	-28(%eax), %ecx
2173	movzwl	-28(%edx), %ebx
2174	subl	%ebx, %ecx
2175	jne	L(memcmp16_exit)
2176L(26bytes):
2177	movzwl	-26(%eax), %ecx
2178	movzwl	-26(%edx), %ebx
2179	subl	%ebx, %ecx
2180	jne	L(memcmp16_exit)
2181L(24bytes):
2182	movzwl	-24(%eax), %ecx
2183	movzwl	-24(%edx), %ebx
2184	subl	%ebx, %ecx
2185	jne	L(memcmp16_exit)
2186L(22bytes):
2187	movzwl	-22(%eax), %ecx
2188	movzwl	-22(%edx), %ebx
2189	subl	%ebx, %ecx
2190	jne	L(memcmp16_exit)
2191L(20bytes):
2192	movzwl	-20(%eax), %ecx
2193	movzwl	-20(%edx), %ebx
2194	subl	%ebx, %ecx
2195	jne	L(memcmp16_exit)
2196L(18bytes):
2197	movzwl	-18(%eax), %ecx
2198	movzwl	-18(%edx), %ebx
2199	subl	%ebx, %ecx
2200	jne	L(memcmp16_exit)
2201L(16bytes):
2202	movzwl	-16(%eax), %ecx
2203	movzwl	-16(%edx), %ebx
2204	subl	%ebx, %ecx
2205	jne	L(memcmp16_exit)
2206L(14bytes):
2207	movzwl	-14(%eax), %ecx
2208	movzwl	-14(%edx), %ebx
2209	subl	%ebx, %ecx
2210	jne	L(memcmp16_exit)
2211L(12bytes):
2212	movzwl	-12(%eax), %ecx
2213	movzwl	-12(%edx), %ebx
2214	subl	%ebx, %ecx
2215	jne	L(memcmp16_exit)
2216L(10bytes):
2217	movzwl	-10(%eax), %ecx
2218	movzwl	-10(%edx), %ebx
2219	subl	%ebx, %ecx
2220	jne	L(memcmp16_exit)
2221L(8bytes):
2222	movzwl	-8(%eax), %ecx
2223	movzwl	-8(%edx), %ebx
2224	subl	%ebx, %ecx
2225	jne	L(memcmp16_exit)
2226L(6bytes):
2227	movzwl	-6(%eax), %ecx
2228	movzwl	-6(%edx), %ebx
2229	subl	%ebx, %ecx
2230	jne	L(memcmp16_exit)
2231L(4bytes):
2232	movzwl	-4(%eax), %ecx
2233	movzwl	-4(%edx), %ebx
2234	subl	%ebx, %ecx
2235	jne	L(memcmp16_exit)
2236L(2bytes):
2237	movzwl	-2(%eax), %eax
2238	movzwl	-2(%edx), %ebx
2239	subl	%ebx, %eax
2240	POP	(%ebx)
2241	ret
2242	CFI_PUSH (%ebx)
2243#else
2244# error Unreachable preprocessor case
2245#endif
2246
2247#if !defined(USE_AS_WMEMCMP) && !defined(USE_AS_MEMCMP16)
2248
2249	.p2align 4
2250L(45bytes):
2251	mov	-45(%eax), %ecx
2252	mov	-45(%edx), %ebx
2253	cmp	%ebx, %ecx
2254	jne	L(find_diff)
2255L(41bytes):
2256	mov	-41(%eax), %ecx
2257	mov	-41(%edx), %ebx
2258	cmp	%ebx, %ecx
2259	jne	L(find_diff)
2260L(37bytes):
2261	mov	-37(%eax), %ecx
2262	mov	-37(%edx), %ebx
2263	cmp	%ebx, %ecx
2264	jne	L(find_diff)
2265L(33bytes):
2266	mov	-33(%eax), %ecx
2267	mov	-33(%edx), %ebx
2268	cmp	%ebx, %ecx
2269	jne	L(find_diff)
2270L(29bytes):
2271	mov	-29(%eax), %ecx
2272	mov	-29(%edx), %ebx
2273	cmp	%ebx, %ecx
2274	jne	L(find_diff)
2275L(25bytes):
2276	mov	-25(%eax), %ecx
2277	mov	-25(%edx), %ebx
2278	cmp	%ebx, %ecx
2279	jne	L(find_diff)
2280L(21bytes):
2281	mov	-21(%eax), %ecx
2282	mov	-21(%edx), %ebx
2283	cmp	%ebx, %ecx
2284	jne	L(find_diff)
2285L(17bytes):
2286	mov	-17(%eax), %ecx
2287	mov	-17(%edx), %ebx
2288	cmp	%ebx, %ecx
2289	jne	L(find_diff)
2290L(13bytes):
2291	mov	-13(%eax), %ecx
2292	mov	-13(%edx), %ebx
2293	cmp	%ebx, %ecx
2294	jne	L(find_diff)
2295L(9bytes):
2296	mov	-9(%eax), %ecx
2297	mov	-9(%edx), %ebx
2298	cmp	%ebx, %ecx
2299	jne	L(find_diff)
2300L(5bytes):
2301	mov	-5(%eax), %ecx
2302	mov	-5(%edx), %ebx
2303	cmp	%ebx, %ecx
2304	jne	L(find_diff)
2305	movzbl	-1(%eax), %ecx
2306	cmp	-1(%edx), %cl
2307	mov	$0, %eax
2308	jne	L(end)
2309	POP	(%ebx)
2310	ret
2311	CFI_PUSH (%ebx)
2312
2313	.p2align 4
2314L(46bytes):
2315	mov	-46(%eax), %ecx
2316	mov	-46(%edx), %ebx
2317	cmp	%ebx, %ecx
2318	jne	L(find_diff)
2319L(42bytes):
2320	mov	-42(%eax), %ecx
2321	mov	-42(%edx), %ebx
2322	cmp	%ebx, %ecx
2323	jne	L(find_diff)
2324L(38bytes):
2325	mov	-38(%eax), %ecx
2326	mov	-38(%edx), %ebx
2327	cmp	%ebx, %ecx
2328	jne	L(find_diff)
2329L(34bytes):
2330	mov	-34(%eax), %ecx
2331	mov	-34(%edx), %ebx
2332	cmp	%ebx, %ecx
2333	jne	L(find_diff)
2334L(30bytes):
2335	mov	-30(%eax), %ecx
2336	mov	-30(%edx), %ebx
2337	cmp	%ebx, %ecx
2338	jne	L(find_diff)
2339L(26bytes):
2340	mov	-26(%eax), %ecx
2341	mov	-26(%edx), %ebx
2342	cmp	%ebx, %ecx
2343	jne	L(find_diff)
2344L(22bytes):
2345	mov	-22(%eax), %ecx
2346	mov	-22(%edx), %ebx
2347	cmp	%ebx, %ecx
2348	jne	L(find_diff)
2349L(18bytes):
2350	mov	-18(%eax), %ecx
2351	mov	-18(%edx), %ebx
2352	cmp	%ebx, %ecx
2353	jne	L(find_diff)
2354L(14bytes):
2355	mov	-14(%eax), %ecx
2356	mov	-14(%edx), %ebx
2357	cmp	%ebx, %ecx
2358	jne	L(find_diff)
2359L(10bytes):
2360	mov	-10(%eax), %ecx
2361	mov	-10(%edx), %ebx
2362	cmp	%ebx, %ecx
2363	jne	L(find_diff)
2364L(6bytes):
2365	mov	-6(%eax), %ecx
2366	mov	-6(%edx), %ebx
2367	cmp	%ebx, %ecx
2368	jne	L(find_diff)
2369L(2bytes):
2370	movzwl	-2(%eax), %ecx
2371	movzwl	-2(%edx), %ebx
2372	cmp	%bl, %cl
2373	jne	L(end)
2374	cmp	%bh, %ch
2375	mov	$0, %eax
2376	jne	L(end)
2377	POP	(%ebx)
2378	ret
2379	CFI_PUSH (%ebx)
2380
2381	.p2align 4
2382L(47bytes):
2383	movl	-47(%eax), %ecx
2384	movl	-47(%edx), %ebx
2385	cmp	%ebx, %ecx
2386	jne	L(find_diff)
2387L(43bytes):
2388	movl	-43(%eax), %ecx
2389	movl	-43(%edx), %ebx
2390	cmp	%ebx, %ecx
2391	jne	L(find_diff)
2392L(39bytes):
2393	movl	-39(%eax), %ecx
2394	movl	-39(%edx), %ebx
2395	cmp	%ebx, %ecx
2396	jne	L(find_diff)
2397L(35bytes):
2398	movl	-35(%eax), %ecx
2399	movl	-35(%edx), %ebx
2400	cmp	%ebx, %ecx
2401	jne	L(find_diff)
2402L(31bytes):
2403	movl	-31(%eax), %ecx
2404	movl	-31(%edx), %ebx
2405	cmp	%ebx, %ecx
2406	jne	L(find_diff)
2407L(27bytes):
2408	movl	-27(%eax), %ecx
2409	movl	-27(%edx), %ebx
2410	cmp	%ebx, %ecx
2411	jne	L(find_diff)
2412L(23bytes):
2413	movl	-23(%eax), %ecx
2414	movl	-23(%edx), %ebx
2415	cmp	%ebx, %ecx
2416	jne	L(find_diff)
2417L(19bytes):
2418	movl	-19(%eax), %ecx
2419	movl	-19(%edx), %ebx
2420	cmp	%ebx, %ecx
2421	jne	L(find_diff)
2422L(15bytes):
2423	movl	-15(%eax), %ecx
2424	movl	-15(%edx), %ebx
2425	cmp	%ebx, %ecx
2426	jne	L(find_diff)
2427L(11bytes):
2428	movl	-11(%eax), %ecx
2429	movl	-11(%edx), %ebx
2430	cmp	%ebx, %ecx
2431	jne	L(find_diff)
2432L(7bytes):
2433	movl	-7(%eax), %ecx
2434	movl	-7(%edx), %ebx
2435	cmp	%ebx, %ecx
2436	jne	L(find_diff)
2437L(3bytes):
2438	movzwl	-3(%eax), %ecx
2439	movzwl	-3(%edx), %ebx
2440	cmpb	%bl, %cl
2441	jne	L(end)
2442	cmp	%bx, %cx
2443	jne	L(end)
2444	movzbl	-1(%eax), %eax
2445	cmpb	-1(%edx), %al
2446	mov	$0, %eax
2447	jne	L(end)
2448	POP	(%ebx)
2449	ret
2450	CFI_PUSH (%ebx)
2451
2452	.p2align 4
2453L(find_diff):
2454	cmpb	%bl, %cl
2455	jne	L(end)
2456	cmp	%bx, %cx
2457	jne	L(end)
2458	shr	$16,%ecx
2459	shr	$16,%ebx
2460	cmp	%bl, %cl
2461	jne	L(end)
2462	cmp	%bx, %cx
2463
2464	.p2align 4
2465L(end):
2466	POP	(%ebx)
2467	mov	$1, %eax
2468	ja	L(bigger)
2469	neg	%eax
2470L(bigger):
2471	ret
2472#elif defined(USE_AS_WMEMCMP)
2473
2474	.p2align 4
2475L(find_diff):
2476	POP	(%ebx)
2477	mov	$1, %eax
2478	jg	L(find_diff_bigger)
2479	neg	%eax
2480	ret
2481
2482	.p2align 4
2483L(find_diff_bigger):
2484	ret
2485
2486#elif defined(USE_AS_MEMCMP16)
2487
2488	.p2align 4
2489L(memcmp16_exit):
2490	POP	(%ebx)
2491	mov	%ecx, %eax
2492	ret
2493#else
2494# error Unreachable preprocessor case
2495#endif
2496END (MEMCMP)
2497