1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMCPY
34# define MEMCPY	memcpy_atom
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#define DEST		PARMS
77#define SRC		DEST+4
78#define LEN		SRC+4
79
80#define CFI_PUSH(REG)		\
81  cfi_adjust_cfa_offset (4);		\
82  cfi_rel_offset (REG, 0)
83
84#define CFI_POP(REG)		\
85  cfi_adjust_cfa_offset (-4);		\
86  cfi_restore (REG)
87
88#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
89#define POP(REG)	popl REG; CFI_POP (REG)
90
91#if (defined SHARED || defined __PIC__)
92# define PARMS		8		/* Preserve EBX.  */
93# define ENTRANCE	PUSH (%ebx);
94# define RETURN_END	POP (%ebx); ret
95# define RETURN		RETURN_END; CFI_PUSH (%ebx)
96# define JMPTBL(I, B)	I - B
97
98# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
99
100/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
101	jump table with relative offsets.  INDEX is a register contains the
102	index into the jump table.   SCALE is the scale of INDEX. */
103
104# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
105    /* We first load PC into EBX.  */		\
106	SETUP_PIC_REG(bx);		\
107    /* Get the address of the jump table.  */		\
108	addl	$(TABLE - .), %ebx;		\
109    /* Get the entry and convert the relative offset to the		\
110	absolute	address.  */		\
111	addl	(%ebx, INDEX, SCALE), %ebx;		\
112    /* We loaded the jump table.  Go.  */		\
113	jmp	*%ebx
114#else
115
116# define PARMS		4
117# define ENTRANCE
118# define RETURN_END	ret
119# define RETURN		RETURN_END
120# define JMPTBL(I, B)	I
121
122/* Branch to an entry in a jump table.  TABLE is a jump table with
123	absolute offsets.  INDEX is a register contains the index into the
124	jump table.  SCALE is the scale of INDEX. */
125
126# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
127	jmp	*TABLE(, INDEX, SCALE)
128#endif
129
130	.section .text.ssse3,"ax",@progbits
131ENTRY (MEMCPY)
132	ENTRANCE
133	movl	LEN(%esp), %ecx
134	movl	SRC(%esp), %eax
135	movl	DEST(%esp), %edx
136
137#ifdef USE_AS_MEMMOVE
138	cmp	%eax, %edx
139	jb	L(copy_forward)
140	je	L(fwd_write_0bytes)
141	cmp	$32, %ecx
142	jae	L(memmove_bwd)
143	jmp	L(bk_write_less32bytes_2)
144
145	.p2align 4
146L(memmove_bwd):
147	add	%ecx, %eax
148	cmp	%eax, %edx
149	movl	SRC(%esp), %eax
150	jb	L(copy_backward)
151
152L(copy_forward):
153#endif
154	cmp	$48, %ecx
155	jae	L(48bytesormore)
156
157L(fwd_write_less32bytes):
158#ifndef USE_AS_MEMMOVE
159	cmp	%dl, %al
160	jb	L(bk_write)
161#endif
162	add	%ecx, %edx
163	add	%ecx, %eax
164	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
165#ifndef USE_AS_MEMMOVE
166	.p2align 4
167L(bk_write):
168	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
169#endif
170
171	.p2align 4
172L(48bytesormore):
173#ifndef USE_AS_MEMMOVE
174	movlpd	(%eax), %xmm0
175	movlpd	8(%eax), %xmm1
176	movlpd	%xmm0, (%edx)
177	movlpd	%xmm1, 8(%edx)
178#else
179	movdqu	(%eax), %xmm0
180#endif
181	PUSH (%edi)
182	movl	%edx, %edi
183	and	$-16, %edx
184	add	$16, %edx
185	sub	%edx, %edi
186	add	%edi, %ecx
187	sub	%edi, %eax
188
189#ifdef SHARED_CACHE_SIZE_HALF
190	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
191#else
192# if (defined SHARED || defined __PIC__)
193	SETUP_PIC_REG(bx)
194	add	$_GLOBAL_OFFSET_TABLE_, %ebx
195	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
196# else
197	cmp	__x86_shared_cache_size_half, %ecx
198# endif
199#endif
200
201	mov	%eax, %edi
202	jae	L(large_page)
203	and	$0xf, %edi
204	jz	L(shl_0)
205	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
206
207	.p2align 4
208L(shl_0):
209#ifdef USE_AS_MEMMOVE
210	movl	DEST+4(%esp), %edi
211	movdqu	%xmm0, (%edi)
212#endif
213	xor	%edi, %edi
214	cmp	$127, %ecx
215	ja	L(shl_0_gobble)
216	lea	-32(%ecx), %ecx
217
218	.p2align 4
219L(shl_0_loop):
220	movdqa	(%eax, %edi), %xmm0
221	movdqa	16(%eax, %edi), %xmm1
222	sub	$32, %ecx
223	movdqa	%xmm0, (%edx, %edi)
224	movdqa	%xmm1, 16(%edx, %edi)
225	lea	32(%edi), %edi
226	jb	L(shl_0_end)
227
228	movdqa	(%eax, %edi), %xmm0
229	movdqa	16(%eax, %edi), %xmm1
230	sub	$32, %ecx
231	movdqa	%xmm0, (%edx, %edi)
232	movdqa	%xmm1, 16(%edx, %edi)
233	lea	32(%edi), %edi
234	jb	L(shl_0_end)
235
236	movdqa	(%eax, %edi), %xmm0
237	movdqa	16(%eax, %edi), %xmm1
238	sub	$32, %ecx
239	movdqa	%xmm0, (%edx, %edi)
240	movdqa	%xmm1, 16(%edx, %edi)
241	lea	32(%edi), %edi
242	jb	L(shl_0_end)
243
244	movdqa	(%eax, %edi), %xmm0
245	movdqa	16(%eax, %edi), %xmm1
246	sub	$32, %ecx
247	movdqa	%xmm0, (%edx, %edi)
248	movdqa	%xmm1, 16(%edx, %edi)
249	lea	32(%edi), %edi
250
251L(shl_0_end):
252	lea	32(%ecx), %ecx
253	add	%ecx, %edi
254	add	%edi, %edx
255	add	%edi, %eax
256	POP (%edi)
257	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
258
259	CFI_PUSH (%edi)
260
261	.p2align 4
262L(shl_0_gobble):
263#ifdef DATA_CACHE_SIZE_HALF
264	cmp	$DATA_CACHE_SIZE_HALF, %ecx
265#else
266# if (defined SHARED || defined __PIC__)
267	SETUP_PIC_REG(bx)
268	add	$_GLOBAL_OFFSET_TABLE_, %ebx
269	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
270# else
271	cmp	__x86_data_cache_size_half, %ecx
272# endif
273#endif
274	POP	(%edi)
275	lea	-128(%ecx), %ecx
276	jae	L(shl_0_gobble_mem_loop)
277
278	.p2align 4
279L(shl_0_gobble_cache_loop):
280	movdqa	(%eax), %xmm0
281	movdqa	0x10(%eax), %xmm1
282	movdqa	0x20(%eax), %xmm2
283	movdqa	0x30(%eax), %xmm3
284	movdqa	0x40(%eax), %xmm4
285	movdqa	0x50(%eax), %xmm5
286	movdqa	0x60(%eax), %xmm6
287	movdqa	0x70(%eax), %xmm7
288	lea	0x80(%eax), %eax
289	sub	$128, %ecx
290	movdqa	%xmm0, (%edx)
291	movdqa	%xmm1, 0x10(%edx)
292	movdqa	%xmm2, 0x20(%edx)
293	movdqa	%xmm3, 0x30(%edx)
294	movdqa	%xmm4, 0x40(%edx)
295	movdqa	%xmm5, 0x50(%edx)
296	movdqa	%xmm6, 0x60(%edx)
297	movdqa	%xmm7, 0x70(%edx)
298	lea	0x80(%edx), %edx
299
300	jae	L(shl_0_gobble_cache_loop)
301	cmp	$-0x40, %ecx
302	lea	0x80(%ecx), %ecx
303	jl	L(shl_0_cache_less_64bytes)
304
305	movdqa	(%eax), %xmm0
306	sub	$0x40, %ecx
307	movdqa	0x10(%eax), %xmm1
308	movdqa	%xmm0, (%edx)
309	movdqa	%xmm1, 0x10(%edx)
310	movdqa	0x20(%eax), %xmm0
311	movdqa	0x30(%eax), %xmm1
312	add	$0x40, %eax
313	movdqa	%xmm0, 0x20(%edx)
314	movdqa	%xmm1, 0x30(%edx)
315	add	$0x40, %edx
316
317L(shl_0_cache_less_64bytes):
318	cmp	$0x20, %ecx
319	jb	L(shl_0_cache_less_32bytes)
320	movdqa	(%eax), %xmm0
321	sub	$0x20, %ecx
322	movdqa	0x10(%eax), %xmm1
323	add	$0x20, %eax
324	movdqa	%xmm0, (%edx)
325	movdqa	%xmm1, 0x10(%edx)
326	add	$0x20, %edx
327
328L(shl_0_cache_less_32bytes):
329	cmp	$0x10, %ecx
330	jb	L(shl_0_cache_less_16bytes)
331	sub	$0x10, %ecx
332	movdqa	(%eax), %xmm0
333	add	$0x10, %eax
334	movdqa	%xmm0, (%edx)
335	add	$0x10, %edx
336
337L(shl_0_cache_less_16bytes):
338	add	%ecx, %edx
339	add	%ecx, %eax
340	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
341
342	.p2align 4
343L(shl_0_gobble_mem_loop):
344	prefetcht0 0x1c0(%eax)
345	prefetcht0 0x280(%eax)
346	prefetcht0 0x1c0(%edx)
347
348	movdqa	(%eax), %xmm0
349	movdqa	0x10(%eax), %xmm1
350	movdqa	0x20(%eax), %xmm2
351	movdqa	0x30(%eax), %xmm3
352	movdqa	0x40(%eax), %xmm4
353	movdqa	0x50(%eax), %xmm5
354	movdqa	0x60(%eax), %xmm6
355	movdqa	0x70(%eax), %xmm7
356	lea	0x80(%eax), %eax
357	sub	$0x80, %ecx
358	movdqa	%xmm0, (%edx)
359	movdqa	%xmm1, 0x10(%edx)
360	movdqa	%xmm2, 0x20(%edx)
361	movdqa	%xmm3, 0x30(%edx)
362	movdqa	%xmm4, 0x40(%edx)
363	movdqa	%xmm5, 0x50(%edx)
364	movdqa	%xmm6, 0x60(%edx)
365	movdqa	%xmm7, 0x70(%edx)
366	lea	0x80(%edx), %edx
367
368	jae	L(shl_0_gobble_mem_loop)
369	cmp	$-0x40, %ecx
370	lea	0x80(%ecx), %ecx
371	jl	L(shl_0_mem_less_64bytes)
372
373	movdqa	(%eax), %xmm0
374	sub	$0x40, %ecx
375	movdqa	0x10(%eax), %xmm1
376
377	movdqa	%xmm0, (%edx)
378	movdqa	%xmm1, 0x10(%edx)
379
380	movdqa	0x20(%eax), %xmm0
381	movdqa	0x30(%eax), %xmm1
382	add	$0x40, %eax
383
384	movdqa	%xmm0, 0x20(%edx)
385	movdqa	%xmm1, 0x30(%edx)
386	add	$0x40, %edx
387
388L(shl_0_mem_less_64bytes):
389	cmp	$0x20, %ecx
390	jb	L(shl_0_mem_less_32bytes)
391	movdqa	(%eax), %xmm0
392	sub	$0x20, %ecx
393	movdqa	0x10(%eax), %xmm1
394	add	$0x20, %eax
395	movdqa	%xmm0, (%edx)
396	movdqa	%xmm1, 0x10(%edx)
397	add	$0x20, %edx
398
399L(shl_0_mem_less_32bytes):
400	cmp	$0x10, %ecx
401	jb	L(shl_0_mem_less_16bytes)
402	sub	$0x10, %ecx
403	movdqa	(%eax), %xmm0
404	add	$0x10, %eax
405	movdqa	%xmm0, (%edx)
406	add	$0x10, %edx
407
408L(shl_0_mem_less_16bytes):
409	add	%ecx, %edx
410	add	%ecx, %eax
411	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
412
413	.p2align 4
414L(shl_1):
415#ifndef USE_AS_MEMMOVE
416	movaps	-1(%eax), %xmm1
417#else
418	movl	DEST+4(%esp), %edi
419	movaps	-1(%eax), %xmm1
420	movdqu	%xmm0, (%edi)
421#endif
422#ifdef DATA_CACHE_SIZE_HALF
423	cmp	$DATA_CACHE_SIZE_HALF, %ecx
424#else
425# if (defined SHARED || defined __PIC__)
426	SETUP_PIC_REG(bx)
427	add	$_GLOBAL_OFFSET_TABLE_, %ebx
428	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
429# else
430	cmp	__x86_data_cache_size_half, %ecx
431# endif
432#endif
433	jb L(sh_1_no_prefetch)
434
435	lea	-64(%ecx), %ecx
436
437	.p2align 4
438L(Shl1LoopStart):
439	prefetcht0 0x1c0(%eax)
440	prefetcht0 0x1c0(%edx)
441	movaps	15(%eax), %xmm2
442	movaps	31(%eax), %xmm3
443	movaps	47(%eax), %xmm4
444	movaps	63(%eax), %xmm5
445	movaps	%xmm5, %xmm7
446	palignr	$1, %xmm4, %xmm5
447	palignr	$1, %xmm3, %xmm4
448	movaps	%xmm5, 48(%edx)
449	palignr	$1, %xmm2, %xmm3
450	lea	64(%eax), %eax
451	palignr	$1, %xmm1, %xmm2
452	movaps	%xmm4, 32(%edx)
453	movaps	%xmm3, 16(%edx)
454	movaps	%xmm7, %xmm1
455	movaps	%xmm2, (%edx)
456	lea	64(%edx), %edx
457	sub	$64, %ecx
458	ja	L(Shl1LoopStart)
459
460L(Shl1LoopLeave):
461	add	$32, %ecx
462	jle	L(shl_end_0)
463
464	movaps	15(%eax), %xmm2
465	movaps	31(%eax), %xmm3
466	palignr	$1, %xmm2, %xmm3
467	palignr	$1, %xmm1, %xmm2
468	movaps	%xmm2, (%edx)
469	movaps	%xmm3, 16(%edx)
470	lea	32(%edx, %ecx), %edx
471	lea	32(%eax, %ecx), %eax
472	POP (%edi)
473	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
474
475	CFI_PUSH (%edi)
476
477	.p2align 4
478L(sh_1_no_prefetch):
479	lea	-32(%ecx), %ecx
480	lea	-1(%eax), %eax
481	xor	%edi, %edi
482
483	.p2align 4
484L(sh_1_no_prefetch_loop):
485	movdqa	16(%eax, %edi), %xmm2
486	sub	$32, %ecx
487	movdqa	32(%eax, %edi), %xmm3
488	movdqa	%xmm3, %xmm4
489	palignr	$1, %xmm2, %xmm3
490	palignr	$1, %xmm1, %xmm2
491	lea	32(%edi), %edi
492	movdqa	%xmm2, -32(%edx, %edi)
493	movdqa	%xmm3, -16(%edx, %edi)
494	jb	L(sh_1_end_no_prefetch_loop)
495
496	movdqa	16(%eax, %edi), %xmm2
497	sub	$32, %ecx
498	movdqa	32(%eax, %edi), %xmm3
499	movdqa	%xmm3, %xmm1
500	palignr	$1, %xmm2, %xmm3
501	palignr	$1, %xmm4, %xmm2
502	lea	32(%edi), %edi
503	movdqa	%xmm2, -32(%edx, %edi)
504	movdqa	%xmm3, -16(%edx, %edi)
505	jae	L(sh_1_no_prefetch_loop)
506
507L(sh_1_end_no_prefetch_loop):
508	lea	32(%ecx), %ecx
509	add	%ecx, %edi
510	add	%edi, %edx
511	lea	1(%edi, %eax), %eax
512	POP	(%edi)
513	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
514
515	CFI_PUSH (%edi)
516
517	.p2align 4
518L(shl_2):
519#ifndef USE_AS_MEMMOVE
520	movaps	-2(%eax), %xmm1
521#else
522	movl	DEST+4(%esp), %edi
523	movaps	-2(%eax), %xmm1
524	movdqu	%xmm0, (%edi)
525#endif
526#ifdef DATA_CACHE_SIZE_HALF
527	cmp	$DATA_CACHE_SIZE_HALF, %ecx
528#else
529# if (defined SHARED || defined __PIC__)
530	SETUP_PIC_REG(bx)
531	add	$_GLOBAL_OFFSET_TABLE_, %ebx
532	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
533# else
534	cmp	__x86_data_cache_size_half, %ecx
535# endif
536#endif
537	jb L(sh_2_no_prefetch)
538
539	lea	-64(%ecx), %ecx
540
541	.p2align 4
542L(Shl2LoopStart):
543	prefetcht0 0x1c0(%eax)
544	prefetcht0 0x1c0(%edx)
545	movaps	14(%eax), %xmm2
546	movaps	30(%eax), %xmm3
547	movaps	46(%eax), %xmm4
548	movaps	62(%eax), %xmm5
549	movaps	%xmm5, %xmm7
550	palignr	$2, %xmm4, %xmm5
551	palignr	$2, %xmm3, %xmm4
552	movaps	%xmm5, 48(%edx)
553	palignr	$2, %xmm2, %xmm3
554	lea	64(%eax), %eax
555	palignr	$2, %xmm1, %xmm2
556	movaps	%xmm4, 32(%edx)
557	movaps	%xmm3, 16(%edx)
558	movaps	%xmm7, %xmm1
559	movaps	%xmm2, (%edx)
560	lea	64(%edx), %edx
561	sub	$64, %ecx
562	ja	L(Shl2LoopStart)
563
564L(Shl2LoopLeave):
565	add	$32, %ecx
566	jle	L(shl_end_0)
567
568	movaps	14(%eax), %xmm2
569	movaps	30(%eax), %xmm3
570	palignr	$2, %xmm2, %xmm3
571	palignr	$2, %xmm1, %xmm2
572	movaps	%xmm2, (%edx)
573	movaps	%xmm3, 16(%edx)
574	lea	32(%edx, %ecx), %edx
575	lea	32(%eax, %ecx), %eax
576	POP (%edi)
577	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
578
579	CFI_PUSH (%edi)
580
581	.p2align 4
582L(sh_2_no_prefetch):
583	lea	-32(%ecx), %ecx
584	lea	-2(%eax), %eax
585	xor	%edi, %edi
586
587	.p2align 4
588L(sh_2_no_prefetch_loop):
589	movdqa	16(%eax, %edi), %xmm2
590	sub	$32, %ecx
591	movdqa	32(%eax, %edi), %xmm3
592	movdqa	%xmm3, %xmm4
593	palignr	$2, %xmm2, %xmm3
594	palignr	$2, %xmm1, %xmm2
595	lea	32(%edi), %edi
596	movdqa	%xmm2, -32(%edx, %edi)
597	movdqa	%xmm3, -16(%edx, %edi)
598	jb	L(sh_2_end_no_prefetch_loop)
599
600	movdqa	16(%eax, %edi), %xmm2
601	sub	$32, %ecx
602	movdqa	32(%eax, %edi), %xmm3
603	movdqa	%xmm3, %xmm1
604	palignr	$2, %xmm2, %xmm3
605	palignr	$2, %xmm4, %xmm2
606	lea	32(%edi), %edi
607	movdqa	%xmm2, -32(%edx, %edi)
608	movdqa	%xmm3, -16(%edx, %edi)
609	jae	L(sh_2_no_prefetch_loop)
610
611L(sh_2_end_no_prefetch_loop):
612	lea	32(%ecx), %ecx
613	add	%ecx, %edi
614	add	%edi, %edx
615	lea	2(%edi, %eax), %eax
616	POP	(%edi)
617	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
618
619	CFI_PUSH (%edi)
620
621	.p2align 4
622L(shl_3):
623#ifndef USE_AS_MEMMOVE
624	movaps	-3(%eax), %xmm1
625#else
626	movl	DEST+4(%esp), %edi
627	movaps	-3(%eax), %xmm1
628	movdqu	%xmm0, (%edi)
629#endif
630#ifdef DATA_CACHE_SIZE_HALF
631	cmp	$DATA_CACHE_SIZE_HALF, %ecx
632#else
633# if (defined SHARED || defined __PIC__)
634	SETUP_PIC_REG(bx)
635	add	$_GLOBAL_OFFSET_TABLE_, %ebx
636	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
637# else
638	cmp	__x86_data_cache_size_half, %ecx
639# endif
640#endif
641	jb L(sh_3_no_prefetch)
642
643	lea	-64(%ecx), %ecx
644
645	.p2align 4
646L(Shl3LoopStart):
647	prefetcht0 0x1c0(%eax)
648	prefetcht0 0x1c0(%edx)
649	movaps	13(%eax), %xmm2
650	movaps	29(%eax), %xmm3
651	movaps	45(%eax), %xmm4
652	movaps	61(%eax), %xmm5
653	movaps	%xmm5, %xmm7
654	palignr	$3, %xmm4, %xmm5
655	palignr	$3, %xmm3, %xmm4
656	movaps	%xmm5, 48(%edx)
657	palignr	$3, %xmm2, %xmm3
658	lea	64(%eax), %eax
659	palignr	$3, %xmm1, %xmm2
660	movaps	%xmm4, 32(%edx)
661	movaps	%xmm3, 16(%edx)
662	movaps	%xmm7, %xmm1
663	movaps	%xmm2, (%edx)
664	lea	64(%edx), %edx
665	sub	$64, %ecx
666	ja	L(Shl3LoopStart)
667
668L(Shl3LoopLeave):
669	add	$32, %ecx
670	jle	L(shl_end_0)
671
672	movaps	13(%eax), %xmm2
673	movaps	29(%eax), %xmm3
674	palignr	$3, %xmm2, %xmm3
675	palignr	$3, %xmm1, %xmm2
676	movaps	%xmm2, (%edx)
677	movaps	%xmm3, 16(%edx)
678	lea	32(%edx, %ecx), %edx
679	lea	32(%eax, %ecx), %eax
680	POP (%edi)
681	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
682
683	CFI_PUSH (%edi)
684
685	.p2align 4
686L(sh_3_no_prefetch):
687	lea	-32(%ecx), %ecx
688	lea	-3(%eax), %eax
689	xor	%edi, %edi
690
691	.p2align 4
692L(sh_3_no_prefetch_loop):
693	movdqa	16(%eax, %edi), %xmm2
694	sub	$32, %ecx
695	movdqa	32(%eax, %edi), %xmm3
696	movdqa	%xmm3, %xmm4
697	palignr	$3, %xmm2, %xmm3
698	palignr	$3, %xmm1, %xmm2
699	lea	32(%edi), %edi
700	movdqa	%xmm2, -32(%edx, %edi)
701	movdqa	%xmm3, -16(%edx, %edi)
702
703	jb	L(sh_3_end_no_prefetch_loop)
704
705	movdqa	16(%eax, %edi), %xmm2
706	sub	$32, %ecx
707	movdqa	32(%eax, %edi), %xmm3
708	movdqa	%xmm3, %xmm1
709	palignr	$3, %xmm2, %xmm3
710	palignr	$3, %xmm4, %xmm2
711	lea	32(%edi), %edi
712	movdqa	%xmm2, -32(%edx, %edi)
713	movdqa	%xmm3, -16(%edx, %edi)
714
715	jae	L(sh_3_no_prefetch_loop)
716
717L(sh_3_end_no_prefetch_loop):
718	lea	32(%ecx), %ecx
719	add	%ecx, %edi
720	add	%edi, %edx
721	lea	3(%edi, %eax), %eax
722	POP	(%edi)
723	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
724
725	CFI_PUSH (%edi)
726
727	.p2align 4
728L(shl_4):
729#ifndef USE_AS_MEMMOVE
730	movaps	-4(%eax), %xmm1
731#else
732	movl	DEST+4(%esp), %edi
733	movaps	-4(%eax), %xmm1
734	movdqu	%xmm0, (%edi)
735#endif
736#ifdef DATA_CACHE_SIZE_HALF
737	cmp	$DATA_CACHE_SIZE_HALF, %ecx
738#else
739# if (defined SHARED || defined __PIC__)
740	SETUP_PIC_REG(bx)
741	add	$_GLOBAL_OFFSET_TABLE_, %ebx
742	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
743# else
744	cmp	__x86_data_cache_size_half, %ecx
745# endif
746#endif
747	jb L(sh_4_no_prefetch)
748
749	lea	-64(%ecx), %ecx
750
751	.p2align 4
752L(Shl4LoopStart):
753	prefetcht0 0x1c0(%eax)
754	prefetcht0 0x1c0(%edx)
755	movaps	12(%eax), %xmm2
756	movaps	28(%eax), %xmm3
757	movaps	44(%eax), %xmm4
758	movaps	60(%eax), %xmm5
759	movaps	%xmm5, %xmm7
760	palignr	$4, %xmm4, %xmm5
761	palignr	$4, %xmm3, %xmm4
762	movaps	%xmm5, 48(%edx)
763	palignr	$4, %xmm2, %xmm3
764	lea	64(%eax), %eax
765	palignr	$4, %xmm1, %xmm2
766	movaps	%xmm4, 32(%edx)
767	movaps	%xmm3, 16(%edx)
768	movaps	%xmm7, %xmm1
769	movaps	%xmm2, (%edx)
770	lea	64(%edx), %edx
771	sub	$64, %ecx
772	ja	L(Shl4LoopStart)
773
774L(Shl4LoopLeave):
775	add	$32, %ecx
776	jle	L(shl_end_0)
777
778	movaps	12(%eax), %xmm2
779	movaps	28(%eax), %xmm3
780	palignr	$4, %xmm2, %xmm3
781	palignr	$4, %xmm1, %xmm2
782	movaps	%xmm2, (%edx)
783	movaps	%xmm3, 16(%edx)
784	lea	32(%edx, %ecx), %edx
785	lea	32(%eax, %ecx), %eax
786	POP (%edi)
787	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
788
789	CFI_PUSH (%edi)
790
791	.p2align 4
792L(sh_4_no_prefetch):
793	lea	-32(%ecx), %ecx
794	lea	-4(%eax), %eax
795	xor	%edi, %edi
796
797	.p2align 4
798L(sh_4_no_prefetch_loop):
799	movdqa	16(%eax, %edi), %xmm2
800	sub	$32, %ecx
801	movdqa	32(%eax, %edi), %xmm3
802	movdqa	%xmm3, %xmm4
803	palignr	$4, %xmm2, %xmm3
804	palignr	$4, %xmm1, %xmm2
805	lea	32(%edi), %edi
806	movdqa	%xmm2, -32(%edx, %edi)
807	movdqa	%xmm3, -16(%edx, %edi)
808
809	jb	L(sh_4_end_no_prefetch_loop)
810
811	movdqa	16(%eax, %edi), %xmm2
812	sub	$32, %ecx
813	movdqa	32(%eax, %edi), %xmm3
814	movdqa	%xmm3, %xmm1
815	palignr	$4, %xmm2, %xmm3
816	palignr	$4, %xmm4, %xmm2
817	lea	32(%edi), %edi
818	movdqa	%xmm2, -32(%edx, %edi)
819	movdqa	%xmm3, -16(%edx, %edi)
820
821	jae	L(sh_4_no_prefetch_loop)
822
823L(sh_4_end_no_prefetch_loop):
824	lea	32(%ecx), %ecx
825	add	%ecx, %edi
826	add	%edi, %edx
827	lea	4(%edi, %eax), %eax
828	POP	(%edi)
829	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
830
831	CFI_PUSH (%edi)
832
833	.p2align 4
834L(shl_5):
835#ifndef USE_AS_MEMMOVE
836	movaps	-5(%eax), %xmm1
837#else
838	movl	DEST+4(%esp), %edi
839	movaps	-5(%eax), %xmm1
840	movdqu	%xmm0, (%edi)
841#endif
842#ifdef DATA_CACHE_SIZE_HALF
843	cmp	$DATA_CACHE_SIZE_HALF, %ecx
844#else
845# if (defined SHARED || defined __PIC__)
846	SETUP_PIC_REG(bx)
847	add	$_GLOBAL_OFFSET_TABLE_, %ebx
848	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
849# else
850	cmp	__x86_data_cache_size_half, %ecx
851# endif
852#endif
853	jb L(sh_5_no_prefetch)
854
855	lea	-64(%ecx), %ecx
856
857	.p2align 4
858L(Shl5LoopStart):
859	prefetcht0 0x1c0(%eax)
860	prefetcht0 0x1c0(%edx)
861	movaps	11(%eax), %xmm2
862	movaps	27(%eax), %xmm3
863	movaps	43(%eax), %xmm4
864	movaps	59(%eax), %xmm5
865	movaps	%xmm5, %xmm7
866	palignr	$5, %xmm4, %xmm5
867	palignr	$5, %xmm3, %xmm4
868	movaps	%xmm5, 48(%edx)
869	palignr	$5, %xmm2, %xmm3
870	lea	64(%eax), %eax
871	palignr	$5, %xmm1, %xmm2
872	movaps	%xmm4, 32(%edx)
873	movaps	%xmm3, 16(%edx)
874	movaps	%xmm7, %xmm1
875	movaps	%xmm2, (%edx)
876	lea	64(%edx), %edx
877	sub	$64, %ecx
878	ja	L(Shl5LoopStart)
879
880L(Shl5LoopLeave):
881	add	$32, %ecx
882	jle	L(shl_end_0)
883
884	movaps	11(%eax), %xmm2
885	movaps	27(%eax), %xmm3
886	palignr	$5, %xmm2, %xmm3
887	palignr	$5, %xmm1, %xmm2
888	movaps	%xmm2, (%edx)
889	movaps	%xmm3, 16(%edx)
890	lea	32(%edx, %ecx), %edx
891	lea	32(%eax, %ecx), %eax
892	POP (%edi)
893	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
894
895	CFI_PUSH (%edi)
896
897	.p2align 4
898L(sh_5_no_prefetch):
899	lea	-32(%ecx), %ecx
900	lea	-5(%eax), %eax
901	xor	%edi, %edi
902
903	.p2align 4
904L(sh_5_no_prefetch_loop):
905	movdqa	16(%eax, %edi), %xmm2
906	sub	$32, %ecx
907	movdqa	32(%eax, %edi), %xmm3
908	movdqa	%xmm3, %xmm4
909	palignr	$5, %xmm2, %xmm3
910	palignr	$5, %xmm1, %xmm2
911	lea	32(%edi), %edi
912	movdqa	%xmm2, -32(%edx, %edi)
913	movdqa	%xmm3, -16(%edx, %edi)
914
915	jb	L(sh_5_end_no_prefetch_loop)
916
917	movdqa	16(%eax, %edi), %xmm2
918	sub	$32, %ecx
919	movdqa	32(%eax, %edi), %xmm3
920	movdqa	%xmm3, %xmm1
921	palignr	$5, %xmm2, %xmm3
922	palignr	$5, %xmm4, %xmm2
923	lea	32(%edi), %edi
924	movdqa	%xmm2, -32(%edx, %edi)
925	movdqa	%xmm3, -16(%edx, %edi)
926
927	jae	L(sh_5_no_prefetch_loop)
928
929L(sh_5_end_no_prefetch_loop):
930	lea	32(%ecx), %ecx
931	add	%ecx, %edi
932	add	%edi, %edx
933	lea	5(%edi, %eax), %eax
934	POP	(%edi)
935	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
936
937	CFI_PUSH (%edi)
938
939	.p2align 4
940L(shl_6):
941#ifndef USE_AS_MEMMOVE
942	movaps	-6(%eax), %xmm1
943#else
944	movl	DEST+4(%esp), %edi
945	movaps	-6(%eax), %xmm1
946	movdqu	%xmm0, (%edi)
947#endif
948#ifdef DATA_CACHE_SIZE_HALF
949	cmp	$DATA_CACHE_SIZE_HALF, %ecx
950#else
951# if (defined SHARED || defined __PIC__)
952	SETUP_PIC_REG(bx)
953	add	$_GLOBAL_OFFSET_TABLE_, %ebx
954	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
955# else
956	cmp	__x86_data_cache_size_half, %ecx
957# endif
958#endif
959	jb L(sh_6_no_prefetch)
960
961	lea	-64(%ecx), %ecx
962
963	.p2align 4
964L(Shl6LoopStart):
965	prefetcht0 0x1c0(%eax)
966	prefetcht0 0x1c0(%edx)
967	movaps	10(%eax), %xmm2
968	movaps	26(%eax), %xmm3
969	movaps	42(%eax), %xmm4
970	movaps	58(%eax), %xmm5
971	movaps	%xmm5, %xmm7
972	palignr	$6, %xmm4, %xmm5
973	palignr	$6, %xmm3, %xmm4
974	movaps	%xmm5, 48(%edx)
975	palignr	$6, %xmm2, %xmm3
976	lea	64(%eax), %eax
977	palignr	$6, %xmm1, %xmm2
978	movaps	%xmm4, 32(%edx)
979	movaps	%xmm3, 16(%edx)
980	movaps	%xmm7, %xmm1
981	movaps	%xmm2, (%edx)
982	lea	64(%edx), %edx
983	sub	$64, %ecx
984	ja	L(Shl6LoopStart)
985
986L(Shl6LoopLeave):
987	add	$32, %ecx
988	jle	L(shl_end_0)
989
990	movaps	10(%eax), %xmm2
991	movaps	26(%eax), %xmm3
992	palignr	$6, %xmm2, %xmm3
993	palignr	$6, %xmm1, %xmm2
994	movaps	%xmm2, (%edx)
995	movaps	%xmm3, 16(%edx)
996	lea	32(%edx, %ecx), %edx
997	lea	32(%eax, %ecx), %eax
998	POP (%edi)
999	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1000
1001	CFI_PUSH (%edi)
1002
1003	.p2align 4
1004L(sh_6_no_prefetch):
1005	lea	-32(%ecx), %ecx
1006	lea	-6(%eax), %eax
1007	xor	%edi, %edi
1008
1009	.p2align 4
1010L(sh_6_no_prefetch_loop):
1011	movdqa	16(%eax, %edi), %xmm2
1012	sub	$32, %ecx
1013	movdqa	32(%eax, %edi), %xmm3
1014	movdqa	%xmm3, %xmm4
1015	palignr	$6, %xmm2, %xmm3
1016	palignr	$6, %xmm1, %xmm2
1017	lea	32(%edi), %edi
1018	movdqa	%xmm2, -32(%edx, %edi)
1019	movdqa	%xmm3, -16(%edx, %edi)
1020
1021	jb	L(sh_6_end_no_prefetch_loop)
1022
1023	movdqa	16(%eax, %edi), %xmm2
1024	sub	$32, %ecx
1025	movdqa	32(%eax, %edi), %xmm3
1026	movdqa	%xmm3, %xmm1
1027	palignr	$6, %xmm2, %xmm3
1028	palignr	$6, %xmm4, %xmm2
1029	lea	32(%edi), %edi
1030	movdqa	%xmm2, -32(%edx, %edi)
1031	movdqa	%xmm3, -16(%edx, %edi)
1032
1033	jae	L(sh_6_no_prefetch_loop)
1034
1035L(sh_6_end_no_prefetch_loop):
1036	lea	32(%ecx), %ecx
1037	add	%ecx, %edi
1038	add	%edi, %edx
1039	lea	6(%edi, %eax), %eax
1040	POP	(%edi)
1041	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1042
1043	CFI_PUSH (%edi)
1044
1045	.p2align 4
1046L(shl_7):
1047#ifndef USE_AS_MEMMOVE
1048	movaps	-7(%eax), %xmm1
1049#else
1050	movl	DEST+4(%esp), %edi
1051	movaps	-7(%eax), %xmm1
1052	movdqu	%xmm0, (%edi)
1053#endif
1054#ifdef DATA_CACHE_SIZE_HALF
1055	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1056#else
1057# if (defined SHARED || defined __PIC__)
1058	SETUP_PIC_REG(bx)
1059	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1060	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1061# else
1062	cmp	__x86_data_cache_size_half, %ecx
1063# endif
1064#endif
1065	jb L(sh_7_no_prefetch)
1066
1067	lea	-64(%ecx), %ecx
1068
1069	.p2align 4
1070L(Shl7LoopStart):
1071	prefetcht0 0x1c0(%eax)
1072	prefetcht0 0x1c0(%edx)
1073	movaps	9(%eax), %xmm2
1074	movaps	25(%eax), %xmm3
1075	movaps	41(%eax), %xmm4
1076	movaps	57(%eax), %xmm5
1077	movaps	%xmm5, %xmm7
1078	palignr	$7, %xmm4, %xmm5
1079	palignr	$7, %xmm3, %xmm4
1080	movaps	%xmm5, 48(%edx)
1081	palignr	$7, %xmm2, %xmm3
1082	lea	64(%eax), %eax
1083	palignr	$7, %xmm1, %xmm2
1084	movaps	%xmm4, 32(%edx)
1085	movaps	%xmm3, 16(%edx)
1086	movaps	%xmm7, %xmm1
1087	movaps	%xmm2, (%edx)
1088	lea	64(%edx), %edx
1089	sub	$64, %ecx
1090	ja	L(Shl7LoopStart)
1091
1092L(Shl7LoopLeave):
1093	add	$32, %ecx
1094	jle	L(shl_end_0)
1095
1096	movaps	9(%eax), %xmm2
1097	movaps	25(%eax), %xmm3
1098	palignr	$7, %xmm2, %xmm3
1099	palignr	$7, %xmm1, %xmm2
1100	movaps	%xmm2, (%edx)
1101	movaps	%xmm3, 16(%edx)
1102	lea	32(%edx, %ecx), %edx
1103	lea	32(%eax, %ecx), %eax
1104	POP (%edi)
1105	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1106
1107	CFI_PUSH (%edi)
1108
1109	.p2align 4
1110L(sh_7_no_prefetch):
1111	lea	-32(%ecx), %ecx
1112	lea	-7(%eax), %eax
1113	xor	%edi, %edi
1114
1115	.p2align 4
1116L(sh_7_no_prefetch_loop):
1117	movdqa	16(%eax, %edi), %xmm2
1118	sub	$32, %ecx
1119	movdqa	32(%eax, %edi), %xmm3
1120	movdqa	%xmm3, %xmm4
1121	palignr	$7, %xmm2, %xmm3
1122	palignr	$7, %xmm1, %xmm2
1123	lea	32(%edi), %edi
1124	movdqa	%xmm2, -32(%edx, %edi)
1125	movdqa	%xmm3, -16(%edx, %edi)
1126	jb	L(sh_7_end_no_prefetch_loop)
1127
1128	movdqa	16(%eax, %edi), %xmm2
1129	sub	$32, %ecx
1130	movdqa	32(%eax, %edi), %xmm3
1131	movdqa	%xmm3, %xmm1
1132	palignr	$7, %xmm2, %xmm3
1133	palignr	$7, %xmm4, %xmm2
1134	lea	32(%edi), %edi
1135	movdqa	%xmm2, -32(%edx, %edi)
1136	movdqa	%xmm3, -16(%edx, %edi)
1137	jae	L(sh_7_no_prefetch_loop)
1138
1139L(sh_7_end_no_prefetch_loop):
1140	lea	32(%ecx), %ecx
1141	add	%ecx, %edi
1142	add	%edi, %edx
1143	lea	7(%edi, %eax), %eax
1144	POP	(%edi)
1145	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1146
1147	CFI_PUSH (%edi)
1148
1149	.p2align 4
1150L(shl_8):
1151#ifndef USE_AS_MEMMOVE
1152	movaps	-8(%eax), %xmm1
1153#else
1154	movl	DEST+4(%esp), %edi
1155	movaps	-8(%eax), %xmm1
1156	movdqu	%xmm0, (%edi)
1157#endif
1158#ifdef DATA_CACHE_SIZE_HALF
1159	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1160#else
1161# if (defined SHARED || defined __PIC__)
1162	SETUP_PIC_REG(bx)
1163	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1164	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1165# else
1166	cmp	__x86_data_cache_size_half, %ecx
1167# endif
1168#endif
1169	jb L(sh_8_no_prefetch)
1170
1171	lea	-64(%ecx), %ecx
1172
1173	.p2align 4
1174L(Shl8LoopStart):
1175	prefetcht0 0x1c0(%eax)
1176	prefetcht0 0x1c0(%edx)
1177	movaps	8(%eax), %xmm2
1178	movaps	24(%eax), %xmm3
1179	movaps	40(%eax), %xmm4
1180	movaps	56(%eax), %xmm5
1181	movaps	%xmm5, %xmm7
1182	palignr	$8, %xmm4, %xmm5
1183	palignr	$8, %xmm3, %xmm4
1184	movaps	%xmm5, 48(%edx)
1185	palignr	$8, %xmm2, %xmm3
1186	lea	64(%eax), %eax
1187	palignr	$8, %xmm1, %xmm2
1188	movaps	%xmm4, 32(%edx)
1189	movaps	%xmm3, 16(%edx)
1190	movaps	%xmm7, %xmm1
1191	movaps	%xmm2, (%edx)
1192	lea	64(%edx), %edx
1193	sub	$64, %ecx
1194	ja	L(Shl8LoopStart)
1195
1196L(LoopLeave8):
1197	add	$32, %ecx
1198	jle	L(shl_end_0)
1199
1200	movaps	8(%eax), %xmm2
1201	movaps	24(%eax), %xmm3
1202	palignr	$8, %xmm2, %xmm3
1203	palignr	$8, %xmm1, %xmm2
1204	movaps	%xmm2, (%edx)
1205	movaps	%xmm3, 16(%edx)
1206	lea	32(%edx, %ecx), %edx
1207	lea	32(%eax, %ecx), %eax
1208	POP (%edi)
1209	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1210
1211	CFI_PUSH (%edi)
1212
1213	.p2align 4
1214L(sh_8_no_prefetch):
1215	lea	-32(%ecx), %ecx
1216	lea	-8(%eax), %eax
1217	xor	%edi, %edi
1218
1219	.p2align 4
1220L(sh_8_no_prefetch_loop):
1221	movdqa	16(%eax, %edi), %xmm2
1222	sub	$32, %ecx
1223	movdqa	32(%eax, %edi), %xmm3
1224	movdqa	%xmm3, %xmm4
1225	palignr	$8, %xmm2, %xmm3
1226	palignr	$8, %xmm1, %xmm2
1227	lea	32(%edi), %edi
1228	movdqa	%xmm2, -32(%edx, %edi)
1229	movdqa	%xmm3, -16(%edx, %edi)
1230	jb	L(sh_8_end_no_prefetch_loop)
1231
1232	movdqa	16(%eax, %edi), %xmm2
1233	sub	$32, %ecx
1234	movdqa	32(%eax, %edi), %xmm3
1235	movdqa	%xmm3, %xmm1
1236	palignr	$8, %xmm2, %xmm3
1237	palignr	$8, %xmm4, %xmm2
1238	lea	32(%edi), %edi
1239	movdqa	%xmm2, -32(%edx, %edi)
1240	movdqa	%xmm3, -16(%edx, %edi)
1241	jae	L(sh_8_no_prefetch_loop)
1242
1243L(sh_8_end_no_prefetch_loop):
1244	lea	32(%ecx), %ecx
1245	add	%ecx, %edi
1246	add	%edi, %edx
1247	lea	8(%edi, %eax), %eax
1248	POP	(%edi)
1249	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1250
1251	CFI_PUSH (%edi)
1252
1253	.p2align 4
1254L(shl_9):
1255#ifndef USE_AS_MEMMOVE
1256	movaps	-9(%eax), %xmm1
1257#else
1258	movl	DEST+4(%esp), %edi
1259	movaps	-9(%eax), %xmm1
1260	movdqu	%xmm0, (%edi)
1261#endif
1262#ifdef DATA_CACHE_SIZE_HALF
1263	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1264#else
1265# if (defined SHARED || defined __PIC__)
1266	SETUP_PIC_REG(bx)
1267	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1268	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1269# else
1270	cmp	__x86_data_cache_size_half, %ecx
1271# endif
1272#endif
1273	jb L(sh_9_no_prefetch)
1274
1275	lea	-64(%ecx), %ecx
1276
1277	.p2align 4
1278L(Shl9LoopStart):
1279	prefetcht0 0x1c0(%eax)
1280	prefetcht0 0x1c0(%edx)
1281	movaps	7(%eax), %xmm2
1282	movaps	23(%eax), %xmm3
1283	movaps	39(%eax), %xmm4
1284	movaps	55(%eax), %xmm5
1285	movaps	%xmm5, %xmm7
1286	palignr	$9, %xmm4, %xmm5
1287	palignr	$9, %xmm3, %xmm4
1288	movaps	%xmm5, 48(%edx)
1289	palignr	$9, %xmm2, %xmm3
1290	lea	64(%eax), %eax
1291	palignr	$9, %xmm1, %xmm2
1292	movaps	%xmm4, 32(%edx)
1293	movaps	%xmm3, 16(%edx)
1294	movaps	%xmm7, %xmm1
1295	movaps	%xmm2, (%edx)
1296	lea	64(%edx), %edx
1297	sub	$64, %ecx
1298	ja	L(Shl9LoopStart)
1299
1300L(Shl9LoopLeave):
1301	add	$32, %ecx
1302	jle	L(shl_end_0)
1303
1304	movaps	7(%eax), %xmm2
1305	movaps	23(%eax), %xmm3
1306	palignr	$9, %xmm2, %xmm3
1307	palignr	$9, %xmm1, %xmm2
1308
1309	movaps	%xmm2, (%edx)
1310	movaps	%xmm3, 16(%edx)
1311	lea	32(%edx, %ecx), %edx
1312	lea	32(%eax, %ecx), %eax
1313	POP (%edi)
1314	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1315
1316	CFI_PUSH (%edi)
1317
1318	.p2align 4
1319L(sh_9_no_prefetch):
1320	lea	-32(%ecx), %ecx
1321	lea	-9(%eax), %eax
1322	xor	%edi, %edi
1323
1324	.p2align 4
1325L(sh_9_no_prefetch_loop):
1326	movdqa	16(%eax, %edi), %xmm2
1327	sub	$32, %ecx
1328	movdqa	32(%eax, %edi), %xmm3
1329	movdqa	%xmm3, %xmm4
1330	palignr	$9, %xmm2, %xmm3
1331	palignr	$9, %xmm1, %xmm2
1332	lea	32(%edi), %edi
1333	movdqa	%xmm2, -32(%edx, %edi)
1334	movdqa	%xmm3, -16(%edx, %edi)
1335	jb	L(sh_9_end_no_prefetch_loop)
1336
1337	movdqa	16(%eax, %edi), %xmm2
1338	sub	$32, %ecx
1339	movdqa	32(%eax, %edi), %xmm3
1340	movdqa	%xmm3, %xmm1
1341	palignr	$9, %xmm2, %xmm3
1342	palignr	$9, %xmm4, %xmm2
1343	lea	32(%edi), %edi
1344	movdqa	%xmm2, -32(%edx, %edi)
1345	movdqa	%xmm3, -16(%edx, %edi)
1346	jae	L(sh_9_no_prefetch_loop)
1347
1348L(sh_9_end_no_prefetch_loop):
1349	lea	32(%ecx), %ecx
1350	add	%ecx, %edi
1351	add	%edi, %edx
1352	lea	9(%edi, %eax), %eax
1353	POP	(%edi)
1354	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1355
1356	CFI_PUSH (%edi)
1357
1358	.p2align 4
1359L(shl_10):
1360#ifndef USE_AS_MEMMOVE
1361	movaps	-10(%eax), %xmm1
1362#else
1363	movl	DEST+4(%esp), %edi
1364	movaps	-10(%eax), %xmm1
1365	movdqu	%xmm0, (%edi)
1366#endif
1367#ifdef DATA_CACHE_SIZE_HALF
1368	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1369#else
1370# if (defined SHARED || defined __PIC__)
1371	SETUP_PIC_REG(bx)
1372	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1373	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1374# else
1375	cmp	__x86_data_cache_size_half, %ecx
1376# endif
1377#endif
1378	jb L(sh_10_no_prefetch)
1379
1380	lea	-64(%ecx), %ecx
1381
1382	.p2align 4
1383L(Shl10LoopStart):
1384	prefetcht0 0x1c0(%eax)
1385	prefetcht0 0x1c0(%edx)
1386	movaps	6(%eax), %xmm2
1387	movaps	22(%eax), %xmm3
1388	movaps	38(%eax), %xmm4
1389	movaps	54(%eax), %xmm5
1390	movaps	%xmm5, %xmm7
1391	palignr	$10, %xmm4, %xmm5
1392	palignr	$10, %xmm3, %xmm4
1393	movaps	%xmm5, 48(%edx)
1394	palignr	$10, %xmm2, %xmm3
1395	lea	64(%eax), %eax
1396	palignr	$10, %xmm1, %xmm2
1397	movaps	%xmm4, 32(%edx)
1398	movaps	%xmm3, 16(%edx)
1399	movaps	%xmm7, %xmm1
1400	movaps	%xmm2, (%edx)
1401	lea	64(%edx), %edx
1402	sub	$64, %ecx
1403	ja	L(Shl10LoopStart)
1404
1405L(Shl10LoopLeave):
1406	add	$32, %ecx
1407	jle	L(shl_end_0)
1408
1409	movaps	6(%eax), %xmm2
1410	movaps	22(%eax), %xmm3
1411	palignr	$10, %xmm2, %xmm3
1412	palignr	$10, %xmm1, %xmm2
1413
1414	movaps	%xmm2, (%edx)
1415	movaps	%xmm3, 16(%edx)
1416	lea	32(%edx, %ecx), %edx
1417	lea	32(%eax, %ecx), %eax
1418	POP (%edi)
1419	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1420
1421	CFI_PUSH (%edi)
1422
1423	.p2align 4
1424L(sh_10_no_prefetch):
1425	lea	-32(%ecx), %ecx
1426	lea	-10(%eax), %eax
1427	xor	%edi, %edi
1428
1429	.p2align 4
1430L(sh_10_no_prefetch_loop):
1431	movdqa	16(%eax, %edi), %xmm2
1432	sub	$32, %ecx
1433	movdqa	32(%eax, %edi), %xmm3
1434	movdqa	%xmm3, %xmm4
1435	palignr	$10, %xmm2, %xmm3
1436	palignr	$10, %xmm1, %xmm2
1437	lea	32(%edi), %edi
1438	movdqa	%xmm2, -32(%edx, %edi)
1439	movdqa	%xmm3, -16(%edx, %edi)
1440	jb	L(sh_10_end_no_prefetch_loop)
1441
1442	movdqa	16(%eax, %edi), %xmm2
1443	sub	$32, %ecx
1444	movdqa	32(%eax, %edi), %xmm3
1445	movdqa	%xmm3, %xmm1
1446	palignr	$10, %xmm2, %xmm3
1447	palignr	$10, %xmm4, %xmm2
1448	lea	32(%edi), %edi
1449	movdqa	%xmm2, -32(%edx, %edi)
1450	movdqa	%xmm3, -16(%edx, %edi)
1451	jae	L(sh_10_no_prefetch_loop)
1452
1453L(sh_10_end_no_prefetch_loop):
1454	lea	32(%ecx), %ecx
1455	add	%ecx, %edi
1456	add	%edi, %edx
1457	lea	10(%edi, %eax), %eax
1458	POP	(%edi)
1459	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1460
1461	CFI_PUSH (%edi)
1462
1463	.p2align 4
1464L(shl_11):
1465#ifndef USE_AS_MEMMOVE
1466	movaps	-11(%eax), %xmm1
1467#else
1468	movl	DEST+4(%esp), %edi
1469	movaps	-11(%eax), %xmm1
1470	movdqu	%xmm0, (%edi)
1471#endif
1472#ifdef DATA_CACHE_SIZE_HALF
1473	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1474#else
1475# if (defined SHARED || defined __PIC__)
1476	SETUP_PIC_REG(bx)
1477	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1478	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1479# else
1480	cmp	__x86_data_cache_size_half, %ecx
1481# endif
1482#endif
1483	jb L(sh_11_no_prefetch)
1484
1485	lea	-64(%ecx), %ecx
1486
1487	.p2align 4
1488L(Shl11LoopStart):
1489	prefetcht0 0x1c0(%eax)
1490	prefetcht0 0x1c0(%edx)
1491	movaps	5(%eax), %xmm2
1492	movaps	21(%eax), %xmm3
1493	movaps	37(%eax), %xmm4
1494	movaps	53(%eax), %xmm5
1495	movaps	%xmm5, %xmm7
1496	palignr	$11, %xmm4, %xmm5
1497	palignr	$11, %xmm3, %xmm4
1498	movaps	%xmm5, 48(%edx)
1499	palignr	$11, %xmm2, %xmm3
1500	lea	64(%eax), %eax
1501	palignr	$11, %xmm1, %xmm2
1502	movaps	%xmm4, 32(%edx)
1503	movaps	%xmm3, 16(%edx)
1504	movaps	%xmm7, %xmm1
1505	movaps	%xmm2, (%edx)
1506	lea	64(%edx), %edx
1507	sub	$64, %ecx
1508	ja	L(Shl11LoopStart)
1509
1510L(Shl11LoopLeave):
1511	add	$32, %ecx
1512	jle	L(shl_end_0)
1513
1514	movaps	5(%eax), %xmm2
1515	movaps	21(%eax), %xmm3
1516	palignr	$11, %xmm2, %xmm3
1517	palignr	$11, %xmm1, %xmm2
1518
1519	movaps	%xmm2, (%edx)
1520	movaps	%xmm3, 16(%edx)
1521	lea	32(%edx, %ecx), %edx
1522	lea	32(%eax, %ecx), %eax
1523	POP (%edi)
1524	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1525
1526	CFI_PUSH (%edi)
1527
1528	.p2align 4
1529L(sh_11_no_prefetch):
1530	lea	-32(%ecx), %ecx
1531	lea	-11(%eax), %eax
1532	xor	%edi, %edi
1533
1534	.p2align 4
1535L(sh_11_no_prefetch_loop):
1536	movdqa	16(%eax, %edi), %xmm2
1537	sub	$32, %ecx
1538	movdqa	32(%eax, %edi), %xmm3
1539	movdqa	%xmm3, %xmm4
1540	palignr	$11, %xmm2, %xmm3
1541	palignr	$11, %xmm1, %xmm2
1542	lea	32(%edi), %edi
1543	movdqa	%xmm2, -32(%edx, %edi)
1544	movdqa	%xmm3, -16(%edx, %edi)
1545	jb	L(sh_11_end_no_prefetch_loop)
1546
1547	movdqa	16(%eax, %edi), %xmm2
1548	sub	$32, %ecx
1549	movdqa	32(%eax, %edi), %xmm3
1550	movdqa	%xmm3, %xmm1
1551	palignr	$11, %xmm2, %xmm3
1552	palignr	$11, %xmm4, %xmm2
1553	lea	32(%edi), %edi
1554	movdqa	%xmm2, -32(%edx, %edi)
1555	movdqa	%xmm3, -16(%edx, %edi)
1556	jae	L(sh_11_no_prefetch_loop)
1557
1558L(sh_11_end_no_prefetch_loop):
1559	lea	32(%ecx), %ecx
1560	add	%ecx, %edi
1561	add	%edi, %edx
1562	lea	11(%edi, %eax), %eax
1563	POP	(%edi)
1564	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1565
1566	CFI_PUSH (%edi)
1567
1568	.p2align 4
1569L(shl_12):
1570#ifndef USE_AS_MEMMOVE
1571	movaps	-12(%eax), %xmm1
1572#else
1573	movl	DEST+4(%esp), %edi
1574	movaps	-12(%eax), %xmm1
1575	movdqu	%xmm0, (%edi)
1576#endif
1577#ifdef DATA_CACHE_SIZE_HALF
1578	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1579#else
1580# if (defined SHARED || defined __PIC__)
1581	SETUP_PIC_REG(bx)
1582	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1583	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1584# else
1585	cmp	__x86_data_cache_size_half, %ecx
1586# endif
1587#endif
1588	jb L(sh_12_no_prefetch)
1589
1590	lea	-64(%ecx), %ecx
1591
1592	.p2align 4
1593L(Shl12LoopStart):
1594	prefetcht0 0x1c0(%eax)
1595	prefetcht0 0x1c0(%edx)
1596	movaps	4(%eax), %xmm2
1597	movaps	20(%eax), %xmm3
1598	movaps	36(%eax), %xmm4
1599	movaps	52(%eax), %xmm5
1600	movaps	%xmm5, %xmm7
1601	palignr	$12, %xmm4, %xmm5
1602	palignr	$12, %xmm3, %xmm4
1603	movaps	%xmm5, 48(%edx)
1604	palignr	$12, %xmm2, %xmm3
1605	lea	64(%eax), %eax
1606	palignr	$12, %xmm1, %xmm2
1607	movaps	%xmm4, 32(%edx)
1608	movaps	%xmm3, 16(%edx)
1609	movaps	%xmm7, %xmm1
1610	movaps	%xmm2, (%edx)
1611	lea	64(%edx), %edx
1612	sub	$64, %ecx
1613	ja	L(Shl12LoopStart)
1614
1615L(Shl12LoopLeave):
1616	add	$32, %ecx
1617	jle	L(shl_end_0)
1618
1619	movaps	4(%eax), %xmm2
1620	movaps	20(%eax), %xmm3
1621	palignr	$12, %xmm2, %xmm3
1622	palignr	$12, %xmm1, %xmm2
1623
1624	movaps	%xmm2, (%edx)
1625	movaps	%xmm3, 16(%edx)
1626	lea	32(%edx, %ecx), %edx
1627	lea	32(%eax, %ecx), %eax
1628	POP (%edi)
1629	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1630
1631	CFI_PUSH (%edi)
1632
1633	.p2align 4
1634L(sh_12_no_prefetch):
1635	lea	-32(%ecx), %ecx
1636	lea	-12(%eax), %eax
1637	xor	%edi, %edi
1638
1639	.p2align 4
1640L(sh_12_no_prefetch_loop):
1641	movdqa	16(%eax, %edi), %xmm2
1642	sub	$32, %ecx
1643	movdqa	32(%eax, %edi), %xmm3
1644	movdqa	%xmm3, %xmm4
1645	palignr	$12, %xmm2, %xmm3
1646	palignr	$12, %xmm1, %xmm2
1647	lea	32(%edi), %edi
1648	movdqa	%xmm2, -32(%edx, %edi)
1649	movdqa	%xmm3, -16(%edx, %edi)
1650	jb	L(sh_12_end_no_prefetch_loop)
1651
1652	movdqa	16(%eax, %edi), %xmm2
1653	sub	$32, %ecx
1654	movdqa	32(%eax, %edi), %xmm3
1655	movdqa	%xmm3, %xmm1
1656	palignr	$12, %xmm2, %xmm3
1657	palignr	$12, %xmm4, %xmm2
1658	lea	32(%edi), %edi
1659	movdqa	%xmm2, -32(%edx, %edi)
1660	movdqa	%xmm3, -16(%edx, %edi)
1661	jae	L(sh_12_no_prefetch_loop)
1662
1663L(sh_12_end_no_prefetch_loop):
1664	lea	32(%ecx), %ecx
1665	add	%ecx, %edi
1666	add	%edi, %edx
1667	lea	12(%edi, %eax), %eax
1668	POP	(%edi)
1669	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1670
1671	CFI_PUSH (%edi)
1672
1673	.p2align 4
1674L(shl_13):
1675#ifndef USE_AS_MEMMOVE
1676	movaps	-13(%eax), %xmm1
1677#else
1678	movl	DEST+4(%esp), %edi
1679	movaps	-13(%eax), %xmm1
1680	movdqu	%xmm0, (%edi)
1681#endif
1682#ifdef DATA_CACHE_SIZE_HALF
1683	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1684#else
1685# if (defined SHARED || defined __PIC__)
1686	SETUP_PIC_REG(bx)
1687	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1688	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1689# else
1690	cmp	__x86_data_cache_size_half, %ecx
1691# endif
1692#endif
1693	jb L(sh_13_no_prefetch)
1694
1695	lea	-64(%ecx), %ecx
1696
1697	.p2align 4
1698L(Shl13LoopStart):
1699	prefetcht0 0x1c0(%eax)
1700	prefetcht0 0x1c0(%edx)
1701	movaps	3(%eax), %xmm2
1702	movaps	19(%eax), %xmm3
1703	movaps	35(%eax), %xmm4
1704	movaps	51(%eax), %xmm5
1705	movaps	%xmm5, %xmm7
1706	palignr	$13, %xmm4, %xmm5
1707	palignr	$13, %xmm3, %xmm4
1708	movaps	%xmm5, 48(%edx)
1709	palignr	$13, %xmm2, %xmm3
1710	lea	64(%eax), %eax
1711	palignr	$13, %xmm1, %xmm2
1712	movaps	%xmm4, 32(%edx)
1713	movaps	%xmm3, 16(%edx)
1714	movaps	%xmm7, %xmm1
1715	movaps	%xmm2, (%edx)
1716	lea	64(%edx), %edx
1717	sub	$64, %ecx
1718	ja	L(Shl13LoopStart)
1719
1720L(Shl13LoopLeave):
1721	add	$32, %ecx
1722	jle	L(shl_end_0)
1723
1724	movaps	3(%eax), %xmm2
1725	movaps	19(%eax), %xmm3
1726	palignr	$13, %xmm2, %xmm3
1727	palignr	$13, %xmm1, %xmm2
1728
1729	movaps	%xmm2, (%edx)
1730	movaps	%xmm3, 16(%edx)
1731	lea	32(%edx, %ecx), %edx
1732	lea	32(%eax, %ecx), %eax
1733	POP (%edi)
1734	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1735
1736	CFI_PUSH (%edi)
1737
1738	.p2align 4
1739L(sh_13_no_prefetch):
1740	lea	-32(%ecx), %ecx
1741	lea	-13(%eax), %eax
1742	xor	%edi, %edi
1743
1744	.p2align 4
1745L(sh_13_no_prefetch_loop):
1746	movdqa	16(%eax, %edi), %xmm2
1747	sub	$32, %ecx
1748	movdqa	32(%eax, %edi), %xmm3
1749	movdqa	%xmm3, %xmm4
1750	palignr	$13, %xmm2, %xmm3
1751	palignr	$13, %xmm1, %xmm2
1752	lea	32(%edi), %edi
1753	movdqa	%xmm2, -32(%edx, %edi)
1754	movdqa	%xmm3, -16(%edx, %edi)
1755	jb	L(sh_13_end_no_prefetch_loop)
1756
1757	movdqa	16(%eax, %edi), %xmm2
1758	sub	$32, %ecx
1759	movdqa	32(%eax, %edi), %xmm3
1760	movdqa	%xmm3, %xmm1
1761	palignr	$13, %xmm2, %xmm3
1762	palignr	$13, %xmm4, %xmm2
1763	lea	32(%edi), %edi
1764	movdqa	%xmm2, -32(%edx, %edi)
1765	movdqa	%xmm3, -16(%edx, %edi)
1766	jae	L(sh_13_no_prefetch_loop)
1767
1768L(sh_13_end_no_prefetch_loop):
1769	lea	32(%ecx), %ecx
1770	add	%ecx, %edi
1771	add	%edi, %edx
1772	lea	13(%edi, %eax), %eax
1773	POP	(%edi)
1774	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1775
1776	CFI_PUSH (%edi)
1777
1778	.p2align 4
1779L(shl_14):
1780#ifndef USE_AS_MEMMOVE
1781	movaps	-14(%eax), %xmm1
1782#else
1783	movl	DEST+4(%esp), %edi
1784	movaps	-14(%eax), %xmm1
1785	movdqu	%xmm0, (%edi)
1786#endif
1787#ifdef DATA_CACHE_SIZE_HALF
1788	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1789#else
1790# if (defined SHARED || defined __PIC__)
1791	SETUP_PIC_REG(bx)
1792	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1793	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1794# else
1795	cmp	__x86_data_cache_size_half, %ecx
1796# endif
1797#endif
1798	jb L(sh_14_no_prefetch)
1799
1800	lea	-64(%ecx), %ecx
1801
1802	.p2align 4
1803L(Shl14LoopStart):
1804	prefetcht0 0x1c0(%eax)
1805	prefetcht0 0x1c0(%edx)
1806	movaps	2(%eax), %xmm2
1807	movaps	18(%eax), %xmm3
1808	movaps	34(%eax), %xmm4
1809	movaps	50(%eax), %xmm5
1810	movaps	%xmm5, %xmm7
1811	palignr	$14, %xmm4, %xmm5
1812	palignr	$14, %xmm3, %xmm4
1813	movaps	%xmm5, 48(%edx)
1814	palignr	$14, %xmm2, %xmm3
1815	lea	64(%eax), %eax
1816	palignr	$14, %xmm1, %xmm2
1817	movaps	%xmm4, 32(%edx)
1818	movaps	%xmm3, 16(%edx)
1819	movaps	%xmm7, %xmm1
1820	movaps	%xmm2, (%edx)
1821	lea	64(%edx), %edx
1822	sub	$64, %ecx
1823	ja	L(Shl14LoopStart)
1824
1825L(Shl14LoopLeave):
1826	add	$32, %ecx
1827	jle	L(shl_end_0)
1828
1829	movaps	2(%eax), %xmm2
1830	movaps	18(%eax), %xmm3
1831	palignr	$14, %xmm2, %xmm3
1832	palignr	$14, %xmm1, %xmm2
1833
1834	movaps	%xmm2, (%edx)
1835	movaps	%xmm3, 16(%edx)
1836	lea	32(%edx, %ecx), %edx
1837	lea	32(%eax, %ecx), %eax
1838	POP (%edi)
1839	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1840
1841	CFI_PUSH (%edi)
1842
1843	.p2align 4
1844L(sh_14_no_prefetch):
1845	lea	-32(%ecx), %ecx
1846	lea	-14(%eax), %eax
1847	xor	%edi, %edi
1848
1849	.p2align 4
1850L(sh_14_no_prefetch_loop):
1851	movdqa	16(%eax, %edi), %xmm2
1852	sub	$32, %ecx
1853	movdqa	32(%eax, %edi), %xmm3
1854	movdqa	%xmm3, %xmm4
1855	palignr	$14, %xmm2, %xmm3
1856	palignr	$14, %xmm1, %xmm2
1857	lea	32(%edi), %edi
1858	movdqa	%xmm2, -32(%edx, %edi)
1859	movdqa	%xmm3, -16(%edx, %edi)
1860	jb	L(sh_14_end_no_prefetch_loop)
1861
1862	movdqa	16(%eax, %edi), %xmm2
1863	sub	$32, %ecx
1864	movdqa	32(%eax, %edi), %xmm3
1865	movdqa	%xmm3, %xmm1
1866	palignr	$14, %xmm2, %xmm3
1867	palignr	$14, %xmm4, %xmm2
1868	lea	32(%edi), %edi
1869	movdqa	%xmm2, -32(%edx, %edi)
1870	movdqa	%xmm3, -16(%edx, %edi)
1871	jae	L(sh_14_no_prefetch_loop)
1872
1873L(sh_14_end_no_prefetch_loop):
1874	lea	32(%ecx), %ecx
1875	add	%ecx, %edi
1876	add	%edi, %edx
1877	lea	14(%edi, %eax), %eax
1878	POP	(%edi)
1879	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1880
1881	CFI_PUSH (%edi)
1882
1883	.p2align 4
1884L(shl_15):
1885#ifndef USE_AS_MEMMOVE
1886	movaps	-15(%eax), %xmm1
1887#else
1888	movl	DEST+4(%esp), %edi
1889	movaps	-15(%eax), %xmm1
1890	movdqu	%xmm0, (%edi)
1891#endif
1892#ifdef DATA_CACHE_SIZE_HALF
1893	cmp	$DATA_CACHE_SIZE_HALF, %ecx
1894#else
1895# if (defined SHARED || defined __PIC__)
1896	SETUP_PIC_REG(bx)
1897	add	$_GLOBAL_OFFSET_TABLE_, %ebx
1898	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1899# else
1900	cmp	__x86_data_cache_size_half, %ecx
1901# endif
1902#endif
1903	jb L(sh_15_no_prefetch)
1904
1905	lea	-64(%ecx), %ecx
1906
1907	.p2align 4
1908L(Shl15LoopStart):
1909	prefetcht0 0x1c0(%eax)
1910	prefetcht0 0x1c0(%edx)
1911	movaps	1(%eax), %xmm2
1912	movaps	17(%eax), %xmm3
1913	movaps	33(%eax), %xmm4
1914	movaps	49(%eax), %xmm5
1915	movaps	%xmm5, %xmm7
1916	palignr	$15, %xmm4, %xmm5
1917	palignr	$15, %xmm3, %xmm4
1918	movaps	%xmm5, 48(%edx)
1919	palignr	$15, %xmm2, %xmm3
1920	lea	64(%eax), %eax
1921	palignr	$15, %xmm1, %xmm2
1922	movaps	%xmm4, 32(%edx)
1923	movaps	%xmm3, 16(%edx)
1924	movaps	%xmm7, %xmm1
1925	movaps	%xmm2, (%edx)
1926	lea	64(%edx), %edx
1927	sub	$64, %ecx
1928	ja	L(Shl15LoopStart)
1929
1930L(Shl15LoopLeave):
1931	add	$32, %ecx
1932	jle	L(shl_end_0)
1933
1934	movaps	1(%eax), %xmm2
1935	movaps	17(%eax), %xmm3
1936	palignr	$15, %xmm2, %xmm3
1937	palignr	$15, %xmm1, %xmm2
1938
1939	movaps	%xmm2, (%edx)
1940	movaps	%xmm3, 16(%edx)
1941	lea	32(%edx, %ecx), %edx
1942	lea	32(%eax, %ecx), %eax
1943	POP (%edi)
1944	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1945
1946	CFI_PUSH (%edi)
1947
1948	.p2align 4
1949L(sh_15_no_prefetch):
1950	lea	-32(%ecx), %ecx
1951	lea	-15(%eax), %eax
1952	xor	%edi, %edi
1953
1954	.p2align 4
1955L(sh_15_no_prefetch_loop):
1956	movdqa	16(%eax, %edi), %xmm2
1957	sub	$32, %ecx
1958	movdqa	32(%eax, %edi), %xmm3
1959	movdqa	%xmm3, %xmm4
1960	palignr	$15, %xmm2, %xmm3
1961	palignr	$15, %xmm1, %xmm2
1962	lea	32(%edi), %edi
1963	movdqa	%xmm2, -32(%edx, %edi)
1964	movdqa	%xmm3, -16(%edx, %edi)
1965	jb	L(sh_15_end_no_prefetch_loop)
1966
1967	movdqa	16(%eax, %edi), %xmm2
1968	sub	$32, %ecx
1969	movdqa	32(%eax, %edi), %xmm3
1970	movdqa	%xmm3, %xmm1
1971	palignr	$15, %xmm2, %xmm3
1972	palignr	$15, %xmm4, %xmm2
1973	lea	32(%edi), %edi
1974	movdqa	%xmm2, -32(%edx, %edi)
1975	movdqa	%xmm3, -16(%edx, %edi)
1976	jae	L(sh_15_no_prefetch_loop)
1977
1978L(sh_15_end_no_prefetch_loop):
1979	lea	32(%ecx), %ecx
1980	add	%ecx, %edi
1981	add	%edi, %edx
1982	lea	15(%edi, %eax), %eax
1983	POP	(%edi)
1984	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1985
1986	CFI_PUSH (%edi)
1987
1988	.p2align 4
1989L(shl_end_0):
1990	lea	32(%ecx), %ecx
1991	lea	(%edx, %ecx), %edx
1992	lea	(%eax, %ecx), %eax
1993	POP	(%edi)
1994	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1995
1996	.p2align 4
1997L(fwd_write_44bytes):
1998	movq	-44(%eax), %xmm0
1999	movq	%xmm0, -44(%edx)
2000L(fwd_write_36bytes):
2001	movq	-36(%eax), %xmm0
2002	movq	%xmm0, -36(%edx)
2003L(fwd_write_28bytes):
2004	movq	-28(%eax), %xmm0
2005	movq	%xmm0, -28(%edx)
2006L(fwd_write_20bytes):
2007	movq	-20(%eax), %xmm0
2008	movq	%xmm0, -20(%edx)
2009L(fwd_write_12bytes):
2010	movq	-12(%eax), %xmm0
2011	movq	%xmm0, -12(%edx)
2012L(fwd_write_4bytes):
2013	movl	-4(%eax), %ecx
2014	movl	%ecx, -4(%edx)
2015#ifdef USE_AS_MEMPCPY
2016	movl	%edx, %eax
2017#else
2018	movl	DEST(%esp), %eax
2019#endif
2020	RETURN
2021
2022	.p2align 4
2023L(fwd_write_40bytes):
2024	movq	-40(%eax), %xmm0
2025	movq	%xmm0, -40(%edx)
2026L(fwd_write_32bytes):
2027	movq	-32(%eax), %xmm0
2028	movq	%xmm0, -32(%edx)
2029L(fwd_write_24bytes):
2030	movq	-24(%eax), %xmm0
2031	movq	%xmm0, -24(%edx)
2032L(fwd_write_16bytes):
2033	movq	-16(%eax), %xmm0
2034	movq	%xmm0, -16(%edx)
2035L(fwd_write_8bytes):
2036	movq	-8(%eax), %xmm0
2037	movq	%xmm0, -8(%edx)
2038L(fwd_write_0bytes):
2039#ifdef USE_AS_MEMPCPY
2040	movl	%edx, %eax
2041#else
2042	movl	DEST(%esp), %eax
2043#endif
2044	RETURN
2045
2046	.p2align 4
2047L(fwd_write_5bytes):
2048	movl	-5(%eax), %ecx
2049	movl	-4(%eax), %eax
2050	movl	%ecx, -5(%edx)
2051	movl	%eax, -4(%edx)
2052#ifdef USE_AS_MEMPCPY
2053	movl	%edx, %eax
2054#else
2055	movl	DEST(%esp), %eax
2056#endif
2057	RETURN
2058
2059	.p2align 4
2060L(fwd_write_45bytes):
2061	movq	-45(%eax), %xmm0
2062	movq	%xmm0, -45(%edx)
2063L(fwd_write_37bytes):
2064	movq	-37(%eax), %xmm0
2065	movq	%xmm0, -37(%edx)
2066L(fwd_write_29bytes):
2067	movq	-29(%eax), %xmm0
2068	movq	%xmm0, -29(%edx)
2069L(fwd_write_21bytes):
2070	movq	-21(%eax), %xmm0
2071	movq	%xmm0, -21(%edx)
2072L(fwd_write_13bytes):
2073	movq	-13(%eax), %xmm0
2074	movq	%xmm0, -13(%edx)
2075	movl	-5(%eax), %ecx
2076	movl	%ecx, -5(%edx)
2077	movzbl	-1(%eax), %ecx
2078	movb	%cl, -1(%edx)
2079#ifdef USE_AS_MEMPCPY
2080	movl	%edx, %eax
2081#else
2082	movl	DEST(%esp), %eax
2083#endif
2084	RETURN
2085
2086	.p2align 4
2087L(fwd_write_41bytes):
2088	movq	-41(%eax), %xmm0
2089	movq	%xmm0, -41(%edx)
2090L(fwd_write_33bytes):
2091	movq	-33(%eax), %xmm0
2092	movq	%xmm0, -33(%edx)
2093L(fwd_write_25bytes):
2094	movq	-25(%eax), %xmm0
2095	movq	%xmm0, -25(%edx)
2096L(fwd_write_17bytes):
2097	movq	-17(%eax), %xmm0
2098	movq	%xmm0, -17(%edx)
2099L(fwd_write_9bytes):
2100	movq	-9(%eax), %xmm0
2101	movq	%xmm0, -9(%edx)
2102L(fwd_write_1bytes):
2103	movzbl	-1(%eax), %ecx
2104	movb	%cl, -1(%edx)
2105#ifdef USE_AS_MEMPCPY
2106	movl	%edx, %eax
2107#else
2108	movl	DEST(%esp), %eax
2109#endif
2110	RETURN
2111
2112	.p2align 4
2113L(fwd_write_46bytes):
2114	movq	-46(%eax), %xmm0
2115	movq	%xmm0, -46(%edx)
2116L(fwd_write_38bytes):
2117	movq	-38(%eax), %xmm0
2118	movq	%xmm0, -38(%edx)
2119L(fwd_write_30bytes):
2120	movq	-30(%eax), %xmm0
2121	movq	%xmm0, -30(%edx)
2122L(fwd_write_22bytes):
2123	movq	-22(%eax), %xmm0
2124	movq	%xmm0, -22(%edx)
2125L(fwd_write_14bytes):
2126	movq	-14(%eax), %xmm0
2127	movq	%xmm0, -14(%edx)
2128L(fwd_write_6bytes):
2129	movl	-6(%eax), %ecx
2130	movl	%ecx, -6(%edx)
2131	movzwl	-2(%eax), %ecx
2132	movw	%cx, -2(%edx)
2133#ifdef USE_AS_MEMPCPY
2134	movl	%edx, %eax
2135#else
2136	movl	DEST(%esp), %eax
2137#endif
2138	RETURN
2139
2140	.p2align 4
2141L(fwd_write_42bytes):
2142	movq	-42(%eax), %xmm0
2143	movq	%xmm0, -42(%edx)
2144L(fwd_write_34bytes):
2145	movq	-34(%eax), %xmm0
2146	movq	%xmm0, -34(%edx)
2147L(fwd_write_26bytes):
2148	movq	-26(%eax), %xmm0
2149	movq	%xmm0, -26(%edx)
2150L(fwd_write_18bytes):
2151	movq	-18(%eax), %xmm0
2152	movq	%xmm0, -18(%edx)
2153L(fwd_write_10bytes):
2154	movq	-10(%eax), %xmm0
2155	movq	%xmm0, -10(%edx)
2156L(fwd_write_2bytes):
2157	movzwl	-2(%eax), %ecx
2158	movw	%cx, -2(%edx)
2159#ifdef USE_AS_MEMPCPY
2160	movl	%edx, %eax
2161#else
2162	movl	DEST(%esp), %eax
2163#endif
2164	RETURN
2165
2166	.p2align 4
2167L(fwd_write_47bytes):
2168	movq	-47(%eax), %xmm0
2169	movq	%xmm0, -47(%edx)
2170L(fwd_write_39bytes):
2171	movq	-39(%eax), %xmm0
2172	movq	%xmm0, -39(%edx)
2173L(fwd_write_31bytes):
2174	movq	-31(%eax), %xmm0
2175	movq	%xmm0, -31(%edx)
2176L(fwd_write_23bytes):
2177	movq	-23(%eax), %xmm0
2178	movq	%xmm0, -23(%edx)
2179L(fwd_write_15bytes):
2180	movq	-15(%eax), %xmm0
2181	movq	%xmm0, -15(%edx)
2182L(fwd_write_7bytes):
2183	movl	-7(%eax), %ecx
2184	movl	%ecx, -7(%edx)
2185	movzwl	-3(%eax), %ecx
2186	movzbl	-1(%eax), %eax
2187	movw	%cx, -3(%edx)
2188	movb	%al, -1(%edx)
2189#ifdef USE_AS_MEMPCPY
2190	movl	%edx, %eax
2191#else
2192	movl	DEST(%esp), %eax
2193#endif
2194	RETURN
2195
2196	.p2align 4
2197L(fwd_write_43bytes):
2198	movq	-43(%eax), %xmm0
2199	movq	%xmm0, -43(%edx)
2200L(fwd_write_35bytes):
2201	movq	-35(%eax), %xmm0
2202	movq	%xmm0, -35(%edx)
2203L(fwd_write_27bytes):
2204	movq	-27(%eax), %xmm0
2205	movq	%xmm0, -27(%edx)
2206L(fwd_write_19bytes):
2207	movq	-19(%eax), %xmm0
2208	movq	%xmm0, -19(%edx)
2209L(fwd_write_11bytes):
2210	movq	-11(%eax), %xmm0
2211	movq	%xmm0, -11(%edx)
2212L(fwd_write_3bytes):
2213	movzwl	-3(%eax), %ecx
2214	movzbl	-1(%eax), %eax
2215	movw	%cx, -3(%edx)
2216	movb	%al, -1(%edx)
2217#ifdef USE_AS_MEMPCPY
2218	movl	%edx, %eax
2219#else
2220	movl	DEST(%esp), %eax
2221#endif
2222	RETURN
2223
2224	.p2align 4
2225L(fwd_write_40bytes_align):
2226	movdqa	-40(%eax), %xmm0
2227	movdqa	%xmm0, -40(%edx)
2228L(fwd_write_24bytes_align):
2229	movdqa	-24(%eax), %xmm0
2230	movdqa	%xmm0, -24(%edx)
2231L(fwd_write_8bytes_align):
2232	movq	-8(%eax), %xmm0
2233	movq	%xmm0, -8(%edx)
2234L(fwd_write_0bytes_align):
2235#ifdef USE_AS_MEMPCPY
2236	movl	%edx, %eax
2237#else
2238	movl	DEST(%esp), %eax
2239#endif
2240	RETURN
2241
2242	.p2align 4
2243L(fwd_write_32bytes_align):
2244	movdqa	-32(%eax), %xmm0
2245	movdqa	%xmm0, -32(%edx)
2246L(fwd_write_16bytes_align):
2247	movdqa	-16(%eax), %xmm0
2248	movdqa	%xmm0, -16(%edx)
2249#ifdef USE_AS_MEMPCPY
2250	movl	%edx, %eax
2251#else
2252	movl	DEST(%esp), %eax
2253#endif
2254	RETURN
2255
2256	.p2align 4
2257L(fwd_write_5bytes_align):
2258	movl	-5(%eax), %ecx
2259	movl	-4(%eax), %eax
2260	movl	%ecx, -5(%edx)
2261	movl	%eax, -4(%edx)
2262#ifdef USE_AS_MEMPCPY
2263	movl	%edx, %eax
2264#else
2265	movl	DEST(%esp), %eax
2266#endif
2267	RETURN
2268
2269	.p2align 4
2270L(fwd_write_45bytes_align):
2271	movdqa	-45(%eax), %xmm0
2272	movdqa	%xmm0, -45(%edx)
2273L(fwd_write_29bytes_align):
2274	movdqa	-29(%eax), %xmm0
2275	movdqa	%xmm0, -29(%edx)
2276L(fwd_write_13bytes_align):
2277	movq	-13(%eax), %xmm0
2278	movq	%xmm0, -13(%edx)
2279	movl	-5(%eax), %ecx
2280	movl	%ecx, -5(%edx)
2281	movzbl	-1(%eax), %ecx
2282	movb	%cl, -1(%edx)
2283#ifdef USE_AS_MEMPCPY
2284	movl	%edx, %eax
2285#else
2286	movl	DEST(%esp), %eax
2287#endif
2288	RETURN
2289
2290	.p2align 4
2291L(fwd_write_37bytes_align):
2292	movdqa	-37(%eax), %xmm0
2293	movdqa	%xmm0, -37(%edx)
2294L(fwd_write_21bytes_align):
2295	movdqa	-21(%eax), %xmm0
2296	movdqa	%xmm0, -21(%edx)
2297	movl	-5(%eax), %ecx
2298	movl	%ecx, -5(%edx)
2299	movzbl	-1(%eax), %ecx
2300	movb	%cl, -1(%edx)
2301#ifdef USE_AS_MEMPCPY
2302	movl	%edx, %eax
2303#else
2304	movl	DEST(%esp), %eax
2305#endif
2306	RETURN
2307
2308	.p2align 4
2309L(fwd_write_41bytes_align):
2310	movdqa	-41(%eax), %xmm0
2311	movdqa	%xmm0, -41(%edx)
2312L(fwd_write_25bytes_align):
2313	movdqa	-25(%eax), %xmm0
2314	movdqa	%xmm0, -25(%edx)
2315L(fwd_write_9bytes_align):
2316	movq	-9(%eax), %xmm0
2317	movq	%xmm0, -9(%edx)
2318L(fwd_write_1bytes_align):
2319	movzbl	-1(%eax), %ecx
2320	movb	%cl, -1(%edx)
2321#ifdef USE_AS_MEMPCPY
2322	movl	%edx, %eax
2323#else
2324	movl	DEST(%esp), %eax
2325#endif
2326	RETURN
2327
2328	.p2align 4
2329L(fwd_write_33bytes_align):
2330	movdqa	-33(%eax), %xmm0
2331	movdqa	%xmm0, -33(%edx)
2332L(fwd_write_17bytes_align):
2333	movdqa	-17(%eax), %xmm0
2334	movdqa	%xmm0, -17(%edx)
2335	movzbl	-1(%eax), %ecx
2336	movb	%cl, -1(%edx)
2337#ifdef USE_AS_MEMPCPY
2338	movl	%edx, %eax
2339#else
2340	movl	DEST(%esp), %eax
2341#endif
2342	RETURN
2343
2344	.p2align 4
2345L(fwd_write_46bytes_align):
2346	movdqa	-46(%eax), %xmm0
2347	movdqa	%xmm0, -46(%edx)
2348L(fwd_write_30bytes_align):
2349	movdqa	-30(%eax), %xmm0
2350	movdqa	%xmm0, -30(%edx)
2351L(fwd_write_14bytes_align):
2352	movq	-14(%eax), %xmm0
2353	movq	%xmm0, -14(%edx)
2354L(fwd_write_6bytes_align):
2355	movl	-6(%eax), %ecx
2356	movl	%ecx, -6(%edx)
2357	movzwl	-2(%eax), %ecx
2358	movw	%cx, -2(%edx)
2359#ifdef USE_AS_MEMPCPY
2360	movl	%edx, %eax
2361#else
2362	movl	DEST(%esp), %eax
2363#endif
2364	RETURN
2365
2366	.p2align 4
2367L(fwd_write_38bytes_align):
2368	movdqa	-38(%eax), %xmm0
2369	movdqa	%xmm0, -38(%edx)
2370L(fwd_write_22bytes_align):
2371	movdqa	-22(%eax), %xmm0
2372	movdqa	%xmm0, -22(%edx)
2373	movl	-6(%eax), %ecx
2374	movl	%ecx, -6(%edx)
2375	movzwl	-2(%eax), %ecx
2376	movw	%cx, -2(%edx)
2377#ifdef USE_AS_MEMPCPY
2378	movl	%edx, %eax
2379#else
2380	movl	DEST(%esp), %eax
2381#endif
2382	RETURN
2383
2384	.p2align 4
2385L(fwd_write_42bytes_align):
2386	movdqa	-42(%eax), %xmm0
2387	movdqa	%xmm0, -42(%edx)
2388L(fwd_write_26bytes_align):
2389	movdqa	-26(%eax), %xmm0
2390	movdqa	%xmm0, -26(%edx)
2391L(fwd_write_10bytes_align):
2392	movq	-10(%eax), %xmm0
2393	movq	%xmm0, -10(%edx)
2394L(fwd_write_2bytes_align):
2395	movzwl	-2(%eax), %ecx
2396	movw	%cx, -2(%edx)
2397#ifdef USE_AS_MEMPCPY
2398	movl	%edx, %eax
2399#else
2400	movl	DEST(%esp), %eax
2401#endif
2402	RETURN
2403
2404	.p2align 4
2405L(fwd_write_34bytes_align):
2406	movdqa	-34(%eax), %xmm0
2407	movdqa	%xmm0, -34(%edx)
2408L(fwd_write_18bytes_align):
2409	movdqa	-18(%eax), %xmm0
2410	movdqa	%xmm0, -18(%edx)
2411	movzwl	-2(%eax), %ecx
2412	movw	%cx, -2(%edx)
2413#ifdef USE_AS_MEMPCPY
2414	movl	%edx, %eax
2415#else
2416	movl	DEST(%esp), %eax
2417#endif
2418	RETURN
2419
2420	.p2align 4
2421L(fwd_write_47bytes_align):
2422	movdqa	-47(%eax), %xmm0
2423	movdqa	%xmm0, -47(%edx)
2424L(fwd_write_31bytes_align):
2425	movdqa	-31(%eax), %xmm0
2426	movdqa	%xmm0, -31(%edx)
2427L(fwd_write_15bytes_align):
2428	movq	-15(%eax), %xmm0
2429	movq	%xmm0, -15(%edx)
2430L(fwd_write_7bytes_align):
2431	movl	-7(%eax), %ecx
2432	movl	%ecx, -7(%edx)
2433	movzwl	-3(%eax), %ecx
2434	movzbl	-1(%eax), %eax
2435	movw	%cx, -3(%edx)
2436	movb	%al, -1(%edx)
2437#ifdef USE_AS_MEMPCPY
2438	movl	%edx, %eax
2439#else
2440	movl	DEST(%esp), %eax
2441#endif
2442	RETURN
2443
2444	.p2align 4
2445L(fwd_write_39bytes_align):
2446	movdqa	-39(%eax), %xmm0
2447	movdqa	%xmm0, -39(%edx)
2448L(fwd_write_23bytes_align):
2449	movdqa	-23(%eax), %xmm0
2450	movdqa	%xmm0, -23(%edx)
2451	movl	-7(%eax), %ecx
2452	movl	%ecx, -7(%edx)
2453	movzwl	-3(%eax), %ecx
2454	movzbl	-1(%eax), %eax
2455	movw	%cx, -3(%edx)
2456	movb	%al, -1(%edx)
2457#ifdef USE_AS_MEMPCPY
2458	movl	%edx, %eax
2459#else
2460	movl	DEST(%esp), %eax
2461#endif
2462	RETURN
2463
2464	.p2align 4
2465L(fwd_write_43bytes_align):
2466	movdqa	-43(%eax), %xmm0
2467	movdqa	%xmm0, -43(%edx)
2468L(fwd_write_27bytes_align):
2469	movdqa	-27(%eax), %xmm0
2470	movdqa	%xmm0, -27(%edx)
2471L(fwd_write_11bytes_align):
2472	movq	-11(%eax), %xmm0
2473	movq	%xmm0, -11(%edx)
2474L(fwd_write_3bytes_align):
2475	movzwl	-3(%eax), %ecx
2476	movzbl	-1(%eax), %eax
2477	movw	%cx, -3(%edx)
2478	movb	%al, -1(%edx)
2479#ifdef USE_AS_MEMPCPY
2480	movl	%edx, %eax
2481#else
2482	movl	DEST(%esp), %eax
2483#endif
2484	RETURN
2485
2486	.p2align 4
2487L(fwd_write_35bytes_align):
2488	movdqa	-35(%eax), %xmm0
2489	movdqa	%xmm0, -35(%edx)
2490L(fwd_write_19bytes_align):
2491	movdqa	-19(%eax), %xmm0
2492	movdqa	%xmm0, -19(%edx)
2493	movzwl	-3(%eax), %ecx
2494	movzbl	-1(%eax), %eax
2495	movw	%cx, -3(%edx)
2496	movb	%al, -1(%edx)
2497#ifdef USE_AS_MEMPCPY
2498	movl	%edx, %eax
2499#else
2500	movl	DEST(%esp), %eax
2501#endif
2502	RETURN
2503
2504	.p2align 4
2505L(fwd_write_44bytes_align):
2506	movdqa	-44(%eax), %xmm0
2507	movdqa	%xmm0, -44(%edx)
2508L(fwd_write_28bytes_align):
2509	movdqa	-28(%eax), %xmm0
2510	movdqa	%xmm0, -28(%edx)
2511L(fwd_write_12bytes_align):
2512	movq	-12(%eax), %xmm0
2513	movq	%xmm0, -12(%edx)
2514L(fwd_write_4bytes_align):
2515	movl	-4(%eax), %ecx
2516	movl	%ecx, -4(%edx)
2517#ifdef USE_AS_MEMPCPY
2518	movl	%edx, %eax
2519#else
2520	movl	DEST(%esp), %eax
2521#endif
2522	RETURN
2523
2524	.p2align 4
2525L(fwd_write_36bytes_align):
2526	movdqa	-36(%eax), %xmm0
2527	movdqa	%xmm0, -36(%edx)
2528L(fwd_write_20bytes_align):
2529	movdqa	-20(%eax), %xmm0
2530	movdqa	%xmm0, -20(%edx)
2531	movl	-4(%eax), %ecx
2532	movl	%ecx, -4(%edx)
2533#ifdef USE_AS_MEMPCPY
2534	movl	%edx, %eax
2535#else
2536	movl	DEST(%esp), %eax
2537#endif
2538	RETURN_END
2539
2540	CFI_PUSH (%edi)
2541
2542	.p2align 4
2543L(large_page):
2544	movdqu	(%eax), %xmm1
2545#ifdef USE_AS_MEMMOVE
2546	movl	DEST+4(%esp), %edi
2547	movdqu	%xmm0, (%edi)
2548#endif
2549	lea	16(%eax), %eax
2550	movntdq	%xmm1, (%edx)
2551	lea	16(%edx), %edx
2552	lea	-0x90(%ecx), %ecx
2553	POP (%edi)
2554
2555	.p2align 4
2556L(large_page_loop):
2557	movdqu	(%eax), %xmm0
2558	movdqu	0x10(%eax), %xmm1
2559	movdqu	0x20(%eax), %xmm2
2560	movdqu	0x30(%eax), %xmm3
2561	movdqu	0x40(%eax), %xmm4
2562	movdqu	0x50(%eax), %xmm5
2563	movdqu	0x60(%eax), %xmm6
2564	movdqu	0x70(%eax), %xmm7
2565	lea	0x80(%eax), %eax
2566
2567	sub	$0x80, %ecx
2568	movntdq	%xmm0, (%edx)
2569	movntdq	%xmm1, 0x10(%edx)
2570	movntdq	%xmm2, 0x20(%edx)
2571	movntdq	%xmm3, 0x30(%edx)
2572	movntdq	%xmm4, 0x40(%edx)
2573	movntdq	%xmm5, 0x50(%edx)
2574	movntdq	%xmm6, 0x60(%edx)
2575	movntdq	%xmm7, 0x70(%edx)
2576	lea	0x80(%edx), %edx
2577	jae	L(large_page_loop)
2578	cmp	$-0x40, %ecx
2579	lea	0x80(%ecx), %ecx
2580	jl	L(large_page_less_64bytes)
2581
2582	movdqu	(%eax), %xmm0
2583	movdqu	0x10(%eax), %xmm1
2584	movdqu	0x20(%eax), %xmm2
2585	movdqu	0x30(%eax), %xmm3
2586	lea	0x40(%eax), %eax
2587
2588	movntdq	%xmm0, (%edx)
2589	movntdq	%xmm1, 0x10(%edx)
2590	movntdq	%xmm2, 0x20(%edx)
2591	movntdq	%xmm3, 0x30(%edx)
2592	lea	0x40(%edx), %edx
2593	sub	$0x40, %ecx
2594L(large_page_less_64bytes):
2595	cmp	$32, %ecx
2596	jb	L(large_page_less_32bytes)
2597	movdqu	(%eax), %xmm0
2598	movdqu	0x10(%eax), %xmm1
2599	lea	0x20(%eax), %eax
2600	movntdq	%xmm0, (%edx)
2601	movntdq	%xmm1, 0x10(%edx)
2602	lea	0x20(%edx), %edx
2603	sub	$0x20, %ecx
2604L(large_page_less_32bytes):
2605	add	%ecx, %edx
2606	add	%ecx, %eax
2607	sfence
2608	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2609
2610	.p2align 4
2611L(bk_write_44bytes):
2612	movq	36(%eax), %xmm0
2613	movq	%xmm0, 36(%edx)
2614L(bk_write_36bytes):
2615	movq	28(%eax), %xmm0
2616	movq	%xmm0, 28(%edx)
2617L(bk_write_28bytes):
2618	movq	20(%eax), %xmm0
2619	movq	%xmm0, 20(%edx)
2620L(bk_write_20bytes):
2621	movq	12(%eax), %xmm0
2622	movq	%xmm0, 12(%edx)
2623L(bk_write_12bytes):
2624	movq	4(%eax), %xmm0
2625	movq	%xmm0, 4(%edx)
2626L(bk_write_4bytes):
2627	movl	(%eax), %ecx
2628	movl	%ecx, (%edx)
2629L(bk_write_0bytes):
2630	movl	DEST(%esp), %eax
2631#ifdef USE_AS_MEMPCPY
2632	movl	LEN(%esp), %ecx
2633	add	%ecx, %eax
2634#endif
2635	RETURN
2636
2637	.p2align 4
2638L(bk_write_40bytes):
2639	movq	32(%eax), %xmm0
2640	movq	%xmm0, 32(%edx)
2641L(bk_write_32bytes):
2642	movq	24(%eax), %xmm0
2643	movq	%xmm0, 24(%edx)
2644L(bk_write_24bytes):
2645	movq	16(%eax), %xmm0
2646	movq	%xmm0, 16(%edx)
2647L(bk_write_16bytes):
2648	movq	8(%eax), %xmm0
2649	movq	%xmm0, 8(%edx)
2650L(bk_write_8bytes):
2651	movq	(%eax), %xmm0
2652	movq	%xmm0, (%edx)
2653	movl	DEST(%esp), %eax
2654#ifdef USE_AS_MEMPCPY
2655	movl	LEN(%esp), %ecx
2656	add	%ecx, %eax
2657#endif
2658	RETURN
2659
2660	.p2align 4
2661L(bk_write_45bytes):
2662	movq	37(%eax), %xmm0
2663	movq	%xmm0, 37(%edx)
2664L(bk_write_37bytes):
2665	movq	29(%eax), %xmm0
2666	movq	%xmm0, 29(%edx)
2667L(bk_write_29bytes):
2668	movq	21(%eax), %xmm0
2669	movq	%xmm0, 21(%edx)
2670L(bk_write_21bytes):
2671	movq	13(%eax), %xmm0
2672	movq	%xmm0, 13(%edx)
2673L(bk_write_13bytes):
2674	movq	5(%eax), %xmm0
2675	movq	%xmm0, 5(%edx)
2676L(bk_write_5bytes):
2677	movl	1(%eax), %ecx
2678	movl	%ecx, 1(%edx)
2679L(bk_write_1bytes):
2680	movzbl	(%eax), %ecx
2681	movb	%cl, (%edx)
2682	movl	DEST(%esp), %eax
2683#ifdef USE_AS_MEMPCPY
2684	movl	LEN(%esp), %ecx
2685	add	%ecx, %eax
2686#endif
2687	RETURN
2688
2689	.p2align 4
2690L(bk_write_41bytes):
2691	movq	33(%eax), %xmm0
2692	movq	%xmm0, 33(%edx)
2693L(bk_write_33bytes):
2694	movq	25(%eax), %xmm0
2695	movq	%xmm0, 25(%edx)
2696L(bk_write_25bytes):
2697	movq	17(%eax), %xmm0
2698	movq	%xmm0, 17(%edx)
2699L(bk_write_17bytes):
2700	movq	9(%eax), %xmm0
2701	movq	%xmm0, 9(%edx)
2702L(bk_write_9bytes):
2703	movq	1(%eax), %xmm0
2704	movq	%xmm0, 1(%edx)
2705	movzbl	(%eax), %ecx
2706	movb	%cl, (%edx)
2707	movl	DEST(%esp), %eax
2708#ifdef USE_AS_MEMPCPY
2709	movl	LEN(%esp), %ecx
2710	add	%ecx, %eax
2711#endif
2712	RETURN
2713
2714	.p2align 4
2715L(bk_write_46bytes):
2716	movq	38(%eax), %xmm0
2717	movq	%xmm0, 38(%edx)
2718L(bk_write_38bytes):
2719	movq	30(%eax), %xmm0
2720	movq	%xmm0, 30(%edx)
2721L(bk_write_30bytes):
2722	movq	22(%eax), %xmm0
2723	movq	%xmm0, 22(%edx)
2724L(bk_write_22bytes):
2725	movq	14(%eax), %xmm0
2726	movq	%xmm0, 14(%edx)
2727L(bk_write_14bytes):
2728	movq	6(%eax), %xmm0
2729	movq	%xmm0, 6(%edx)
2730L(bk_write_6bytes):
2731	movl	2(%eax), %ecx
2732	movl	%ecx, 2(%edx)
2733	movzwl	(%eax), %ecx
2734	movw	%cx, (%edx)
2735	movl	DEST(%esp), %eax
2736#ifdef USE_AS_MEMPCPY
2737	movl	LEN(%esp), %ecx
2738	add	%ecx, %eax
2739#endif
2740	RETURN
2741
2742	.p2align 4
2743L(bk_write_42bytes):
2744	movq	34(%eax), %xmm0
2745	movq	%xmm0, 34(%edx)
2746L(bk_write_34bytes):
2747	movq	26(%eax), %xmm0
2748	movq	%xmm0, 26(%edx)
2749L(bk_write_26bytes):
2750	movq	18(%eax), %xmm0
2751	movq	%xmm0, 18(%edx)
2752L(bk_write_18bytes):
2753	movq	10(%eax), %xmm0
2754	movq	%xmm0, 10(%edx)
2755L(bk_write_10bytes):
2756	movq	2(%eax), %xmm0
2757	movq	%xmm0, 2(%edx)
2758L(bk_write_2bytes):
2759	movzwl	(%eax), %ecx
2760	movw	%cx, (%edx)
2761	movl	DEST(%esp), %eax
2762#ifdef USE_AS_MEMPCPY
2763	movl	LEN(%esp), %ecx
2764	add	%ecx, %eax
2765#endif
2766	RETURN
2767
2768	.p2align 4
2769L(bk_write_47bytes):
2770	movq	39(%eax), %xmm0
2771	movq	%xmm0, 39(%edx)
2772L(bk_write_39bytes):
2773	movq	31(%eax), %xmm0
2774	movq	%xmm0, 31(%edx)
2775L(bk_write_31bytes):
2776	movq	23(%eax), %xmm0
2777	movq	%xmm0, 23(%edx)
2778L(bk_write_23bytes):
2779	movq	15(%eax), %xmm0
2780	movq	%xmm0, 15(%edx)
2781L(bk_write_15bytes):
2782	movq	7(%eax), %xmm0
2783	movq	%xmm0, 7(%edx)
2784L(bk_write_7bytes):
2785	movl	3(%eax), %ecx
2786	movl	%ecx, 3(%edx)
2787	movzwl	1(%eax), %ecx
2788	movw	%cx, 1(%edx)
2789	movzbl	(%eax), %eax
2790	movb	%al, (%edx)
2791	movl	DEST(%esp), %eax
2792#ifdef USE_AS_MEMPCPY
2793	movl	LEN(%esp), %ecx
2794	add	%ecx, %eax
2795#endif
2796	RETURN
2797
2798	.p2align 4
2799L(bk_write_43bytes):
2800	movq	35(%eax), %xmm0
2801	movq	%xmm0, 35(%edx)
2802L(bk_write_35bytes):
2803	movq	27(%eax), %xmm0
2804	movq	%xmm0, 27(%edx)
2805L(bk_write_27bytes):
2806	movq	19(%eax), %xmm0
2807	movq	%xmm0, 19(%edx)
2808L(bk_write_19bytes):
2809	movq	11(%eax), %xmm0
2810	movq	%xmm0, 11(%edx)
2811L(bk_write_11bytes):
2812	movq	3(%eax), %xmm0
2813	movq	%xmm0, 3(%edx)
2814L(bk_write_3bytes):
2815	movzwl	1(%eax), %ecx
2816	movw	%cx, 1(%edx)
2817	movzbl	(%eax), %eax
2818	movb	%al, (%edx)
2819	movl	DEST(%esp), %eax
2820#ifdef USE_AS_MEMPCPY
2821	movl	LEN(%esp), %ecx
2822	add	%ecx, %eax
2823#endif
2824	RETURN_END
2825
2826
2827	.pushsection .rodata.ssse3,"a",@progbits
2828	.p2align 2
2829L(table_48bytes_fwd):
2830	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2831	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2832	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2833	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2834	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2835	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2836	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2837	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2838	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2839	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2840	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2841	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2842	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2843	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2844	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2845	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2846	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2847	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2848	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2849	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2850	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2851	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2852	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2853	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2854	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2855	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2856	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2857	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2858	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2859	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2860	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2861	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2862	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2863	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2864	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2865	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2866	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2867	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2868	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2869	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2870	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2871	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2872	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2873	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2874	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2875	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2876	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2877	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2878
2879	.p2align 2
2880L(table_48bytes_fwd_align):
2881	.int	JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2882	.int	JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2883	.int	JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2884	.int	JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2885	.int	JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2886	.int	JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2887	.int	JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2888	.int	JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2889	.int	JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2890	.int	JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2891	.int	JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2892	.int	JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2893	.int	JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2894	.int	JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2895	.int	JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2896	.int	JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2897	.int	JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2898	.int	JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2899	.int	JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2900	.int	JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2901	.int	JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2902	.int	JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2903	.int	JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2904	.int	JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2905	.int	JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2906	.int	JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2907	.int	JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2908	.int	JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2909	.int	JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2910	.int	JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2911	.int	JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2912	.int	JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2913	.int	JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2914	.int	JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2915	.int	JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2916	.int	JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2917	.int	JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2918	.int	JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2919	.int	JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2920	.int	JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2921	.int	JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2922	.int	JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2923	.int	JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2924	.int	JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2925	.int	JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2926	.int	JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
2927	.int	JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
2928	.int	JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
2929
2930	.p2align 2
2931L(shl_table):
2932	.int	JMPTBL (L(shl_0), L(shl_table))
2933	.int	JMPTBL (L(shl_1), L(shl_table))
2934	.int	JMPTBL (L(shl_2), L(shl_table))
2935	.int	JMPTBL (L(shl_3), L(shl_table))
2936	.int	JMPTBL (L(shl_4), L(shl_table))
2937	.int	JMPTBL (L(shl_5), L(shl_table))
2938	.int	JMPTBL (L(shl_6), L(shl_table))
2939	.int	JMPTBL (L(shl_7), L(shl_table))
2940	.int	JMPTBL (L(shl_8), L(shl_table))
2941	.int	JMPTBL (L(shl_9), L(shl_table))
2942	.int	JMPTBL (L(shl_10), L(shl_table))
2943	.int	JMPTBL (L(shl_11), L(shl_table))
2944	.int	JMPTBL (L(shl_12), L(shl_table))
2945	.int	JMPTBL (L(shl_13), L(shl_table))
2946	.int	JMPTBL (L(shl_14), L(shl_table))
2947	.int	JMPTBL (L(shl_15), L(shl_table))
2948
2949	.p2align 2
2950L(table_48_bytes_bwd):
2951	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
2952	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
2953	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
2954	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
2955	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
2956	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
2957	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
2958	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
2959	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
2960	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
2961	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
2962	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
2963	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
2964	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
2965	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
2966	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
2967	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
2968	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
2969	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
2970	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
2971	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
2972	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
2973	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
2974	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
2975	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
2976	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
2977	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
2978	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
2979	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
2980	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
2981	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
2982	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
2983	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
2984	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
2985	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
2986	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
2987	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
2988	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
2989	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
2990	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
2991	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
2992	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
2993	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
2994	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
2995	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
2996	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
2997	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
2998	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
2999
3000	.popsection
3001
3002#ifdef USE_AS_MEMMOVE
3003	.p2align 4
3004L(copy_backward):
3005	PUSH (%edi)
3006	movl	%eax, %edi
3007	lea	(%ecx,%edx,1),%edx
3008	lea	(%ecx,%edi,1),%edi
3009	testl	$0x3, %edx
3010	jnz	L(bk_align)
3011
3012L(bk_aligned_4):
3013	cmp	$64, %ecx
3014	jae	L(bk_write_more64bytes)
3015
3016L(bk_write_64bytesless):
3017	cmp	$32, %ecx
3018	jb	L(bk_write_less32bytes)
3019
3020L(bk_write_more32bytes):
3021	/* Copy 32 bytes at a time.  */
3022	sub	$32, %ecx
3023	movq	-8(%edi), %xmm0
3024	movq	%xmm0, -8(%edx)
3025	movq	-16(%edi), %xmm0
3026	movq	%xmm0, -16(%edx)
3027	movq	-24(%edi), %xmm0
3028	movq	%xmm0, -24(%edx)
3029	movq	-32(%edi), %xmm0
3030	movq	%xmm0, -32(%edx)
3031	sub	$32, %edx
3032	sub	$32, %edi
3033
3034L(bk_write_less32bytes):
3035	movl	%edi, %eax
3036	sub	%ecx, %edx
3037	sub	%ecx, %eax
3038	POP (%edi)
3039L(bk_write_less32bytes_2):
3040	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3041
3042	CFI_PUSH (%edi)
3043
3044	.p2align 4
3045L(bk_align):
3046	cmp	$8, %ecx
3047	jbe	L(bk_write_less32bytes)
3048	testl	$1, %edx
3049	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
3050	then	(EDX & 2) must be != 0.  */
3051	jz	L(bk_got2)
3052	sub	$1, %edi
3053	sub	$1, %ecx
3054	sub	$1, %edx
3055	movzbl	(%edi), %eax
3056	movb	%al, (%edx)
3057
3058	testl	$2, %edx
3059	jz	L(bk_aligned_4)
3060
3061L(bk_got2):
3062	sub	$2, %edi
3063	sub	$2, %ecx
3064	sub	$2, %edx
3065	movzwl	(%edi), %eax
3066	movw	%ax, (%edx)
3067	jmp	L(bk_aligned_4)
3068
3069	.p2align 4
3070L(bk_write_more64bytes):
3071	/* Check alignment of last byte.  */
3072	testl	$15, %edx
3073	jz	L(bk_ssse3_cpy_pre)
3074
3075/* EDX is aligned 4 bytes, but not 16 bytes.  */
3076L(bk_ssse3_align):
3077	sub	$4, %edi
3078	sub	$4, %ecx
3079	sub	$4, %edx
3080	movl	(%edi), %eax
3081	movl	%eax, (%edx)
3082
3083	testl	$15, %edx
3084	jz	L(bk_ssse3_cpy_pre)
3085
3086	sub	$4, %edi
3087	sub	$4, %ecx
3088	sub	$4, %edx
3089	movl	(%edi), %eax
3090	movl	%eax, (%edx)
3091
3092	testl	$15, %edx
3093	jz	L(bk_ssse3_cpy_pre)
3094
3095	sub	$4, %edi
3096	sub	$4, %ecx
3097	sub	$4, %edx
3098	movl	(%edi), %eax
3099	movl	%eax, (%edx)
3100
3101L(bk_ssse3_cpy_pre):
3102	cmp	$64, %ecx
3103	jb	L(bk_write_more32bytes)
3104
3105	.p2align 4
3106L(bk_ssse3_cpy):
3107	sub	$64, %edi
3108	sub	$64, %ecx
3109	sub	$64, %edx
3110	movdqu	0x30(%edi), %xmm3
3111	movdqa	%xmm3, 0x30(%edx)
3112	movdqu	0x20(%edi), %xmm2
3113	movdqa	%xmm2, 0x20(%edx)
3114	movdqu	0x10(%edi), %xmm1
3115	movdqa	%xmm1, 0x10(%edx)
3116	movdqu	(%edi), %xmm0
3117	movdqa	%xmm0, (%edx)
3118	cmp	$64, %ecx
3119	jae	L(bk_ssse3_cpy)
3120	jmp	L(bk_write_64bytesless)
3121
3122#endif
3123
3124END (MEMCPY)
3125