1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE	memmove_generic
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#define DEST		PARMS
77#define SRC		DEST+4
78#define LEN		SRC+4
79
80#define CFI_PUSH(REG)		\
81  cfi_adjust_cfa_offset (4);		\
82  cfi_rel_offset (REG, 0)
83
84#define CFI_POP(REG)		\
85  cfi_adjust_cfa_offset (-4);		\
86  cfi_restore (REG)
87
88#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
89#define POP(REG)	popl REG; CFI_POP (REG)
90
91#define PARMS		8		/* Preserve EBX.  */
92#define ENTRANCE	PUSH (%ebx);
93#define RETURN_END	POP (%ebx); ret
94#define RETURN		RETURN_END; CFI_PUSH (%ebx)
95
96	.section .text.sse2,"ax",@progbits
97ENTRY (MEMMOVE)
98	ENTRANCE
99	movl	LEN(%esp), %ecx
100	movl	SRC(%esp), %eax
101	movl	DEST(%esp), %edx
102
103/* Check whether we should copy backward or forward.  */
104	cmp	%eax, %edx
105	je	L(mm_return)
106	jg	L(mm_len_0_or_more_backward)
107
108/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
109	separately.  */
110	cmp	$16, %ecx
111	jbe	L(mm_len_0_16_bytes_forward)
112
113	cmpl	$32, %ecx
114	ja	L(mm_len_32_or_more_forward)
115
116/* Copy [0..32] and return.  */
117	movdqu	(%eax), %xmm0
118	movdqu	-16(%eax, %ecx), %xmm1
119	movdqu	%xmm0, (%edx)
120	movdqu	%xmm1, -16(%edx, %ecx)
121	jmp	L(mm_return)
122
123L(mm_len_32_or_more_forward):
124	cmpl	$64, %ecx
125	ja	L(mm_len_64_or_more_forward)
126
127/* Copy [0..64] and return.  */
128	movdqu	(%eax), %xmm0
129	movdqu	16(%eax), %xmm1
130	movdqu	-16(%eax, %ecx), %xmm2
131	movdqu	-32(%eax, %ecx), %xmm3
132	movdqu	%xmm0, (%edx)
133	movdqu	%xmm1, 16(%edx)
134	movdqu	%xmm2, -16(%edx, %ecx)
135	movdqu	%xmm3, -32(%edx, %ecx)
136	jmp	L(mm_return)
137
138L(mm_len_64_or_more_forward):
139	cmpl	$128, %ecx
140	ja	L(mm_len_128_or_more_forward)
141
142/* Copy [0..128] and return.  */
143	movdqu	(%eax), %xmm0
144	movdqu	16(%eax), %xmm1
145	movdqu	32(%eax), %xmm2
146	movdqu	48(%eax), %xmm3
147	movdqu	-64(%eax, %ecx), %xmm4
148	movdqu	-48(%eax, %ecx), %xmm5
149	movdqu	-32(%eax, %ecx), %xmm6
150	movdqu	-16(%eax, %ecx), %xmm7
151	movdqu	%xmm0, (%edx)
152	movdqu	%xmm1, 16(%edx)
153	movdqu	%xmm2, 32(%edx)
154	movdqu	%xmm3, 48(%edx)
155	movdqu	%xmm4, -64(%edx, %ecx)
156	movdqu	%xmm5, -48(%edx, %ecx)
157	movdqu	%xmm6, -32(%edx, %ecx)
158	movdqu	%xmm7, -16(%edx, %ecx)
159	jmp	L(mm_return)
160
161L(mm_len_128_or_more_forward):
162	PUSH (%esi)
163	PUSH (%edi)
164
165/* Aligning the address of destination.  */
166	movdqu	(%eax), %xmm0
167	movdqu	16(%eax), %xmm1
168	movdqu	32(%eax), %xmm2
169	movdqu	48(%eax), %xmm3
170
171	leal	64(%edx), %edi
172	andl	$-64, %edi
173	subl	%edx, %eax
174
175	movdqu	(%eax, %edi), %xmm4
176	movdqu	16(%eax, %edi), %xmm5
177	movdqu	32(%eax, %edi), %xmm6
178	movdqu	48(%eax, %edi), %xmm7
179
180	movdqu	%xmm0, (%edx)
181	movdqu	%xmm1, 16(%edx)
182	movdqu	%xmm2, 32(%edx)
183	movdqu	%xmm3, 48(%edx)
184	movdqa	%xmm4, (%edi)
185	movaps	%xmm5, 16(%edi)
186	movaps	%xmm6, 32(%edi)
187	movaps	%xmm7, 48(%edi)
188	addl	$64, %edi
189
190	leal	(%edx, %ecx), %ebx
191	andl	$-64, %ebx
192	cmp	%edi, %ebx
193	jbe	L(mm_copy_remaining_forward)
194
195	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
196	jae	L(mm_large_page_loop_forward)
197
198	.p2align 4
199L(mm_main_loop_forward):
200
201	prefetcht0 128(%eax, %edi)
202
203	movdqu	(%eax, %edi), %xmm0
204	movdqu	16(%eax, %edi), %xmm1
205	movdqu	32(%eax, %edi), %xmm2
206	movdqu	48(%eax, %edi), %xmm3
207	movdqa	%xmm0, (%edi)
208	movaps	%xmm1, 16(%edi)
209	movaps	%xmm2, 32(%edi)
210	movaps	%xmm3, 48(%edi)
211	leal	64(%edi), %edi
212	cmp	%edi, %ebx
213	ja	L(mm_main_loop_forward)
214
215L(mm_copy_remaining_forward):
216	addl	%edx, %ecx
217	subl	%edi, %ecx
218/* We copied all up till %edi position in the dst.
219	In %ecx now is how many bytes are left to copy.
220	Now we need to advance %esi. */
221	leal	(%edi, %eax), %esi
222
223L(mm_remaining_0_64_bytes_forward):
224	cmp	$32, %ecx
225	ja	L(mm_remaining_33_64_bytes_forward)
226	cmp	$16, %ecx
227	ja	L(mm_remaining_17_32_bytes_forward)
228	testl	%ecx, %ecx
229	.p2align 4,,2
230	je	L(mm_return_pop_all)
231
232	cmpb	$8, %cl
233	ja	L(mm_remaining_9_16_bytes_forward)
234	cmpb	$4, %cl
235	.p2align 4,,5
236	ja	L(mm_remaining_5_8_bytes_forward)
237	cmpb	$2, %cl
238	.p2align 4,,1
239	ja	L(mm_remaining_3_4_bytes_forward)
240	movzbl	-1(%esi,%ecx), %eax
241	movzbl	(%esi), %ebx
242	movb	%al, -1(%edi,%ecx)
243	movb	%bl, (%edi)
244	jmp	L(mm_return_pop_all)
245
246L(mm_remaining_33_64_bytes_forward):
247	movdqu	(%esi), %xmm0
248	movdqu	16(%esi), %xmm1
249	movdqu	-32(%esi, %ecx), %xmm2
250	movdqu	-16(%esi, %ecx), %xmm3
251	movdqu	%xmm0, (%edi)
252	movdqu	%xmm1, 16(%edi)
253	movdqu	%xmm2, -32(%edi, %ecx)
254	movdqu	%xmm3, -16(%edi, %ecx)
255	jmp	L(mm_return_pop_all)
256
257L(mm_remaining_17_32_bytes_forward):
258	movdqu	(%esi), %xmm0
259	movdqu	-16(%esi, %ecx), %xmm1
260	movdqu	%xmm0, (%edi)
261	movdqu	%xmm1, -16(%edi, %ecx)
262	jmp	L(mm_return_pop_all)
263
264L(mm_remaining_9_16_bytes_forward):
265	movq	(%esi), %xmm0
266	movq	-8(%esi, %ecx), %xmm1
267	movq	%xmm0, (%edi)
268	movq	%xmm1, -8(%edi, %ecx)
269	jmp	L(mm_return_pop_all)
270
271L(mm_remaining_5_8_bytes_forward):
272	movl	(%esi), %eax
273	movl	-4(%esi,%ecx), %ebx
274	movl	%eax, (%edi)
275	movl	%ebx, -4(%edi,%ecx)
276	jmp	L(mm_return_pop_all)
277
278L(mm_remaining_3_4_bytes_forward):
279	movzwl	-2(%esi,%ecx), %eax
280	movzwl	(%esi), %ebx
281	movw	%ax, -2(%edi,%ecx)
282	movw	%bx, (%edi)
283	jmp	L(mm_return_pop_all)
284
285L(mm_len_0_16_bytes_forward):
286	testb	$24, %cl
287	jne	L(mm_len_9_16_bytes_forward)
288	testb	$4, %cl
289	.p2align 4,,5
290	jne	L(mm_len_5_8_bytes_forward)
291	testl	%ecx, %ecx
292	.p2align 4,,2
293	je	L(mm_return)
294	testb	$2, %cl
295	.p2align 4,,1
296	jne	L(mm_len_2_4_bytes_forward)
297	movzbl	-1(%eax,%ecx), %ebx
298	movzbl	(%eax), %eax
299	movb	%bl, -1(%edx,%ecx)
300	movb	%al, (%edx)
301	jmp	L(mm_return)
302
303L(mm_len_2_4_bytes_forward):
304	movzwl	-2(%eax,%ecx), %ebx
305	movzwl	(%eax), %eax
306	movw	%bx, -2(%edx,%ecx)
307	movw	%ax, (%edx)
308	jmp	L(mm_return)
309
310L(mm_len_5_8_bytes_forward):
311	movl	(%eax), %ebx
312	movl	-4(%eax,%ecx), %eax
313	movl	%ebx, (%edx)
314	movl	%eax, -4(%edx,%ecx)
315	jmp	L(mm_return)
316
317L(mm_len_9_16_bytes_forward):
318	movq	(%eax), %xmm0
319	movq	-8(%eax, %ecx), %xmm1
320	movq	%xmm0, (%edx)
321	movq	%xmm1, -8(%edx, %ecx)
322	jmp	L(mm_return)
323
324	CFI_POP (%edi)
325	CFI_POP (%esi)
326
327L(mm_recalc_len):
328/* Compute in %ecx how many bytes are left to copy after
329	the main loop stops.  */
330	movl	%ebx, %ecx
331	subl	%edx, %ecx
332/* The code for copying backwards.  */
333L(mm_len_0_or_more_backward):
334
335/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
336	separately.  */
337	cmp	$16, %ecx
338	jbe	L(mm_len_0_16_bytes_backward)
339
340	cmpl	$32, %ecx
341	jg	L(mm_len_32_or_more_backward)
342
343/* Copy [0..32] and return.  */
344	movdqu	(%eax), %xmm0
345	movdqu	-16(%eax, %ecx), %xmm1
346	movdqu	%xmm0, (%edx)
347	movdqu	%xmm1, -16(%edx, %ecx)
348	jmp	L(mm_return)
349
350L(mm_len_32_or_more_backward):
351	cmpl	$64, %ecx
352	jg	L(mm_len_64_or_more_backward)
353
354/* Copy [0..64] and return.  */
355	movdqu	(%eax), %xmm0
356	movdqu	16(%eax), %xmm1
357	movdqu	-16(%eax, %ecx), %xmm2
358	movdqu	-32(%eax, %ecx), %xmm3
359	movdqu	%xmm0, (%edx)
360	movdqu	%xmm1, 16(%edx)
361	movdqu	%xmm2, -16(%edx, %ecx)
362	movdqu	%xmm3, -32(%edx, %ecx)
363	jmp	L(mm_return)
364
365L(mm_len_64_or_more_backward):
366	cmpl	$128, %ecx
367	jg	L(mm_len_128_or_more_backward)
368
369/* Copy [0..128] and return.  */
370	movdqu	(%eax), %xmm0
371	movdqu	16(%eax), %xmm1
372	movdqu	32(%eax), %xmm2
373	movdqu	48(%eax), %xmm3
374	movdqu	-64(%eax, %ecx), %xmm4
375	movdqu	-48(%eax, %ecx), %xmm5
376	movdqu	-32(%eax, %ecx), %xmm6
377	movdqu	-16(%eax, %ecx), %xmm7
378	movdqu	%xmm0, (%edx)
379	movdqu	%xmm1, 16(%edx)
380	movdqu	%xmm2, 32(%edx)
381	movdqu	%xmm3, 48(%edx)
382	movdqu	%xmm4, -64(%edx, %ecx)
383	movdqu	%xmm5, -48(%edx, %ecx)
384	movdqu	%xmm6, -32(%edx, %ecx)
385	movdqu	%xmm7, -16(%edx, %ecx)
386	jmp	L(mm_return)
387
388L(mm_len_128_or_more_backward):
389	PUSH (%esi)
390	PUSH (%edi)
391
392/* Aligning the address of destination. We need to save
393	16 bits from the source in order not to overwrite them.  */
394	movdqu	-16(%eax, %ecx), %xmm0
395	movdqu	-32(%eax, %ecx), %xmm1
396	movdqu	-48(%eax, %ecx), %xmm2
397	movdqu	-64(%eax, %ecx), %xmm3
398
399	leal	(%edx, %ecx), %edi
400	andl	$-64, %edi
401
402	movl	%eax, %esi
403	subl	%edx, %esi
404
405	movdqu	-16(%edi, %esi), %xmm4
406	movdqu	-32(%edi, %esi), %xmm5
407	movdqu	-48(%edi, %esi), %xmm6
408	movdqu	-64(%edi, %esi), %xmm7
409
410	movdqu	%xmm0, -16(%edx, %ecx)
411	movdqu	%xmm1, -32(%edx, %ecx)
412	movdqu	%xmm2, -48(%edx, %ecx)
413	movdqu	%xmm3, -64(%edx, %ecx)
414	movdqa	%xmm4, -16(%edi)
415	movdqa	%xmm5, -32(%edi)
416	movdqa	%xmm6, -48(%edi)
417	movdqa	%xmm7, -64(%edi)
418	leal	-64(%edi), %edi
419
420	leal	64(%edx), %ebx
421	andl	$-64, %ebx
422
423	cmp	%edi, %ebx
424	jae	L(mm_main_loop_backward_end)
425
426	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
427	jae	L(mm_large_page_loop_backward)
428
429	.p2align 4
430L(mm_main_loop_backward):
431
432	prefetcht0 -128(%edi, %esi)
433
434	movdqu	-64(%edi, %esi), %xmm0
435	movdqu	-48(%edi, %esi), %xmm1
436	movdqu	-32(%edi, %esi), %xmm2
437	movdqu	-16(%edi, %esi), %xmm3
438	movdqa	%xmm0, -64(%edi)
439	movdqa	%xmm1, -48(%edi)
440	movdqa	%xmm2, -32(%edi)
441	movdqa	%xmm3, -16(%edi)
442	leal	-64(%edi), %edi
443	cmp	%edi, %ebx
444	jb	L(mm_main_loop_backward)
445L(mm_main_loop_backward_end):
446	POP (%edi)
447	POP (%esi)
448	jmp	L(mm_recalc_len)
449
450/* Copy [0..16] and return.  */
451L(mm_len_0_16_bytes_backward):
452	testb	$24, %cl
453	jnz	L(mm_len_9_16_bytes_backward)
454	testb	$4, %cl
455	.p2align 4,,5
456	jnz	L(mm_len_5_8_bytes_backward)
457	testl	%ecx, %ecx
458	.p2align 4,,2
459	je	L(mm_return)
460	testb	$2, %cl
461	.p2align 4,,1
462	jne	L(mm_len_3_4_bytes_backward)
463	movzbl	-1(%eax,%ecx), %ebx
464	movzbl	(%eax), %eax
465	movb	%bl, -1(%edx,%ecx)
466	movb	%al, (%edx)
467	jmp	L(mm_return)
468
469L(mm_len_3_4_bytes_backward):
470	movzwl	-2(%eax,%ecx), %ebx
471	movzwl	(%eax), %eax
472	movw	%bx, -2(%edx,%ecx)
473	movw	%ax, (%edx)
474	jmp	L(mm_return)
475
476L(mm_len_9_16_bytes_backward):
477	PUSH (%esi)
478	movl	-4(%eax,%ecx), %ebx
479	movl	-8(%eax,%ecx), %esi
480	movl	%ebx, -4(%edx,%ecx)
481	movl	%esi, -8(%edx,%ecx)
482	subl	$8, %ecx
483	POP (%esi)
484	jmp	L(mm_len_0_16_bytes_backward)
485
486L(mm_len_5_8_bytes_backward):
487	movl	(%eax), %ebx
488	movl	-4(%eax,%ecx), %eax
489	movl	%ebx, (%edx)
490	movl	%eax, -4(%edx,%ecx)
491
492L(mm_return):
493	movl	%edx, %eax
494	RETURN
495
496L(mm_return_pop_all):
497	movl	%edx, %eax
498	POP (%edi)
499	POP (%esi)
500	RETURN
501
502/* Big length copy forward part.  */
503
504	.p2align 4
505L(mm_large_page_loop_forward):
506	movdqu	(%eax, %edi), %xmm0
507	movdqu	16(%eax, %edi), %xmm1
508	movdqu	32(%eax, %edi), %xmm2
509	movdqu	48(%eax, %edi), %xmm3
510	movntdq	%xmm0, (%edi)
511	movntdq	%xmm1, 16(%edi)
512	movntdq	%xmm2, 32(%edi)
513	movntdq	%xmm3, 48(%edi)
514	leal	64(%edi), %edi
515	cmp	%edi, %ebx
516	ja	L(mm_large_page_loop_forward)
517	sfence
518	jmp	L(mm_copy_remaining_forward)
519
520/* Big length copy backward part.  */
521	.p2align 4
522L(mm_large_page_loop_backward):
523	movdqu	-64(%edi, %esi), %xmm0
524	movdqu	-48(%edi, %esi), %xmm1
525	movdqu	-32(%edi, %esi), %xmm2
526	movdqu	-16(%edi, %esi), %xmm3
527	movntdq	%xmm0, -64(%edi)
528	movntdq	%xmm1, -48(%edi)
529	movntdq	%xmm2, -32(%edi)
530	movntdq	%xmm3, -16(%edi)
531	leal	-64(%edi), %edi
532	cmp	%edi, %ebx
533	jb	L(mm_large_page_loop_backward)
534	sfence
535	POP (%edi)
536	POP (%esi)
537	jmp	L(mm_recalc_len)
538
539END (MEMMOVE)
540