1/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef USE_AS_WCSCAT
32
33# ifndef L
34#  define L(label)	.L##label
35# endif
36
37# ifndef cfi_startproc
38#  define cfi_startproc	.cfi_startproc
39# endif
40
41# ifndef cfi_endproc
42#  define cfi_endproc	.cfi_endproc
43# endif
44
45# ifndef cfi_rel_offset
46#  define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
47# endif
48
49# ifndef cfi_restore
50#  define cfi_restore(reg)	.cfi_restore reg
51# endif
52
53# ifndef cfi_adjust_cfa_offset
54#  define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
55# endif
56
57# ifndef ENTRY
58#  define ENTRY(name)	\
59	.type name, @function;	\
60	.globl name;	\
61	.p2align 4;	\
62name:	\
63	cfi_startproc
64# endif
65
66# ifndef END
67#  define END(name)	\
68	cfi_endproc;	\
69	.size name, .-name
70# endif
71
72# define CFI_PUSH(REG)	\
73	cfi_adjust_cfa_offset (4);	\
74	cfi_rel_offset (REG, 0)
75
76# define CFI_POP(REG)	\
77	cfi_adjust_cfa_offset (-4);	\
78	cfi_restore (REG)
79
80# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
81# define POP(REG)	popl REG; CFI_POP (REG)
82
83# define PARMS	4
84# define RETURN	POP (%edi); ret; CFI_PUSH (%edi)
85
86# define STR1	PARMS
87# define STR2	STR1+4
88# define LEN	STR2+4
89
90.text
91ENTRY (wcscpy_ssse3)
92	mov	STR1(%esp), %edx
93	mov	STR2(%esp), %ecx
94
95	cmpl	$0, (%ecx)
96	jz	L(ExitTail4)
97	cmpl	$0, 4(%ecx)
98	jz	L(ExitTail8)
99	cmpl	$0, 8(%ecx)
100	jz	L(ExitTail12)
101	cmpl	$0, 12(%ecx)
102	jz	L(ExitTail16)
103
104	PUSH	(%edi)
105	mov	%edx, %edi
106#endif
107	PUSH	(%esi)
108	lea	16(%ecx), %esi
109
110	and	$-16, %esi
111
112	pxor	%xmm0, %xmm0
113	pcmpeqd	(%esi), %xmm0
114	movdqu	(%ecx), %xmm1
115	movdqu	%xmm1, (%edx)
116
117	pmovmskb %xmm0, %eax
118	sub	%ecx, %esi
119
120	test	%eax, %eax
121	jnz	L(CopyFrom1To16Bytes)
122
123	mov	%edx, %eax
124	lea	16(%edx), %edx
125	and	$-16, %edx
126	sub	%edx, %eax
127
128	sub	%eax, %ecx
129	mov	%ecx, %eax
130	and	$0xf, %eax
131	mov	$0, %esi
132
133	jz	L(Align16Both)
134	cmp	$4, %eax
135	je	L(Shl4)
136	cmp	$8, %eax
137	je	L(Shl8)
138	jmp	L(Shl12)
139
140L(Align16Both):
141	movaps	(%ecx), %xmm1
142	movaps	16(%ecx), %xmm2
143	movaps	%xmm1, (%edx)
144	pcmpeqd	%xmm2, %xmm0
145	pmovmskb %xmm0, %eax
146	lea	16(%esi), %esi
147
148	test	%eax, %eax
149	jnz	L(CopyFrom1To16Bytes)
150
151	movaps	16(%ecx, %esi), %xmm3
152	movaps	%xmm2, (%edx, %esi)
153	pcmpeqd	%xmm3, %xmm0
154	pmovmskb %xmm0, %eax
155	lea	16(%esi), %esi
156
157	test	%eax, %eax
158	jnz	L(CopyFrom1To16Bytes)
159
160	movaps	16(%ecx, %esi), %xmm4
161	movaps	%xmm3, (%edx, %esi)
162	pcmpeqd	%xmm4, %xmm0
163	pmovmskb %xmm0, %eax
164	lea	16(%esi), %esi
165
166	test	%eax, %eax
167	jnz	L(CopyFrom1To16Bytes)
168
169	movaps	16(%ecx, %esi), %xmm1
170	movaps	%xmm4, (%edx, %esi)
171	pcmpeqd	%xmm1, %xmm0
172	pmovmskb %xmm0, %eax
173	lea	16(%esi), %esi
174
175	test	%eax, %eax
176	jnz	L(CopyFrom1To16Bytes)
177
178	movaps	16(%ecx, %esi), %xmm2
179	movaps	%xmm1, (%edx, %esi)
180	pcmpeqd	%xmm2, %xmm0
181	pmovmskb %xmm0, %eax
182	lea	16(%esi), %esi
183
184	test	%eax, %eax
185	jnz	L(CopyFrom1To16Bytes)
186
187	movaps	16(%ecx, %esi), %xmm3
188	movaps	%xmm2, (%edx, %esi)
189	pcmpeqd	%xmm3, %xmm0
190	pmovmskb %xmm0, %eax
191	lea	16(%esi), %esi
192
193	test	%eax, %eax
194	jnz	L(CopyFrom1To16Bytes)
195
196	movaps	%xmm3, (%edx, %esi)
197	mov	%ecx, %eax
198	lea	16(%ecx, %esi), %ecx
199	and	$-0x40, %ecx
200	sub	%ecx, %eax
201	sub	%eax, %edx
202
203	mov	$-0x40, %esi
204
205L(Aligned64Loop):
206	movaps	(%ecx), %xmm2
207	movaps	32(%ecx), %xmm3
208	movaps	%xmm2, %xmm4
209	movaps	16(%ecx), %xmm5
210	movaps	%xmm3, %xmm6
211	movaps	48(%ecx), %xmm7
212	pminub	%xmm5, %xmm2
213	pminub	%xmm7, %xmm3
214	pminub	%xmm2, %xmm3
215	lea	64(%edx), %edx
216	pcmpeqd	%xmm0, %xmm3
217	lea	64(%ecx), %ecx
218	pmovmskb %xmm3, %eax
219
220	test	%eax, %eax
221	jnz	L(Aligned64Leave)
222	movaps	%xmm4, -64(%edx)
223	movaps	%xmm5, -48(%edx)
224	movaps	%xmm6, -32(%edx)
225	movaps	%xmm7, -16(%edx)
226	jmp	L(Aligned64Loop)
227
228L(Aligned64Leave):
229	pcmpeqd	%xmm4, %xmm0
230	pmovmskb %xmm0, %eax
231	test	%eax, %eax
232	jnz	L(CopyFrom1To16Bytes)
233
234	pcmpeqd	%xmm5, %xmm0
235	pmovmskb %xmm0, %eax
236	movaps	%xmm4, -64(%edx)
237	lea	16(%esi), %esi
238	test	%eax, %eax
239	jnz	L(CopyFrom1To16Bytes)
240
241	pcmpeqd	%xmm6, %xmm0
242	pmovmskb %xmm0, %eax
243	movaps	%xmm5, -48(%edx)
244	lea	16(%esi), %esi
245	test	%eax, %eax
246	jnz	L(CopyFrom1To16Bytes)
247
248	movaps	%xmm6, -32(%edx)
249	pcmpeqd	%xmm7, %xmm0
250	pmovmskb %xmm0, %eax
251	lea	16(%esi), %esi
252	test	%eax, %eax
253	jnz	L(CopyFrom1To16Bytes)
254
255	mov	$-0x40, %esi
256	movaps	%xmm7, -16(%edx)
257	jmp	L(Aligned64Loop)
258
259	.p2align 4
260L(Shl4):
261	movaps	-4(%ecx), %xmm1
262	movaps	12(%ecx), %xmm2
263L(Shl4Start):
264	pcmpeqd	%xmm2, %xmm0
265	pmovmskb %xmm0, %eax
266	movaps	%xmm2, %xmm3
267
268	test	%eax, %eax
269	jnz	L(Shl4LoopExit)
270
271	palignr	$4, %xmm1, %xmm2
272	movaps	%xmm2, (%edx)
273	movaps	28(%ecx), %xmm2
274
275	pcmpeqd	%xmm2, %xmm0
276	lea	16(%edx), %edx
277	pmovmskb %xmm0, %eax
278	lea	16(%ecx), %ecx
279	movaps	%xmm2, %xmm1
280
281	test	%eax, %eax
282	jnz	L(Shl4LoopExit)
283
284	palignr	$4, %xmm3, %xmm2
285	movaps	%xmm2, (%edx)
286	movaps	28(%ecx), %xmm2
287
288	pcmpeqd	%xmm2, %xmm0
289	lea	16(%edx), %edx
290	pmovmskb %xmm0, %eax
291	lea	16(%ecx), %ecx
292	movaps	%xmm2, %xmm3
293
294	test	%eax, %eax
295	jnz	L(Shl4LoopExit)
296
297	palignr	$4, %xmm1, %xmm2
298	movaps	%xmm2, (%edx)
299	movaps	28(%ecx), %xmm2
300
301	pcmpeqd	%xmm2, %xmm0
302	lea	16(%edx), %edx
303	pmovmskb %xmm0, %eax
304	lea	16(%ecx), %ecx
305
306	test	%eax, %eax
307	jnz	L(Shl4LoopExit)
308
309	palignr	$4, %xmm3, %xmm2
310	movaps	%xmm2, (%edx)
311	lea	28(%ecx), %ecx
312	lea	16(%edx), %edx
313
314	mov	%ecx, %eax
315	and	$-0x40, %ecx
316	sub	%ecx, %eax
317	lea	-12(%ecx), %ecx
318	sub	%eax, %edx
319
320	movaps	-4(%ecx), %xmm1
321
322L(Shl4LoopStart):
323	movaps	12(%ecx), %xmm2
324	movaps	28(%ecx), %xmm3
325	movaps	%xmm3, %xmm6
326	movaps	44(%ecx), %xmm4
327	movaps	%xmm4, %xmm7
328	movaps	60(%ecx), %xmm5
329	pminub	%xmm2, %xmm6
330	pminub	%xmm5, %xmm7
331	pminub	%xmm6, %xmm7
332	pcmpeqd	%xmm0, %xmm7
333	pmovmskb %xmm7, %eax
334	movaps	%xmm5, %xmm7
335	palignr	$4, %xmm4, %xmm5
336	palignr	$4, %xmm3, %xmm4
337	test	%eax, %eax
338	jnz	L(Shl4Start)
339
340	palignr	$4, %xmm2, %xmm3
341	lea	64(%ecx), %ecx
342	palignr	$4, %xmm1, %xmm2
343	movaps	%xmm7, %xmm1
344	movaps	%xmm5, 48(%edx)
345	movaps	%xmm4, 32(%edx)
346	movaps	%xmm3, 16(%edx)
347	movaps	%xmm2, (%edx)
348	lea	64(%edx), %edx
349	jmp	L(Shl4LoopStart)
350
351L(Shl4LoopExit):
352	movlpd	(%ecx), %xmm0
353	movl	8(%ecx), %esi
354	movlpd	%xmm0, (%edx)
355	movl	%esi, 8(%edx)
356	POP	(%esi)
357	add	$12, %edx
358	add	$12, %ecx
359	test	%al, %al
360	jz	L(ExitHigh)
361	test	$0x01, %al
362	jnz	L(Exit4)
363	movlpd	(%ecx), %xmm0
364	movlpd	%xmm0, (%edx)
365	movl	%edi, %eax
366	RETURN
367
368	CFI_PUSH	(%esi)
369
370	.p2align 4
371L(Shl8):
372	movaps	-8(%ecx), %xmm1
373	movaps	8(%ecx), %xmm2
374L(Shl8Start):
375	pcmpeqd	%xmm2, %xmm0
376	pmovmskb %xmm0, %eax
377	movaps	%xmm2, %xmm3
378
379	test	%eax, %eax
380	jnz	L(Shl8LoopExit)
381
382	palignr	$8, %xmm1, %xmm2
383	movaps	%xmm2, (%edx)
384	movaps	24(%ecx), %xmm2
385
386	pcmpeqd	%xmm2, %xmm0
387	lea	16(%edx), %edx
388	pmovmskb %xmm0, %eax
389	lea	16(%ecx), %ecx
390	movaps	%xmm2, %xmm1
391
392	test	%eax, %eax
393	jnz	L(Shl8LoopExit)
394
395	palignr	$8, %xmm3, %xmm2
396	movaps	%xmm2, (%edx)
397	movaps	24(%ecx), %xmm2
398
399	pcmpeqd	%xmm2, %xmm0
400	lea	16(%edx), %edx
401	pmovmskb %xmm0, %eax
402	lea	16(%ecx), %ecx
403	movaps	%xmm2, %xmm3
404
405	test	%eax, %eax
406	jnz	L(Shl8LoopExit)
407
408	palignr	$8, %xmm1, %xmm2
409	movaps	%xmm2, (%edx)
410	movaps	24(%ecx), %xmm2
411
412	pcmpeqd	%xmm2, %xmm0
413	lea	16(%edx), %edx
414	pmovmskb %xmm0, %eax
415	lea	16(%ecx), %ecx
416
417	test	%eax, %eax
418	jnz	L(Shl8LoopExit)
419
420	palignr	$8, %xmm3, %xmm2
421	movaps	%xmm2, (%edx)
422	lea	24(%ecx), %ecx
423	lea	16(%edx), %edx
424
425	mov	%ecx, %eax
426	and	$-0x40, %ecx
427	sub	%ecx, %eax
428	lea	-8(%ecx), %ecx
429	sub	%eax, %edx
430
431	movaps	-8(%ecx), %xmm1
432
433L(Shl8LoopStart):
434	movaps	8(%ecx), %xmm2
435	movaps	24(%ecx), %xmm3
436	movaps	%xmm3, %xmm6
437	movaps	40(%ecx), %xmm4
438	movaps	%xmm4, %xmm7
439	movaps	56(%ecx), %xmm5
440	pminub	%xmm2, %xmm6
441	pminub	%xmm5, %xmm7
442	pminub	%xmm6, %xmm7
443	pcmpeqd	%xmm0, %xmm7
444	pmovmskb %xmm7, %eax
445	movaps	%xmm5, %xmm7
446	palignr	$8, %xmm4, %xmm5
447	palignr	$8, %xmm3, %xmm4
448	test	%eax, %eax
449	jnz	L(Shl8Start)
450
451	palignr	$8, %xmm2, %xmm3
452	lea	64(%ecx), %ecx
453	palignr	$8, %xmm1, %xmm2
454	movaps	%xmm7, %xmm1
455	movaps	%xmm5, 48(%edx)
456	movaps	%xmm4, 32(%edx)
457	movaps	%xmm3, 16(%edx)
458	movaps	%xmm2, (%edx)
459	lea	64(%edx), %edx
460	jmp	L(Shl8LoopStart)
461
462L(Shl8LoopExit):
463	movlpd	(%ecx), %xmm0
464	movlpd	%xmm0, (%edx)
465	POP	(%esi)
466	add	$8, %edx
467	add	$8, %ecx
468	test	%al, %al
469	jz	L(ExitHigh)
470	test	$0x01, %al
471	jnz	L(Exit4)
472	movlpd	(%ecx), %xmm0
473	movlpd	%xmm0, (%edx)
474	movl	%edi, %eax
475	RETURN
476
477	CFI_PUSH	(%esi)
478
479	.p2align 4
480L(Shl12):
481	movaps	-12(%ecx), %xmm1
482	movaps	4(%ecx), %xmm2
483L(Shl12Start):
484	pcmpeqd	%xmm2, %xmm0
485	pmovmskb %xmm0, %eax
486	movaps	%xmm2, %xmm3
487
488	test	%eax, %eax
489	jnz	L(Shl12LoopExit)
490
491	palignr	$12, %xmm1, %xmm2
492	movaps	%xmm2, (%edx)
493	movaps	20(%ecx), %xmm2
494
495	pcmpeqd	%xmm2, %xmm0
496	lea	16(%edx), %edx
497	pmovmskb %xmm0, %eax
498	lea	16(%ecx), %ecx
499	movaps	%xmm2, %xmm1
500
501	test	%eax, %eax
502	jnz	L(Shl12LoopExit)
503
504	palignr	$12, %xmm3, %xmm2
505	movaps	%xmm2, (%edx)
506	movaps	20(%ecx), %xmm2
507
508	pcmpeqd	%xmm2, %xmm0
509	lea	16(%edx), %edx
510	pmovmskb %xmm0, %eax
511	lea	16(%ecx), %ecx
512	movaps	%xmm2, %xmm3
513
514	test	%eax, %eax
515	jnz	L(Shl12LoopExit)
516
517	palignr	$12, %xmm1, %xmm2
518	movaps	%xmm2, (%edx)
519	movaps	20(%ecx), %xmm2
520
521	pcmpeqd	%xmm2, %xmm0
522	lea	16(%edx), %edx
523	pmovmskb %xmm0, %eax
524	lea	16(%ecx), %ecx
525
526	test	%eax, %eax
527	jnz	L(Shl12LoopExit)
528
529	palignr	$12, %xmm3, %xmm2
530	movaps	%xmm2, (%edx)
531	lea	20(%ecx), %ecx
532	lea	16(%edx), %edx
533
534	mov	%ecx, %eax
535	and	$-0x40, %ecx
536	sub	%ecx, %eax
537	lea	-4(%ecx), %ecx
538	sub	%eax, %edx
539
540	movaps	-12(%ecx), %xmm1
541
542L(Shl12LoopStart):
543	movaps	4(%ecx), %xmm2
544	movaps	20(%ecx), %xmm3
545	movaps	%xmm3, %xmm6
546	movaps	36(%ecx), %xmm4
547	movaps	%xmm4, %xmm7
548	movaps	52(%ecx), %xmm5
549	pminub	%xmm2, %xmm6
550	pminub	%xmm5, %xmm7
551	pminub	%xmm6, %xmm7
552	pcmpeqd	%xmm0, %xmm7
553	pmovmskb %xmm7, %eax
554	movaps	%xmm5, %xmm7
555	palignr	$12, %xmm4, %xmm5
556	palignr	$12, %xmm3, %xmm4
557	test	%eax, %eax
558	jnz	L(Shl12Start)
559
560	palignr	$12, %xmm2, %xmm3
561	lea	64(%ecx), %ecx
562	palignr	$12, %xmm1, %xmm2
563	movaps	%xmm7, %xmm1
564	movaps	%xmm5, 48(%edx)
565	movaps	%xmm4, 32(%edx)
566	movaps	%xmm3, 16(%edx)
567	movaps	%xmm2, (%edx)
568	lea	64(%edx), %edx
569	jmp	L(Shl12LoopStart)
570
571L(Shl12LoopExit):
572	movl	(%ecx), %esi
573	movl	%esi, (%edx)
574	mov	$4, %esi
575
576	.p2align 4
577L(CopyFrom1To16Bytes):
578	add	%esi, %edx
579	add	%esi, %ecx
580
581	POP	(%esi)
582	test	%al, %al
583	jz	L(ExitHigh)
584	test	$0x01, %al
585	jnz	L(Exit4)
586L(Exit8):
587	movlpd	(%ecx), %xmm0
588	movlpd	%xmm0, (%edx)
589	movl	%edi, %eax
590	RETURN
591
592	.p2align 4
593L(ExitHigh):
594	test	$0x01, %ah
595	jnz	L(Exit12)
596L(Exit16):
597	movdqu	(%ecx), %xmm0
598	movdqu	%xmm0, (%edx)
599	movl	%edi, %eax
600	RETURN
601
602	.p2align 4
603L(Exit4):
604	movl	(%ecx), %eax
605	movl	%eax, (%edx)
606	movl	%edi, %eax
607	RETURN
608
609	.p2align 4
610L(Exit12):
611	movlpd	(%ecx), %xmm0
612	movlpd	%xmm0, (%edx)
613	movl	8(%ecx), %eax
614	movl	%eax, 8(%edx)
615	movl	%edi, %eax
616	RETURN
617
618CFI_POP	(%edi)
619
620	.p2align 4
621L(ExitTail4):
622	movl	(%ecx), %eax
623	movl	%eax, (%edx)
624	movl	%edx, %eax
625	ret
626
627	.p2align 4
628L(ExitTail8):
629	movlpd	(%ecx), %xmm0
630	movlpd	%xmm0, (%edx)
631	movl	%edx, %eax
632	ret
633
634	.p2align 4
635L(ExitTail12):
636	movlpd	(%ecx), %xmm0
637	movlpd	%xmm0, (%edx)
638	movl	8(%ecx), %eax
639	movl	%eax, 8(%edx)
640	movl	%edx, %eax
641	ret
642
643	.p2align 4
644L(ExitTail16):
645	movdqu	(%ecx), %xmm0
646	movdqu	%xmm0, (%edx)
647	movl	%edx, %eax
648	ret
649
650#ifndef USE_AS_WCSCAT
651END (wcscpy_ssse3)
652#endif
653