1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 * All rights reserved.
4 * Copyright (c) 2013-2014 NVIDIA Corporation.  All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *  * Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 *  * Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in
13 *    the documentation and/or other materials provided with the
14 *    distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <private/bionic_asm.h>
31
32        .text
33        .syntax unified
34        .fpu    neon
35
36#define CACHE_LINE_SIZE         (64)
37#define MEMCPY_BLOCK_SIZE_SMALL (32768)
38#define MEMCPY_BLOCK_SIZE_MID   (1048576)
39#define PREFETCH_DISTANCE_NEAR  (CACHE_LINE_SIZE*4)
40#define PREFETCH_DISTANCE_MID   (CACHE_LINE_SIZE*4)
41#define PREFETCH_DISTANCE_FAR   (CACHE_LINE_SIZE*16)
42
43ENTRY(memmove_a15)
44        cmp         r2, #0
45        cmpne       r0, r1
46        bxeq        lr
47        subs        r3, r0, r1
48        bls         .L_jump_to_memcpy
49        cmp         r2, r3
50        bhi         .L_reversed_memcpy
51
52.L_jump_to_memcpy:
53        b           __memcpy
54
55.L_reversed_memcpy:
56        push        {r0, lr}
57        .cfi_def_cfa_offset 8
58        .cfi_rel_offset r0, 0
59        .cfi_rel_offset lr, 4
60
61        add         r0, r0, r2
62        add         r1, r1, r2
63
64        /* preload next cache line */
65        pld         [r1, #-CACHE_LINE_SIZE]
66        pld         [r1, #-CACHE_LINE_SIZE*2]
67
68.L_reversed_memcpy_align_dest:
69        /* Deal with very small blocks (< 32bytes) asap */
70        cmp         r2, #32
71        blo         .L_reversed_memcpy_lt_32bytes
72        /* no need to align if len < 128 bytes */
73        cmp         r2, #128
74        blo         .L_reversed_memcpy_lt_128bytes
75        /* align destination to 64 bytes (1 cache line) */
76        ands        r3, r0, #0x3f
77        beq         .L_reversed_memcpy_dispatch
78        sub         r2, r2, r3
790:      /* copy 1 byte */
80        movs        ip, r3, lsl #31
81        ldrbmi      ip, [r1, #-1]!
82        strbmi      ip, [r0, #-1]!
831:      /* copy 2 bytes */
84        ldrbcs      ip, [r1, #-1]!
85        strbcs      ip, [r0, #-1]!
86        ldrbcs      ip, [r1, #-1]!
87        strbcs      ip, [r0, #-1]!
882:      /* copy 4 bytes */
89        movs        ip, r3, lsl #29
90        bpl         3f
91        sub         r1, r1, #4
92        sub         r0, r0, #4
93        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
94        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
953:      /* copy 8 bytes */
96        bcc         4f
97        sub         r1, r1, #8
98        sub         r0, r0, #8
99        vld1.8      {d0}, [r1]
100        vst1.8      {d0}, [r0, :64]
1014:      /* copy 16 bytes */
102        movs        ip, r3, lsl #27
103        bpl         5f
104        sub         r1, r1, #16
105        sub         r0, r0, #16
106        vld1.8      {q0}, [r1]
107        vst1.8      {q0}, [r0, :128]
1085:      /* copy 32 bytes */
109        bcc         .L_reversed_memcpy_dispatch
110        sub         r1, r1, #32
111        sub         r0, r0, #32
112        vld1.8      {q0, q1}, [r1]
113        vst1.8      {q0, q1}, [r0, :256]
114
115.L_reversed_memcpy_dispatch:
116        /* preload more cache lines */
117        pld         [r1, #-CACHE_LINE_SIZE*3]
118        pld         [r1, #-CACHE_LINE_SIZE*4]
119
120        cmp         r2, #MEMCPY_BLOCK_SIZE_SMALL
121        blo         .L_reversed_memcpy_neon_pld_near
122        cmp         r2, #MEMCPY_BLOCK_SIZE_MID
123        blo         .L_reversed_memcpy_neon_pld_mid
124        b           .L_reversed_memcpy_neon_pld_far
125
126.L_reversed_memcpy_neon_pld_near:
127        /* less than 128 bytes? */
128        subs        r2, r2, #128
129        blo         1f
130        sub         r1, r1, #32
131        sub         r0, r0, #32
132        mov         r3, #-32
133        .align      4
1340:
135        /* copy 128 bytes in each loop */
136        subs        r2, r2, #128
137
138        /* preload to cache */
139        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
140        /* copy a cache line */
141        vld1.8      {q0, q1}, [r1], r3
142        vst1.8      {q0, q1}, [r0, :256], r3
143        vld1.8      {q0, q1}, [r1], r3
144        vst1.8      {q0, q1}, [r0, :256], r3
145
146        /* preload to cache */
147        pld         [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
148        /* copy a cache line */
149        vld1.8      {q0, q1}, [r1], r3
150        vst1.8      {q0, q1}, [r0, :256], r3
151        vld1.8      {q0, q1}, [r1], r3
152        vst1.8      {q0, q1}, [r0, :256], r3
153
154        bhs         0b
155        add         r1, r1, #32
156        add         r0, r0, #32
1571:
158        adds        r2, r2, #128
159        bne         .L_reversed_memcpy_lt_128bytes
160        pop         {r0, pc}
161
162.L_reversed_memcpy_neon_pld_mid:
163        subs        r2, r2, #128
164        sub         r1, r1, #32
165        sub         r0, r0, #32
166        mov         r3, #-32
167        .align      4
1680:
169        /* copy 128 bytes in each loop */
170        subs        r2, r2, #128
171
172        /* preload to cache */
173        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
174        /* copy a cache line */
175        vld1.8      {q0, q1}, [r1], r3
176        vst1.8      {q0, q1}, [r0, :256], r3
177        vld1.8      {q0, q1}, [r1], r3
178        vst1.8      {q0, q1}, [r0, :256], r3
179
180        /* preload to cache */
181        pld         [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
182        /* copy a cache line */
183        vld1.8      {q0, q1}, [r1], r3
184        vst1.8      {q0, q1}, [r0, :256], r3
185        vld1.8      {q0, q1}, [r1], r3
186        vst1.8      {q0, q1}, [r0, :256], r3
187
188        bhs         0b
189        add         r1, r1, #32
190        add         r0, r0, #32
1911:
192        adds        r2, r2, #128
193        bne         .L_reversed_memcpy_lt_128bytes
194        pop         {r0, pc}
195
196.L_reversed_memcpy_neon_pld_far:
197        sub         r2, r2, #128
198        sub         r0, r0, #128
199        sub         r1, r1, #128
200        .align      4
2010:
202        /* copy 128 bytes in each loop */
203        subs        r2, r2, #128
204
205        /* preload to cache */
206        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
207        pld         [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
208        /* read */
209        vld1.8      {q0, q1}, [r1]!
210        vld1.8      {q2, q3}, [r1]!
211        vld1.8      {q8, q9}, [r1]!
212        vld1.8      {q10, q11}, [r1]!
213        /* write */
214        vst1.8      {q0, q1}, [r0, :256]!
215        vst1.8      {q2, q3}, [r0, :256]!
216        vst1.8      {q8, q9}, [r0, :256]!
217        vst1.8      {q10, q11}, [r0, :256]!
218
219        sub         r0, r0, #256
220        sub         r1, r1, #256
221        bhs         0b
222        add         r0, r0, #128
223        add         r1, r1, #128
2241:
225        adds        r2, r2, #128
226        bne         .L_reversed_memcpy_lt_128bytes
227        pop         {r0, pc}
228
229.L_reversed_memcpy_lt_128bytes:
2306:      /* copy 64 bytes */
231        movs        ip, r2, lsl #26
232        bcc         5f
233        sub         r1, r1, #32
234        sub         r0, r0, #32
235        vld1.8      {q0, q1}, [r1]
236        vst1.8      {q0, q1}, [r0]
237        sub         r1, r1, #32
238        sub         r0, r0, #32
239        vld1.8      {q0, q1}, [r1]
240        vst1.8      {q0, q1}, [r0]
2415:      /* copy 32 bytes */
242        bpl         4f
243        sub         r1, r1, #32
244        sub         r0, r0, #32
245        vld1.8      {q0, q1}, [r1]
246        vst1.8      {q0, q1}, [r0]
247.L_reversed_memcpy_lt_32bytes:
2484:      /* copy 16 bytes */
249        movs        ip, r2, lsl #28
250        bcc         3f
251        sub         r1, r1, #16
252        sub         r0, r0, #16
253        vld1.8      {q0}, [r1]
254        vst1.8      {q0}, [r0]
2553:      /* copy 8 bytes */
256        bpl         2f
257        sub         r1, r1, #8
258        sub         r0, r0, #8
259        vld1.8      {d0}, [r1]
260        vst1.8      {d0}, [r0]
2612:      /* copy 4 bytes */
262        ands        ip, r2, #0x4
263        beq         1f
264        sub         r1, r1, #4
265        sub         r0, r0, #4
266        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]
267        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]
2681:      /* copy 2 bytes */
269        movs        ip, r2, lsl #31
270        ldrbcs      ip, [r1, #-1]!
271        strbcs      ip, [r0, #-1]!
272        ldrbcs      ip, [r1, #-1]!
273        strbcs      ip, [r0, #-1]!
2740:      /* copy 1 byte */
275        ldrbmi      ip, [r1, #-1]!
276        strbmi      ip, [r0, #-1]!
277
278        pop         {r0, pc}
279
280END(memmove_a15)
281