1/*
2 * Copyright (C) 2017 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *  * Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 *  * Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in
14 *    the documentation and/or other materials provided with the
15 *    distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
24 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <private/bionic_asm.h>
32
33#define PLDOFFS (16)
34#define PLDSIZE (128) /* L2 cache line size */
35
36        .syntax unified
37
38        // To avoid warning about deprecated instructions, add an explicit
39        // arch. The code generated is exactly the same.
40        .arch armv7-a
41
42        .code 32
43ENTRY(__memcpy_kryo)
44        push            {r0}
45        .cfi_def_cfa_offset 4
46        .cfi_rel_offset r0, 0
47        cmp             r2, #4
48        blt             .Lneon_lt4
49        cmp             r2, #16
50        blt             .Lneon_lt16
51        cmp             r2, #32
52        blt             .Lneon_16
53        cmp              r2, #128
54        blt              .Lneon_copy_32_a
55        /* Copy blocks of 128-bytes (word-aligned) at a time*/
56        /* Code below is optimized for PLDSIZE=128 only */
57        mov             r12, r2, lsr #7
58        cmp             r12, #PLDOFFS
59        ble             .Lneon_copy_128_loop_nopld
60        sub             r12, #PLDOFFS
61        pld             [r1, #(PLDOFFS-1)*PLDSIZE]
62.Lneon_copy_128_loop_outer:
63        pld             [r1, #(PLDOFFS*PLDSIZE)]
64        pld             [r1, #(PLDOFFS)*(PLDSIZE)+64]
65        vld1.32         {q0, q1}, [r1]!
66        vld1.32         {q2, q3}, [r1]!
67        vld1.32         {q8, q9}, [r1]!
68        vld1.32         {q10, q11}, [r1]!
69        subs            r12, r12, #1
70        vst1.32         {q0, q1}, [r0]!
71        vst1.32         {q2, q3}, [r0]!
72        vst1.32         {q8, q9}, [r0]!
73        vst1.32         {q10, q11}, [r0]!
74        bne             .Lneon_copy_128_loop_outer
75        mov             r12, #PLDOFFS
76.Lneon_copy_128_loop_nopld:
77        vld1.32         {q0, q1}, [r1]!
78        vld1.32         {q2, q3}, [r1]!
79        vld1.32         {q8, q9}, [r1]!
80        vld1.32         {q10, q11}, [r1]!
81        subs            r12, r12, #1
82        vst1.32         {q0, q1}, [r0]!
83        vst1.32         {q2, q3}, [r0]!
84        vst1.32         {q8, q9}, [r0]!
85        vst1.32         {q10, q11}, [r0]!
86        bne             .Lneon_copy_128_loop_nopld
87        ands            r2, r2, #0x7f
88        beq             .Lneon_exit
89        cmp             r2, #32
90        blt             .Lneon_16
91        nop
92        /* Copy blocks of 32-bytes (word aligned) at a time*/
93.Lneon_copy_32_a:
94        mov             r12, r2, lsr #5
95.Lneon_copy_32_loop_a:
96        vld1.32         {q0,q1}, [r1]!
97        subs            r12, r12, #1
98        vst1.32         {q0,q1}, [r0]!
99        bne             .Lneon_copy_32_loop_a
100        ands            r2, r2, #0x1f
101        beq             .Lneon_exit
102.Lneon_16:
103        subs            r2, r2, #16
104        blt             .Lneon_lt16
105        vld1.32         {q8}, [r1]!
106        vst1.32         {q8}, [r0]!
107        beq             .Lneon_exit
108.Lneon_lt16:
109        movs            r12, r2, lsl #29
110        bcc             .Lneon_skip8
111        ldr             r3, [r1], #4
112        ldr             r12, [r1], #4
113        str             r3, [r0], #4
114        str             r12, [r0], #4
115.Lneon_skip8:
116        bpl             .Lneon_lt4
117        ldr             r3, [r1], #4
118        str             r3, [r0], #4
119.Lneon_lt4:
120        movs            r2, r2, lsl #31
121        bcc             .Lneon_lt2
122        ldrh            r3, [r1], #2
123        strh            r3, [r0], #2
124.Lneon_lt2:
125        bpl             .Lneon_exit
126        ldrb            r12, [r1]
127        strb            r12, [r0]
128.Lneon_exit:
129        pop             {r0}
130        bx              lr
131
132END(__memcpy_kryo)
133