1/* 2 * Copyright (C) 2012,2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 x0 = dst 19 x1 = y0 base pointer 20 x2 = y1 base pointer 21 x3 = y2 base pointer 22 x4 = coeffs 23 x5 = length / 2 24*/ 25 26#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f: 27#define END(f) .size f, .-f; 28 29ENTRY(rsdIntrinsicConvolve3x3_K) 30 sub x6, sp, #64 31 sub sp, sp, #64 32 st1 {v8.1d-v11.1d}, [x6], #32 33 st1 {v12.1d-v15.1d}, [x6] 34 35 /* Load the coefficients in the v0, v1 registers */ 36 ld1 {v0.8h, v1.8h}, [x4] 37 38 /* Load the frequently used immediate in a register */ 39 mov x4, #8 40 411: 42 /* Load and post-increase the address by x4=#8 */ 43 ld1 {v13.16b}, [x1], x4 44 ld1 {v14.16b}, [x2], x4 45 ld1 {v15.16b}, [x3], x4 46 47 /* Signal memory for data that will be used in the loop after the next */ 48// prfm PLDL1KEEP,[x1, x4] // TODO: test this 49// prfm PLDL1KEEP,[x2, x4] // TODO: test this 50// prfm PLDL1KEEP,[x3, x4] // TODO: test this 51 52 uxtl v2.8h, v13.8b 53 uxtl2 v3.8h, v13.16b 54 uxtl v4.8h, v14.8b 55 uxtl2 v5.8h, v14.16b 56 uxtl v6.8h, v15.8b 57 uxtl2 v7.8h, v15.16b 58 59/* 60 The two pixel source array is 61 v2, v2hi, v3lo, v3hi 62 v4, v4hi, v5lo, v5hi 63 v6, v6hi, v7lo, v7hi 64*/ 65 66 smull v8.4s, v2.4h, v0.h[0] 67 smull2 v9.4s, v2.8h, v0.h[0] 68 smlal2 v8.4s, v2.8h, v0.h[1] 69 smlal v9.4s, v3.4h, v0.h[1] 70 smlal v8.4s, v3.4h, v0.h[2] 71 smlal2 v9.4s, v3.8h, v0.h[2] 72 smlal v8.4s, v4.4h, v0.h[3] 73 smlal2 v9.4s, v4.8h, v0.h[3] 74 smlal2 v8.4s, v4.8h, v0.h[4] 75 smlal v9.4s, v5.4h, v0.h[4] 76 smlal v8.4s, v5.4h, v0.h[5] 77 smlal2 v9.4s, v5.8h, v0.h[5] 78 smlal v8.4s, v6.4h, v0.h[6] 79 smlal2 v9.4s, v6.8h, v0.h[6] 80 smlal2 v8.4s, v6.8h, v0.h[7] 81 smlal v9.4s, v7.4h, v0.h[7] 82 smlal v8.4s, v7.4h, v1.h[0] 83 smlal2 v9.4s, v7.8h, v1.h[0] 84 85 shrn v8.4h, v8.4s, #8 86 shrn2 v8.8h, v9.4s, #8 87 88 sqxtun v8.8b, v8.8h 89 st1 {v8.8b}, [x0], #8 90 91 /* Are we done yet? */ 92 subs x5, x5, #1 93 bne 1b 94 95 /* We're done, bye! */ 96 ld1 {v8.1d-v11.1d}, [sp], #32 97 ld1 {v12.1d-v15.1d}, [sp], #32 98 ret 99END(rsdIntrinsicConvolve3x3_K) 100 101 102/* Convolve 5x5 */ 103 104/* 105 x0 = dst 106 x1 = y0 base pointer 107 x2 = y1 base pointer 108 x3 = y2 base pointer 109 x4 = y3 base pointer 110 x5 = y4 base pointer 111 x6 = coeffs 112 x7 = length 113*/ 114ENTRY(rsdIntrinsicConvolve5x5_K) 115 sub x8, sp, #64 116 sub sp, sp, #64 117 st1 {v8.1d-v11.1d}, [x8], #32 118 st1 {v12.1d-v15.1d}, [x8] 119 120 /* Create the coefficients vector */ 121 ld1 {v0.8h-v2.8h}, [x6], #48 122 ld1 {v3.4h}, [x6], #8 123 124 movi v15.4s, #0x7f 125 126 /* Load the frequently used immediate in a register */ 127 mov x6, #8 128 1291: 130 /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */ 131 ld1 {v9.8b-v11.8b}, [x1], x6 // y0 ( y - 2 ) 132 ld1 {v12.8b-v14.8b}, [x2], x6 // y0 ( y - 1 ) 133 134 /* Signal memory for data that will be used in the loop after the next */ 135// prfm PLDL1KEEP,[x1, x6] // TODO: test this 136// prfm PLDL1KEEP,[x2, x6] // TODO: test this 137 138 /* Promoting the 8bit channels to 16bit */ 139 uxtl v9.8h, v9.8b 140 uxtl v10.8h, v10.8b 141 uxtl v11.8h, v11.8b 142 uxtl v12.8h, v12.8b 143 uxtl v13.8h, v13.8b 144 uxtl v14.8h, v14.8b 145 146/* 147 v9, v9hi, v10lo, v10hi, v11lo, v11hi, 148 v12, v12hi 149*/ 150 smull v4.4s, v9.4h, v0.h[0] 151 smull2 v5.4s, v9.8h, v0.h[0] 152 smlal2 v4.4s, v9.8h, v0.h[1] 153 smlal v5.4s, v10.4h, v0.h[1] 154 smlal v4.4s, v10.4h, v0.h[2] 155 smlal2 v5.4s, v10.8h, v0.h[2] 156 smlal2 v4.4s, v10.8h, v0.h[3] 157 smlal v5.4s, v11.4h, v0.h[3] 158 smlal v4.4s, v11.4h, v0.h[4] 159 smlal2 v5.4s, v11.8h, v0.h[4] 160 161 smlal v4.4s, v12.4h, v0.h[5] 162 smlal2 v5.4s, v12.8h, v0.h[5] 163 smlal2 v4.4s, v12.8h, v0.h[6] 164 smlal v5.4s, v13.4h, v0.h[6] 165 smlal v4.4s, v13.4h, v0.h[7] 166 smlal2 v5.4s, v13.8h, v0.h[7] 167 smlal2 v4.4s, v13.8h, v1.h[0] 168 smlal v5.4s, v14.4h, v1.h[0] 169 smlal v4.4s, v14.4h, v1.h[1] 170 smlal2 v5.4s, v14.8h, v1.h[1] 171 172 /* Next 2 rows */ 173 /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */ 174 ld1 {v9.8b-v11.8b}, [x3], x6 // y0 ( y ) 175 ld1 {v12.8b-v14.8b}, [x4], x6 // y0 ( y + 1 ) 176 177 /* Signal memory for data that will be used in the loop after the next */ 178// prfm PLDL1KEEP,[x3, x6] // TODO: test this 179// prfm PLDL1KEEP,[x4, x6] // TODO: test this 180 181 /* Promoting the 8bit channels to 16bit */ 182 uxtl v9.8h, v9.8b 183 uxtl v10.8h, v10.8b 184 uxtl v11.8h, v11.8b 185 uxtl v12.8h, v12.8b 186 uxtl v13.8h, v13.8b 187 uxtl v14.8h, v14.8b 188 189/* 190 v9, v9hi, v10lo, v10hi, v11lo, v11hi, 191 v12, v12hi 192*/ 193 smlal v4.4s, v9.4h, v1.h[2] 194 smlal2 v5.4s, v9.8h, v1.h[2] 195 smlal2 v4.4s, v9.8h, v1.h[3] 196 smlal v5.4s, v10.4h, v1.h[3] 197 smlal v4.4s, v10.4h, v1.h[4] 198 smlal2 v5.4s, v10.8h, v1.h[4] 199 smlal2 v4.4s, v10.8h, v1.h[5] 200 smlal v5.4s, v11.4h, v1.h[5] 201 smlal v4.4s, v11.4h, v1.h[6] 202 smlal2 v5.4s, v11.8h, v1.h[6] 203 204 smlal v4.4s, v12.4h, v1.h[7] 205 smlal2 v5.4s, v12.8h, v1.h[7] 206 smlal2 v4.4s, v12.8h, v2.h[0] 207 smlal v5.4s, v13.4h, v2.h[0] 208 smlal v4.4s, v13.4h, v2.h[1] 209 smlal2 v5.4s, v13.8h, v2.h[1] 210 smlal2 v4.4s, v13.8h, v2.h[2] 211 smlal v5.4s, v14.4h, v2.h[2] 212 smlal v4.4s, v14.4h, v2.h[3] 213 smlal2 v5.4s, v14.8h, v2.h[3] 214 215 /* Last row */ 216 /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */ 217 ld1 {v9.8b- v11.8b}, [x5], x6 // y0 ( y + 2 ) 218 219 /* Signal memory for data that will be used in the loop after the next */ 220// prfm PLDL1KEEP,[x5, x6] // TODO: test this 221 222 /* Promoting the 8bit channels to 16bit */ 223 uxtl v9.8h, v9.8b 224 uxtl v10.8h, v10.8b 225 uxtl v11.8h, v11.8b 226 227/* 228 v9, v9hi, v10lo, v10hi, v11lo, v11hi, 229 v12, v12hi 230*/ 231 232 smlal v4.4s, v9.4h, v2.h[4] 233 smlal2 v5.4s, v9.8h, v2.h[4] 234 smlal2 v4.4s, v9.8h, v2.h[5] 235 smlal v5.4s, v10.4h, v2.h[5] 236 smlal v4.4s, v10.4h, v2.h[6] 237 smlal2 v5.4s, v10.8h, v2.h[6] 238 smlal2 v4.4s, v10.8h, v2.h[7] 239 smlal v5.4s, v11.4h, v2.h[7] 240 smlal v4.4s, v11.4h, v3.h[0] 241 smlal2 v5.4s, v11.8h, v3.h[0] 242 243 add v4.4s, v4.4s, v15.4s 244 add v5.4s, v5.4s, v15.4s 245 246/* Narrow it to a d-reg 32 -> 16 bit */ 247 rshrn v4.4h, v4.4s, #8 248 rshrn2 v4.8h, v5.4s, #8 249 250 251/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ 252 sqxtun v4.8b, v4.8h 253 254 st1 {v4.8b}, [x0], #8 // return the output and increase the address of x0 255 256 /* Are we done? */ 257 subs x7, x7, #1 258 bne 1b 259 260 /* Yup, bye */ 261 ld1 {v8.1d-v11.1d}, [sp], #32 262 ld1 {v12.1d-v15.1d}, [sp], #32 263 ret 264 265END(rsdIntrinsicConvolve5x5_K) 266