1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define PRIVATE(f) .text; .align 4; .type f,#function; f: 19#define END(f) .size f, .-f; 20 21//#define ARCH_ARM64_USE_BLUR_PRELOAD 22 23/* Number of fractional bits to preserve in intermediate results. The 24 * intermediate storage is 16-bit, and we started with 8 bit data (the integer 25 * part), so this should be between 0 and 8. 26 */ 27.set FRACTION_BITS, 7 28.set MAX_R, 25 29 30 31/* A quick way of making a line of code conditional on some other condition. 32 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 33 * `ifcc`: 34 */ 35.macro ifcc zzz:vararg 36.if cc 37 \zzz 38.endif 39.endm 40 41/* It's not always clear that prefetching is beneficial and this needs further 42 * testing on different cores, so it's made switchable here. 43 */ 44#if defined(ARCH_ARM64_USE_BLUR_PRELOAD) 45#define VERTPLD(...) prfm PLDL1KEEP, [__VA_ARGS__] 46#else 47#define VERTPLD(...) nop 48#endif 49 50/* Fetch 16 columns of bytes (regardless of image format), convolve these 51 * vertically, and leave them in the register file. If working near the top or 52 * bottom of an image then clamp the addressing while loading the data in. 53 * 54 * The convolution is fully unrolled for windows up to max_r, with the 55 * outermost edges calculated first. This way it's possible to branch directly 56 * into the relevant part of the code for an arbitrary convolution radius. Two 57 * variants of the loop are produced; one eliminates the clamping code for a 58 * slight speed advantage. 59 * 60 * Where the macro is called with reg=x, the specified register is taken to 61 * contain a pre-calculated pointer into one of the two loops. 62 * 63 * Input: 64 * x1 -- src 65 * x2 -- pitch 66 * x5 -- r 67 * x6 -- rup (r, unless clipped to top of source image) 68 * x7 -- rdn (r, unless clipped to bottom of source image) 69 * x12 -- switch index 70 * v0-v3 -- coefficient table 71 * x13 = -pitch 72 * x15 = top-row in 73 * x19 = bottom-row in 74 * Output: 75 * x1 += 16 76 * v10,v11 -- 16 convolved columns 77 * Modifies: 78 * x10 = upper row pointer 79 * x11 = lower row pointer 80 * v12-v15 = temporary sums 81 */ 82.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ 83 .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 84 85 ld1 {v15.16b}, [x1], #16 86 mov x10, x15 87 88 uxtl v14.8h, v15.8b 89 VERTPLD(x1, #16) 90 uxtl2 v15.8h, v15.16b 91 .if \max_r < 16 // approximate 92 ifcc adr \reg, 1f 93 .else 94 ifcc adrp \reg, 1f 95 ifcc add \reg, \reg, #:lo12:1f 96 .endif 97 98 umull v12.4s, v14.4h, v0.h[0] 99 ifcc sub \reg, \reg, x5, LSL #6 100 umull2 v13.4s, v14.8h, v0.h[0] 101 mov x11, x19 102 umull v14.4s, v15.4h, v0.h[0] 103 ifcc add \reg, \reg, x5, LSL #3 104 umull2 v15.4s, v15.8h, v0.h[0] 105 br \reg 106 107 /* This version of the vertical fetch loop body is used away from the edges 108 * of the source image. The pointers start at the top and bottom source rows 109 * and work their way towards the centre on each iteration. This way the 110 * number of taps used can be controlled by jumping directly into the middle 111 * of the loop and running to completion. 112 * If the loop body changes size then the code which caculates the address of 113 * the initial iteration must be updated to accordingly. 114 */ 115 .macro vertfetch_noclamp i, dreg 116 .if 0 < \i && \i <= \max_r 117 ld1 {v10.16b}, [x10], x2 118 ld1 {v11.16b}, [x11], x13 119 uaddl v16.8h, v10.8b, v11.8b 120 uaddl2 v11.8h, v10.16b, v11.16b 121 umlal v12.4s, v16.4h, \dreg 122 umlal2 v13.4s, v16.8h, \dreg 123 VERTPLD(x10, #32) 124 umlal v14.4s, v11.4h, \dreg 125 VERTPLD(x11, #32) 126 umlal2 v15.4s, v11.8h, \dreg 127 .endif 128 .endm 129 130 /* This version of the vertical fetch loop body is used near the edges of the 131 * source image, where one or both of the accesses may start with a clamped 132 * value, and the row addresses only begin to change after some number of 133 * iterations before the end. 134 * If the loop body changes size then the code which caculates the address of 135 * the initial iteration must be updated to accordingly. 136 */ 137 .macro vertfetch_clamped i, dreg 138 .if 0 < \i && \i <= \max_r 139 ld1 {v10.16b}, [x10], x2 140 cmp x6, #\i 141 ld1 {v11.16b}, [x11], x13 142 csel x10, x15, x10, lo 143 uaddl v16.8h, v10.8b, v11.8b 144 cmp x7, #\i 145 uaddl2 v11.8h, v10.16b, v11.16b 146 csel x11, x19, x11, lo 147 umlal v12.4s, v16.4h, \dreg 148 umlal2 v13.4s, v16.8h, \dreg 149 VERTPLD(x10, #32) 150 umlal v14.4s, v11.4h, \dreg 151 VERTPLD(x11, #32) 152 umlal2 v15.4s, v11.8h, \dreg 153 .endif 154 .endm 155 156 /* Entry into this unrolled loop is computed as a negative index from 157 * \labelc at the end of the block. 158 */ 159 .align 4 160 vertfetch_clamped 27, v3.h[3] 161 vertfetch_clamped 26, v3.h[2] 162 vertfetch_clamped 25, v3.h[1] 163 vertfetch_clamped 24, v3.h[0] 164 vertfetch_clamped 23, v2.h[7] 165 vertfetch_clamped 22, v2.h[6] 166 vertfetch_clamped 21, v2.h[5] 167 vertfetch_clamped 20, v2.h[4] 168 vertfetch_clamped 19, v2.h[3] 169 vertfetch_clamped 18, v2.h[2] 170 vertfetch_clamped 17, v2.h[1] 171 vertfetch_clamped 16, v2.h[0] 172 vertfetch_clamped 15, v1.h[7] 173 vertfetch_clamped 14, v1.h[6] 174 vertfetch_clamped 13, v1.h[5] 175 vertfetch_clamped 12, v1.h[4] 176 vertfetch_clamped 11, v1.h[3] 177 vertfetch_clamped 10, v1.h[2] 178 vertfetch_clamped 9, v1.h[1] 179 vertfetch_clamped 8, v1.h[0] 180 vertfetch_clamped 7, v0.h[7] 181 vertfetch_clamped 6, v0.h[6] 182 vertfetch_clamped 5, v0.h[5] 183 vertfetch_clamped 4, v0.h[4] 184 vertfetch_clamped 3, v0.h[3] 185 vertfetch_clamped 2, v0.h[2] 186 vertfetch_clamped 1, v0.h[1] 187 vertfetch_clamped 0, v0.h[0] 188 1: 189 \labelc : b 2f /* done with clamped loop, skip over non-clamped loop */ 190 191 /* Entry into this unrolled loop is computed as a negative index from 192 * \labelnc at the end of the block. 193 */ 194 .align 4 195 vertfetch_noclamp 27, v3.h[3] 196 vertfetch_noclamp 26, v3.h[2] 197 vertfetch_noclamp 25, v3.h[1] 198 vertfetch_noclamp 24, v3.h[0] 199 vertfetch_noclamp 23, v2.h[7] 200 vertfetch_noclamp 22, v2.h[6] 201 vertfetch_noclamp 21, v2.h[5] 202 vertfetch_noclamp 20, v2.h[4] 203 vertfetch_noclamp 19, v2.h[3] 204 vertfetch_noclamp 18, v2.h[2] 205 vertfetch_noclamp 17, v2.h[1] 206 vertfetch_noclamp 16, v2.h[0] 207 vertfetch_noclamp 15, v1.h[7] 208 vertfetch_noclamp 14, v1.h[6] 209 vertfetch_noclamp 13, v1.h[5] 210 vertfetch_noclamp 12, v1.h[4] 211 vertfetch_noclamp 11, v1.h[3] 212 vertfetch_noclamp 10, v1.h[2] 213 vertfetch_noclamp 9, v1.h[1] 214 vertfetch_noclamp 8, v1.h[0] 215 vertfetch_noclamp 7, v0.h[7] 216 vertfetch_noclamp 6, v0.h[6] 217 vertfetch_noclamp 5, v0.h[5] 218 vertfetch_noclamp 4, v0.h[4] 219 vertfetch_noclamp 3, v0.h[3] 220 vertfetch_noclamp 2, v0.h[2] 221 vertfetch_noclamp 1, v0.h[1] 222 vertfetch_noclamp 0, v0.h[0] 223 \labelnc : 224 225 .purgem vertfetch_clamped 226 .purgem vertfetch_noclamp 227 228 2: uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS 229 add x15, x15, #16 230 uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS 231 add x19, x19, #16 232 uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS 233 uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS 234.endm /*}}}*/ 235 236/* Some portion of the convolution window (as much as will fit, and all of it 237 * for the uchar1 cases) is kept in the register file to avoid unnecessary 238 * memory accesses. This forces the horizontal loops to be unrolled because 239 * there's no indexed addressing into the register file. 240 * 241 * As in the fetch macro, the operations are ordered from outside to inside, so 242 * that jumping into the middle of the block bypasses the unwanted window taps. 243 * 244 * There are several variants of the macro because of the fixed offets of the 245 * taps -- the wider the maximum radius the further the centre tap is from the 246 * most recently fetched data. This means that pre-filling the window requires 247 * more data that won't be used and it means that rotating the window involves 248 * more mov operations. 249 * 250 * When the buffer gets too big the buffer at [x9] is used. 251 * 252 * Input: 253 * v16-v31,v4-v11 -- convoltion window 254 * x9 -- pointer to additional convolution window data 255 * Output: 256 * x9 -- updated buffer pointer (if used) 257 * d31 -- result to be stored 258 * Modifies: 259 * x12 -- temp buffer pointer 260 * v12-v13 -- temporaries for load and vext operations. 261 * v14-v15 -- intermediate sums 262 */ 263#define TUNED_LIST1 8, 16 264.macro hconv1_8/*{{{*/ 265 266.rodata 267 200: .hword -4 268 .hword 101f-100f 269 .hword 102f-100f 270 .hword 103f-100f 271 .hword 104f-100f 272 .hword 105f-100f 273 .hword 106f-100f 274 .hword 107f-100f 275 .hword 108f-100f 276 .align 4 277.text 278 umull v14.4s, v9.4h, v0.h[0] 279 umull2 v15.4s, v9.8h, v0.h[0] 280 281 adrp x16, 200b 282 add x16, x16, :lo12:200b 283 ldrsh x12, [x16, x5, LSL #1] 284 adr x16, 100f 285 add x12, x12, x16 286 100: br x12 287 108: umlal v14.4s, v8.4h, v1.h[0] 288 umlal2 v15.4s, v8.8h, v1.h[0] 289 umlal v14.4s, v10.4h, v1.h[0] 290 umlal2 v15.4s, v10.8h, v1.h[0] 291 107: ext v12.16b, v8.16b, v9.16b, #1*2 292 ext v13.16b, v9.16b, v10.16b, #7*2 293 umlal v14.4s, v12.4h, v0.h[7] 294 umlal2 v15.4s, v12.8h, v0.h[7] 295 umlal v14.4s, v13.4h, v0.h[7] 296 umlal2 v15.4s, v13.8h, v0.h[7] 297 106: ext v12.16b, v8.16b, v9.16b, #2*2 298 ext v13.16b, v9.16b, v10.16b, #6*2 299 umlal v14.4s, v12.4h, v0.h[6] 300 umlal2 v15.4s, v12.8h, v0.h[6] 301 umlal v14.4s, v13.4h, v0.h[6] 302 umlal2 v15.4s, v13.8h, v0.h[6] 303 105: ext v12.16b, v8.16b, v9.16b, #3*2 304 ext v13.16b, v9.16b, v10.16b, #5*2 305 umlal v14.4s, v12.4h, v0.h[5] 306 umlal2 v15.4s, v12.8h, v0.h[5] 307 umlal v14.4s, v13.4h, v0.h[5] 308 umlal2 v15.4s, v13.8h, v0.h[5] 309 104: //ext v12.16b, v8.16b, v9.16b, #4*2 310 //ext v13.16b, v9.16b, v10.16b, #4*2 311 umlal2 v14.4s, v8.8h, v0.h[4] 312 umlal v15.4s, v9.4h, v0.h[4] 313 umlal2 v14.4s, v9.8h, v0.h[4] 314 umlal v15.4s, v10.4h, v0.h[4] 315 103: ext v12.16b, v8.16b, v9.16b, #5*2 316 ext v13.16b, v9.16b, v10.16b, #3*2 317 umlal v14.4s, v12.4h, v0.h[3] 318 umlal2 v15.4s, v12.8h, v0.h[3] 319 umlal v14.4s, v13.4h, v0.h[3] 320 umlal2 v15.4s, v13.8h, v0.h[3] 321 102: ext v12.16b, v8.16b, v9.16b, #6*2 322 ext v13.16b, v9.16b, v10.16b, #2*2 323 umlal v14.4s, v12.4h, v0.h[2] 324 umlal2 v15.4s, v12.8h, v0.h[2] 325 umlal v14.4s, v13.4h, v0.h[2] 326 umlal2 v15.4s, v13.8h, v0.h[2] 327 101: ext v12.16b, v8.16b, v9.16b, #7*2 328 ext v13.16b, v9.16b, v10.16b, #1*2 329 umlal v14.4s, v12.4h, v0.h[1] 330 umlal2 v15.4s, v12.8h, v0.h[1] 331 umlal v14.4s, v13.4h, v0.h[1] 332 umlal2 v15.4s, v13.8h, v0.h[1] 333 334 uqrshrn v14.4h, v14.4s, #16 335 uqrshrn2 v14.8h, v15.4s, #16 336 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 337 338 mov v8.16b, v9.16b 339 mov v9.16b, v10.16b 340 mov v10.16b, v11.16b 341.endm/*}}}*/ 342 343.macro hconv1_16/*{{{*/ 344.rodata 345 200: .hword -4 346 .hword 101f-100f 347 .hword 102f-100f 348 .hword 103f-100f 349 .hword 104f-100f 350 .hword 105f-100f 351 .hword 106f-100f 352 .hword 107f-100f 353 .hword 108f-100f 354 .hword 109f-100f 355 .hword 110f-100f 356 .hword 111f-100f 357 .hword 112f-100f 358 .hword 113f-100f 359 .hword 114f-100f 360 .hword 115f-100f 361 .hword 116f-100f 362 .align 4 363 364.text 365 umull v14.4s, v8.4h, v0.h[0] 366 umull2 v15.4s, v8.8h, v0.h[0] 367 368 adrp x16, 200b 369 add x16, x16, :lo12:200b 370 ldrsh x12, [x16, x5, LSL #1] 371 adr x16, 100f 372 add x12, x12, x16 373 100: br x12 374 116: //ext v12.16b, v6.16b, v7.16b, #0*2 375 //ext v13.16b, v10.16b, v11.16b, #0*2 376 umlal v14.4s, v6.4h, v2.h[0] 377 umlal2 v15.4s, v6.8h, v2.h[0] 378 umlal v14.4s, v10.4h, v2.h[0] 379 umlal2 v15.4s, v10.8h, v2.h[0] 380 115: ext v12.16b, v6.16b, v7.16b, #1*2 381 ext v13.16b, v9.16b, v10.16b, #7*2 382 umlal v14.4s, v12.4h, v1.h[7] 383 umlal2 v15.4s, v12.8h, v1.h[7] 384 umlal v14.4s, v13.4h, v1.h[7] 385 umlal2 v15.4s, v13.8h, v1.h[7] 386 114: ext v12.16b, v6.16b, v7.16b, #2*2 387 ext v13.16b, v9.16b, v10.16b, #6*2 388 umlal v14.4s, v12.4h, v1.h[6] 389 umlal2 v15.4s, v12.8h, v1.h[6] 390 umlal v14.4s, v13.4h, v1.h[6] 391 umlal2 v15.4s, v13.8h, v1.h[6] 392 113: ext v12.16b, v6.16b, v7.16b, #3*2 393 ext v13.16b, v9.16b, v10.16b, #5*2 394 umlal v14.4s, v12.4h, v1.h[5] 395 umlal2 v15.4s, v12.8h, v1.h[5] 396 umlal v14.4s, v13.4h, v1.h[5] 397 umlal2 v15.4s, v13.8h, v1.h[5] 398 112: //ext v12.16b, v6.16b, v7.16b, #4*2 399 //ext v13.16b, v9.16b, v10.16b, #4*2 400 umlal2 v14.4s, v6.8h, v1.h[4] 401 umlal v15.4s, v7.4h, v1.h[4] 402 umlal2 v14.4s, v9.8h, v1.h[4] 403 umlal v15.4s, v10.4h, v1.h[4] 404 111: ext v12.16b, v6.16b, v7.16b, #5*2 405 ext v13.16b, v9.16b, v10.16b, #3*2 406 umlal v14.4s, v12.4h, v1.h[3] 407 umlal2 v15.4s, v12.8h, v1.h[3] 408 umlal v14.4s, v13.4h, v1.h[3] 409 umlal2 v15.4s, v13.8h, v1.h[3] 410 110: ext v12.16b, v6.16b, v7.16b, #6*2 411 ext v13.16b, v9.16b, v10.16b, #2*2 412 umlal v14.4s, v12.4h, v1.h[2] 413 umlal2 v15.4s, v12.8h, v1.h[2] 414 umlal v14.4s, v13.4h, v1.h[2] 415 umlal2 v15.4s, v13.8h, v1.h[2] 416 109: ext v12.16b, v6.16b, v7.16b, #7*2 417 ext v13.16b, v9.16b, v10.16b, #1*2 418 umlal v14.4s, v12.4h, v1.h[1] 419 umlal2 v15.4s, v12.8h, v1.h[1] 420 umlal v14.4s, v13.4h, v1.h[1] 421 umlal2 v15.4s, v13.8h, v1.h[1] 422 108: //ext v12.16b, v7.16b, v8.16b, #0*2 423 //ext v13.16b, v9.16b, v10.16b, #0*2 424 umlal v14.4s, v7.4h, v1.h[0] 425 umlal2 v15.4s, v7.8h, v1.h[0] 426 umlal v14.4s, v9.4h, v1.h[0] 427 umlal2 v15.4s, v9.8h, v1.h[0] 428 107: ext v12.16b, v7.16b, v8.16b, #1*2 429 ext v13.16b, v8.16b, v9.16b, #7*2 430 umlal v14.4s, v12.4h, v0.h[7] 431 umlal2 v15.4s, v12.8h, v0.h[7] 432 umlal v14.4s, v13.4h, v0.h[7] 433 umlal2 v15.4s, v13.8h, v0.h[7] 434 106: ext v12.16b, v7.16b, v8.16b, #2*2 435 ext v13.16b, v8.16b, v9.16b, #6*2 436 umlal v14.4s, v12.4h, v0.h[6] 437 umlal2 v15.4s, v12.8h, v0.h[6] 438 umlal v14.4s, v13.4h, v0.h[6] 439 umlal2 v15.4s, v13.8h, v0.h[6] 440 105: ext v12.16b, v7.16b, v8.16b, #3*2 441 ext v13.16b, v8.16b, v9.16b, #5*2 442 umlal v14.4s, v12.4h, v0.h[5] 443 umlal2 v15.4s, v12.8h, v0.h[5] 444 umlal v14.4s, v13.4h, v0.h[5] 445 umlal2 v15.4s, v13.8h, v0.h[5] 446 104: //ext v12.16b, v7.16b, v8.16b, #4*2 447 //ext v13.16b, v8.16b, v9.16b, #4*2 448 umlal2 v14.4s, v7.8h, v0.h[4] 449 umlal v15.4s, v8.4h, v0.h[4] 450 umlal2 v14.4s, v8.8h, v0.h[4] 451 umlal v15.4s, v9.4h, v0.h[4] 452 103: ext v12.16b, v7.16b, v8.16b, #5*2 453 ext v13.16b, v8.16b, v9.16b, #3*2 454 umlal v14.4s, v12.4h, v0.h[3] 455 umlal2 v15.4s, v12.8h, v0.h[3] 456 umlal v14.4s, v13.4h, v0.h[3] 457 umlal2 v15.4s, v13.8h, v0.h[3] 458 102: ext v12.16b, v7.16b, v8.16b, #6*2 459 ext v13.16b, v8.16b, v9.16b, #2*2 460 umlal v14.4s, v12.4h, v0.h[2] 461 umlal2 v15.4s, v12.8h, v0.h[2] 462 umlal v14.4s, v13.4h, v0.h[2] 463 umlal2 v15.4s, v13.8h, v0.h[2] 464 101: ext v12.16b, v7.16b, v8.16b, #7*2 465 ext v13.16b, v8.16b, v9.16b, #1*2 466 umlal v14.4s, v12.4h, v0.h[1] 467 umlal2 v15.4s, v12.8h, v0.h[1] 468 umlal v14.4s, v13.4h, v0.h[1] 469 umlal2 v15.4s, v13.8h, v0.h[1] 470 471 uqrshrn v14.4h, v14.4s, #16 472 uqrshrn2 v14.8h, v15.4s, #16 473 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 474 475 mov v6.16b, v7.16b 476 mov v7.16b, v8.16b 477 mov v8.16b, v9.16b 478 mov v9.16b, v10.16b 479 mov v10.16b, v11.16b 480.endm/*}}}*/ 481 482.macro hconv1_25/*{{{*/ 483.rodata 484 200: .hword -4 485 .hword 101f-100f 486 .hword 102f-100f 487 .hword 103f-100f 488 .hword 104f-100f 489 .hword 105f-100f 490 .hword 106f-100f 491 .hword 107f-100f 492 .hword 108f-100f 493 .hword 109f-100f 494 .hword 110f-100f 495 .hword 111f-100f 496 .hword 112f-100f 497 .hword 113f-100f 498 .hword 114f-100f 499 .hword 115f-100f 500 .hword 116f-100f 501 .hword 117f-100f 502 .hword 118f-100f 503 .hword 119f-100f 504 .hword 120f-100f 505 .hword 121f-100f 506 .hword 122f-100f 507 .hword 123f-100f 508 .hword 124f-100f 509 .hword 125f-100f 510 .align 4 511.text 512 ext v12.16b, v6.16b, v7.16b, #7*2 513 umull v14.4s, v12.4h, v0.h[0] 514 umull2 v15.4s, v12.8h, v0.h[0] 515 516 adrp x16, 200b 517 add x16, x16, :lo12:200b 518 ldrsh x12, [x16, x5, LSL #1] 519 adr x16, 100f 520 add x12, x12, x16 521 100: br x12 522 125: ext v12.16b, v31.16b, v4.16b, #6*2 523 ext v13.16b, v10.16b, v11.16b, #0*2 524 umlal v14.4s, v12.4h, v3.h[1] 525 umlal2 v15.4s, v12.8h, v3.h[1] 526 umlal v14.4s, v13.4h, v3.h[1] 527 umlal2 v15.4s, v13.8h, v3.h[1] 528 124: ext v12.16b, v31.16b, v4.16b, #7*2 529 ext v13.16b, v9.16b, v10.16b, #7*2 530 umlal v14.4s, v12.4h, v3.h[0] 531 umlal2 v15.4s, v12.8h, v3.h[0] 532 umlal v14.4s, v13.4h, v3.h[0] 533 umlal2 v15.4s, v13.8h, v3.h[0] 534 123: ext v12.16b, v4.16b, v5.16b, #0*2 535 ext v13.16b, v9.16b, v10.16b, #6*2 536 umlal v14.4s, v12.4h, v2.h[7] 537 umlal2 v15.4s, v12.8h, v2.h[7] 538 umlal v14.4s, v13.4h, v2.h[7] 539 umlal2 v15.4s, v13.8h, v2.h[7] 540 122: ext v12.16b, v4.16b, v5.16b, #1*2 541 ext v13.16b, v9.16b, v10.16b, #5*2 542 umlal v14.4s, v12.4h, v2.h[6] 543 umlal2 v15.4s, v12.8h, v2.h[6] 544 umlal v14.4s, v13.4h, v2.h[6] 545 umlal2 v15.4s, v13.8h, v2.h[6] 546 121: ext v12.16b, v4.16b, v5.16b, #2*2 547 ext v13.16b, v9.16b, v10.16b, #4*2 548 umlal v14.4s, v12.4h, v2.h[5] 549 umlal2 v15.4s, v12.8h, v2.h[5] 550 umlal v14.4s, v13.4h, v2.h[5] 551 umlal2 v15.4s, v13.8h, v2.h[5] 552 120: ext v12.16b, v4.16b, v5.16b, #3*2 553 ext v13.16b, v9.16b, v10.16b, #3*2 554 umlal v14.4s, v12.4h, v2.h[4] 555 umlal2 v15.4s, v12.8h, v2.h[4] 556 umlal v14.4s, v13.4h, v2.h[4] 557 umlal2 v15.4s, v13.8h, v2.h[4] 558 119: ext v12.16b, v4.16b, v5.16b, #4*2 559 ext v13.16b, v9.16b, v10.16b, #2*2 560 umlal v14.4s, v12.4h, v2.h[3] 561 umlal2 v15.4s, v12.8h, v2.h[3] 562 umlal v14.4s, v13.4h, v2.h[3] 563 umlal2 v15.4s, v13.8h, v2.h[3] 564 118: ext v12.16b, v4.16b, v5.16b, #5*2 565 ext v13.16b, v9.16b, v10.16b, #1*2 566 umlal v14.4s, v12.4h, v2.h[2] 567 umlal2 v15.4s, v12.8h, v2.h[2] 568 umlal v14.4s, v13.4h, v2.h[2] 569 umlal2 v15.4s, v13.8h, v2.h[2] 570 117: ext v12.16b, v4.16b, v5.16b, #6*2 571 ext v13.16b, v9.16b, v10.16b, #0*2 572 umlal v14.4s, v12.4h, v2.h[1] 573 umlal2 v15.4s, v12.8h, v2.h[1] 574 umlal v14.4s, v13.4h, v2.h[1] 575 umlal2 v15.4s, v13.8h, v2.h[1] 576 116: ext v12.16b, v4.16b, v5.16b, #7*2 577 ext v13.16b, v8.16b, v9.16b, #7*2 578 umlal v14.4s, v12.4h, v2.h[0] 579 umlal2 v15.4s, v12.8h, v2.h[0] 580 umlal v14.4s, v13.4h, v2.h[0] 581 umlal2 v15.4s, v13.8h, v2.h[0] 582 115: ext v12.16b, v5.16b, v6.16b, #0*2 583 ext v13.16b, v8.16b, v9.16b, #6*2 584 umlal v14.4s, v12.4h, v1.h[7] 585 umlal2 v15.4s, v12.8h, v1.h[7] 586 umlal v14.4s, v13.4h, v1.h[7] 587 umlal2 v15.4s, v13.8h, v1.h[7] 588 114: ext v12.16b, v5.16b, v6.16b, #1*2 589 ext v13.16b, v8.16b, v9.16b, #5*2 590 umlal v14.4s, v12.4h, v1.h[6] 591 umlal2 v15.4s, v12.8h, v1.h[6] 592 umlal v14.4s, v13.4h, v1.h[6] 593 umlal2 v15.4s, v13.8h, v1.h[6] 594 113: ext v12.16b, v5.16b, v6.16b, #2*2 595 ext v13.16b, v8.16b, v9.16b, #4*2 596 umlal v14.4s, v12.4h, v1.h[5] 597 umlal2 v15.4s, v12.8h, v1.h[5] 598 umlal v14.4s, v13.4h, v1.h[5] 599 umlal2 v15.4s, v13.8h, v1.h[5] 600 112: ext v12.16b, v5.16b, v6.16b, #3*2 601 ext v13.16b, v8.16b, v9.16b, #3*2 602 umlal v14.4s, v12.4h, v1.h[4] 603 umlal2 v15.4s, v12.8h, v1.h[4] 604 umlal v14.4s, v13.4h, v1.h[4] 605 umlal2 v15.4s, v13.8h, v1.h[4] 606 111: ext v12.16b, v5.16b, v6.16b, #4*2 607 ext v13.16b, v8.16b, v9.16b, #2*2 608 umlal v14.4s, v12.4h, v1.h[3] 609 umlal2 v15.4s, v12.8h, v1.h[3] 610 umlal v14.4s, v13.4h, v1.h[3] 611 umlal2 v15.4s, v13.8h, v1.h[3] 612 110: ext v12.16b, v5.16b, v6.16b, #5*2 613 ext v13.16b, v8.16b, v9.16b, #1*2 614 umlal v14.4s, v12.4h, v1.h[2] 615 umlal2 v15.4s, v12.8h, v1.h[2] 616 umlal v14.4s, v13.4h, v1.h[2] 617 umlal2 v15.4s, v13.8h, v1.h[2] 618 109: ext v12.16b, v5.16b, v6.16b, #6*2 619 ext v13.16b, v8.16b, v9.16b, #0*2 620 umlal v14.4s, v12.4h, v1.h[1] 621 umlal2 v15.4s, v12.8h, v1.h[1] 622 umlal v14.4s, v13.4h, v1.h[1] 623 umlal2 v15.4s, v13.8h, v1.h[1] 624 108: ext v12.16b, v5.16b, v6.16b, #7*2 625 ext v13.16b, v7.16b, v8.16b, #7*2 626 umlal v14.4s, v12.4h, v1.h[0] 627 umlal2 v15.4s, v12.8h, v1.h[0] 628 umlal v14.4s, v13.4h, v1.h[0] 629 umlal2 v15.4s, v13.8h, v1.h[0] 630 107: ext v12.16b, v6.16b, v7.16b, #0*2 631 ext v13.16b, v7.16b, v8.16b, #6*2 632 umlal v14.4s, v12.4h, v0.h[7] 633 umlal2 v15.4s, v12.8h, v0.h[7] 634 umlal v14.4s, v13.4h, v0.h[7] 635 umlal2 v15.4s, v13.8h, v0.h[7] 636 106: ext v12.16b, v6.16b, v7.16b, #1*2 637 ext v13.16b, v7.16b, v8.16b, #5*2 638 umlal v14.4s, v12.4h, v0.h[6] 639 umlal2 v15.4s, v12.8h, v0.h[6] 640 umlal v14.4s, v13.4h, v0.h[6] 641 umlal2 v15.4s, v13.8h, v0.h[6] 642 105: ext v12.16b, v6.16b, v7.16b, #2*2 643 ext v13.16b, v7.16b, v8.16b, #4*2 644 umlal v14.4s, v12.4h, v0.h[5] 645 umlal2 v15.4s, v12.8h, v0.h[5] 646 umlal v14.4s, v13.4h, v0.h[5] 647 umlal2 v15.4s, v13.8h, v0.h[5] 648 104: ext v12.16b, v6.16b, v7.16b, #3*2 649 ext v13.16b, v7.16b, v8.16b, #3*2 650 umlal v14.4s, v12.4h, v0.h[4] 651 umlal2 v15.4s, v12.8h, v0.h[4] 652 umlal v14.4s, v13.4h, v0.h[4] 653 umlal2 v15.4s, v13.8h, v0.h[4] 654 103: ext v12.16b, v6.16b, v7.16b, #4*2 655 ext v13.16b, v7.16b, v8.16b, #2*2 656 umlal v14.4s, v12.4h, v0.h[3] 657 umlal2 v15.4s, v12.8h, v0.h[3] 658 umlal v14.4s, v13.4h, v0.h[3] 659 umlal2 v15.4s, v13.8h, v0.h[3] 660 102: ext v12.16b, v6.16b, v7.16b, #5*2 661 ext v13.16b, v7.16b, v8.16b, #1*2 662 umlal v14.4s, v12.4h, v0.h[2] 663 umlal2 v15.4s, v12.8h, v0.h[2] 664 umlal v14.4s, v13.4h, v0.h[2] 665 umlal2 v15.4s, v13.8h, v0.h[2] 666 101: ext v12.16b, v6.16b, v7.16b, #6*2 667 ext v13.16b, v7.16b, v8.16b, #0*2 668 umlal v14.4s, v12.4h, v0.h[1] 669 umlal2 v15.4s, v12.8h, v0.h[1] 670 umlal v14.4s, v13.4h, v0.h[1] 671 umlal2 v15.4s, v13.8h, v0.h[1] 672 673 uqrshrn v14.4h, v14.4s, #16 674 uqrshrn2 v14.8h, v15.4s, #16 675 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 676 677 mov v31.16b, v4.16b 678 mov v4.16b, v5.16b 679 mov v5.16b, v6.16b 680 mov v6.16b, v7.16b 681 mov v7.16b, v8.16b 682 mov v8.16b, v9.16b 683 mov v9.16b, v10.16b 684 mov v10.16b, v11.16b 685.endm/*}}}*/ 686 687#define TUNED_LIST4 6, 12, 20 688.macro hconv4_6/*{{{*/ 689.rodata 690 200: .hword -4 691 .hword 101f-100f 692 .hword 102f-100f 693 .hword 103f-100f 694 .hword 104f-100f 695 .hword 105f-100f 696 .hword 106f-100f 697 .align 4 698.text 699 umull v14.4s, v7.4h, v0.h[0] 700 umull2 v15.4s, v7.8h, v0.h[0] 701 702 adrp x16, 200b 703 add x16, x16, :lo12:200b 704 ldrsh x12, [x16, x5, LSL #1] 705 adr x16, 100f 706 add x12, x12, x16 707 100: br x12 708 106: umlal v14.4s, v4.4h, v0.h[6] 709 umlal2 v15.4s, v4.8h, v0.h[6] 710 umlal v14.4s, v10.4h, v0.h[6] 711 umlal2 v15.4s, v10.8h, v0.h[6] 712 105: umlal2 v14.4s, v4.8h, v0.h[5] 713 umlal v15.4s, v5.4h, v0.h[5] 714 umlal2 v14.4s, v9.8h, v0.h[5] 715 umlal v15.4s, v10.4h, v0.h[5] 716 104: umlal v14.4s, v5.4h, v0.h[4] 717 umlal2 v15.4s, v5.8h, v0.h[4] 718 umlal v14.4s, v9.4h, v0.h[4] 719 umlal2 v15.4s, v9.8h, v0.h[4] 720 103: umlal2 v14.4s, v5.8h, v0.h[3] 721 umlal v15.4s, v6.4h, v0.h[3] 722 umlal2 v14.4s, v8.8h, v0.h[3] 723 umlal v15.4s, v9.4h, v0.h[3] 724 102: umlal v14.4s, v6.4h, v0.h[2] 725 umlal2 v15.4s, v6.8h, v0.h[2] 726 umlal v14.4s, v8.4h, v0.h[2] 727 umlal2 v15.4s, v8.8h, v0.h[2] 728 101: umlal2 v14.4s, v6.8h, v0.h[1] 729 umlal v15.4s, v7.4h, v0.h[1] 730 umlal2 v14.4s, v7.8h, v0.h[1] 731 umlal v15.4s, v8.4h, v0.h[1] 732 733 uqrshrn v14.4h, v14.4s, #16 734 uqrshrn2 v14.8h, v15.4s, #16 735 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 736 737 mov v4.16b, v5.16b 738 mov v5.16b, v6.16b 739 mov v6.16b, v7.16b 740 mov v7.16b, v8.16b 741 mov v8.16b, v9.16b 742 mov v9.16b, v10.16b 743 mov v10.16b, v11.16b 744.endm/*}}}*/ 745 746.macro hconv4_12/*{{{*/ 747.rodata 748 200: .hword -4 //Might need to remove these... 749 .hword 101f-100f 750 .hword 102f-100f 751 .hword 103f-100f 752 .hword 104f-100f 753 .hword 105f-100f 754 .hword 106f-100f 755 .hword 107f-100f 756 .hword 108f-100f 757 .hword 109f-100f 758 .hword 110f-100f 759 .hword 111f-100f 760 .hword 112f-100f 761 .align 4 762.text 763 umull v14.4s, v4.4h, v0.h[0] 764 umull2 v15.4s, v4.8h, v0.h[0] 765 766 adrp x16, 200b 767 add x16, x16, :lo12:200b 768 ldrsh x12, [x16, x5, LSL #1] 769 adr x16, 100f 770 add x12, x12, x16 771 100: br x12 772 112: umlal v14.4s, v26.4h, v1.h[4] 773 umlal2 v15.4s, v26.8h, v1.h[4] 774 umlal v14.4s, v10.4h, v1.h[4] 775 umlal2 v15.4s, v10.8h, v1.h[4] 776 111: umlal2 v14.4s, v26.8h, v1.h[3] 777 umlal v15.4s, v27.4h, v1.h[3] 778 umlal2 v14.4s, v9.8h, v1.h[3] 779 umlal v15.4s, v10.4h, v1.h[3] 780 110: umlal v14.4s, v27.4h, v1.h[2] 781 umlal2 v15.4s, v27.8h, v1.h[2] 782 umlal v14.4s, v9.4h, v1.h[2] 783 umlal2 v15.4s, v9.8h, v1.h[2] 784 109: umlal2 v14.4s, v27.8h, v1.h[1] 785 umlal v15.4s, v28.4h, v1.h[1] 786 umlal2 v14.4s, v8.8h, v1.h[1] 787 umlal v15.4s, v9.4h, v1.h[1] 788 108: umlal v14.4s, v28.4h, v1.h[0] 789 umlal2 v15.4s, v28.8h, v1.h[0] 790 umlal v14.4s, v8.4h, v1.h[0] 791 umlal2 v15.4s, v8.8h, v1.h[0] 792 107: umlal2 v14.4s, v28.8h, v0.h[7] 793 umlal v15.4s, v29.4h, v0.h[7] 794 umlal2 v14.4s, v7.8h, v0.h[7] 795 umlal v15.4s, v8.4h, v0.h[7] 796 106: umlal v14.4s, v29.4h, v0.h[6] 797 umlal2 v15.4s, v29.8h, v0.h[6] 798 umlal v14.4s, v7.4h, v0.h[6] 799 umlal2 v15.4s, v7.8h, v0.h[6] 800 105: umlal2 v14.4s, v29.8h, v0.h[5] 801 umlal v15.4s, v30.4h, v0.h[5] 802 umlal2 v14.4s, v6.8h, v0.h[5] 803 umlal v15.4s, v7.4h, v0.h[5] 804 104: umlal v14.4s, v30.4h, v0.h[4] 805 umlal2 v15.4s, v30.8h, v0.h[4] 806 umlal v14.4s, v6.4h, v0.h[4] 807 umlal2 v15.4s, v6.8h, v0.h[4] 808 103: umlal2 v14.4s, v30.8h, v0.h[3] 809 umlal v15.4s, v31.4h, v0.h[3] 810 umlal2 v14.4s, v5.8h, v0.h[3] 811 umlal v15.4s, v6.4h, v0.h[3] 812 102: umlal v14.4s, v31.4h, v0.h[2] 813 umlal2 v15.4s, v31.8h, v0.h[2] 814 umlal v14.4s, v5.4h, v0.h[2] 815 umlal2 v15.4s, v5.8h, v0.h[2] 816 101: umlal2 v14.4s, v31.8h, v0.h[1] 817 umlal v15.4s, v4.4h, v0.h[1] 818 umlal2 v14.4s, v4.8h, v0.h[1] 819 umlal v15.4s, v5.4h, v0.h[1] 820 821 uqrshrn v14.4h, v14.4s, #16 822 uqrshrn2 v14.8h, v15.4s, #16 823 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 824 825 mov v26.16b, v27.16b 826 mov v27.16b, v28.16b 827 mov v28.16b, v29.16b 828 mov v29.16b, v30.16b 829 mov v30.16b, v31.16b 830 mov v31.16b, v4.16b 831 mov v4.16b, v5.16b 832 mov v5.16b, v6.16b 833 mov v6.16b, v7.16b 834 mov v7.16b, v8.16b 835 mov v8.16b, v9.16b 836 mov v9.16b, v10.16b 837 mov v10.16b, v11.16b 838.endm/*}}}*/ 839 840.macro hconv4_20/*{{{*/ 841.rodata 842 200: .hword -4 843 .hword 101f-100f 844 .hword 102f-100f 845 .hword 103f-100f 846 .hword 104f-100f 847 .hword 105f-100f 848 .hword 106f-100f 849 .hword 107f-100f 850 .hword 108f-100f 851 .hword 109f-100f 852 .hword 110f-100f 853 .hword 111f-100f 854 .hword 112f-100f 855 .hword 113f-100f 856 .hword 114f-100f 857 .hword 115f-100f 858 .hword 116f-100f 859 .hword 117f-100f 860 .hword 118f-100f 861 .hword 119f-100f 862 .hword 120f-100f 863 .align 4 864.text 865 umull v14.4s, v28.4h, v0.h[0] 866 umull2 v15.4s, v28.8h, v0.h[0] 867 868 adrp x16, 200b 869 add x16, x16, :lo12:200b 870 ldrsh x12, [x16, x5, LSL #1] 871 adr x16, 100f 872 add x12, x12, x16 873 100: br x12 874 120: umlal v14.4s, v18.4h, v2.h[4] 875 umlal2 v15.4s, v18.8h, v2.h[4] 876 umlal v14.4s, v10.4h, v2.h[4] 877 umlal2 v15.4s, v10.8h, v2.h[4] 878 119: umlal2 v14.4s, v18.8h, v2.h[3] 879 umlal v15.4s, v19.4h, v2.h[3] 880 umlal2 v14.4s, v9.8h, v2.h[3] 881 umlal v15.4s, v10.4h, v2.h[3] 882 118: umlal v14.4s, v19.4h, v2.h[2] 883 umlal2 v15.4s, v19.8h, v2.h[2] 884 umlal v14.4s, v9.4h, v2.h[2] 885 umlal2 v15.4s, v9.8h, v2.h[2] 886 117: umlal2 v14.4s, v19.8h, v2.h[1] 887 umlal v15.4s, v20.4h, v2.h[1] 888 umlal2 v14.4s, v8.8h, v2.h[1] 889 umlal v15.4s, v9.4h, v2.h[1] 890 116: umlal v14.4s, v20.4h, v2.h[0] 891 umlal2 v15.4s, v20.8h, v2.h[0] 892 umlal v14.4s, v8.4h, v2.h[0] 893 umlal2 v15.4s, v8.8h, v2.h[0] 894 115: umlal2 v14.4s, v20.8h, v1.h[7] 895 umlal v15.4s, v21.4h, v1.h[7] 896 umlal2 v14.4s, v7.8h, v1.h[7] 897 umlal v15.4s, v8.4h, v1.h[7] 898 114: umlal v14.4s, v21.4h, v1.h[6] 899 umlal2 v15.4s, v21.8h, v1.h[6] 900 umlal v14.4s, v7.4h, v1.h[6] 901 umlal2 v15.4s, v7.8h, v1.h[6] 902 113: umlal2 v14.4s, v21.8h, v1.h[5] 903 umlal v15.4s, v22.4h, v1.h[5] 904 umlal2 v14.4s, v6.8h, v1.h[5] 905 umlal v15.4s, v7.4h, v1.h[5] 906 112: umlal v14.4s, v22.4h, v1.h[4] 907 umlal2 v15.4s, v22.8h, v1.h[4] 908 umlal v14.4s, v6.4h, v1.h[4] 909 umlal2 v15.4s, v6.8h, v1.h[4] 910 111: umlal2 v14.4s, v22.8h, v1.h[3] 911 umlal v15.4s, v23.4h, v1.h[3] 912 umlal2 v14.4s, v5.8h, v1.h[3] 913 umlal v15.4s, v6.4h, v1.h[3] 914 110: umlal v14.4s, v23.4h, v1.h[2] 915 umlal2 v15.4s, v23.8h, v1.h[2] 916 umlal v14.4s, v5.4h, v1.h[2] 917 umlal2 v15.4s, v5.8h, v1.h[2] 918 109: umlal2 v14.4s, v23.8h, v1.h[1] 919 umlal v15.4s, v24.4h, v1.h[1] 920 umlal2 v14.4s, v4.8h, v1.h[1] 921 umlal v15.4s, v5.4h, v1.h[1] 922 108: umlal v14.4s, v24.4h, v1.h[0] 923 umlal2 v15.4s, v24.8h, v1.h[0] 924 umlal v14.4s, v4.4h, v1.h[0] 925 umlal2 v15.4s, v4.8h, v1.h[0] 926 107: umlal2 v14.4s, v24.8h, v0.h[7] 927 umlal v15.4s, v25.4h, v0.h[7] 928 umlal2 v14.4s, v31.8h, v0.h[7] 929 umlal v15.4s, v4.4h, v0.h[7] 930 106: umlal v14.4s, v25.4h, v0.h[6] 931 umlal2 v15.4s, v25.8h, v0.h[6] 932 umlal v14.4s, v31.4h, v0.h[6] 933 umlal2 v15.4s, v31.8h, v0.h[6] 934 105: umlal2 v14.4s, v25.8h, v0.h[5] 935 umlal v15.4s, v26.4h, v0.h[5] 936 umlal2 v14.4s, v30.8h, v0.h[5] 937 umlal v15.4s, v31.4h, v0.h[5] 938 104: umlal v14.4s, v26.4h, v0.h[4] 939 umlal2 v15.4s, v26.8h, v0.h[4] 940 umlal v14.4s, v30.4h, v0.h[4] 941 umlal2 v15.4s, v30.8h, v0.h[4] 942 103: umlal2 v14.4s, v26.8h, v0.h[3] 943 umlal v15.4s, v27.4h, v0.h[3] 944 umlal2 v14.4s, v29.8h, v0.h[3] 945 umlal v15.4s, v30.4h, v0.h[3] 946 102: umlal v14.4s, v27.4h, v0.h[2] 947 umlal2 v15.4s, v27.8h, v0.h[2] 948 umlal v14.4s, v29.4h, v0.h[2] 949 umlal2 v15.4s, v29.8h, v0.h[2] 950 101: umlal2 v14.4s, v27.8h, v0.h[1] 951 umlal v15.4s, v28.4h, v0.h[1] 952 umlal2 v14.4s, v28.8h, v0.h[1] 953 umlal v15.4s, v29.4h, v0.h[1] 954 955 uqrshrn v14.4h, v14.4s, #16 956 uqrshrn2 v14.8h, v15.4s, #16 957 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 958 959 mov v18.16b, v19.16b 960 mov v19.16b, v20.16b 961 mov v20.16b, v21.16b 962 mov v21.16b, v22.16b 963 mov v22.16b, v23.16b 964 mov v23.16b, v24.16b 965 mov v24.16b, v25.16b 966 mov v25.16b, v26.16b 967 mov v26.16b, v27.16b 968 mov v27.16b, v28.16b 969 mov v28.16b, v29.16b 970 mov v29.16b, v30.16b 971 mov v30.16b, v31.16b 972 mov v31.16b, v4.16b 973 mov v4.16b, v5.16b 974 mov v5.16b, v6.16b 975 mov v6.16b, v7.16b 976 mov v7.16b, v8.16b 977 mov v8.16b, v9.16b 978 mov v9.16b, v10.16b 979 mov v10.16b, v11.16b 980.endm/*}}}*/ 981 982.macro hconv4_25/*{{{*/ 983.rodata 984 200: .hword -4 985 .hword 101f-100f 986 .hword 102f-100f 987 .hword 103f-100f 988 .hword 104f-100f 989 .hword 105f-100f 990 .hword 106f-100f 991 .hword 107f-100f 992 .hword 108f-100f 993 .hword 109f-100f 994 .hword 110f-100f 995 .hword 111f-100f 996 .hword 112f-100f 997 .hword 113f-100f 998 .hword 114f-100f 999 .hword 115f-100f 1000 .hword 116f-100f 1001 .hword 117f-100f 1002 .hword 118f-100f 1003 .hword 119f-100f 1004 .hword 120f-100f 1005 .hword 121f-100f 1006 .hword 122f-100f 1007 .hword 123f-100f 1008 .hword 124f-100f 1009 .hword 125f-100f 1010 .align 4 1011.text 1012 umull2 v14.4s, v25.8h, v0.h[0] 1013 umull v15.4s, v26.4h, v0.h[0] 1014 1015 adrp x16, 200b 1016 add x16, x16, :lo12:200b 1017 ldrsh x12, [x16, x5, LSL #1] 1018 adr x16, 100f 1019 add x12, x12, x16 1020 100: br x12 1021 125: ld1 {v12.8h}, [x9] 1022 umlal v14.4s, v12.4h, v3.h[1] 1023 umlal2 v15.4s, v12.8h, v3.h[1] 1024 umlal v14.4s, v10.4h, v3.h[1] 1025 umlal2 v15.4s, v10.8h, v3.h[1] 1026 124: add x12, x9, #0x08 1027 bic x12, x12, #0x40 1028 ld1 {v12.4h}, [x12], #8 1029 bic x12, x12, #0x40 1030 ld1 {v13.4h}, [x12] 1031 umlal v14.4s, v12.4h, v3.h[0] 1032 umlal v15.4s, v13.4h, v3.h[0] 1033 umlal2 v14.4s, v9.8h, v3.h[0] 1034 umlal v15.4s, v10.4h, v3.h[0] 1035 123: add x12, x9, #0x10 1036 bic x12, x12, #0x40 1037 ld1 {v12.8h}, [x12] 1038 umlal v14.4s, v12.4h, v2.h[7] 1039 umlal2 v15.4s, v12.8h, v2.h[7] 1040 umlal v14.4s, v9.4h, v2.h[7] 1041 umlal2 v15.4s, v9.8h, v2.h[7] 1042 122: add x12, x9, #0x18 1043 bic x12, x12, #0x40 1044 ld1 {v12.4h}, [x12], #8 1045 bic x12, x12, #0x40 1046 ld1 {v13.4h}, [x12] 1047 umlal v14.4s, v12.4h, v2.h[6] 1048 umlal v15.4s, v13.4h, v2.h[6] 1049 umlal2 v14.4s, v8.8h, v2.h[6] 1050 umlal v15.4s, v9.4h, v2.h[6] 1051 121: add x12, x9, #0x20 1052 bic x12, x12, #0x40 1053 ld1 {v12.8h}, [x12] 1054 umlal v14.4s, v12.4h, v2.h[5] 1055 umlal2 v15.4s, v12.8h, v2.h[5] 1056 umlal v14.4s, v8.4h, v2.h[5] 1057 umlal2 v15.4s, v8.8h, v2.h[5] 1058 120: add x12, x9, #0x28 1059 bic x12, x12, #0x40 1060 ld1 {v12.4h}, [x12], #8 1061 bic x12, x12, #0x40 1062 ld1 {v13.4h}, [x12] 1063 umlal v14.4s, v12.4h, v2.h[4] 1064 umlal v15.4s, v13.4h, v2.h[4] 1065 umlal2 v14.4s, v7.8h, v2.h[4] 1066 umlal v15.4s, v8.4h, v2.h[4] 1067 119: add x12, x9, #0x30 1068 bic x12, x12, #0x40 1069 ld1 {v12.8h}, [x12] 1070 umlal v14.4s, v12.4h, v2.h[3] 1071 umlal2 v15.4s, v12.8h, v2.h[3] 1072 umlal v14.4s, v7.4h, v2.h[3] 1073 umlal2 v15.4s, v7.8h, v2.h[3] 1074 118: add x12, x9, #0x38 1075 bic x12, x12, #0x40 1076 ld1 {v12.4h}, [x12] 1077 umlal v14.4s, v12.4h, v2.h[2] 1078 umlal v15.4s, v17.4h, v2.h[2] 1079 umlal2 v14.4s, v6.8h, v2.h[2] 1080 umlal v15.4s, v7.4h, v2.h[2] 1081 117: umlal v14.4s, v17.4h, v2.h[1] 1082 umlal2 v15.4s, v17.8h, v2.h[1] 1083 umlal v14.4s, v6.4h, v2.h[1] 1084 umlal2 v15.4s, v6.8h, v2.h[1] 1085 116: umlal2 v14.4s, v17.8h, v2.h[0] 1086 umlal v15.4s, v18.4h, v2.h[0] 1087 umlal2 v14.4s, v5.8h, v2.h[0] 1088 umlal v15.4s, v6.4h, v2.h[0] 1089 115: umlal v14.4s, v18.4h, v1.h[7] 1090 umlal2 v15.4s, v18.8h, v1.h[7] 1091 umlal v14.4s, v5.4h, v1.h[7] 1092 umlal2 v15.4s, v5.8h, v1.h[7] 1093 114: umlal2 v14.4s, v18.8h, v1.h[6] 1094 umlal v15.4s, v19.4h, v1.h[6] 1095 umlal2 v14.4s, v4.8h, v1.h[6] 1096 umlal v15.4s, v5.4h, v1.h[6] 1097 113: umlal v14.4s, v19.4h, v1.h[5] 1098 umlal2 v15.4s, v19.8h, v1.h[5] 1099 umlal v14.4s, v4.4h, v1.h[5] 1100 umlal2 v15.4s, v4.8h, v1.h[5] 1101 112: umlal2 v14.4s, v19.8h, v1.h[4] 1102 umlal v15.4s, v20.4h, v1.h[4] 1103 umlal2 v14.4s, v31.8h, v1.h[4] 1104 umlal v15.4s, v4.4h, v1.h[4] 1105 111: umlal v14.4s, v20.4h, v1.h[3] 1106 umlal2 v15.4s, v20.8h, v1.h[3] 1107 umlal v14.4s, v31.4h, v1.h[3] 1108 umlal2 v15.4s, v31.8h, v1.h[3] 1109 110: umlal2 v14.4s, v20.8h, v1.h[2] 1110 umlal v15.4s, v21.4h, v1.h[2] 1111 umlal2 v14.4s, v30.8h, v1.h[2] 1112 umlal v15.4s, v31.4h, v1.h[2] 1113 109: umlal v14.4s, v21.4h, v1.h[1] 1114 umlal2 v15.4s, v21.8h, v1.h[1] 1115 umlal v14.4s, v30.4h, v1.h[1] 1116 umlal2 v15.4s, v30.8h, v1.h[1] 1117 108: umlal2 v14.4s, v21.8h, v1.h[0] 1118 umlal v15.4s, v22.4h, v1.h[0] 1119 umlal2 v14.4s, v29.8h, v1.h[0] 1120 umlal v15.4s, v30.4h, v1.h[0] 1121 107: umlal v14.4s, v22.4h, v0.h[7] 1122 umlal2 v15.4s, v22.8h, v0.h[7] 1123 umlal v14.4s, v29.4h, v0.h[7] 1124 umlal2 v15.4s, v29.8h, v0.h[7] 1125 106: umlal2 v14.4s, v22.8h, v0.h[6] 1126 umlal v15.4s, v23.4h, v0.h[6] 1127 umlal2 v14.4s, v28.8h, v0.h[6] 1128 umlal v15.4s, v29.4h, v0.h[6] 1129 105: umlal v14.4s, v23.4h, v0.h[5] 1130 umlal2 v15.4s, v23.8h, v0.h[5] 1131 umlal v14.4s, v28.4h, v0.h[5] 1132 umlal2 v15.4s, v28.8h, v0.h[5] 1133 104: umlal2 v14.4s, v23.8h, v0.h[4] 1134 umlal v15.4s, v24.4h, v0.h[4] 1135 umlal2 v14.4s, v27.8h, v0.h[4] 1136 umlal v15.4s, v28.4h, v0.h[4] 1137 103: umlal v14.4s, v24.4h, v0.h[3] 1138 umlal2 v15.4s, v24.8h, v0.h[3] 1139 umlal v14.4s, v27.4h, v0.h[3] 1140 umlal2 v15.4s, v27.8h, v0.h[3] 1141 102: umlal2 v14.4s, v24.8h, v0.h[2] 1142 umlal v15.4s, v25.4h, v0.h[2] 1143 umlal2 v14.4s, v26.8h, v0.h[2] 1144 umlal v15.4s, v27.4h, v0.h[2] 1145 101: umlal v14.4s, v25.4h, v0.h[1] 1146 umlal2 v15.4s, v25.8h, v0.h[1] 1147 umlal v14.4s, v26.4h, v0.h[1] 1148 umlal2 v15.4s, v26.8h, v0.h[1] 1149 1150 uqrshrn v14.4h, v14.4s, #16 1151 uqrshrn2 v14.8h, v15.4s, #16 1152 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 1153 1154 st1 {v17.16b}, [x9], #16 1155 bic x9, x9, #0x40 1156 mov v17.16b, v18.16b 1157 mov v18.16b, v19.16b 1158 mov v19.16b, v20.16b 1159 mov v20.16b, v21.16b 1160 mov v21.16b, v22.16b 1161 mov v22.16b, v23.16b 1162 mov v23.16b, v24.16b 1163 mov v24.16b, v25.16b 1164 mov v25.16b, v26.16b 1165 mov v26.16b, v27.16b 1166 mov v27.16b, v28.16b 1167 mov v28.16b, v29.16b 1168 mov v29.16b, v30.16b 1169 mov v30.16b, v31.16b 1170 mov v31.16b, v4.16b 1171 mov v4.16b, v5.16b 1172 mov v5.16b, v6.16b 1173 mov v6.16b, v7.16b 1174 mov v7.16b, v8.16b 1175 mov v8.16b, v9.16b 1176 mov v9.16b, v10.16b 1177 mov v10.16b, v11.16b 1178.endm/*}}}*/ 1179 1180/* Dedicated function wrapper for the fetch macro, for the cases where 1181 * performance isn't that important, to keep code size down. 1182 */ 1183PRIVATE(fetch_generic_asm) 1184 stp x10, x11, [sp, #-16]! 1185 fetch 1186 ldp x10, x11, [sp], #16 1187 ret 1188END(fetch_generic_asm) 1189 1190 1191/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory 1192 * beyond that limit, and filling the rest of the vector with the last legal 1193 * pixel. 1194 * Result is in v10 and v11. v8 and v9 are filled with the first legal pixel. 1195 * Note: This function can read beyond the right edge of input if the image is 1196 * narrower than 16 bytes. 1197 */ 1198PRIVATE(fetch_clampleft1) 1199 stp x29, x30, [sp, #-16]! 1200 bl fetch_generic_asm 1201 dup v8.8h, v10.h[0] 1202 dup v9.8h, v10.h[0] 1203 ands x12, x10, #15 1204 beq 1f 1205 sub x1, x1, x12 1206 sub x15, x15, x12 1207 sub x19, x19, x12 1208 sub x10, x10, x12 1209 sub x12, sp, x12, LSL #1 1210 sub sp, sp, #64 1211 sub x12, x12, #32 1212 st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp] 1213 ld1 {v10.8h,v11.8h}, [x12] 1214 add sp, sp, #64 12151: ldp x29, x30, [sp], #16 1216 ret 1217END(fetch_clampleft1) 1218 1219PRIVATE(fetch_clampleft4) 1220 stp x29, x30, [sp, #-16]! 1221 bl fetch_generic_asm 1222 dup v8.2d, v10.d[0] 1223 dup v9.2d, v10.d[0] 1224 ands x12, x10, #15 1225 beq 1f 1226 sub x1, x1, x12 1227 sub x15, x15, x12 1228 sub x19, x19, x12 1229 sub x10, x10, x12 1230 sub x12, sp, x12, LSL #1 1231 sub sp, sp, #64 1232 sub x12, x12, #32 1233 st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp] 1234 ld1 {v10.8h,v11.8h}, [x12] 1235 add sp, sp, #64 12361: ldp x29, x30, [sp], #16 1237 ret 1238END(fetch_clampleft4) 1239 1240/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding 1241 * reading memory beyond that limit, and filling the rest of the vector with 1242 * the last legal pixel. 1243 * Result is in v10 and v11. v12 and v13 are filled with the last legal pixel. 1244 * Note: This function can read beyond the left edge of input if the image is 1245 * narrower than 16 bytes. 1246 */ 1247PRIVATE(fetch_clampright1) 1248 stp x29, x30, [sp, #-16]! 1249 sub x12, xzr, x11 1250 ands x12, x12, #15 1251 beq 1f 1252 sub x1, x1, x12 1253 sub x15, x15, x12 1254 sub x19, x19, x12 1255 bl fetch_generic_asm 1256 dup v12.8h, v11.h[7] 1257 dup v13.8h, v11.h[7] 1258 sub x12, xzr, x11 1259 and x12, x12, #15 1260 sub sp, sp, #64 1261 add x12, sp, x12, LSL #1 1262 st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp] 1263 ld1 {v10.8h,v11.8h}, [x12] 1264 add sp, sp, #64 1265 ldp x29, x30, [sp], #16 1266 ret 12671: bl fetch_generic_asm 1268 dup v12.8h, v11.h[7] 1269 dup v13.8h, v11.h[7] 1270 ldp x29, x30, [sp], #16 1271 ret 1272END(fetch_clampright1) 1273 1274PRIVATE(fetch_clampright4) 1275 stp x29, x30, [sp, #-16]! 1276 sub x12, xzr, x11 1277 ands x12, x12, #15 1278 beq 1f 1279 sub x1, x1, x12 1280 sub x15, x15, x12 1281 sub x19, x19, x12 1282 bl fetch_generic_asm 1283 dup v12.2d, v11.d[1] 1284 dup v13.2d, v11.d[1] 1285 sub x12, xzr, x11 1286 and x12, x12, #15 1287 sub sp, sp, #64 1288 add x12, sp, x12, LSL #1 1289 st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp] 1290 ld1 {v10.8h,v11.8h}, [x12] 1291 add sp, sp, #64 1292 ldp x29, x30, [sp], #16 1293 ret 12941: bl fetch_generic_asm 1295 dup v12.2d, v11.d[1] 1296 dup v13.2d, v11.d[1] 1297 ldp x29, x30, [sp], #16 1298 ret 1299END(fetch_clampright4) 1300 1301/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th 1302 * value across to fill the rest of the register pair. Used for filling the 1303 * right hand edge of the window when reading too close to the right hand edge 1304 * of the image. 1305 * Also returns a dup-ed copy of the last element in v12 for the tail-fill 1306 * case (this happens incidentally in common path, but must be done 1307 * deliberately in the fast-out path). 1308 */ 1309PRIVATE(prefill_sweepright1) 1310 ands x12, x11, #15 1311 beq 1f 1312 sub x12, x12, #1 1313 sub sp, sp, #64 1314 st1 {v10.8h,v11.8h}, [sp] 1315 add x12, sp, x12, LSL #1 1316 ld1r {v12.8h}, [x12] 1317 ld1r {v13.8h}, [x12] 1318 st1 {v12.8h,v13.8h}, [x12] 1319 ld1 {v10.8h,v11.8h}, [sp] 1320 add sp, sp, #64 1321 ret 13221: dup v12.8h, v11.h[7] 1323 dup v13.8h, v11.h[7] 1324 ret 1325END(prefill_sweepright1) 1326 1327PRIVATE(prefill_sweepright4) 1328 ands x12, x11, #15 1329 beq 1f 1330 sub x12, x12, #4 1331 sub sp, sp, #64 1332 st1 {v10.8h,v11.8h}, [sp] 1333 add x12, sp, x12, LSL #1 1334 ld1r {v12.2d}, [x12] 1335 st1 {v13.8h}, [x12] 1336 ld1 {v10.8h,v11.8h}, [sp] 1337 add sp, sp, #64 1338 ret 13391: dup v12.2d, v11.d[1] 1340 dup v13.2d, v11.d[1] 1341 ret 1342END(prefill_sweepright4) 1343 1344/* The main loop keeps a sliding window of data that has already been convolved 1345 * in the vertical axis for the current line. This usually stays in the 1346 * register file, but spills to memory for large windows. The first thing that 1347 * needs to be done at start-up is to fill this window with image data, taking 1348 * into account the padding needed if the left or right edges of the image fall 1349 * within this window. 1350 */ 1351 1352/* Because the window is in the register file writes to it cannot be indexed 1353 * by another register. Consequently the fill loops are unrolled to address 1354 * the registers directly. This macro distinguishes between writes to the 1355 * register file and writes to the spill buffer (indicated by a destination 1356 * register named xx). 1357 */ 1358.macro prefill_out ra, rb, sra, srb 1359 .ifc \ra,xx 1360 .ifc \rb,xx 1361 st1 {\sra,\srb}, [x9], #32 1362 .else 1363 bic x9, x9, #0x40 1364 st1 {\sra}, [x9], #16 1365 mov \rb, \srb 1366 .endif 1367 .else 1368 .ifnc \ra,\sra 1369 mov \ra, \sra 1370 .endif 1371 .ifnc \rb,\srb 1372 mov \rb, \srb 1373 .endif 1374 .endif 1375.endm 1376 1377/* This macro provides the list of registers representing the window, and the 1378 * cases where the register file is too small and a spill buffer is used 1379 * instead. 1380 * Since several specialisations of each function are generated, this also 1381 * culls superfluous iterations, and sets the variable `i` for subsequent 1382 * macros indicating the current index into the window. 1383 */ 1384.macro prefill_list, macro, nextmacro, max_r, step, label 1385 .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label 1386 .if windowsize >= (\line * 16) 1387 .set i, windowsize - (\line * 16) 1388\label\macro\line: 1389 prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step 1390 .endif 1391 .endm 1392 ifneeded \macro \nextmacro, 13, 12, xx, xx, \step, \label 1393 ifneeded \macro \nextmacro, 12, 11, xx, xx, \step, \label 1394 ifneeded \macro \nextmacro, 11, 10, xx, v17.16b, \step, \label 1395 ifneeded \macro \nextmacro, 10, 9, v18.16b, v19.16b, \step, \label 1396 ifneeded \macro \nextmacro, 9, 8, v20.16b, v21.16b, \step, \label 1397 ifneeded \macro \nextmacro, 8, 7, v22.16b, v23.16b, \step, \label 1398 ifneeded \macro \nextmacro, 7, 6, v24.16b, v25.16b, \step, \label 1399 ifneeded \macro \nextmacro, 6, 5, v26.16b, v27.16b, \step, \label 1400 ifneeded \macro \nextmacro, 5, 4, v28.16b, v29.16b, \step, \label 1401 ifneeded \macro \nextmacro, 4, 3, v30.16b, v31.16b, \step, \label 1402 ifneeded \macro \nextmacro, 3, 2, v4.16b, v5.16b, \step, \label 1403 ifneeded \macro \nextmacro, 2, 1, v6.16b, v7.16b, \step, \label 1404 ifneeded \macro \nextmacro, 1, 0, v8.16b, v9.16b, \step, \label 1405\label\macro\()0: 1406 b \label\()_end 1407 .purgem ifneeded 1408.endm 1409 1410/* These macros represent the possible stages of filling the window. 1411 * Each macro is unrolled enough times that it can fill the entire window 1412 * itself, but normally it will have to hand control to subsequent macros 1413 * part-way through and this is done using labels named \next and \after, where 1414 * \next is the next macro starting at the same window position and \after is 1415 * the next macro starting after the current window position. 1416 */ 1417 1418/* leftfill: v8 and v9 contain the left padding value. While the window 1419 * extends outside of the image on the left-hand side, and at least 16 more 1420 * padding values are needed in the window, store v8 and v9 into the window. 1421 * Otherwise skip forward to storing image data. 1422 */ 1423.macro prefill_leftfill, next, after, ra, rb, step 1424 cmp x10, #i+16 1425 blo \next 1426 prefill_out \ra, \rb, v8.16b, v9.16b 1427.endm 1428 1429/* leftedge: The very first non-fill or partial-fill chunk from the image is 1430 * already loaded (as it was used to calculate the left padding value), so 1431 * store it here, and then drop into the regular load/store cycle in the next 1432 * macro. 1433 */ 1434.macro prefill_leftedge, next, after, ra, rb, step 14351: prefill_out \ra, \rb, v10.16b, v11.16b 1436 b \after 1437.endm 1438 1439/* dofetch: Copy chunks of the image into the window without any complications 1440 * from edge conditions. 1441 */ 1442.macro prefill_dofetch, next, after, ra, rb, step 1443 cmp x11, #i+16 1444 bls \next 1445 bl fetch_generic_asm 1446 prefill_out \ra, \rb, v10.16b, v11.16b 1447.endm 1448 1449/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond 1450 * the right-hand edge of the image. In that case sweep the last valid pixel 1451 * across the rest of the chunk, and in either case prepare padding data in v12 1452 * and v13 for the next macro. This is done in fetch_clampright. 1453 * This only happens once before going on to the next macro. 1454 * Sometimes leftedge also covers the rightedge case, in which case this has 1455 * to be skipped altogether. 1456 */ 1457.macro prefill_rightedge, next, after, ra, rb, step 1458 cmp x11, #i 1459 bls \next 1460 bl fetch_clampright\step 1461 prefill_out \ra, \rb, v10.16b, v11.16b 1462 b \after 1463.endm 1464 1465/* rightfill: The rest of the window is simply filled with right padding from 1466 * v12 and v13. 1467 */ 1468.macro prefill_rightfill, next, after, ra, rb, step 1469 prefill_out \ra, \rb, v12.16b, v13.16b 1470.endm 1471 1472/* Here all of the macros above are unrolled and laid out in the proper order. 1473 */ 1474.macro prefill_body, max_r, step, label 1475 prefill_list leftfill, leftedge, \max_r, \step, \label 1476 prefill_list leftedge, dofetch, \max_r, \step, \label 1477 prefill_list dofetch, rightedge, \max_r, \step, \label 1478 prefill_list rightedge, rightfill, \max_r, \step, \label 1479 prefill_list rightfill, oops, \max_r, \step, \label 1480\label\()_end: 1481.endm 1482 1483 1484/* Fill the convolution window with context data. The aim here is to load 1485 * exactly 2*r columns, and in the main loop to read as many columns as will be 1486 * written. This is complicated by the window being divided into chunks at 1487 * register boundaries, and the need to handle cases when the input starts very 1488 * close to the left or right (or both) edges of the image and the need to fill 1489 * the spaces that leaves with left and right edge padding values. 1490 * 1491 * Input: 1492 * x1 -- src 1493 * x2 -- pitch 1494 * x3 -- count 1495 * x4 -- available image data right of src pointer 1496 * x5 -- r 1497 * x6 -- rup 1498 * x7 -- rdn 1499 * x8 -- available image data left of src pointer 1500 * x9 -- buffer (if needed) 1501 * x13 = -pitch 1502 * x15 = top-row in 1503 * x19 = bottom-row in 1504 * Output: 1505 * x4 -= min(inlen, count + windowsize - centertap) 1506 * x1 += min(inlen, count + windowsize - centertap) 1507 * x15 += min(inlen, count + windowsize - centertap) 1508 * x19 += min(inlen, count + windowsize - centertap) 1509 * Modifies: 1510 * x10 -- fill start index in the window 1511 * x11 -- fill stop index in the window 1512 * x12 -- scratch 1513 */ 1514.macro prefill step=1, max_r=25, label=xx 1515.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15) 1516.set centertap, (windowsize - \max_r * \step) 1517 mov x10, #centertap 1518 subs x10, x10, x8 1519 csel x10, xzr, x10, lo 1520 1521 subs x11, x4, #windowsize - centertap 1522 csel x11, xzr, x11, hs 1523 add x11, x11, #windowsize 1524 1525 /* x10 indicates where in the window legal image data begins. 1526 * x11 indicates where in the window legal image date ends. 1527 * When starting near the centre of a large image these would be 1528 * zero and windowsize respectively, but when starting near the 1529 * edges this can change. 1530 * When starting on the leftmost pixel, x10 will be centertap. 1531 * When starting on the rightmost pixel, x11 will be centertap+1. 1532 */ 1533 1534 /* x4 indicates how much data there is between the current pointers 1535 * and the right edge of the image. The pointers currently point 1536 * to the data needed at centertap. The subsequent code will 1537 * consume (windowsize - x10) data, but only the data from 1538 * centertap to windowsize comes out of x4's budget. 1539 */ 15401: subs x4, x4, #windowsize - centertap 1541 csel x4, xzr, x4, lo 1542 1543 /* And the pointers need to rewind to the start of the window. 1544 */ 1545 sub x1, x1, #centertap 1546 sub x15, x15, #centertap 1547 sub x19, x19, #centertap 1548 1549 /* Unless x8 indicated that there wasn't that much data available. 1550 */ 1551 add x1, x1, x10 1552 add x15, x15, x10 1553 add x19, x19, x10 1554 1555 /* Get the first chunk, and add padding to align it to the window 1556 * if necessary. 1557 */ 1558 bl fetch_clampleft\step 1559 1560 /* Sometimes the start and the end of the window are in the same 1561 * chunk. In that case both ends need filler at the outset. 1562 */ 1563 sub x12, x11, #1 1564 eor x12, x10, x12 1565 cmp x12, #16 1566 bhs 1f 1567 bl prefill_sweepright\step 1568 1569 /* Iterate through all the points in the window and fill them in 1570 * with padding or image data as needed. 1571 */ 15721: prefill_body \max_r, \step, \label 1573.endm 1574 1575/* The main body of the convolve functions. Having already pre-filled the 1576 * convolution window with 2*r input values, the logic settles into a regular 1577 * pattern of reading and writing at a 1:1 rate until either input or output 1578 * expires. The input leads the output by r values, so when processing all the 1579 * way to the right-hand edge, or within r pixels of that edge, the input will 1580 * run out first. In the case of very narrow images, or sub-windows starting 1581 * near the right edge, the input may already have run out while the 1582 * convolution window was being filled and this loop will start with a 1583 * zero-length input. 1584 * 1585 * Once the input runs out, the rest of the output must be processed by padding 1586 * the remainder of the window with pad value from the last valid pixel from 1587 * the source. 1588 * 1589 * Input: 1590 * x0 = dst 1591 * x1 = src 1592 * x2 = pitch 1593 * x3 = count 1594 * x4 = inlen 1595 * x5 = r 1596 * x6 = rup 1597 * x7 = rdn 1598 * x9 = buffer 1599 * x13 = -pitch 1600 * x15 = top-row in 1601 * x19 = bottom-row in 1602 * Modifies 1603 * x8 = fetch code pointer 1604 */ 1605.macro conv_body core, step=1, max_r=25, labelc="", labelnc="" 1606 1607 /* If x4 >= x3 then there's no need for clipping. The main loop 1608 * needs to exit when either x3 or x4 runs out, so clamp x4 to be 1609 * no greater than x3 and use x4 for the loop. 1610 * However, if x4 comes out of the loop with less than 16 bytes 1611 * left, a partial read would be necessary to avoid reading beyond 1612 * the end of the image. To avoid this, clamp x4 to the next 1613 * multiple of 16, which is still sufficient to force it out of the 1614 * loop but doesn't imply a rewind. 1615 */ 1616 add x12, x3, #15 1617 bic x12, x12, #15 1618 cmp x4, x12 1619 csel x4, x12, x4, hi 1620 1621 /* First calculate the entry-point into the internal fetch logic. 1622 * This is done so the same function can service several kernel 1623 * sizes. 1624 */ 1625 adrp x8, \labelnc 1626 add x8, x8, #:lo12:\labelnc 1627 sub x8, x8, x5, LSL #5 1628 sub x8, x8, x5, LSL #3 1629 cmp x5, x6 1630 ccmp x5, x7, #0, eq 1631 beq 5f 1632 1633 /* if (r != rup || r != rdn) then the address-clamping table should 1634 * be used rather than the short-cut version. 1635 */ 1636 adrp x8, \labelc 1637 add x8, x8, #:lo12:\labelc 1638 sub x8, x8, x5, LSL #6 1639 add x8, x8, x5, LSL #3 1640 b 5f 1641 1642 /* Main loop: ... */ 1643 .align 4 16443: /* first perform a vertical convolution from memory to get the next 1645 * 16 taps of the horizontal window into the register file... 1646 */ 1647 fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8 1648 1649 /* ...then perform a horizontal convolution on that window to 1650 * produce eight output bytes, and slide the window along. 1651 * This has to be done twice to match the 16-way vertical pass. 1652 * It would be preferable to have twice the work done in \core, but 1653 * that would demand yet another variant on those macros and would 1654 * perturb the register allocation severely. 1655 */ 1656 \core 1657 st1 {v15.8b}, [x0], #8 1658 \core 1659 st1 {v15.8b}, [x0], #8 1660 1661 sub x3, x3, #16 16625: subs x4, x4, #16 1663 bhi 3b 1664 /* Here there's 16 or fewer bytes available before the edge of the 1665 * source image. x4 holds that count minus 16 (because it was 1666 * decremented before the first iteration ran). The last read may 1667 * not be a whole chunk, and beyond that a fill value must be used. 1668 * 1669 * Of course, none of that matters if there's no more output to 1670 * produce... 1671 */ 1672 cbz x3, 5f 1673 1674 /* Oh well. */ 1675 adds x4, x4, #16 1676 bne 1f 1677 .if \step==1 1678 dup v10.8h, v9.h[7] 1679 dup v11.8h, v9.h[7] 1680 .else 1681 dup v10.2d, v9.d[1] 1682 dup v11.2d, v9.d[1] 1683 .endif 1684 b 3f 1685 1686 /* To avoid reading past end of input, rewind pointers by (16-x4) 1687 * to ensure that they're exactly 16 bytes from the edge. 1688 */ 16891: mov x11, x4 1690 bl fetch_clampright\step 1691 /* Now to put this padding to use, perform any remaining 1692 * iterations. This is done at half the rate of the main loop, 1693 * because there's no longer pressure from a 16-lane window filler. 1694 */ 16953: \core 1696 .if \step==1 1697 dup v11.8h, v11.h[7] 1698 .else 1699 dup v11.2d, v11.d[1] 1700 .endif 1701 subs x3, x3, #8 1702 blo 4f 1703 st1 {v15.8b}, [x0], #8 1704 bne 3b 1705 b 5f 1706 1707 /* If the final iteration contained 0 < l < 8 values, then perform 1708 * a piecewise store of the final vector. 1709 */ 17104: tbz x3, #2, 1f 1711 st1 {v15.s}[0], [x0], #4 1712 ext v15.8b, v15.8b, v15.8b, #4 17131: tbz x3, #1, 1f 1714 st1 {v15.h}[0], [x0], #2 1715 ext v15.8b, v15.8b, v15.8b, #2 17161: tbz x3, #0, 5f 1717 st1 {v15.b}[0], [x0], #1 1718 ext v15.8b, v15.8b, v15.8b, #1 17195: mov x0, #0 1720.endm 1721 1722 1723.irp r, TUNED_LIST1, 25 1724PRIVATE(convolve1_\r) 1725 stp x29,x30, [sp, #-16]! 1726 1727 prefill step=1, max_r=\r, label=.Lcnv1_\r 1728 1729 conv_body core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1730 1731 ldp x29,x30, [sp], #16 1732 ret 1733END(convolve1_\r) 1734.endr 1735 1736.irp r, TUNED_LIST4, 25 1737PRIVATE(convolve4_\r) 1738 sub x9, sp, #0x40 1739 stp x29,x30, [sp, #-(16 + 0x40 + 0x80)]! 1740 bic x9, x9, #0x7f 1741 1742 /* x9 now points to a 0x40 byte buffer on the stack whose address 1743 * has the low 7 bits clear. This allows easy address calculation 1744 * in the wrap-around cases. 1745 */ 1746 1747 prefill step=4, max_r=\r, label=.Lcnv4_\r 1748 1749 conv_body core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1750 1751 ldp x29,x30, [sp], #(16 + 0x40 + 0x80) 1752 ret 1753END(convolve4_\r) 1754.endr 1755 1756/* void rsdIntrinsicBlurU1_K( 1757 * void *out, // x0 1758 * void *in, // x1 1759 * size_t w, // x2 1760 * size_t h, // x3 1761 * size_t p, // x4 1762 * size_t x, // x5 1763 * size_t y, // x6 1764 * size_t count, // x7 1765 * size_t r, // [sp] 1766 * uint16_t *tab); // [sp,#8] 1767 */ 1768ENTRY(rsdIntrinsicBlurU1_K) 1769 stp x19,x30, [sp, #-16]! 1770 sub x8, sp, #32 1771 sub sp, sp, #64 1772 st1 {v8.1d - v11.1d}, [sp] 1773 st1 {v12.1d - v15.1d}, [x8] 1774 mov x8, x5 // x 1775 ldr w5, [sp,#80] // r 1776 sub x9, x2, x8 // w - x 1777 sub x10, x3, x6 // h - y 1778 mov x2, x4 // pitch 1779 mov x3, x7 // count 1780 sub x7, x10, #1 // h - y - 1 1781 mov x4, x9 // inlen = (w - x) 1782 1783 ldr x12, [sp, #88] // tab 1784 1785 add x1, x1, x8 // src += x 1786 1787 cmp x6, x5 1788 csel x6, x5, x6, hs // rup = min(r, y) 1789 cmp x7, x5 1790 csel x7, x5, x7, hs // rdn = min(r, h - y - 1) 1791 1792 sub x13, xzr, x2 // -pitch 1793 msub x15, x2, x6, x1 1794 madd x19, x2, x7, x1 1795 1796 ld1 {v0.8h,v1.8h}, [x12], #32 1797 ld1 {v2.8h,v3.8h}, [x12], #32 1798 1799 adr x30, 1f 1800 .irp r, TUNED_LIST1 1801 cmp x5, #\r 1802 bls convolve1_\r 1803 .endr 1804 b convolve1_25 1805 18061: ld1 {v8.1d - v11.1d}, [sp], #32 1807 ld1 {v12.1d - v15.1d}, [sp], #32 1808 ldp x19,x30, [sp], #16 1809 ret 1810END(rsdIntrinsicBlurU1_K) 1811 1812/* void rsdIntrinsicBlurU4_K( 1813 * void *out, // x0 1814 * void *in, // x1 1815 * size_t w, // x2 1816 * size_t h, // x3 1817 * size_t p, // x4 1818 * size_t x, // x5 1819 * size_t y, // x6 1820 * size_t count, // x7 1821 * size_t r, // [sp] 1822 * uint16_t *tab); // [sp,#8] 1823 */ 1824ENTRY(rsdIntrinsicBlurU4_K) 1825 stp x19,x30, [sp, #-16]! 1826 sub x8, sp, #32 1827 sub sp, sp, #64 1828 st1 {v8.1d - v11.1d}, [sp] 1829 st1 {v12.1d - v15.1d}, [x8] 1830 lsl x8, x5, #2 // x 1831 lsl x2, x2, #2 1832 ldr w5, [sp,#80] // r 1833 sub x9, x2, x8 // w - x 1834 sub x10, x3, x6 // h - y 1835 mov x2, x4 // pitch 1836 lsl x3, x7, #2 // count 1837 sub x7, x10, #1 // h - y - 1 1838 mov x4, x9 // inlen = (w - x) 1839 1840 ldr x12, [sp, #88] 1841 1842 add x1, x1, x8 // in += x 1843 1844 cmp x6, x5 1845 csel x6, x5, x6, hs // rup = min(r, y) 1846 cmp x7, x5 1847 csel x7, x5, x7, hs // rdn = min(r, h - y - 1) 1848 1849 1850 sub x13, xzr, x2 1851 msub x15, x2, x6, x1 1852 madd x19, x2, x7, x1 1853 1854 ld1 {v0.8h,v1.8h}, [x12], #32 1855 ld1 {v2.8h,v3.8h}, [x12], #32 1856 1857 adr x30, 1f 1858 .irp r, TUNED_LIST4 1859 cmp x5, #\r 1860 bls convolve4_\r 1861 .endr 1862 b convolve4_25 1863 18641: ld1 {v8.1d - v11.1d}, [sp], #32 1865 ld1 {v12.1d - v15.1d}, [sp], #32 1866 ldp x19,x30, [sp], #16 1867 ret 1868END(rsdIntrinsicBlurU4_K) 1869