1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart 18#define END(f) .fnend; .size f, .-f; 19 20.eabi_attribute 25,1 @Tag_ABI_align8_preserved 21.arm 22 23/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1 24 * integer (bicubic has a little overshoot). It would also be possible to add 25 * a temporary DC bias to eliminate the sign bit for more precision, but that's 26 * extra arithmetic. 27 */ 28.set VERTBITS, 14 29 30/* The size of the scratch buffer in which we store our vertically convolved 31 * intermediates. 32 */ 33.set CHUNKSHIFT, 7 34.set CHUNKSIZE, (1 << CHUNKSHIFT) 35 36/* The number of components processed in a single iteration of the innermost 37 * loop. 38 */ 39.set VECSHIFT, 3 40.set VECSIZE, (1<<VECSHIFT) 41 42/* Read four different lines (except at edges where addresses may be clamped, 43 * which is why we don't simply take base and stride registers), and multiply 44 * and accumulate them by the coefficients in d6[0..3], leaving the results in 45 * q12. This gives eight 16-bit results representing a horizontal line of 2-8 46 * input pixels (depending on number of components per pixel) to be fed into 47 * the horizontal scaling pass. 48 * 49 * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are 50 * known to represent negative values and VMLS is used to implement this). 51 * Output is VERTBITS signed fixed-point, which must leave room for a little 52 * bit of overshoot beyond [0,1.0). 53 */ 54.macro vert8, dstlo=d24, dsthi=d25 55 vld1.u8 d16, [r4]! 56 vld1.u8 d18, [r5]! 57 vld1.u8 d20, [r6]! 58 vld1.u8 d22, [r7]! 59 vmovl.u8 q8, d16 60 vmovl.u8 q9, d18 61 vmovl.u8 q10, d20 62 vmovl.u8 q11, d22 63 vmull.u16 q12, d18, d6[1] 64 vmull.u16 q13, d19, d6[1] 65 vmlsl.u16 q12, d16, d6[0] 66 vmlsl.u16 q13, d17, d6[0] 67 vmlal.u16 q12, d20, d6[2] 68 vmlal.u16 q13, d21, d6[2] 69 vmlsl.u16 q12, d22, d6[3] 70 vmlsl.u16 q13, d23, d6[3] 71 72 /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies), 73 * minus VERTBITS (the number of fraction bits we want to keep from 74 * here on). 75 */ 76 vqshrn.s32 \dstlo, q12, #8 + 16 - VERTBITS 77 vqshrn.s32 \dsthi, q13, #8 + 16 - VERTBITS 78.endm 79 80/* As above, but only four 16-bit results into d25. 81 */ 82.macro vert4 83 vld1.u32 d16[0], [r4]! 84 vld1.u32 d18[0], [r5]! 85 vld1.u32 d20[0], [r6]! 86 vld1.u32 d22[0], [r7]! 87 vmovl.u8 q8, d16 88 vmovl.u8 q9, d18 89 vmovl.u8 q10, d20 90 vmovl.u8 q11, d22 91 vmull.u16 q12, d18, d6[1] 92 vmlsl.u16 q12, d16, d6[0] 93 vmlal.u16 q12, d20, d6[2] 94 vmlsl.u16 q12, d22, d6[3] 95 vqshrn.s32 d25, q12, #8 + 16 - VERTBITS 96.endm 97 98 99/* During horizontal resize having CHUNKSIZE input available means being able 100 * to produce a varying amount of output, depending on the phase of the data. 101 * This function calculates the minimum number of VECSIZE chunks extracted from 102 * a CHUNKSIZE window (r1), and the threshold value for when the count will be 103 * one higher than that (r0). 104 * These work out, conveniently, to be the quotient and remainder from: 105 * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE) 106 * 107 * The two values can be packed together in a uint64_t for convenience; and 108 * they are, in fact, used this way as an arithmetic short-cut later on. 109 */ 110 111/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */ 112ENTRY(rsdIntrinsicResize_oscctl_K) 113 lsl r2, r0, #VECSHIFT 114 movw r0, #:lower16:(CHUNKSIZE << 16) - 1 115 movt r0, #:upper16:(CHUNKSIZE << 16) - 1 116 add r0, r0, r2 117#if defined(ARCH_ARM_USE_UDIV) 118 udiv r1, r0, r2 119 mls r0, r1, r2, r0 120#else 121 clz r3, r2 122 clz r1, r0 123 subs r3, r3, r1 124 movlt r3, #0 125 mov r1, #1 126 lsl r2, r2, r3 127 lsl r3, r1, r3 128 mov r1, #0 1291: cmp r2, r0 130 addls r1, r3 131 subls r0, r2 132 lsrs r3, r3, #1 133 lsr r2, r2, #1 134 bne 1b 135#endif 136 bx lr 137END(rsdIntrinsicResize_oscctl_K) 138 139/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code. 140 * For the most part the vertical pass (the outer loop) is the same for all 141 * versions. Exceptions are handled in-line with conditional assembly. 142 */ 143.irp comp, 1, 2, 4 144.if \comp == 1 145.set COMPONENT_SHIFT, 0 146.elseif \comp == 2 147.set COMPONENT_SHIFT, 1 148.elseif \comp == 4 149.set COMPONENT_SHIFT, 2 150.else 151.error "Unknown component count" 152.endif 153.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT) 154.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT) 155 156.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2 157.set OSC_STORE, (BUFFER_SIZE + 0) 158.set OSCSTEP_STORE, (BUFFER_SIZE + 4) 159.set OSCCTL_STORE, (BUFFER_SIZE + 8) 160.set AVAIL_STORE, (BUFFER_SIZE + 16) 161.set SP_STORE, (BUFFER_SIZE + 24) /* should be +20, but rounded up to make a legal constant somewhere */ 162 163/* void rsdIntrinsicResizeB\comp\()_K( 164 * uint8_t * restrict dst, // r0 165 * size_t count, // r1 166 * uint32_t xf, // r2 167 * uint32_t xinc, // r3 168 * uint8_t const * restrict srcn, // [sp] -> [sp,#104] -> r4 169 * uint8_t const * restrict src0, // [sp,#4] -> [sp,#108] -> r5 170 * uint8_t const * restrict src1, // [sp,#8] -> [sp,#112] -> r6 171 * uint8_t const * restrict src2, // [sp,#12] -> [sp,#116] -> r7 172 * size_t xclip, // [sp,#16] -> [sp,#120] 173 * size_t avail, // [sp,#20] -> [sp,#124] -> lr 174 * uint64_t osc_ctl, // [sp,#24] -> [sp,#128] 175 * int32_t const *yr); // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access) 176 */ 177ENTRY(rsdIntrinsicResizeB\comp\()_K) 178 push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} 179 vpush {d8-d15} 180 181 /* align the working buffer on the stack to make it easy to use bit 182 * twiddling for address calculations and bounds tests. 183 */ 184 sub r12, sp, #BUFFER_SIZE + 32 185 mov lr, sp 186 bfc r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1 187 mov sp, r12 188 str lr, [sp,#SP_STORE] 189 190 ldr r8, [lr,#136] // yr 191 adr r9, 8f 192 vld1.s32 {q4}, [r8] 193 vld1.s16 {q5}, [r9] 194 vqmovun.s32 d8, q4 // yr 195 vdup.s16 q6, r2 196 vdup.s16 q7, r3 197 vmla.s16 q6, q5, q7 // vxf 198 vshl.s16 q7, q7, #VECSHIFT // vxinc 199 200 ldrd r4,r5, [lr,#104] // srcn, src0 201 ldrd r6,r7, [lr,#112] // src1, src2 202 203 /* Compute starting condition for oscillator used to compute ahead 204 * of time how many iterations are possible before needing to 205 * refill the working buffer. This is based on the fixed-point 206 * index of the last element in the vector of pixels processed in 207 * each iteration, counting up until it would overflow. 208 */ 209 sub r8, r2, r3 210 mov r9, r3, LSL #VECSHIFT 211 add r8, r8, r9 212 213 ldrd r10,r11, [lr,#128] // osc_ctl 214 215 str r8, [sp,#OSC_STORE] 216 str r9, [sp,#OSCSTEP_STORE] 217 str r10, [sp,#OSCCTL_STORE] 218 str r11, [sp,#OSCCTL_STORE+4] 219 ldrd r10,r11, [lr,#120] // xclip,avail 220 221 222 /* r4-r7 contain pointers to the four lines of input to be 223 * convolved. These pointers have been clamped vertically and 224 * horizontally (which is why it's not a simple row/stride pair), 225 * and the xclip argument (now in r10) indicates how many pixels 226 * from true the x position of the pointer is. This value should 227 * be 0, 1, or 2 only. 228 * 229 * Start by placing four pixels worth of input at the far end of 230 * the buffer. As many as two of these may be clipped, so four 231 * pixels are fetched, and then the first pixel is duplicated and 232 * the data shifted according to xclip. The source pointers are 233 * then also adjusted according to xclip so that subsequent fetches 234 * match. 235 */ 236 vmov d6, d8 /* make y coeffs available for vert4 and vert8 macros */ 237 238 sub r8, r12, r10, LSL #COMPONENT_SHIFT + 1 239 add r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2 240 add r8, r8, #4 * COMPONENT_COUNT * 2 241.if \comp == 1 242 vert4 243 vdup.s16 d24, d25[0] 244 vst1.s16 {q12}, [r12] 245 vld1.s16 {d24}, [r8] 246 vst1.s16 {d24}, [r9] 247.elseif \comp == 2 248 vert8 249 vdup.u32 q11, d24[0] 250 vst1.s16 {q11,q12}, [r12] 251 vld1.s16 {q12}, [r8] 252 vst1.s16 {q12}, [r9] 253.elseif \comp == 4 254 vert8 d28, d29 255 vert8 d30, d31 256 vmov.u64 d24, d28 257 vmov.u64 d25, d28 258 vmov.u64 d26, d28 259 vmov.u64 d27, d28 260 vst1.s16 {q12,q13}, [r12]! 261 vst1.s16 {q14,q15}, [r12] 262 sub r12, r12, #32 263 vld1.s16 {q11,q12}, [r8] 264 vst1.s16 {q11,q12}, [r9] 265.endif 266 /* Count off four pixels into the working buffer, and move count to 267 * its new home. 268 */ 269 sub lr, r11, #4 270 /* Incoming pointers were to the first _legal_ pixel. Four pixels 271 * were read unconditionally, but some may have been discarded by 272 * xclip, so we rewind the pointers to compensate. 273 */ 274 sub r4, r4, r10, LSL #COMPONENT_SHIFT 275 sub r5, r5, r10, LSL #COMPONENT_SHIFT 276 sub r6, r6, r10, LSL #COMPONENT_SHIFT 277 sub r7, r7, r10, LSL #COMPONENT_SHIFT 278 279 /* First tap starts where we just pre-filled, at the end of the 280 * buffer. 281 */ 282 add r2, r2, #(CHUNKSIZE * 2 - 4) << 16 283 284 /* Use overflowing arithmetic to implement wraparound array 285 * indexing. 286 */ 287 mov r2, r2, LSL #(15 - CHUNKSHIFT) 288 mov r3, r3, LSL #(15 - CHUNKSHIFT) 289 290 str lr, [sp,#AVAIL_STORE] 291 292 /* Start of outermost loop. 293 * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the 294 * number of iterations of the inner loop that can be performed and 295 * get into that. 296 * 297 * The fill is complicated by the possibility of running out of 298 * input before the scratch buffer is filled. If this isn't a risk 299 * then it's handled by the simple loop at 2:, otherwise the 300 * horrible loop at 3:. 301 */ 3021: ldr lr, [sp,#AVAIL_STORE] /* get number of pixels available */ 303 vmov d6, d8 /* put y scaling coefficients somewhere handy */ 304 subs lr, #CHUNKSIZE 305 bge 2f /* if at least CHUNKSIZE are available... */ 306 add lr, #CHUNKSIZE /* if they're not... */ 307 b 4f 308 /* ..just sneaking a literal in here after this unconditional branch.. */ 3098: .hword 0, 1, 2, 3, 4, 5, 6, 7 310 /* basic fill loop, processing 8 bytes at a time until there are 311 * fewer than eight bytes available. 312 */ 3133: vert8 314 sub lr, lr, #8 / COMPONENT_COUNT 315 vst1.s16 {q12}, [r12]! 3164: cmp lr, #8 / COMPONENT_COUNT - 1 317 bgt 3b 318.if \comp == 4 319 blt 3f 320 /* The last pixel (four bytes) if necessary */ 321 vert4 322.else 323 cmp lr, #1 324 blt 3f 325 /* The last pixels if necessary */ 326 sub r4, r4, #8 327 sub r5, r5, #8 328 sub r6, r6, #8 329 sub r7, r7, #8 330 add r4, r4, lr, LSL #COMPONENT_SHIFT 331 add r5, r5, lr, LSL #COMPONENT_SHIFT 332 add r6, r6, lr, LSL #COMPONENT_SHIFT 333 add r7, r7, lr, LSL #COMPONENT_SHIFT 334 vert8 335 sub lr, sp, lr, LSL #COMPONENT_SHIFT + 1 336 sub sp, sp, #32 337 sub lr, lr, #16 338.if \comp == 1 339 vdup.s16 q13, d25[3] 340.elseif \comp == 2 341 vdup.u32 q13, d25[1] 342.endif 343 vst1.s16 {q12,q13}, [sp] 344 vld1.s16 {q12}, [lr] 345 add sp, sp, #32 346 b 4f 347.endif 348 /* Keep filling until we get to the end of this chunk of the buffer */ 3493: 350.if \comp == 1 351 vdup.s16 q12, d25[3] 352.elseif \comp == 2 353 vdup.u32 q12, d25[1] 354.elseif \comp == 4 355 vmov.u64 d24, d25 356.endif 3574: vst1.s16 {q12}, [r12]! 358 tst r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 359 bne 3b 360 b 4f 361 362.align 4 3632: /* Quickly pull a chunk of data into the working buffer. 364 */ 365 vert8 366 vst1.s16 {q12}, [r12]! 367 vert8 368 vst1.s16 {q12}, [r12]! 369 tst r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2 370 bne 2b 371 cmp lr, #0 372 bne 3f 3734: /* if we end with 0 pixels left we'll have nothing handy to spread 374 * across to the right, so we rewind a bit. 375 */ 376 mov lr, #1 377 sub r4, r4, #COMPONENT_COUNT 378 sub r5, r5, #COMPONENT_COUNT 379 sub r6, r6, #COMPONENT_COUNT 380 sub r7, r7, #COMPONENT_COUNT 3813: str lr, [sp,#AVAIL_STORE] /* done with available pixel count */ 382 add lr, sp, #OSC_STORE 383 ldrd r8,r9, [lr,#0] /* need osc, osc_step soon */ 384 ldrd r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */ 385 386 /* copy four taps (width of cubic window) to far end for overflow 387 * address handling 388 */ 389 sub lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2 390 eor r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2 391.if \comp == 1 392 vld1.s16 {d28}, [lr] 393.elseif \comp == 2 394 vld1.s16 {q14}, [lr] 395.elseif \comp == 4 396 vld1.s16 {q14,q15}, [lr] 397.endif 398 add lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2 399.if \comp == 1 400 vst1.s16 {d28}, [lr] 401.elseif \comp == 2 402 vst1.s16 {q14}, [lr] 403.elseif \comp == 4 404 vst1.s16 {q14,q15}, [lr] 405.endif 406 /* r11 contains the maximum possible iteration count, but if r8 is 407 * greater than r10 then this indicates that the count must be 408 * reduced by one for this iteration to avoid reading past the end 409 * of the available data. 410 */ 411 cmp r10, r8 412 sbc lr, r11, #0 413 414 mla r8, lr, r9, r8 415 sub r8, r8, #(CHUNKSIZE << 16) 416 417 str r8, [sp,#OSC_STORE] /* done with osc */ 418 419 /* prefer to count pixels, rather than vectors, to clarify the tail 420 * store case on exit. 421 */ 422 mov lr, lr, LSL #VECSHIFT 423 cmp lr, r1 424 movgt lr, r1 425 426 sub r1, r1, lr 427 428 mov lr, lr, LSL #COMPONENT_SHIFT 429 430 vmov.i16 d10, #3 431 vmov.i16 d11, #0x8000 432 433 cmp lr, #0 434 bgt 3f 435 cmp r1, #0 436 bgt 1b /* an extreme case where we shouldn't use code in this structure */ 437 b 9f 438 439 .align 4 4402: /* Inner loop continues here, but starts at 3:, see end of loop 441 * below for explanation. */ 442.if LOOP_OUTPUT_SIZE == 4 443 vst1.u32 {d16[0]}, [r0]! 444.elseif LOOP_OUTPUT_SIZE == 8 445 vst1.u8 {d16}, [r0]! 446.elseif LOOP_OUTPUT_SIZE == 16 447 vst1.u8 {q8}, [r0]! 448.elseif LOOP_OUTPUT_SIZE == 32 449 vst1.u8 {q8,q9}, [r0]! 450.endif 451 /* Inner loop: here the four x coefficients for each tap are 452 * calculated in vector code, and the addresses are calculated in 453 * scalar code, and these calculations are interleaved. 454 */ 4553: vshr.u16 q8, q6, #1 456 mov r8, r2, LSR #(31 - CHUNKSHIFT) 457 vqrdmulh.s16 q9, q8, q8 458 add r2, r2, r3 459 vqrdmulh.s16 q10, q9, q8 460 mov r9, r2, LSR #(31 - CHUNKSHIFT) 461 vshll.s16 q11, d18, #2 462 vshll.s16 q12, d19, #2 463 add r2, r2, r3 464 vmlsl.s16 q11, d20, d10 465 vmlsl.s16 q12, d21, d10 466 mov r10, r2, LSR #(31 - CHUNKSHIFT) 467 468 vhadd.s16 q0, q10, q8 469 add r2, r2, r3 470 vsub.s16 q0, q9, q0 471 mov r11, r2, LSR #(31 - CHUNKSHIFT) 472 473 vaddw.s16 q1, q11, d18 474 vaddw.s16 q13, q12, d19 475 add r2, r2, r3 476 vshrn.s32 d2, q1, #1 477 vshrn.s32 d3, q13, #1 478 add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 479 vsub.s16 d2, d2, d11 480 vsub.s16 d3, d3, d11 // TODO: find a wider d11 and use q-reg operation 481 add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 482 483 vaddw.s16 q2, q11, d16 484 vaddw.s16 q13, q12, d17 485 add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 486 vshrn.s32 d4, q2, #1 487 vshrn.s32 d5, q13, #1 488 add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 489 vneg.s16 q2, q2 490 491 vhsub.s16 q3, q10, q9 492 493 /* increment the x fractional parts (oveflow is ignored, as the 494 * scalar arithmetic shadows this addition with full precision). 495 */ 496 vadd.s16 q6, q6, q7 497 498 /* At this point we have four pointers in r8-r11, pointing to the 499 * four taps in the scratch buffer that must be convolved together 500 * to produce an output pixel (one output pixel per pointer). 501 * These pointers usually overlap, but their spacing is irregular 502 * so resolving the redundancy through L1 is a pragmatic solution. 503 * 504 * The scratch buffer is made of signed 16-bit data, holding over 505 * some extra precision, and overshoot, from the vertical pass. 506 * 507 * We also have the 16-bit unsigned fixed-point weights for each 508 * of the four taps in q0 - q3. That's eight pixels worth of 509 * coefficients when we have only four pointers, so calculations 510 * for four more pixels are interleaved with the fetch and permute 511 * code for each variant in the following code. 512 * 513 * The data arrangement is less than ideal for any pixel format, 514 * but permuting loads help to mitigate most of the problems. 515 * 516 * Note also that the two outside taps of a bicubic are negative, 517 * but these coefficients are unsigned. The sign is hard-coded by 518 * use of multiply-and-subtract operations. 519 */ 520.if \comp == 1 521 /* The uchar 1 case. 522 * Issue one lanewise vld4.s16 to load four consecutive pixels from 523 * one pointer (one pixel) into four different registers; then load 524 * four consecutive s16 values from the next pointer (pixel) into 525 * the next lane of those four registers, etc., so that we finish 526 * with q12 - q15 representing the four taps, and each lane 527 * representing a separate pixel. 528 * 529 * The first vld4 uses a splat to avoid any false dependency on 530 * the previous state of the register. 531 */ 532 vld4.s16 {d24[],d26[],d28[],d30[]}, [r8] 533 mov r8, r2, LSR #(31 - CHUNKSHIFT) 534 add r2, r2, r3 535 vld4.s16 {d24[1],d26[1],d28[1],d30[1]}, [r9] 536 add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 537 mov r9, r2, LSR #(31 - CHUNKSHIFT) 538 add r2, r2, r3 539 vld4.s16 {d24[2],d26[2],d28[2],d30[2]}, [r10] 540 add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 541 mov r10, r2, LSR #(31 - CHUNKSHIFT) 542 add r2, r2, r3 543 vld4.s16 {d24[3],d26[3],d28[3],d30[3]}, [r11] 544 add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 545 mov r11, r2, LSR #(31 - CHUNKSHIFT) 546 add r2, r2, r3 547 vld4.s16 {d25[],d27[],d29[],d31[]}, [r8] 548 add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 549 vld4.s16 {d25[1],d27[1],d29[1],d31[1]}, [r9] 550 vld4.s16 {d25[2],d27[2],d29[2],d31[2]}, [r10] 551 vld4.s16 {d25[3],d27[3],d29[3],d31[3]}, [r11] 552 553 vmull.s16 q8, d24, d0 554 vmull.s16 q9, d25, d1 555 vmlsl.s16 q8, d26, d2 556 vmlsl.s16 q9, d27, d3 557 vmlsl.s16 q8, d28, d4 558 vmlsl.s16 q9, d29, d5 559 vmlal.s16 q8, d30, d6 560 vmlal.s16 q9, d31, d7 561 562 subs lr, lr, #LOOP_OUTPUT_SIZE 563 564 vqrshrn.s32 d16, q8, #15 565 vqrshrn.s32 d17, q9, #15 566 567 vqrshrun.s16 d16, q8, #VERTBITS - 8 568.elseif \comp == 2 569 /* The uchar2 case: 570 * This time load pairs of values into adjacent lanes in q12 - q15 571 * by aliasing them as u32 data; leaving room for only four pixels, 572 * so the process has to be done twice. This also means that the 573 * coefficient registers fail to align with the coefficient data 574 * (eight separate pixels), so that has to be doubled-up to match. 575 */ 576 vld4.u32 {d24[],d26[],d28[],d30[]}, [r8] 577 mov r8, r2, LSR #(31 - CHUNKSHIFT) 578 add r2, r2, r3 579 vld4.u32 {d24[1],d26[1],d28[1],d30[1]}, [r9] 580 add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 581 mov r9, r2, LSR #(31 - CHUNKSHIFT) 582 add r2, r2, r3 583 vld4.u32 {d25[],d27[],d29[],d31[]}, [r10] 584 add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 585 mov r10, r2, LSR #(31 - CHUNKSHIFT) 586 add r2, r2, r3 587 vld4.u32 {d25[1],d27[1],d29[1],d31[1]}, [r11] 588 add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 589 mov r11, r2, LSR #(31 - CHUNKSHIFT) 590 add r2, r2, r3 591 592 /* double-up coefficients to align with component pairs */ 593 vmov d20, d0 594 add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 595 vmov d21, d2 596 vmov d22, d4 597 vmov d23, d6 598 vzip.s16 d0, d20 599 vzip.s16 d2, d21 600 vzip.s16 d4, d22 601 vzip.s16 d6, d23 602 603 vmull.s16 q8, d24, d0 604 vmull.s16 q9, d25, d20 605 vmlsl.s16 q8, d26, d2 606 vmlsl.s16 q9, d27, d21 607 vmlsl.s16 q8, d28, d4 608 vmlsl.s16 q9, d29, d22 609 vmlal.s16 q8, d30, d6 610 vmlal.s16 q9, d31, d23 611 612 vqrshrn.s32 d16, q8, #15 613 vqrshrn.s32 d17, q9, #15 614 615 vld4.u32 {d24[],d26[],d28[],d30[]}, [r8] 616 vld4.u32 {d24[1],d26[1],d28[1],d30[1]}, [r9] 617 vld4.u32 {d25[],d27[],d29[],d31[]}, [r10] 618 vld4.u32 {d25[1],d27[1],d29[1],d31[1]}, [r11] 619 620 /* double-up coefficients to align with component pairs */ 621 vmov d0, d1 622 vmov d2, d3 623 vmov d4, d5 624 vmov d6, d7 625 vzip.s16 d0, d1 626 vzip.s16 d2, d3 627 vzip.s16 d4, d5 628 vzip.s16 d6, d7 629 630 vmull.s16 q10, d24, d0 631 vmull.s16 q11, d25, d1 632 vmlsl.s16 q10, d26, d2 633 vmlsl.s16 q11, d27, d3 634 vmlsl.s16 q10, d28, d4 635 vmlsl.s16 q11, d29, d5 636 vmlal.s16 q10, d30, d6 637 vmlal.s16 q11, d31, d7 638 639 subs lr, lr, #LOOP_OUTPUT_SIZE 640 641 vqrshrn.s32 d18, q10, #15 642 vqrshrn.s32 d19, q11, #15 643 644 vqrshrun.s16 d16, q8, #VERTBITS - 8 645 vqrshrun.s16 d17, q9, #VERTBITS - 8 646.elseif \comp == 4 647 /* The uchar4 case. 648 * This case is comparatively painless because four s16s are the 649 * smallest addressable unit for a vmul-by-scalar. Rather than 650 * permute the data, simply arrange the multiplies to suit the way 651 * the data comes in. That's a lot of data, though, so things 652 * progress in pairs of pixels at a time. 653 */ 654 vld1.s16 {q12,q13}, [r8] 655 mov r8, r2, LSR #(31 - CHUNKSHIFT) 656 add r2, r2, r3 657 vld1.s16 {q14,q15}, [r9] 658 add r8, sp, r8, LSL #(COMPONENT_SHIFT + 1) 659 mov r9, r2, LSR #(31 - CHUNKSHIFT) 660 add r2, r2, r3 661 662 vmull.s16 q8, d24, d0[0] 663 vmull.s16 q9, d28, d0[1] 664 vmlsl.s16 q8, d25, d2[0] 665 vmlsl.s16 q9, d29, d2[1] 666 vmlsl.s16 q8, d26, d4[0] 667 vmlsl.s16 q9, d30, d4[1] 668 vmlal.s16 q8, d27, d6[0] 669 vmlal.s16 q9, d31, d6[1] 670 671 /* And two more... */ 672 vld1.s16 {q12,q13}, [r10] 673 add r9, sp, r9, LSL #(COMPONENT_SHIFT + 1) 674 mov r10, r2, LSR #(31 - CHUNKSHIFT) 675 add r2, r2, r3 676 vld1.s16 {q14,q15}, [r11] 677 add r10, sp, r10, LSL #(COMPONENT_SHIFT + 1) 678 mov r11, r2, LSR #(31 - CHUNKSHIFT) 679 add r2, r2, r3 680 681 vqrshrn.s32 d16, q8, #15 682 add r11, sp, r11, LSL #(COMPONENT_SHIFT + 1) 683 vqrshrn.s32 d17, q9, #15 684 685 vmull.s16 q10, d24, d0[2] 686 vmull.s16 q11, d28, d0[3] 687 vmlsl.s16 q10, d25, d2[2] 688 vmlsl.s16 q11, d29, d2[3] 689 vmlsl.s16 q10, d26, d4[2] 690 vmlsl.s16 q11, d30, d4[3] 691 vmlal.s16 q10, d27, d6[2] 692 vmlal.s16 q11, d31, d6[3] 693 694 vqrshrn.s32 d18, q10, #15 695 vqrshrn.s32 d19, q11, #15 696 697 vqrshrun.s16 d16, q8, #VERTBITS - 8 698 vqrshrun.s16 d17, q9, #VERTBITS - 8 699 700 /* And two more... */ 701 vld1.s16 {q12,q13}, [r8] 702 vld1.s16 {q14,q15}, [r9] 703 704 vmull.s16 q10, d24, d1[0] 705 vmull.s16 q11, d28, d1[1] 706 vmlsl.s16 q10, d25, d3[0] 707 vmlsl.s16 q11, d29, d3[1] 708 vmlsl.s16 q10, d26, d5[0] 709 vmlsl.s16 q11, d30, d5[1] 710 vmlal.s16 q10, d27, d7[0] 711 vmlal.s16 q11, d31, d7[1] 712 713 /* And two more... */ 714 vld1.s16 {q12,q13}, [r10] 715 vld1.s16 {q14,q15}, [r11] 716 717 subs lr, lr, #LOOP_OUTPUT_SIZE 718 719 vqrshrn.s32 d18, q10, #15 720 vqrshrn.s32 d19, q11, #15 721 722 vmull.s16 q10, d24, d1[2] 723 vmull.s16 q11, d28, d1[3] 724 vmlsl.s16 q10, d25, d3[2] 725 vmlsl.s16 q11, d29, d3[3] 726 vmlsl.s16 q10, d26, d5[2] 727 vmlsl.s16 q11, d30, d5[3] 728 vmlal.s16 q10, d27, d7[2] 729 vmlal.s16 q11, d31, d7[3] 730 731 vqrshrn.s32 d20, q10, #15 732 vqrshrn.s32 d21, q11, #15 733 734 vqrshrun.s16 d18, q9, #VERTBITS - 8 735 vqrshrun.s16 d19, q10, #VERTBITS - 8 736.endif 737 bgt 2b /* continue inner loop */ 738 /* The inner loop has already been limited to ensure that none of 739 * the earlier iterations could overfill the output, so the store 740 * appears within the loop but after the conditional branch (at the 741 * top). At the end, provided it won't overfill, perform the final 742 * store here. If it would, then break out to the tricky tail case 743 * instead. 744 */ 745 blt 1f 746 /* Store the amount of data appropriate to the configuration of the 747 * instance being assembled. 748 */ 749.if LOOP_OUTPUT_SIZE == 4 750 vst1.u32 {d16[0]}, [r0]! 751.elseif LOOP_OUTPUT_SIZE == 8 752 vst1.u8 {d16}, [r0]! 753.elseif LOOP_OUTPUT_SIZE == 16 754 vst1.u8 {q8}, [r0]! 755.elseif LOOP_OUTPUT_SIZE == 32 756 vst1.u8 {q8,q9}, [r0]! 757.endif 758 b 1b /* resume outer loop */ 759 /* Partial tail store case: 760 * Different versions of the code need different subsets of the 761 * following partial stores. Here the number of components and the 762 * size of the chunk of data produced by each inner loop iteration 763 * is tested to figure out whether or not each phrase is relevant. 764 */ 765.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16 7661: tst lr, #16 767 beq 1f 768 vst1.u8 {q8}, [r0]! 769 vmov q8, q9 770.endif 771.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8 7721: tst lr, #8 773 beq 1f 774 vst1.u8 {d16}, [r0]! 775 vmov.u8 d16, d17 776.endif 777.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4 7781: tst lr, #4 779 beq 1f 780 vst1.u32 {d16[0]}, [r0]! 781 vext.u32 d16, d16, d16, #1 782.endif 783.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2 7841: tst lr, #2 785 beq 1f 786 vst1.u16 {d16[0]}, [r0]! 787 vext.u16 d16, d16, d16, #1 788.endif 789.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1 7901: tst lr, #1 791 beq 1f 792 vst1.u8 {d16[0]}, [r0]! 793.endif 7941: 7959: ldr sp, [sp,#SP_STORE] 796 vpop {d8-d15} 797 pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} 798END(rsdIntrinsicResizeB\comp\()_K) 799.endr 800