1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
18#define END(f) .fnend; .size f, .-f;
19
20.eabi_attribute 25,1 @Tag_ABI_align8_preserved
21.arm
22
23/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
24 * integer (bicubic has a little overshoot).  It would also be possible to add
25 * a temporary DC bias to eliminate the sign bit for more precision, but that's
26 * extra arithmetic.
27 */
28.set VERTBITS, 14
29
30/* The size of the scratch buffer in which we store our vertically convolved
31 * intermediates.
32 */
33.set CHUNKSHIFT, 7
34.set CHUNKSIZE, (1 << CHUNKSHIFT)
35
36/* The number of components processed in a single iteration of the innermost
37 * loop.
38 */
39.set VECSHIFT, 3
40.set VECSIZE, (1<<VECSHIFT)
41
42/* Read four different lines (except at edges where addresses may be clamped,
43 * which is why we don't simply take base and stride registers), and multiply
44 * and accumulate them by the coefficients in d6[0..3], leaving the results in
45 * q12.  This gives eight 16-bit results representing a horizontal line of 2-8
46 * input pixels (depending on number of components per pixel) to be fed into
47 * the horizontal scaling pass.
48 *
49 * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
50 * known to represent negative values and VMLS is used to implement this).
51 * Output is VERTBITS signed fixed-point, which must leave room for a little
52 * bit of overshoot beyond [0,1.0).
53 */
54.macro vert8, dstlo=d24, dsthi=d25
55        vld1.u8     d16, [r4]!
56        vld1.u8     d18, [r5]!
57        vld1.u8     d20, [r6]!
58        vld1.u8     d22, [r7]!
59        vmovl.u8    q8, d16
60        vmovl.u8    q9, d18
61        vmovl.u8    q10, d20
62        vmovl.u8    q11, d22
63        vmull.u16   q12, d18, d6[1]
64        vmull.u16   q13, d19, d6[1]
65        vmlsl.u16   q12, d16, d6[0]
66        vmlsl.u16   q13, d17, d6[0]
67        vmlal.u16   q12, d20, d6[2]
68        vmlal.u16   q13, d21, d6[2]
69        vmlsl.u16   q12, d22, d6[3]
70        vmlsl.u16   q13, d23, d6[3]
71
72        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
73         * minus VERTBITS (the number of fraction bits we want to keep from
74         * here on).
75         */
76        vqshrn.s32  \dstlo, q12, #8 + 16 - VERTBITS
77        vqshrn.s32  \dsthi, q13, #8 + 16 - VERTBITS
78.endm
79
80/* As above, but only four 16-bit results into d25.
81 */
82.macro vert4
83        vld1.u32    d16[0], [r4]!
84        vld1.u32    d18[0], [r5]!
85        vld1.u32    d20[0], [r6]!
86        vld1.u32    d22[0], [r7]!
87        vmovl.u8    q8, d16
88        vmovl.u8    q9, d18
89        vmovl.u8    q10, d20
90        vmovl.u8    q11, d22
91        vmull.u16   q12, d18, d6[1]
92        vmlsl.u16   q12, d16, d6[0]
93        vmlal.u16   q12, d20, d6[2]
94        vmlsl.u16   q12, d22, d6[3]
95        vqshrn.s32  d25, q12, #8 + 16 - VERTBITS
96.endm
97
98
99/* During horizontal resize having CHUNKSIZE input available means being able
100 * to produce a varying amount of output, depending on the phase of the data.
101 * This function calculates the minimum number of VECSIZE chunks extracted from
102 * a CHUNKSIZE window (r1), and the threshold value for when the count will be
103 * one higher than that (r0).
104 * These work out, conveniently, to be the quotient and remainder from:
105 *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
106 *
107 * The two values can be packed together in a uint64_t for convenience; and
108 * they are, in fact, used this way as an arithmetic short-cut later on.
109 */
110
111/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
112ENTRY(rsdIntrinsicResize_oscctl_K)
113        lsl         r2, r0, #VECSHIFT
114        movw        r0, #:lower16:(CHUNKSIZE << 16) - 1
115        movt        r0, #:upper16:(CHUNKSIZE << 16) - 1
116        add         r0, r0, r2
117#if defined(ARCH_ARM_USE_UDIV)
118        udiv        r1, r0, r2
119        mls         r0, r1, r2, r0
120#else
121        clz         r3, r2
122        clz         r1, r0
123        subs        r3, r3, r1
124        movlt       r3, #0
125        mov         r1, #1
126        lsl         r2, r2, r3
127        lsl         r3, r1, r3
128        mov         r1, #0
1291:      cmp         r2, r0
130        addls       r1, r3
131        subls       r0, r2
132        lsrs        r3, r3, #1
133        lsr         r2, r2, #1
134        bne         1b
135#endif
136        bx          lr
137END(rsdIntrinsicResize_oscctl_K)
138
139/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
140 * For the most part the vertical pass (the outer loop) is the same for all
141 * versions.  Exceptions are handled in-line with conditional assembly.
142 */
143.irp comp, 1, 2, 4
144.if \comp == 1
145.set COMPONENT_SHIFT, 0
146.elseif \comp == 2
147.set COMPONENT_SHIFT, 1
148.elseif \comp == 4
149.set COMPONENT_SHIFT, 2
150.else
151.error "Unknown component count"
152.endif
153.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
154.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
155
156.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
157.set OSC_STORE, (BUFFER_SIZE + 0)
158.set OSCSTEP_STORE, (BUFFER_SIZE + 4)
159.set OSCCTL_STORE, (BUFFER_SIZE + 8)
160.set AVAIL_STORE, (BUFFER_SIZE + 16)
161.set SP_STORE, (BUFFER_SIZE + 24)   /* should be +20, but rounded up to make a legal constant somewhere */
162
163/* void rsdIntrinsicResizeB\comp\()_K(
164 *             uint8_t * restrict dst,          // r0
165 *             size_t count,                    // r1
166 *             uint32_t xf,                     // r2
167 *             uint32_t xinc,                   // r3
168 *             uint8_t const * restrict srcn,   // [sp]     -> [sp,#104] -> r4
169 *             uint8_t const * restrict src0,   // [sp,#4]  -> [sp,#108] -> r5
170 *             uint8_t const * restrict src1,   // [sp,#8]  -> [sp,#112] -> r6
171 *             uint8_t const * restrict src2,   // [sp,#12] -> [sp,#116] -> r7
172 *             size_t xclip,                    // [sp,#16] -> [sp,#120]
173 *             size_t avail,                    // [sp,#20] -> [sp,#124] -> lr
174 *             uint64_t osc_ctl,                // [sp,#24] -> [sp,#128]
175 *             int32_t const *yr);              // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
176 */
177ENTRY(rsdIntrinsicResizeB\comp\()_K)
178            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
179            vpush       {d8-d15}
180
181            /* align the working buffer on the stack to make it easy to use bit
182             * twiddling for address calculations and bounds tests.
183             */
184            sub         r12, sp, #BUFFER_SIZE + 32
185            mov         lr, sp
186            bfc         r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
187            mov         sp, r12
188            str         lr, [sp,#SP_STORE]
189
190            ldr         r8, [lr,#136]           // yr
191            adr         r9, 8f
192            vld1.s32    {q4}, [r8]
193            vld1.s16    {q5}, [r9]
194            vqmovun.s32 d8, q4                  // yr
195            vdup.s16    q6, r2
196            vdup.s16    q7, r3
197            vmla.s16    q6, q5, q7              // vxf
198            vshl.s16    q7, q7, #VECSHIFT       // vxinc
199
200            ldrd        r4,r5, [lr,#104]        // srcn, src0
201            ldrd        r6,r7, [lr,#112]        // src1, src2
202
203            /* Compute starting condition for oscillator used to compute ahead
204             * of time how many iterations are possible before needing to
205             * refill the working buffer.  This is based on the fixed-point
206             * index of the last element in the vector of pixels processed in
207             * each iteration, counting up until it would overflow.
208             */
209            sub         r8, r2, r3
210            mov         r9, r3, LSL #VECSHIFT
211            add         r8, r8, r9
212
213            ldrd        r10,r11, [lr,#128]      // osc_ctl
214
215            str         r8, [sp,#OSC_STORE]
216            str         r9, [sp,#OSCSTEP_STORE]
217            str         r10, [sp,#OSCCTL_STORE]
218            str         r11, [sp,#OSCCTL_STORE+4]
219            ldrd        r10,r11, [lr,#120]      // xclip,avail
220
221
222            /* r4-r7 contain pointers to the four lines of input to be
223             * convolved.  These pointers have been clamped vertically and
224             * horizontally (which is why it's not a simple row/stride pair),
225             * and the xclip argument (now in r10) indicates how many pixels
226             * from true the x position of the pointer is.  This value should
227             * be 0, 1, or 2 only.
228             *
229             * Start by placing four pixels worth of input at the far end of
230             * the buffer.  As many as two of these may be clipped, so four
231             * pixels are fetched, and then the first pixel is duplicated and
232             * the data shifted according to xclip.  The source pointers are
233             * then also adjusted according to xclip so that subsequent fetches
234             * match.
235             */
236            vmov        d6, d8  /* make y coeffs available for vert4 and vert8 macros */
237
238            sub         r8, r12, r10, LSL #COMPONENT_SHIFT + 1
239            add         r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
240            add         r8, r8, #4 * COMPONENT_COUNT * 2
241.if \comp == 1
242            vert4
243            vdup.s16    d24, d25[0]
244            vst1.s16    {q12}, [r12]
245            vld1.s16    {d24}, [r8]
246            vst1.s16    {d24}, [r9]
247.elseif \comp == 2
248            vert8
249            vdup.u32    q11, d24[0]
250            vst1.s16    {q11,q12}, [r12]
251            vld1.s16    {q12}, [r8]
252            vst1.s16    {q12}, [r9]
253.elseif \comp == 4
254            vert8       d28, d29
255            vert8       d30, d31
256            vmov.u64    d24, d28
257            vmov.u64    d25, d28
258            vmov.u64    d26, d28
259            vmov.u64    d27, d28
260            vst1.s16    {q12,q13}, [r12]!
261            vst1.s16    {q14,q15}, [r12]
262            sub         r12, r12, #32
263            vld1.s16    {q11,q12}, [r8]
264            vst1.s16    {q11,q12}, [r9]
265.endif
266            /* Count off four pixels into the working buffer, and move count to
267             * its new home.
268             */
269            sub         lr, r11, #4
270            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
271             * were read unconditionally, but some may have been discarded by
272             * xclip, so we rewind the pointers to compensate.
273             */
274            sub         r4, r4, r10, LSL #COMPONENT_SHIFT
275            sub         r5, r5, r10, LSL #COMPONENT_SHIFT
276            sub         r6, r6, r10, LSL #COMPONENT_SHIFT
277            sub         r7, r7, r10, LSL #COMPONENT_SHIFT
278
279            /* First tap starts where we just pre-filled, at the end of the
280             * buffer.
281             */
282            add         r2, r2, #(CHUNKSIZE * 2 - 4) << 16
283
284            /* Use overflowing arithmetic to implement wraparound array
285             * indexing.
286             */
287            mov         r2, r2, LSL #(15 - CHUNKSHIFT)
288            mov         r3, r3, LSL #(15 - CHUNKSHIFT)
289
290            str         lr, [sp,#AVAIL_STORE]
291
292            /* Start of outermost loop.
293             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
294             * number of iterations of the inner loop that can be performed and
295             * get into that.
296             *
297             * The fill is complicated by the possibility of running out of
298             * input before the scratch buffer is filled.  If this isn't a risk
299             * then it's handled by the simple loop at 2:, otherwise the
300             * horrible loop at 3:.
301             */
3021:          ldr         lr, [sp,#AVAIL_STORE]   /* get number of pixels available */
303            vmov        d6, d8              /* put y scaling coefficients somewhere handy */
304            subs        lr, #CHUNKSIZE
305            bge         2f                  /* if at least CHUNKSIZE are available... */
306            add         lr, #CHUNKSIZE      /* if they're not... */
307            b           4f
308            /* ..just sneaking a literal in here after this unconditional branch.. */
3098:          .hword      0, 1, 2, 3, 4, 5, 6, 7
310            /* basic fill loop, processing 8 bytes at a time until there are
311             * fewer than eight bytes available.
312             */
3133:          vert8
314            sub         lr, lr, #8 / COMPONENT_COUNT
315            vst1.s16    {q12}, [r12]!
3164:          cmp         lr, #8 / COMPONENT_COUNT - 1
317            bgt         3b
318.if \comp == 4
319            blt         3f
320            /* The last pixel (four bytes) if necessary */
321            vert4
322.else
323            cmp         lr, #1
324            blt         3f
325            /* The last pixels if necessary */
326            sub         r4, r4, #8
327            sub         r5, r5, #8
328            sub         r6, r6, #8
329            sub         r7, r7, #8
330            add         r4, r4, lr, LSL #COMPONENT_SHIFT
331            add         r5, r5, lr, LSL #COMPONENT_SHIFT
332            add         r6, r6, lr, LSL #COMPONENT_SHIFT
333            add         r7, r7, lr, LSL #COMPONENT_SHIFT
334            vert8
335            sub         lr, sp, lr, LSL #COMPONENT_SHIFT + 1
336            sub         sp, sp, #32
337            sub         lr, lr, #16
338.if \comp == 1
339            vdup.s16    q13, d25[3]
340.elseif \comp == 2
341            vdup.u32    q13, d25[1]
342.endif
343            vst1.s16    {q12,q13}, [sp]
344            vld1.s16    {q12}, [lr]
345            add         sp, sp, #32
346            b           4f
347.endif
348            /* Keep filling until we get to the end of this chunk of the buffer */
3493:
350.if \comp == 1
351            vdup.s16    q12, d25[3]
352.elseif \comp == 2
353            vdup.u32    q12, d25[1]
354.elseif \comp == 4
355            vmov.u64    d24, d25
356.endif
3574:          vst1.s16    {q12}, [r12]!
358            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
359            bne         3b
360            b           4f
361
362.align 4
3632:          /* Quickly pull a chunk of data into the working buffer.
364             */
365            vert8
366            vst1.s16    {q12}, [r12]!
367            vert8
368            vst1.s16    {q12}, [r12]!
369            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
370            bne         2b
371            cmp         lr, #0
372            bne         3f
3734:          /* if we end with 0 pixels left we'll have nothing handy to spread
374             * across to the right, so we rewind a bit.
375             */
376            mov         lr, #1
377            sub         r4, r4, #COMPONENT_COUNT
378            sub         r5, r5, #COMPONENT_COUNT
379            sub         r6, r6, #COMPONENT_COUNT
380            sub         r7, r7, #COMPONENT_COUNT
3813:          str         lr, [sp,#AVAIL_STORE]       /* done with available pixel count */
382            add         lr, sp, #OSC_STORE
383            ldrd        r8,r9, [lr,#0]              /* need osc, osc_step soon */
384            ldrd        r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */
385
386            /* copy four taps (width of cubic window) to far end for overflow
387             * address handling
388             */
389            sub         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
390            eor         r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
391.if \comp == 1
392            vld1.s16    {d28}, [lr]
393.elseif \comp == 2
394            vld1.s16    {q14}, [lr]
395.elseif \comp == 4
396            vld1.s16    {q14,q15}, [lr]
397.endif
398            add         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
399.if \comp == 1
400            vst1.s16    {d28}, [lr]
401.elseif \comp == 2
402            vst1.s16    {q14}, [lr]
403.elseif \comp == 4
404            vst1.s16    {q14,q15}, [lr]
405.endif
406            /* r11 contains the maximum possible iteration count, but if r8 is
407             * greater than r10 then this indicates that the count must be
408             * reduced by one for this iteration to avoid reading past the end
409             * of the available data.
410             */
411            cmp             r10, r8
412            sbc         lr, r11, #0
413
414            mla         r8, lr, r9, r8
415            sub         r8, r8, #(CHUNKSIZE << 16)
416
417            str         r8, [sp,#OSC_STORE]         /* done with osc */
418
419            /* prefer to count pixels, rather than vectors, to clarify the tail
420             * store case on exit.
421             */
422            mov         lr, lr, LSL #VECSHIFT
423            cmp         lr, r1
424            movgt       lr, r1
425
426            sub         r1, r1, lr
427
428            mov         lr, lr, LSL #COMPONENT_SHIFT
429
430            vmov.i16    d10, #3
431            vmov.i16    d11, #0x8000
432
433            cmp         lr, #0
434            bgt         3f
435            cmp         r1, #0
436            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
437            b           9f
438
439            .align 4
4402:          /* Inner loop continues here, but starts at 3:, see end of loop
441             * below for explanation. */
442.if LOOP_OUTPUT_SIZE == 4
443            vst1.u32    {d16[0]}, [r0]!
444.elseif LOOP_OUTPUT_SIZE == 8
445            vst1.u8     {d16}, [r0]!
446.elseif LOOP_OUTPUT_SIZE == 16
447            vst1.u8     {q8}, [r0]!
448.elseif LOOP_OUTPUT_SIZE == 32
449            vst1.u8     {q8,q9}, [r0]!
450.endif
451            /* Inner loop:  here the four x coefficients for each tap are
452             * calculated in vector code, and the addresses are calculated in
453             * scalar code, and these calculations are interleaved.
454             */
4553:          vshr.u16    q8, q6, #1
456            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
457            vqrdmulh.s16 q9, q8, q8
458            add         r2, r2, r3
459            vqrdmulh.s16 q10, q9, q8
460            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
461            vshll.s16   q11, d18, #2
462            vshll.s16   q12, d19, #2
463            add         r2, r2, r3
464            vmlsl.s16   q11, d20, d10
465            vmlsl.s16   q12, d21, d10
466            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
467
468            vhadd.s16   q0, q10, q8
469            add         r2, r2, r3
470            vsub.s16    q0, q9, q0
471            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
472
473            vaddw.s16   q1, q11, d18
474            vaddw.s16   q13, q12, d19
475            add         r2, r2, r3
476            vshrn.s32   d2, q1, #1
477            vshrn.s32   d3, q13, #1
478            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
479            vsub.s16    d2, d2, d11
480            vsub.s16    d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
481            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
482
483            vaddw.s16   q2, q11, d16
484            vaddw.s16   q13, q12, d17
485            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
486            vshrn.s32   d4, q2, #1
487            vshrn.s32   d5, q13, #1
488            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
489            vneg.s16    q2, q2
490
491            vhsub.s16   q3, q10, q9
492
493            /* increment the x fractional parts (oveflow is ignored, as the
494             * scalar arithmetic shadows this addition with full precision).
495             */
496            vadd.s16    q6, q6, q7
497
498            /* At this point we have four pointers in r8-r11, pointing to the
499             * four taps in the scratch buffer that must be convolved together
500             * to produce an output pixel (one output pixel per pointer).
501             * These pointers usually overlap, but their spacing is irregular
502             * so resolving the redundancy through L1 is a pragmatic solution.
503             *
504             * The scratch buffer is made of signed 16-bit data, holding over
505             * some extra precision, and overshoot, from the vertical pass.
506             *
507             * We also have the 16-bit unsigned fixed-point weights for each
508             * of the four taps in q0 - q3.  That's eight pixels worth of
509             * coefficients when we have only four pointers, so calculations
510             * for four more pixels are interleaved with the fetch and permute
511             * code for each variant in the following code.
512             *
513             * The data arrangement is less than ideal for any pixel format,
514             * but permuting loads help to mitigate most of the problems.
515             *
516             * Note also that the two outside taps of a bicubic are negative,
517             * but these coefficients are unsigned.  The sign is hard-coded by
518             * use of multiply-and-subtract operations.
519             */
520.if \comp == 1
521            /* The uchar 1 case.
522             * Issue one lanewise vld4.s16 to load four consecutive pixels from
523             * one pointer (one pixel) into four different registers; then load
524             * four consecutive s16 values from the next pointer (pixel) into
525             * the next lane of those four registers, etc., so that we finish
526             * with q12 - q15 representing the four taps, and each lane
527             * representing a separate pixel.
528             *
529             * The first vld4 uses a splat to avoid any false dependency on
530             * the previous state of the register.
531             */
532            vld4.s16    {d24[],d26[],d28[],d30[]}, [r8]
533            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
534            add         r2, r2, r3
535            vld4.s16    {d24[1],d26[1],d28[1],d30[1]}, [r9]
536            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
537            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
538            add         r2, r2, r3
539            vld4.s16    {d24[2],d26[2],d28[2],d30[2]}, [r10]
540            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
541            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
542            add         r2, r2, r3
543            vld4.s16    {d24[3],d26[3],d28[3],d30[3]}, [r11]
544            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
545            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
546            add         r2, r2, r3
547            vld4.s16    {d25[],d27[],d29[],d31[]}, [r8]
548            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
549            vld4.s16    {d25[1],d27[1],d29[1],d31[1]}, [r9]
550            vld4.s16    {d25[2],d27[2],d29[2],d31[2]}, [r10]
551            vld4.s16    {d25[3],d27[3],d29[3],d31[3]}, [r11]
552
553            vmull.s16   q8, d24, d0
554            vmull.s16   q9, d25, d1
555            vmlsl.s16   q8, d26, d2
556            vmlsl.s16   q9, d27, d3
557            vmlsl.s16   q8, d28, d4
558            vmlsl.s16   q9, d29, d5
559            vmlal.s16   q8, d30, d6
560            vmlal.s16   q9, d31, d7
561
562            subs        lr, lr, #LOOP_OUTPUT_SIZE
563
564            vqrshrn.s32 d16, q8, #15
565            vqrshrn.s32 d17, q9, #15
566
567            vqrshrun.s16 d16, q8, #VERTBITS - 8
568.elseif \comp == 2
569            /* The uchar2 case:
570             * This time load pairs of values into adjacent lanes in q12 - q15
571             * by aliasing them as u32 data; leaving room for only four pixels,
572             * so the process has to be done twice.  This also means that the
573             * coefficient registers fail to align with the coefficient data
574             * (eight separate pixels), so that has to be doubled-up to match.
575             */
576            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
577            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
578            add         r2, r2, r3
579            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
580            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
581            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
582            add         r2, r2, r3
583            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
584            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
585            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
586            add         r2, r2, r3
587            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
588            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
589            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
590            add         r2, r2, r3
591
592            /* double-up coefficients to align with component pairs */
593            vmov        d20, d0
594            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
595            vmov        d21, d2
596            vmov        d22, d4
597            vmov        d23, d6
598            vzip.s16    d0, d20
599            vzip.s16    d2, d21
600            vzip.s16    d4, d22
601            vzip.s16    d6, d23
602
603            vmull.s16   q8, d24, d0
604            vmull.s16   q9, d25, d20
605            vmlsl.s16   q8, d26, d2
606            vmlsl.s16   q9, d27, d21
607            vmlsl.s16   q8, d28, d4
608            vmlsl.s16   q9, d29, d22
609            vmlal.s16   q8, d30, d6
610            vmlal.s16   q9, d31, d23
611
612            vqrshrn.s32 d16, q8, #15
613            vqrshrn.s32 d17, q9, #15
614
615            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
616            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
617            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
618            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
619
620            /* double-up coefficients to align with component pairs */
621            vmov        d0, d1
622            vmov        d2, d3
623            vmov        d4, d5
624            vmov        d6, d7
625            vzip.s16    d0, d1
626            vzip.s16    d2, d3
627            vzip.s16    d4, d5
628            vzip.s16    d6, d7
629
630            vmull.s16   q10, d24, d0
631            vmull.s16   q11, d25, d1
632            vmlsl.s16   q10, d26, d2
633            vmlsl.s16   q11, d27, d3
634            vmlsl.s16   q10, d28, d4
635            vmlsl.s16   q11, d29, d5
636            vmlal.s16   q10, d30, d6
637            vmlal.s16   q11, d31, d7
638
639            subs        lr, lr, #LOOP_OUTPUT_SIZE
640
641            vqrshrn.s32 d18, q10, #15
642            vqrshrn.s32 d19, q11, #15
643
644            vqrshrun.s16 d16, q8, #VERTBITS - 8
645            vqrshrun.s16 d17, q9, #VERTBITS - 8
646.elseif \comp == 4
647            /* The uchar4 case.
648             * This case is comparatively painless because four s16s are the
649             * smallest addressable unit for a vmul-by-scalar.  Rather than
650             * permute the data, simply arrange the multiplies to suit the way
651             * the data comes in.  That's a lot of data, though, so things
652             * progress in pairs of pixels at a time.
653             */
654            vld1.s16    {q12,q13}, [r8]
655            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
656            add         r2, r2, r3
657            vld1.s16    {q14,q15}, [r9]
658            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
659            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
660            add         r2, r2, r3
661
662            vmull.s16   q8, d24, d0[0]
663            vmull.s16   q9, d28, d0[1]
664            vmlsl.s16   q8, d25, d2[0]
665            vmlsl.s16   q9, d29, d2[1]
666            vmlsl.s16   q8, d26, d4[0]
667            vmlsl.s16   q9, d30, d4[1]
668            vmlal.s16   q8, d27, d6[0]
669            vmlal.s16   q9, d31, d6[1]
670
671            /* And two more...  */
672            vld1.s16    {q12,q13}, [r10]
673            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
674            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
675            add         r2, r2, r3
676            vld1.s16    {q14,q15}, [r11]
677            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
678            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
679            add         r2, r2, r3
680
681            vqrshrn.s32 d16, q8, #15
682            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
683            vqrshrn.s32 d17, q9, #15
684
685            vmull.s16   q10, d24, d0[2]
686            vmull.s16   q11, d28, d0[3]
687            vmlsl.s16   q10, d25, d2[2]
688            vmlsl.s16   q11, d29, d2[3]
689            vmlsl.s16   q10, d26, d4[2]
690            vmlsl.s16   q11, d30, d4[3]
691            vmlal.s16   q10, d27, d6[2]
692            vmlal.s16   q11, d31, d6[3]
693
694            vqrshrn.s32 d18, q10, #15
695            vqrshrn.s32 d19, q11, #15
696
697            vqrshrun.s16 d16, q8, #VERTBITS - 8
698            vqrshrun.s16 d17, q9, #VERTBITS - 8
699
700            /* And two more...  */
701            vld1.s16    {q12,q13}, [r8]
702            vld1.s16    {q14,q15}, [r9]
703
704            vmull.s16   q10, d24, d1[0]
705            vmull.s16   q11, d28, d1[1]
706            vmlsl.s16   q10, d25, d3[0]
707            vmlsl.s16   q11, d29, d3[1]
708            vmlsl.s16   q10, d26, d5[0]
709            vmlsl.s16   q11, d30, d5[1]
710            vmlal.s16   q10, d27, d7[0]
711            vmlal.s16   q11, d31, d7[1]
712
713            /* And two more...  */
714            vld1.s16    {q12,q13}, [r10]
715            vld1.s16    {q14,q15}, [r11]
716
717            subs        lr, lr, #LOOP_OUTPUT_SIZE
718
719            vqrshrn.s32 d18, q10, #15
720            vqrshrn.s32 d19, q11, #15
721
722            vmull.s16   q10, d24, d1[2]
723            vmull.s16   q11, d28, d1[3]
724            vmlsl.s16   q10, d25, d3[2]
725            vmlsl.s16   q11, d29, d3[3]
726            vmlsl.s16   q10, d26, d5[2]
727            vmlsl.s16   q11, d30, d5[3]
728            vmlal.s16   q10, d27, d7[2]
729            vmlal.s16   q11, d31, d7[3]
730
731            vqrshrn.s32 d20, q10, #15
732            vqrshrn.s32 d21, q11, #15
733
734            vqrshrun.s16 d18, q9, #VERTBITS - 8
735            vqrshrun.s16 d19, q10, #VERTBITS - 8
736.endif
737            bgt         2b      /* continue inner loop */
738            /* The inner loop has already been limited to ensure that none of
739             * the earlier iterations could overfill the output, so the store
740             * appears within the loop but after the conditional branch (at the
741             * top).  At the end, provided it won't overfill, perform the final
742             * store here.  If it would, then break out to the tricky tail case
743             * instead.
744             */
745            blt         1f
746            /* Store the amount of data appropriate to the configuration of the
747             * instance being assembled.
748             */
749.if LOOP_OUTPUT_SIZE == 4
750            vst1.u32    {d16[0]}, [r0]!
751.elseif LOOP_OUTPUT_SIZE == 8
752            vst1.u8     {d16}, [r0]!
753.elseif LOOP_OUTPUT_SIZE == 16
754            vst1.u8     {q8}, [r0]!
755.elseif LOOP_OUTPUT_SIZE == 32
756            vst1.u8     {q8,q9}, [r0]!
757.endif
758            b           1b              /* resume outer loop */
759            /* Partial tail store case:
760             * Different versions of the code need different subsets of the
761             * following partial stores.  Here the number of components and the
762             * size of the chunk of data produced by each inner loop iteration
763             * is tested to figure out whether or not each phrase is relevant.
764             */
765.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
7661:          tst         lr, #16
767            beq         1f
768            vst1.u8     {q8}, [r0]!
769            vmov        q8, q9
770.endif
771.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
7721:          tst         lr, #8
773            beq         1f
774            vst1.u8     {d16}, [r0]!
775            vmov.u8     d16, d17
776.endif
777.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
7781:          tst         lr, #4
779            beq         1f
780            vst1.u32    {d16[0]}, [r0]!
781            vext.u32    d16, d16, d16, #1
782.endif
783.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
7841:          tst         lr, #2
785            beq         1f
786            vst1.u16    {d16[0]}, [r0]!
787            vext.u16    d16, d16, d16, #1
788.endif
789.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
7901:          tst         lr, #1
791            beq         1f
792            vst1.u8     {d16[0]}, [r0]!
793.endif
7941:
7959:          ldr         sp, [sp,#SP_STORE]
796            vpop        {d8-d15}
797            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
798END(rsdIntrinsicResizeB\comp\()_K)
799.endr
800