1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
18#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
19#define END(f) .fnend; .size f, .-f;
20
21#define ARCH_ARM_USE_BLUR_PRELOAD
22
23.eabi_attribute 25,1 @Tag_ABI_align8_preserved
24.arm
25
26/* Number of fractional bits to preserve in intermediate results.  The
27 * intermediate storage is 16-bit, and we started with 8 bit data (the integer
28 * part), so this should be between 0 and 8.
29 */
30.set FRACTION_BITS, 7
31
32.set MAX_R, 25
33
34
35/* A quick way of making a line of code conditional on some other condition.
36 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
37 * `ifcc`:
38 */
39.macro ifcc zzz:vararg
40.if cc
41            \zzz
42.endif
43.endm
44
45/* It's not always clear that prefetching is beneficial and this needs further
46 * testing on different cores, so it's made switchable here.
47 */
48#if defined(ARCH_ARM_USE_BLUR_PRELOAD)
49#define VERTPLD(...) pld [__VA_ARGS__]
50#else
51#define VERTPLD(...) nop
52#endif
53
54/* Fetch 16 columns of bytes (regardless of image format), convolve these
55 * vertically, and leave them in the register file.  If working near the top or
56 * bottom of an image then clamp the addressing while loading the data in.
57 *
58 * The convolution is fully unrolled for windows up to max_r, with the
59 * outermost edges calculated first.  This way it's possible to branch directly
60 * into the relevant part of the code for an arbitrary convolution radius.  Two
61 * variants of the loop are produced; one eliminates the clamping code for a
62 * slight speed advantage.
63 *
64 * Where the macro is called with reg=x, the specified register is taken to
65 * contain a pre-calculated pointer into one of the two loops.
66 *
67 * Input:
68 *      r1 -- src
69 *      r2 -- pitch
70 *      r5 -- r
71 *      r6 -- rup (r, unless clipped to top of source image)
72 *      r7 -- rdn (r, unless clipped to bottom of source image)
73 *      r12 -- switch index
74 *      q0-q3 -- coefficient table
75 * Output:
76 *      r1 += 16
77 *      q10,q11 -- 16 convolved columns
78 * Modifies:
79 *      r10 = upper row pointer
80 *      r11 = lower row pointer
81 *      q12-q15 = temporary sums
82 */
83.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/
84  .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
85
86            vld1.8      {d30,d31}, [r1]
87            mls         r10, r2, r6, r1
88
89            vmovl.u8    q14, d30
90            VERTPLD(r1, #32)
91            vmovl.u8    q15, d31
92  .if \max_r < 16 // approximate
93    ifcc    adr         \reg, 1f
94  .else
95    ifcc    ldr         \reg, 2f
961:  ifcc    add         \reg, \reg, pc
97  .endif
98
99            vmull.u16   q12, d28, d0[0]
100    ifcc    sub         \reg, r5, LSL #6
101            vmull.u16   q13, d29, d0[0]
102            mla         r11, r2, r7, r1
103            vmull.u16   q14, d30, d0[0]
104            add         r1, r1, #16
105            vmull.u16   q15, d31, d0[0]
106            bx          \reg
107
108     ifcc   .align 2
109  2: ifcc   .word       1f-1b-8
110
111  /* This version of the vertical fetch loop body is used away from the edges
112   * of the source image.  The pointers start at the top and bottom source rows
113   * and work their way towards the centre on each iteration.  This way the
114   * number of taps used can be controlled by jumping directly into the middle
115   * of the loop and running to completion.
116   * If the loop body changes size then the code which caculates the address of
117   * the initial iteration must be updated to accordingly.
118   */
119  .macro vertfetch_noclamp i, dreg
120    .if 0 < \i && \i <= \max_r
121            vld1.8      {d20,d21}, [r10], r2
122            vld1.8      {d22,d23}, [r11]
123            sub         r11, r11, r2
124            vswp        d21, d22
125            VERTPLD(r10, #32)
126            vaddl.u8    q10, d20, d21
127            vaddl.u8    q11, d22, d23
128            vmlal.u16   q12, d20, \dreg
129            VERTPLD(r11, #32)
130            vmlal.u16   q13, d21, \dreg
131            vmlal.u16   q14, d22, \dreg
132            vmlal.u16   q15, d23, \dreg
133    .endif
134  .endm
135
136  /* This version of the vertical fetch loop body is used near the edges of the
137   * source image, where one or both of the accesses may start with a clamped
138   * value, and the row addresses only begin to change after some number of
139   * iterations before the end.
140   * If the loop body changes size then the code which caculates the address of
141   * the initial iteration must be updated to accordingly.
142   */
143  .macro vertfetch_clamped i, dreg
144    .if 0 < \i && \i <= \max_r
145            vld1.8      {d20,d21}, [r10]
146            vld1.8      {d22,d23}, [r11]
147            cmp         r6, #\i
148            vswp        d21, d22
149            VERTPLD(r10, #32)
150            vaddl.u8    q10, d20, d21
151            addhs       r10, r10, r2
152            vaddl.u8    q11, d22, d23
153            cmp         r7, #\i
154            vmlal.u16   q12, d20, \dreg
155            VERTPLD(r11, #32)
156            vmlal.u16   q13, d21, \dreg
157            subhs       r11, r11, r2
158            vmlal.u16   q14, d22, \dreg
159            nop
160            vmlal.u16   q15, d23, \dreg
161    .endif
162  .endm
163
164  /* Entry into this unrolled loop is computed as a negative index from
165   * \labelc at the end of the block.
166   */
167  .align 4
168  vertfetch_clamped 27, d6[3]
169  vertfetch_clamped 26, d6[2]
170  vertfetch_clamped 25, d6[1]
171  vertfetch_clamped 24, d6[0]
172  vertfetch_clamped 23, d5[3]
173  vertfetch_clamped 22, d5[2]
174  vertfetch_clamped 21, d5[1]
175  vertfetch_clamped 20, d5[0]
176  vertfetch_clamped 19, d4[3]
177  vertfetch_clamped 18, d4[2]
178  vertfetch_clamped 17, d4[1]
179  vertfetch_clamped 16, d4[0]
180  vertfetch_clamped 15, d3[3]
181  vertfetch_clamped 14, d3[2]
182  vertfetch_clamped 13, d3[1]
183  vertfetch_clamped 12, d3[0]
184  vertfetch_clamped 11, d2[3]
185  vertfetch_clamped 10, d2[2]
186  vertfetch_clamped  9, d2[1]
187  vertfetch_clamped  8, d2[0]
188  vertfetch_clamped  7, d1[3]
189  vertfetch_clamped  6, d1[2]
190  vertfetch_clamped  5, d1[1]
191  vertfetch_clamped  4, d1[0]
192  vertfetch_clamped  3, d0[3]
193  vertfetch_clamped  2, d0[2]
194  vertfetch_clamped  1, d0[1]
195  vertfetch_clamped  0, d0[0]
196  1:
197  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
198
199  /* Entry into this unrolled loop is computed as a negative index from
200   * \labelnc at the end of the block.
201   */
202  .align 4
203  vertfetch_noclamp 27, d6[3]
204  vertfetch_noclamp 26, d6[2]
205  vertfetch_noclamp 25, d6[1]
206  vertfetch_noclamp 24, d6[0]
207  vertfetch_noclamp 23, d5[3]
208  vertfetch_noclamp 22, d5[2]
209  vertfetch_noclamp 21, d5[1]
210  vertfetch_noclamp 20, d5[0]
211  vertfetch_noclamp 19, d4[3]
212  vertfetch_noclamp 18, d4[2]
213  vertfetch_noclamp 17, d4[1]
214  vertfetch_noclamp 16, d4[0]
215  vertfetch_noclamp 15, d3[3]
216  vertfetch_noclamp 14, d3[2]
217  vertfetch_noclamp 13, d3[1]
218  vertfetch_noclamp 12, d3[0]
219  vertfetch_noclamp 11, d2[3]
220  vertfetch_noclamp 10, d2[2]
221  vertfetch_noclamp  9, d2[1]
222  vertfetch_noclamp  8, d2[0]
223  vertfetch_noclamp  7, d1[3]
224  vertfetch_noclamp  6, d1[2]
225  vertfetch_noclamp  5, d1[1]
226  vertfetch_noclamp  4, d1[0]
227  vertfetch_noclamp  3, d0[3]
228  vertfetch_noclamp  2, d0[2]
229  vertfetch_noclamp  1, d0[1]
230  vertfetch_noclamp  0, d0[0]
231  \labelnc :
232
233  .purgem vertfetch_clamped
234  .purgem vertfetch_noclamp
235
236  2:        vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
237            vqrshrn.u32 d21, q13, #16 - FRACTION_BITS
238            vqrshrn.u32 d22, q14, #16 - FRACTION_BITS
239            vqrshrn.u32 d23, q15, #16 - FRACTION_BITS
240.endm /*}}}*/
241
242/* Some portion of the convolution window (as much as will fit, and all of it
243 * for the uchar1 cases) is kept in the register file to avoid unnecessary
244 * memory accesses.  This forces the horizontal loops to be unrolled because
245 * there's no indexed addressing into the register file.
246 *
247 * As in the fetch macro, the operations are ordered from outside to inside, so
248 * that jumping into the middle of the block bypasses the unwanted window taps.
249 *
250 * There are several variants of the macro because of the fixed offets of the
251 * taps -- the wider the maximum radius the further the centre tap is from the
252 * most recently fetched data.  This means that pre-filling the window requires
253 * more data that won't be used and it means that rotating the window involves
254 * more mov operations.
255 *
256 * When the buffer gets too big the buffer at [r9] is used.
257 *
258 * Input:
259 *      q4-q11 -- convoltion window
260 *      r9 -- pointer to additional convolution window data
261 * Output:
262 *      r9 -- updated buffer pointer (if used)
263 *      d31 -- result to be stored
264 * Modifies:
265 *      r12 -- temp buffer pointer
266 *      q12-q13 -- temporaries for load and vext operations.
267 *      q14-q15 -- intermediate sums
268 */
269#define TUNED_LIST1 8, 16
270.macro hconv1_8/*{{{*/
271            vmull.u16   q14, d18, d0[0]
272            vmull.u16   q15, d19, d0[0]
273
274            ldr         r12, [pc, r5, LSL #2]
275            add         pc, pc, r12
276            bkpt
277    100:    .word 101f-100b
278            .word 102f-100b
279            .word 103f-100b
280            .word 104f-100b
281            .word 105f-100b
282            .word 106f-100b
283            .word 107f-100b
284            .word 108f-100b
285    108:    vmlal.u16   q14, d16, d2[0]
286            vmlal.u16   q15, d17, d2[0]
287            vmlal.u16   q14, d20, d2[0]
288            vmlal.u16   q15, d21, d2[0]
289    107:    vext.u16    q12, q8, q9, #1
290            vext.u16    q13, q9, q10, #7
291            vmlal.u16   q14, d24, d1[3]
292            vmlal.u16   q15, d25, d1[3]
293            vmlal.u16   q14, d26, d1[3]
294            vmlal.u16   q15, d27, d1[3]
295    106:    vext.u16    q12, q8, q9, #2
296            vext.u16    q13, q9, q10, #6
297            vmlal.u16   q14, d24, d1[2]
298            vmlal.u16   q15, d25, d1[2]
299            vmlal.u16   q14, d26, d1[2]
300            vmlal.u16   q15, d27, d1[2]
301    105:    vext.u16    q12, q8, q9, #3
302            vext.u16    q13, q9, q10, #5
303            vmlal.u16   q14, d24, d1[1]
304            vmlal.u16   q15, d25, d1[1]
305            vmlal.u16   q14, d26, d1[1]
306            vmlal.u16   q15, d27, d1[1]
307    104:    //vext.u16    q12, q8, q9, #4
308            //vext.u16    q13, q9, q10, #4
309            vmlal.u16   q14, d17, d1[0]
310            vmlal.u16   q15, d18, d1[0]
311            vmlal.u16   q14, d19, d1[0]
312            vmlal.u16   q15, d20, d1[0]
313    103:    vext.u16    q12, q8, q9, #5
314            vext.u16    q13, q9, q10, #3
315            vmlal.u16   q14, d24, d0[3]
316            vmlal.u16   q15, d25, d0[3]
317            vmlal.u16   q14, d26, d0[3]
318            vmlal.u16   q15, d27, d0[3]
319    102:    vext.u16    q12, q8, q9, #6
320            vext.u16    q13, q9, q10, #2
321            vmlal.u16   q14, d24, d0[2]
322            vmlal.u16   q15, d25, d0[2]
323            vmlal.u16   q14, d26, d0[2]
324            vmlal.u16   q15, d27, d0[2]
325    101:    vext.u16    q12, q8, q9, #7
326            vext.u16    q13, q9, q10, #1
327            vmlal.u16   q14, d24, d0[1]
328            vmlal.u16   q15, d25, d0[1]
329            vmlal.u16   q14, d26, d0[1]
330            vmlal.u16   q15, d27, d0[1]
331
332            vqrshrn.u32 d28, q14, #16
333            vqrshrn.u32 d29, q15, #16
334            vqrshrn.u16 d31, q14, #FRACTION_BITS
335
336            vmov        q8, q9
337            vmov        q9, q10
338            vmov        q10, q11
339.endm/*}}}*/
340
341.macro hconv1_16/*{{{*/
342            vmull.u16   q14, d16, d0[0]
343            vmull.u16   q15, d17, d0[0]
344
345            ldr         r12, [pc, r5, LSL #2]
346            add         pc, pc, r12
347            bkpt
348    100:    .word 101f-100b
349            .word 102f-100b
350            .word 103f-100b
351            .word 104f-100b
352            .word 105f-100b
353            .word 106f-100b
354            .word 107f-100b
355            .word 108f-100b
356            .word 109f-100b
357            .word 110f-100b
358            .word 111f-100b
359            .word 112f-100b
360            .word 113f-100b
361            .word 114f-100b
362            .word 115f-100b
363            .word 116f-100b
364    116:    //vext.u16    q12, q6, q7, #0
365            //vext.u16    q13, q10, q11, #0
366            vmlal.u16   q14, d12, d4[0]
367            vmlal.u16   q15, d13, d4[0]
368            vmlal.u16   q14, d20, d4[0]
369            vmlal.u16   q15, d21, d4[0]
370    115:    vext.u16    q12, q6, q7, #1
371            vext.u16    q13, q9, q10, #7
372            vmlal.u16   q14, d24, d3[3]
373            vmlal.u16   q15, d25, d3[3]
374            vmlal.u16   q14, d26, d3[3]
375            vmlal.u16   q15, d27, d3[3]
376    114:    vext.u16    q12, q6, q7, #2
377            vext.u16    q13, q9, q10, #6
378            vmlal.u16   q14, d24, d3[2]
379            vmlal.u16   q15, d25, d3[2]
380            vmlal.u16   q14, d26, d3[2]
381            vmlal.u16   q15, d27, d3[2]
382    113:    vext.u16    q12, q6, q7, #3
383            vext.u16    q13, q9, q10, #5
384            vmlal.u16   q14, d24, d3[1]
385            vmlal.u16   q15, d25, d3[1]
386            vmlal.u16   q14, d26, d3[1]
387            vmlal.u16   q15, d27, d3[1]
388    112:    //vext.u16    q12, q6, q7, #4
389            //vext.u16    q13, q9, q10, #4
390            vmlal.u16   q14, d13, d3[0]
391            vmlal.u16   q15, d14, d3[0]
392            vmlal.u16   q14, d19, d3[0]
393            vmlal.u16   q15, d20, d3[0]
394    111:    vext.u16    q12, q6, q7, #5
395            vext.u16    q13, q9, q10, #3
396            vmlal.u16   q14, d24, d2[3]
397            vmlal.u16   q15, d25, d2[3]
398            vmlal.u16   q14, d26, d2[3]
399            vmlal.u16   q15, d27, d2[3]
400    110:    vext.u16    q12, q6, q7, #6
401            vext.u16    q13, q9, q10, #2
402            vmlal.u16   q14, d24, d2[2]
403            vmlal.u16   q15, d25, d2[2]
404            vmlal.u16   q14, d26, d2[2]
405            vmlal.u16   q15, d27, d2[2]
406    109:    vext.u16    q12, q6, q7, #7
407            vext.u16    q13, q9, q10, #1
408            vmlal.u16   q14, d24, d2[1]
409            vmlal.u16   q15, d25, d2[1]
410            vmlal.u16   q14, d26, d2[1]
411            vmlal.u16   q15, d27, d2[1]
412    108:    //vext.u16    q12, q7, q8, #0
413            //vext.u16    q13, q9, q10, #0
414            vmlal.u16   q14, d14, d2[0]
415            vmlal.u16   q15, d15, d2[0]
416            vmlal.u16   q14, d18, d2[0]
417            vmlal.u16   q15, d19, d2[0]
418    107:    vext.u16    q12, q7, q8, #1
419            vext.u16    q13, q8, q9, #7
420            vmlal.u16   q14, d24, d1[3]
421            vmlal.u16   q15, d25, d1[3]
422            vmlal.u16   q14, d26, d1[3]
423            vmlal.u16   q15, d27, d1[3]
424    106:    vext.u16    q12, q7, q8, #2
425            vext.u16    q13, q8, q9, #6
426            vmlal.u16   q14, d24, d1[2]
427            vmlal.u16   q15, d25, d1[2]
428            vmlal.u16   q14, d26, d1[2]
429            vmlal.u16   q15, d27, d1[2]
430    105:    vext.u16    q12, q7, q8, #3
431            vext.u16    q13, q8, q9, #5
432            vmlal.u16   q14, d24, d1[1]
433            vmlal.u16   q15, d25, d1[1]
434            vmlal.u16   q14, d26, d1[1]
435            vmlal.u16   q15, d27, d1[1]
436    104:    //vext.u16    q12, q7, q8, #4
437            //vext.u16    q13, q8, q9, #4
438            vmlal.u16   q14, d15, d1[0]
439            vmlal.u16   q15, d16, d1[0]
440            vmlal.u16   q14, d17, d1[0]
441            vmlal.u16   q15, d18, d1[0]
442    103:    vext.u16    q12, q7, q8, #5
443            vext.u16    q13, q8, q9, #3
444            vmlal.u16   q14, d24, d0[3]
445            vmlal.u16   q15, d25, d0[3]
446            vmlal.u16   q14, d26, d0[3]
447            vmlal.u16   q15, d27, d0[3]
448    102:    vext.u16    q12, q7, q8, #6
449            vext.u16    q13, q8, q9, #2
450            vmlal.u16   q14, d24, d0[2]
451            vmlal.u16   q15, d25, d0[2]
452            vmlal.u16   q14, d26, d0[2]
453            vmlal.u16   q15, d27, d0[2]
454    101:    vext.u16    q12, q7, q8, #7
455            vext.u16    q13, q8, q9, #1
456            vmlal.u16   q14, d24, d0[1]
457            vmlal.u16   q15, d25, d0[1]
458            vmlal.u16   q14, d26, d0[1]
459            vmlal.u16   q15, d27, d0[1]
460
461            vqrshrn.u32 d28, q14, #16
462            vqrshrn.u32 d29, q15, #16
463            vqrshrn.u16 d31, q14, #FRACTION_BITS
464
465            vmov        q6, q7
466            vmov        q7, q8
467            vmov        q8, q9
468            vmov        q9, q10
469            vmov        q10, q11
470.endm/*}}}*/
471
472.macro hconv1_25/*{{{*/
473            vext.u16    q12, q6, q7, #7
474            vmull.u16   q14, d24, d0[0]
475            vmull.u16   q15, d25, d0[0]
476
477            ldr         r12, [pc, r5, LSL #2]
478            add         pc, pc, r12
479            bkpt
480    100:    .word 101f-100b
481            .word 102f-100b
482            .word 103f-100b
483            .word 104f-100b
484            .word 105f-100b
485            .word 106f-100b
486            .word 107f-100b
487            .word 108f-100b
488            .word 109f-100b
489            .word 110f-100b
490            .word 111f-100b
491            .word 112f-100b
492            .word 113f-100b
493            .word 114f-100b
494            .word 115f-100b
495            .word 116f-100b
496            .word 117f-100b
497            .word 118f-100b
498            .word 119f-100b
499            .word 120f-100b
500            .word 121f-100b
501            .word 122f-100b
502            .word 123f-100b
503            .word 124f-100b
504            .word 125f-100b
505    125:    vext.u16    q12, q3, q4, #6
506            vext.u16    q13, q10, q11, #0
507            vmlal.u16   q14, d24, d6[1]
508            vmlal.u16   q15, d25, d6[1]
509            vmlal.u16   q14, d26, d6[1]
510            vmlal.u16   q15, d27, d6[1]
511    124:    vext.u16    q12, q3, q4, #7
512            vext.u16    q13, q9, q10, #7
513            vmlal.u16   q14, d24, d6[0]
514            vmlal.u16   q15, d25, d6[0]
515            vmlal.u16   q14, d26, d6[0]
516            vmlal.u16   q15, d27, d6[0]
517    123:    vext.u16    q12, q4, q5, #0
518            vext.u16    q13, q9, q10, #6
519            vmlal.u16   q14, d24, d5[3]
520            vmlal.u16   q15, d25, d5[3]
521            vmlal.u16   q14, d26, d5[3]
522            vmlal.u16   q15, d27, d5[3]
523    122:    vext.u16    q12, q4, q5, #1
524            vext.u16    q13, q9, q10, #5
525            vmlal.u16   q14, d24, d5[2]
526            vmlal.u16   q15, d25, d5[2]
527            vmlal.u16   q14, d26, d5[2]
528            vmlal.u16   q15, d27, d5[2]
529    121:    vext.u16    q12, q4, q5, #2
530            vext.u16    q13, q9, q10, #4
531            vmlal.u16   q14, d24, d5[1]
532            vmlal.u16   q15, d25, d5[1]
533            vmlal.u16   q14, d26, d5[1]
534            vmlal.u16   q15, d27, d5[1]
535    120:    vext.u16    q12, q4, q5, #3
536            vext.u16    q13, q9, q10, #3
537            vmlal.u16   q14, d24, d5[0]
538            vmlal.u16   q15, d25, d5[0]
539            vmlal.u16   q14, d26, d5[0]
540            vmlal.u16   q15, d27, d5[0]
541    119:    vext.u16    q12, q4, q5, #4
542            vext.u16    q13, q9, q10, #2
543            vmlal.u16   q14, d24, d4[3]
544            vmlal.u16   q15, d25, d4[3]
545            vmlal.u16   q14, d26, d4[3]
546            vmlal.u16   q15, d27, d4[3]
547    118:    vext.u16    q12, q4, q5, #5
548            vext.u16    q13, q9, q10, #1
549            vmlal.u16   q14, d24, d4[2]
550            vmlal.u16   q15, d25, d4[2]
551            vmlal.u16   q14, d26, d4[2]
552            vmlal.u16   q15, d27, d4[2]
553    117:    vext.u16    q12, q4, q5, #6
554            vext.u16    q13, q9, q10, #0
555            vmlal.u16   q14, d24, d4[1]
556            vmlal.u16   q15, d25, d4[1]
557            vmlal.u16   q14, d26, d4[1]
558            vmlal.u16   q15, d27, d4[1]
559    116:    vext.u16    q12, q4, q5, #7
560            vext.u16    q13, q8, q9, #7
561            vmlal.u16   q14, d24, d4[0]
562            vmlal.u16   q15, d25, d4[0]
563            vmlal.u16   q14, d26, d4[0]
564            vmlal.u16   q15, d27, d4[0]
565    115:    vext.u16    q12, q5, q6, #0
566            vext.u16    q13, q8, q9, #6
567            vmlal.u16   q14, d24, d3[3]
568            vmlal.u16   q15, d25, d3[3]
569            vmlal.u16   q14, d26, d3[3]
570            vmlal.u16   q15, d27, d3[3]
571    114:    vext.u16    q12, q5, q6, #1
572            vext.u16    q13, q8, q9, #5
573            vmlal.u16   q14, d24, d3[2]
574            vmlal.u16   q15, d25, d3[2]
575            vmlal.u16   q14, d26, d3[2]
576            vmlal.u16   q15, d27, d3[2]
577    113:    vext.u16    q12, q5, q6, #2
578            vext.u16    q13, q8, q9, #4
579            vmlal.u16   q14, d24, d3[1]
580            vmlal.u16   q15, d25, d3[1]
581            vmlal.u16   q14, d26, d3[1]
582            vmlal.u16   q15, d27, d3[1]
583    112:    vext.u16    q12, q5, q6, #3
584            vext.u16    q13, q8, q9, #3
585            vmlal.u16   q14, d24, d3[0]
586            vmlal.u16   q15, d25, d3[0]
587            vmlal.u16   q14, d26, d3[0]
588            vmlal.u16   q15, d27, d3[0]
589    111:    vext.u16    q12, q5, q6, #4
590            vext.u16    q13, q8, q9, #2
591            vmlal.u16   q14, d24, d2[3]
592            vmlal.u16   q15, d25, d2[3]
593            vmlal.u16   q14, d26, d2[3]
594            vmlal.u16   q15, d27, d2[3]
595    110:    vext.u16    q12, q5, q6, #5
596            vext.u16    q13, q8, q9, #1
597            vmlal.u16   q14, d24, d2[2]
598            vmlal.u16   q15, d25, d2[2]
599            vmlal.u16   q14, d26, d2[2]
600            vmlal.u16   q15, d27, d2[2]
601    109:    vext.u16    q12, q5, q6, #6
602            vext.u16    q13, q8, q9, #0
603            vmlal.u16   q14, d24, d2[1]
604            vmlal.u16   q15, d25, d2[1]
605            vmlal.u16   q14, d26, d2[1]
606            vmlal.u16   q15, d27, d2[1]
607    108:    vext.u16    q12, q5, q6, #7
608            vext.u16    q13, q7, q8, #7
609            vmlal.u16   q14, d24, d2[0]
610            vmlal.u16   q15, d25, d2[0]
611            vmlal.u16   q14, d26, d2[0]
612            vmlal.u16   q15, d27, d2[0]
613    107:    vext.u16    q12, q6, q7, #0
614            vext.u16    q13, q7, q8, #6
615            vmlal.u16   q14, d24, d1[3]
616            vmlal.u16   q15, d25, d1[3]
617            vmlal.u16   q14, d26, d1[3]
618            vmlal.u16   q15, d27, d1[3]
619    106:    vext.u16    q12, q6, q7, #1
620            vext.u16    q13, q7, q8, #5
621            vmlal.u16   q14, d24, d1[2]
622            vmlal.u16   q15, d25, d1[2]
623            vmlal.u16   q14, d26, d1[2]
624            vmlal.u16   q15, d27, d1[2]
625    105:    vext.u16    q12, q6, q7, #2
626            vext.u16    q13, q7, q8, #4
627            vmlal.u16   q14, d24, d1[1]
628            vmlal.u16   q15, d25, d1[1]
629            vmlal.u16   q14, d26, d1[1]
630            vmlal.u16   q15, d27, d1[1]
631    104:    vext.u16    q12, q6, q7, #3
632            vext.u16    q13, q7, q8, #3
633            vmlal.u16   q14, d24, d1[0]
634            vmlal.u16   q15, d25, d1[0]
635            vmlal.u16   q14, d26, d1[0]
636            vmlal.u16   q15, d27, d1[0]
637    103:    vext.u16    q12, q6, q7, #4
638            vext.u16    q13, q7, q8, #2
639            vmlal.u16   q14, d24, d0[3]
640            vmlal.u16   q15, d25, d0[3]
641            vmlal.u16   q14, d26, d0[3]
642            vmlal.u16   q15, d27, d0[3]
643    102:    vext.u16    q12, q6, q7, #5
644            vext.u16    q13, q7, q8, #1
645            vmlal.u16   q14, d24, d0[2]
646            vmlal.u16   q15, d25, d0[2]
647            vmlal.u16   q14, d26, d0[2]
648            vmlal.u16   q15, d27, d0[2]
649    101:    vext.u16    q12, q6, q7, #6
650            vext.u16    q13, q7, q8, #0
651            vmlal.u16   q14, d24, d0[1]
652            vmlal.u16   q15, d25, d0[1]
653            vmlal.u16   q14, d26, d0[1]
654            vmlal.u16   q15, d27, d0[1]
655
656            vqrshrn.u32 d28, q14, #16
657            vqrshrn.u32 d29, q15, #16
658            vqrshrn.u16 d31, q14, #FRACTION_BITS
659
660            vmov        d7, d9
661            vmov        q4, q5
662            vmov        q5, q6
663            vmov        q6, q7
664            vmov        q7, q8
665            vmov        q8, q9
666            vmov        q9, q10
667            vmov        q10, q11
668.endm/*}}}*/
669
670#define TUNED_LIST4 6, 12
671.macro hconv4_6/*{{{*/
672            vmull.u16   q14, d14, d0[0]
673            vmull.u16   q15, d15, d0[0]
674
675            ldr         r12, [pc, r5, LSL #2]
676            add         pc, pc, r12
677            bkpt
678    100:    .word 101f-100b
679            .word 102f-100b
680            .word 103f-100b
681            .word 104f-100b
682            .word 105f-100b
683            .word 106f-100b
684    106:    vmlal.u16   q14, d8,  d1[2]
685            vmlal.u16   q15, d9,  d1[2]
686            vmlal.u16   q14, d20, d1[2]
687            vmlal.u16   q15, d21, d1[2]
688    105:    vmlal.u16   q14, d9,  d1[1]
689            vmlal.u16   q15, d10, d1[1]
690            vmlal.u16   q14, d19, d1[1]
691            vmlal.u16   q15, d20, d1[1]
692    104:    vmlal.u16   q14, d10, d1[0]
693            vmlal.u16   q15, d11, d1[0]
694            vmlal.u16   q14, d18, d1[0]
695            vmlal.u16   q15, d19, d1[0]
696    103:    vmlal.u16   q14, d11, d0[3]
697            vmlal.u16   q15, d12, d0[3]
698            vmlal.u16   q14, d17, d0[3]
699            vmlal.u16   q15, d18, d0[3]
700    102:    vmlal.u16   q14, d12, d0[2]
701            vmlal.u16   q15, d13, d0[2]
702            vmlal.u16   q14, d16, d0[2]
703            vmlal.u16   q15, d17, d0[2]
704    101:    vmlal.u16   q14, d13, d0[1]
705            vmlal.u16   q15, d14, d0[1]
706            vmlal.u16   q14, d15, d0[1]
707            vmlal.u16   q15, d16, d0[1]
708
709            vqrshrn.u32 d28, q14, #16
710            vqrshrn.u32 d29, q15, #16
711            vqrshrn.u16 d31, q14, #FRACTION_BITS
712
713            vmov        q4, q5
714            vmov        q5, q6
715            vmov        q6, q7
716            vmov        q7, q8
717            vmov        q8, q9
718            vmov        q9, q10
719            vmov        q10, q11
720.endm/*}}}*/
721
722.macro hconv4_12/*{{{*/
723            vmull.u16   q14, d8, d0[0]
724            vmull.u16   q15, d9, d0[0]
725
726            ldr         r12, [pc, r5, LSL #2]
727            add         pc, pc, r12
728            bkpt
729    100:    .word 101f-100b
730            .word 102f-100b
731            .word 103f-100b
732            .word 104f-100b
733            .word 105f-100b
734            .word 106f-100b
735            .word 107f-100b
736            .word 108f-100b
737            .word 109f-100b
738            .word 110f-100b
739            .word 111f-100b
740            .word 112f-100b
741    112:    add         r12, r9, #0x1a0
742            bic         r12, r12, #0x200
743            vld1.u16    {d24,d25}, [r12:128]
744            vmlal.u16   q14, d24, d3[0]
745            vmlal.u16   q15, d25, d3[0]
746            vmlal.u16   q14, d20, d3[0]
747            vmlal.u16   q15, d21, d3[0]
748    111:    add         r12, r9, #0x1a8
749            bic         r12, r12, #0x200
750            vld1.u16    {d24}, [r12:64]!
751            bic         r12, r12, #0x200
752            vld1.u16    {d25}, [r12:64]
753            vmlal.u16   q14, d24, d2[3]
754            vmlal.u16   q15, d25, d2[3]
755            vmlal.u16   q14, d19, d2[3]
756            vmlal.u16   q15, d20, d2[3]
757    110:    add         r12, r9, #0x1b0
758            bic         r12, r12, #0x200
759            vld1.u16    {d24,d25}, [r12:128]
760            vmlal.u16   q14, d24, d2[2]
761            vmlal.u16   q15, d25, d2[2]
762            vmlal.u16   q14, d18, d2[2]
763            vmlal.u16   q15, d19, d2[2]
764    109:    add         r12, r9, #0x1b8
765            bic         r12, r12, #0x200
766            vld1.u16    {d24}, [r12:64]!
767            bic         r12, r12, #0x200
768            vld1.u16    {d25}, [r12:64]
769            vmlal.u16   q14, d24, d2[1]
770            vmlal.u16   q15, d25, d2[1]
771            vmlal.u16   q14, d17, d2[1]
772            vmlal.u16   q15, d18, d2[1]
773    108:    add         r12, r9, #0x1c0
774            bic         r12, r12, #0x200
775            vld1.u16    {d24,d25}, [r12:128]
776            vmlal.u16   q14, d24, d2[0]
777            vmlal.u16   q15, d25, d2[0]
778            vmlal.u16   q14, d16, d2[0]
779            vmlal.u16   q15, d17, d2[0]
780    107:    add         r12, r9, #0x1c8
781            bic         r12, r12, #0x200
782            vld1.u16    {d24}, [r12:64]!
783            bic         r12, r12, #0x200
784            vld1.u16    {d25}, [r12:64]
785            vmlal.u16   q14, d24, d1[3]
786            vmlal.u16   q15, d25, d1[3]
787            vmlal.u16   q14, d15, d1[3]
788            vmlal.u16   q15, d16, d1[3]
789    106:    add         r12, r9, #0x1d0
790            bic         r12, r12, #0x200
791            vld1.u16    {d24,d25}, [r12:128]
792            vmlal.u16   q14, d24, d1[2]
793            vmlal.u16   q15, d25, d1[2]
794            vmlal.u16   q14, d14, d1[2]
795            vmlal.u16   q15, d15, d1[2]
796    105:    add         r12, r9, #0x1d8
797            bic         r12, r12, #0x200
798            vld1.u16    {d24}, [r12:64]!
799            bic         r12, r12, #0x200
800            vld1.u16    {d25}, [r12:64]
801            vmlal.u16   q14, d24, d1[1]
802            vmlal.u16   q15, d25, d1[1]
803            vmlal.u16   q14, d13, d1[1]
804            vmlal.u16   q15, d14, d1[1]
805    104:    add         r12, r9, #0x1e0
806            bic         r12, r12, #0x200
807            vld1.u16    {d24,d25}, [r12:128]
808            vmlal.u16   q14, d24, d1[0]
809            vmlal.u16   q15, d25, d1[0]
810            vmlal.u16   q14, d12, d1[0]
811            vmlal.u16   q15, d13, d1[0]
812    103:    add         r12, r9, #0x1e8
813            bic         r12, r12, #0x200
814            vld1.u16    {d24}, [r12:64]!
815            bic         r12, r12, #0x200
816            vld1.u16    {d25}, [r12:64]
817            vmlal.u16   q14, d24, d0[3]
818            vmlal.u16   q15, d25, d0[3]
819            vmlal.u16   q14, d11, d0[3]
820            vmlal.u16   q15, d12, d0[3]
821    102:    add         r12, r9, #0x1f0
822            bic         r12, r12, #0x200
823            vld1.u16    {d24,d25}, [r12:128]
824            vmlal.u16   q14, d24, d0[2]
825            vmlal.u16   q15, d25, d0[2]
826            vmlal.u16   q14, d10, d0[2]
827            vmlal.u16   q15, d11, d0[2]
828    101:    add         r12, r9, #0x1f8
829            bic         r12, r12, #0x200
830            vld1.u16    {d24}, [r12:64]
831            vmlal.u16   q14, d24, d0[1]
832            vmlal.u16   q15, d8,  d0[1]
833            vmlal.u16   q14, d9,  d0[1]
834            vmlal.u16   q15, d10, d0[1]
835
836            vqrshrn.u32 d28, q14, #16
837            vqrshrn.u32 d29, q15, #16
838            vqrshrn.u16 d31, q14, #FRACTION_BITS
839
840            vst1.u8     {q4}, [r9:128]!
841            bic         r9, r9, #0x200
842            vmov        q4, q5
843            vmov        q5, q6
844            vmov        q6, q7
845            vmov        q7, q8
846            vmov        q8, q9
847            vmov        q9, q10
848            vmov        q10, q11
849.endm/*}}}*/
850
851.macro hconv4_25/*{{{*/
852            add         r12, r9, #0x198
853            bic         r12, r12, #0x200
854            vld1.u16    {d24}, [r12:64]!
855            bic         r12, r12, #0x200
856            vld1.u16    {d25}, [r12:64]
857            vmull.u16   q14, d24, d0[0]
858            vmull.u16   q15, d25, d0[0]
859
860            ldr         r12, [pc, r5, LSL #2]
861            add         pc, pc, r12
862            bkpt
863    100:    .word 101f-100b
864            .word 102f-100b
865            .word 103f-100b
866            .word 104f-100b
867            .word 105f-100b
868            .word 106f-100b
869            .word 107f-100b
870            .word 108f-100b
871            .word 109f-100b
872            .word 110f-100b
873            .word 111f-100b
874            .word 112f-100b
875            .word 113f-100b
876            .word 114f-100b
877            .word 115f-100b
878            .word 116f-100b
879            .word 117f-100b
880            .word 118f-100b
881            .word 119f-100b
882            .word 120f-100b
883            .word 121f-100b
884            .word 122f-100b
885            .word 123f-100b
886            .word 124f-100b
887            .word 125f-100b
888    125:    add         r12, r9, #0x0d0
889            bic         r12, r12, #0x200
890            vld1.u16    {d24,d25}, [r12:128]
891            vmlal.u16   q14, d24, d6[1]
892            vmlal.u16   q15, d25, d6[1]
893            vmlal.u16   q14, d20, d6[1]
894            vmlal.u16   q15, d21, d6[1]
895    124:    add         r12, r9, #0x0d8
896            bic         r12, r12, #0x200
897            vld1.u16    {d24}, [r12:64]!
898            bic         r12, r12, #0x200
899            vld1.u16    {d25}, [r12]
900            vmlal.u16   q14, d24, d6[0]
901            vmlal.u16   q15, d25, d6[0]
902            vmlal.u16   q14, d19, d6[0]
903            vmlal.u16   q15, d20, d6[0]
904    123:    add         r12, r9, #0x0e0
905            bic         r12, r12, #0x200
906            vld1.u16    {d24,d25}, [r12:128]
907            vmlal.u16   q14, d24, d5[3]
908            vmlal.u16   q15, d25, d5[3]
909            vmlal.u16   q14, d18, d5[3]
910            vmlal.u16   q15, d19, d5[3]
911    122:    add         r12, r9, #0x0e8
912            bic         r12, r12, #0x200
913            vld1.u16    {d24}, [r12:64]!
914            bic         r12, r12, #0x200
915            vld1.u16    {d25}, [r12]
916            vmlal.u16   q14, d24, d5[2]
917            vmlal.u16   q15, d25, d5[2]
918            vmlal.u16   q14, d17, d5[2]
919            vmlal.u16   q15, d18, d5[2]
920    121:    add         r12, r9, #0x0f0
921            bic         r12, r12, #0x200
922            vld1.u16    {d24,d25}, [r12:128]
923            vmlal.u16   q14, d24, d5[1]
924            vmlal.u16   q15, d25, d5[1]
925            vmlal.u16   q14, d16, d5[1]
926            vmlal.u16   q15, d17, d5[1]
927    120:    add         r12, r9, #0x0f8
928            bic         r12, r12, #0x200
929            vld1.u16    {d24}, [r12:64]!
930            bic         r12, r12, #0x200
931            vld1.u16    {d25}, [r12]
932            vmlal.u16   q14, d24, d5[0]
933            vmlal.u16   q15, d25, d5[0]
934            vmlal.u16   q14, d15, d5[0]
935            vmlal.u16   q15, d16, d5[0]
936    119:    add         r12, r9, #0x100
937            bic         r12, r12, #0x200
938            vld1.u16    {d24,d25}, [r12:128]
939            vmlal.u16   q14, d24, d4[3]
940            vmlal.u16   q15, d25, d4[3]
941            vmlal.u16   q14, d14, d4[3]
942            vmlal.u16   q15, d15, d4[3]
943    118:    add         r12, r9, #0x108
944            bic         r12, r12, #0x200
945            vld1.u16    {d24}, [r12:64]!
946            bic         r12, r12, #0x200
947            vld1.u16    {d25}, [r12]
948            vmlal.u16   q14, d24, d4[2]
949            vmlal.u16   q15, d25, d4[2]
950            vmlal.u16   q14, d13, d4[2]
951            vmlal.u16   q15, d14, d4[2]
952    117:    add         r12, r9, #0x110
953            bic         r12, r12, #0x200
954            vld1.u16    {d24,d25}, [r12:128]
955            vmlal.u16   q14, d24, d4[1]
956            vmlal.u16   q15, d25, d4[1]
957            vmlal.u16   q14, d12, d4[1]
958            vmlal.u16   q15, d13, d4[1]
959    116:    add         r12, r9, #0x118
960            bic         r12, r12, #0x200
961            vld1.u16    {d24}, [r12:64]!
962            bic         r12, r12, #0x200
963            vld1.u16    {d25}, [r12]
964            vmlal.u16   q14, d24, d4[0]
965            vmlal.u16   q15, d25, d4[0]
966            vmlal.u16   q14, d11, d4[0]
967            vmlal.u16   q15, d12, d4[0]
968    115:    add         r12, r9, #0x120
969            bic         r12, r12, #0x200
970            vld1.u16    {d24,d25}, [r12:128]
971            vmlal.u16   q14, d24, d3[3]
972            vmlal.u16   q15, d25, d3[3]
973            vmlal.u16   q14, d10, d3[3]
974            vmlal.u16   q15, d11, d3[3]
975    114:    add         r12, r9, #0x128
976            bic         r12, r12, #0x200
977            vld1.u16    {d24}, [r12:64]!
978            bic         r12, r12, #0x200
979            vld1.u16    {d25}, [r12]
980            vmlal.u16   q14, d24, d3[2]
981            vmlal.u16   q15, d25, d3[2]
982            vmlal.u16   q14, d9,  d3[2]
983            vmlal.u16   q15, d10, d3[2]
984    113:    add         r12, r9, #0x130
985            bic         r12, r12, #0x200
986            vld1.u16    {d24,d25}, [r12:128]
987            vmlal.u16   q14, d24, d3[1]
988            vmlal.u16   q15, d25, d3[1]
989            vmlal.u16   q14, d8,  d3[1]
990            vmlal.u16   q15, d9,  d3[1]
991    112:    add         r12, r9, #0x138
992            bic         r12, r12, #0x200
993            vld1.u16    {d24}, [r12:64]!
994            bic         r12, r12, #0x200
995            vld1.u16    {d25}, [r12]
996                                            add         r12, r9, #0x1f8
997                                            bic         r12, r12, #0x200
998                                            vld1.u16    {d26}, [r12:64]
999            vmlal.u16   q14, d24, d3[0]
1000            vmlal.u16   q15, d25, d3[0]
1001            vmlal.u16   q14, d26, d3[0]   @ Could be d7, without the load, right?
1002            vmlal.u16   q15, d8,  d3[0]
1003    111:    add         r12, r9, #0x140
1004            bic         r12, r12, #0x200
1005            vld1.u16    {d24,d25}, [r12:128]
1006                                            add         r12, r9, #0x1f0
1007                                            bic         r12, r12, #0x200
1008                                            vld1.u16    {d26,d27}, [r12:128]
1009            vmlal.u16   q14, d24, d2[3]
1010            vmlal.u16   q15, d25, d2[3]
1011            vmlal.u16   q14, d26, d2[3]
1012            vmlal.u16   q15, d27, d2[3]
1013    110:    add         r12, r9, #0x148
1014            bic         r12, r12, #0x200
1015            vld1.u16    {d24}, [r12:64]!
1016            bic         r12, r12, #0x200
1017            vld1.u16    {d25}, [r12]
1018                                            add         r12, r9, #0x1e8
1019                                            bic         r12, r12, #0x200
1020                                            vld1.u16    {d26}, [r12:64]!
1021                                            bic         r12, r12, #0x200
1022                                            vld1.u16    {d27}, [r12:64]
1023            vmlal.u16   q14, d24, d2[2]
1024            vmlal.u16   q15, d25, d2[2]
1025            vmlal.u16   q14, d26, d2[2]
1026            vmlal.u16   q15, d27, d2[2]
1027    109:    add         r12, r9, #0x150
1028            bic         r12, r12, #0x200
1029            vld1.u16    {d24,d25}, [r12:128]
1030                                            add         r12, r9, #0x1e0
1031                                            bic         r12, r12, #0x200
1032                                            vld1.u16    {d26,d27}, [r12:128]
1033            vmlal.u16   q14, d24, d2[1]
1034            vmlal.u16   q15, d25, d2[1]
1035            vmlal.u16   q14, d26, d2[1]
1036            vmlal.u16   q15, d27, d2[1]
1037    108:    add         r12, r9, #0x158
1038            bic         r12, r12, #0x200
1039            vld1.u16    {d24}, [r12:64]!
1040            bic         r12, r12, #0x200
1041            vld1.u16    {d25}, [r12]
1042                                            add         r12, r9, #0x1d8
1043                                            bic         r12, r12, #0x200
1044                                            vld1.u16    {d26}, [r12:64]!
1045                                            bic         r12, r12, #0x200
1046                                            vld1.u16    {d27}, [r12:64]
1047            vmlal.u16   q14, d24, d2[0]
1048            vmlal.u16   q15, d25, d2[0]
1049            vmlal.u16   q14, d26, d2[0]
1050            vmlal.u16   q15, d27, d2[0]
1051    107:    add         r12, r9, #0x160
1052            bic         r12, r12, #0x200
1053            vld1.u16    {d24,d25}, [r12:128]
1054                                            add         r12, r9, #0x1d0
1055                                            bic         r12, r12, #0x200
1056                                            vld1.u16    {d26,d27}, [r12:128]
1057            vmlal.u16   q14, d24, d1[3]
1058            vmlal.u16   q15, d25, d1[3]
1059            vmlal.u16   q14, d26, d1[3]
1060            vmlal.u16   q15, d27, d1[3]
1061    106:    add         r12, r9, #0x168
1062            bic         r12, r12, #0x200
1063            vld1.u16    {d24}, [r12:64]!
1064            bic         r12, r12, #0x200
1065            vld1.u16    {d25}, [r12]
1066                                            add         r12, r9, #0x1c8
1067                                            bic         r12, r12, #0x200
1068                                            vld1.u16    {d26}, [r12:64]!
1069                                            bic         r12, r12, #0x200
1070                                            vld1.u16    {d27}, [r12:64]
1071            vmlal.u16   q14, d24, d1[2]
1072            vmlal.u16   q15, d25, d1[2]
1073            vmlal.u16   q14, d26, d1[2]
1074            vmlal.u16   q15, d27, d1[2]
1075    105:    add         r12, r9, #0x170
1076            bic         r12, r12, #0x200
1077            vld1.u16    {d24,d25}, [r12:128]
1078                                            add         r12, r9, #0x1c0
1079                                            bic         r12, r12, #0x200
1080                                            vld1.u16    {d26,d27}, [r12:128]
1081            vmlal.u16   q14, d24, d1[1]
1082            vmlal.u16   q15, d25, d1[1]
1083            vmlal.u16   q14, d26, d1[1]
1084            vmlal.u16   q15, d27, d1[1]
1085    104:    add         r12, r9, #0x178
1086            bic         r12, r12, #0x200
1087            vld1.u16    {d24}, [r12:64]!
1088            bic         r12, r12, #0x200
1089            vld1.u16    {d25}, [r12]
1090                                            add         r12, r9, #0x1b8
1091                                            bic         r12, r12, #0x200
1092                                            vld1.u16    {d26}, [r12:64]!
1093                                            bic         r12, r12, #0x200
1094                                            vld1.u16    {d27}, [r12:64]
1095            vmlal.u16   q14, d24, d1[0]
1096            vmlal.u16   q15, d25, d1[0]
1097            vmlal.u16   q14, d26, d1[0]
1098            vmlal.u16   q15, d27, d1[0]
1099    103:    add         r12, r9, #0x180
1100            bic         r12, r12, #0x200
1101            vld1.u16    {d24,d25}, [r12:128]
1102                                            add         r12, r9, #0x1b0
1103                                            bic         r12, r12, #0x200
1104                                            vld1.u16    {d26,d27}, [r12:128]
1105            vmlal.u16   q14, d24, d0[3]
1106            vmlal.u16   q15, d25, d0[3]
1107            vmlal.u16   q14, d26, d0[3]
1108            vmlal.u16   q15, d27, d0[3]
1109    102:    add         r12, r9, #0x188
1110            bic         r12, r12, #0x200
1111            vld1.u16    {d24}, [r12:64]!
1112            bic         r12, r12, #0x200
1113            vld1.u16    {d25}, [r12]
1114                                            add         r12, r9, #0x1a8
1115                                            bic         r12, r12, #0x200
1116                                            vld1.u16    {d26}, [r12:64]!
1117                                            bic         r12, r12, #0x200
1118                                            vld1.u16    {d27}, [r12:64]
1119            vmlal.u16   q14, d24, d0[2]
1120            vmlal.u16   q15, d25, d0[2]
1121            vmlal.u16   q14, d26, d0[2]
1122            vmlal.u16   q15, d27, d0[2]
1123    101:    add         r12, r9, #0x190
1124            bic         r12, r12, #0x200
1125            vld1.u16    {d24,d25}, [r12:128]!
1126            bic         r12, r12, #0x200
1127            vld1.u16    {d26,d27}, [r12:128]
1128            vmlal.u16   q14, d24, d0[1]
1129            vmlal.u16   q15, d25, d0[1]
1130            vmlal.u16   q14, d26, d0[1]
1131            vmlal.u16   q15, d27, d0[1]
1132
1133            vqrshrn.u32 d28, q14, #16
1134            vqrshrn.u32 d29, q15, #16
1135            vqrshrn.u16 d31, q14, #FRACTION_BITS
1136
1137            vst1.u8     {q4}, [r9:128]!
1138            bic         r9, r9, #0x200
1139            vmov        q4, q5
1140            vmov        q5, q6
1141            vmov        q6, q7
1142            vmov        q7, q8
1143            vmov        q8, q9
1144            vmov        q9, q10
1145            vmov        q10, q11
1146.endm/*}}}*/
1147
1148/* Dedicated function wrapper for the fetch macro, for the cases where
1149 * performance isn't that important, to keep code size down.
1150 */
1151PRIVATE(fetch_generic_asm)
1152            push        {r10,r11}
1153            fetch
1154            pop         {r10,r11}
1155            bx          lr
1156END(fetch_generic_asm)
1157
1158
1159/* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory
1160 * beyond that limit, and filling the rest of the vector with the last legal
1161 * pixel.
1162 * Result is in q10 and q11.  q8 and q9 are filled with the first legal pixel.
1163 * Note: This function can read beyond the right edge of input if the image is
1164 * narrower than 16 bytes.
1165 */
1166PRIVATE(fetch_clampleft1)
1167            push        {r12,lr}
1168            bl          fetch_generic_asm
1169            vdup.u16    q8, d20[0]
1170            vdup.u16    q9, d20[0]
1171            ands        r12, r10, #15
1172            beq         1f
1173            sub         r1, r1, r12
1174            sub         r10, r10, r12
1175            sub         sp, sp, #32
1176            vst1.u16    {q10,q11}, [sp]
1177            sub         r12, sp, r12, LSL #1
1178            sub         sp, sp, #32
1179            vst1.u16    {q8,q9}, [sp]
1180            vld1.u16    {q10,q11}, [r12]
1181            add         sp, sp, #64
11821:          pop         {r12,pc}
1183END(fetch_clampleft1)
1184
1185PRIVATE(fetch_clampleft4)
1186            push        {r12,lr}
1187            bl          fetch_generic_asm
1188            vmov.u16    d16, d20
1189            vmov.u16    d17, d20
1190            vmov.u16    d18, d20
1191            vmov.u16    d19, d20
1192            ands        r12, r10, #15
1193            beq         1f
1194            sub         r1, r1, r12
1195            sub         r10, r10, r12
1196            sub         sp, sp, #32
1197            vst1.u16    {q10-q11}, [sp]
1198            sub         r12, sp, r12, LSL #1
1199            sub         sp, sp, #32
1200            vst1.u16    {q8,q9}, [sp]
1201            vld1.u16    {q10,q11}, [r12]
1202            add         sp, sp, #64
12031:          pop         {r12,pc}
1204END(fetch_clampleft4)
1205
1206/* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding
1207 * reading memory beyond that limit, and filling the rest of the vector with
1208 * the last legal pixel.
1209 * Result is in q10 and q11.  q12 and q13 are filled with the last legal pixel.
1210 * Note: This function can read beyond the left edge of input if the image is
1211 * narrower than 16 bytes.
1212 */
1213PRIVATE(fetch_clampright1)
1214            push        {r12, lr}
1215            rsb         r12, r11, #0
1216            ands        r12, r12, #15
1217            beq         1f
1218            sub         r1, r1, r12
1219            bl          fetch_generic_asm
1220            vdup.u16    q12, d23[3]
1221            vdup.u16    q13, d23[3]
1222            rsb         r12, r11, #0
1223            and         r12, r12, #15
1224            sub         sp, sp, #32
1225            vst1.u16    {q12,q13}, [sp]
1226            sub         sp, sp, #32
1227            add         r12, sp, r12, LSL #1
1228            vst1.u16    {q10,q11}, [sp]
1229            vld1.u16    {q10,q11}, [r12]
1230            add         sp, sp, #64
1231            pop         {r12,pc}
12321:          bl          fetch_generic_asm
1233            vdup.u16    q12, d23[3]
1234            vdup.u16    q13, d23[3]
1235            pop         {r12,pc}
1236END(fetch_clampright1)
1237
1238PRIVATE(fetch_clampright4)
1239            push        {r12, lr}
1240            rsb         r12, r11, #0
1241            ands        r12, r12, #15
1242            beq         1f
1243            sub         r1, r1, r12
1244            bl          fetch_generic_asm
1245            vmov.u16    d24, d23
1246            vmov.u16    d25, d23
1247            vmov.u16    d26, d23
1248            vmov.u16    d27, d23
1249            rsb         r12, r11, #0
1250            and         r12, r12, #15
1251            sub         sp, sp, #32
1252            vst1.u16    {q12-q13}, [sp]
1253            sub         sp, sp, #32
1254            add         r12, sp, r12, LSL #1
1255            vst1.u16    {q10,q11}, [sp]
1256            vld1.u16    {q10,q11}, [r12]
1257            add         sp, sp, #64
1258            pop         {r12,pc}
12591:          bl          fetch_generic_asm
1260            vmov.u16    d24, d23
1261            vmov.u16    d25, d23
1262            vmov.u16    d26, d23
1263            vmov.u16    d27, d23
1264            pop         {r12,pc}
1265END(fetch_clampright4)
1266
1267/* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th
1268 * value across to fill the rest of the register pair.  Used for filling the
1269 * right hand edge of the window when reading too close to the right hand edge
1270 * of the image.
1271 * Also returns a dup-ed copy of the last element in q12 for the tail-fill
1272 * case (this happens incidentally in common path, but must be done
1273 * deliberately in the fast-out path).
1274 */
1275PRIVATE(prefill_sweepright1)
1276            ands        r12, r11, #15
1277            beq         1f
1278            sub         r12, r12, #1
1279            sub         sp, sp, #64
1280            vst1.u16    {q10,q11}, [sp]
1281            add         r12, sp, r12, LSL #1
1282            vld1.u16    {d24[],d25[]}, [r12]
1283            vld1.u16    {d26[],d27[]}, [r12]
1284            vst1.u16    {q12,q13}, [r12]
1285            vld1.u16    {q10,q11}, [sp]
1286            add         sp, sp, #64
1287            bx          lr
12881:          vdup.u16    q12, d23[3]
1289            vdup.u16    q13, d23[3]
1290            bx          lr
1291END(prefill_sweepright1)
1292
1293PRIVATE(prefill_sweepright4)
1294            ands        r12, r11, #15
1295            beq         1f
1296            sub         r12, r12, #4
1297            sub         sp, sp, #64
1298            vst1.u16    {q10,q11}, [sp]
1299            add         r12, sp, r12, LSL #1
1300            vld1.u64    {d24}, [r12]
1301            vld1.u64    {d25}, [r12]
1302            vld1.u64    {d26}, [r12]
1303            vld1.u64    {d27}, [r12]
1304            vst1.u16    {q12,q13}, [r12]
1305            vld1.u16    {q10,q11}, [sp]
1306            add         sp, sp, #64
1307            bx          lr
13081:          vmov.u16    d24, d23
1309            vmov.u16    d25, d23
1310            vmov.u16    d26, d23
1311            vmov.u16    d27, d23
1312            bx          lr
1313END(prefill_sweepright4)
1314
1315/* The main loop keeps a sliding window of data that has already been convolved
1316 * in the vertical axis for the current line.  This usually stays in the
1317 * register file, but spills to memory for large windows.  The first thing that
1318 * needs to be done at start-up is to fill this window with image data, taking
1319 * into account the padding needed if the left or right edges of the image fall
1320 * within this window.
1321 */
1322
1323/* Because the window is in the register file writes to it cannot be indexed
1324 * by another register.  Consequently the fill loops are unrolled to address
1325 * the registers directly.  This macro distinguishes between writes to the
1326 * register file and writes to the spill buffer (indicated by a destination
1327 * register named xx).
1328 */
1329.macro prefill_out ra, rb, sra, srb, srb_hi
1330  .ifc \ra,xx
1331    .ifc \rb,xx
1332            vst1.u16    {\sra,\srb}, [r9:128]!
1333    .else
1334            /* this case is used only for the last tap of uchar1 r=25 */
1335            /* discard \sra */
1336            vmov.u16    \rb, \srb_hi
1337    .endif
1338  .else
1339    .ifnc \ra,\sra
1340            vmov.u16    \ra, \sra
1341    .endif
1342    .ifnc \rb,\srb
1343            vmov.u16    \rb, \srb
1344    .endif
1345  .endif
1346.endm
1347
1348/* This macro provides the list of registers representing the window, and the
1349 * cases where the register file is too small and a spill buffer is used
1350 * instead.
1351 * Since several specialisations of each function are generated, this also
1352 * culls superfluous iterations, and sets the variable `i` for subsequent
1353 * macros indicating the current index into the window.
1354 */
1355.macro prefill_list, macro, nextmacro, max_r, step, label
1356  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
1357    .if windowsize >= (\line * 16)
1358      .set i, windowsize - (\line * 16)
1359\label\macro\line:
1360            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
1361    .endif
1362  .endm
1363  .if \step > 1
1364            ifneeded \macro \nextmacro, 13, 12, xx, xx,  \step, \label
1365            ifneeded \macro \nextmacro, 12, 11, xx, xx,  \step, \label
1366            ifneeded \macro \nextmacro, 11, 10, xx, xx,  \step, \label
1367            ifneeded \macro \nextmacro, 10,  9, xx, xx,  \step, \label
1368            ifneeded \macro \nextmacro,  9,  8, xx, xx,  \step, \label
1369            ifneeded \macro \nextmacro,  8,  7, xx, xx,  \step, \label
1370            ifneeded \macro \nextmacro,  7,  6, xx, xx,  \step, \label
1371            ifneeded \macro \nextmacro,  6,  5, xx, xx,  \step, \label
1372            ifneeded \macro \nextmacro,  5,  4, xx, xx,  \step, \label
1373            ifneeded \macro \nextmacro,  4,  3, xx, xx,  \step, \label
1374  .else
1375            /* q3 normally contains the coefficient table, but it's not fully
1376             * used.  In the uchar1, r=25 case the other half of q3 is used for
1377             * the last two window taps to avoid falling out to memory.
1378             */
1379            ifneeded \macro \nextmacro,  4,  3, xx, d7,   \step, \label
1380  .endif
1381            ifneeded \macro \nextmacro,  3,  2, q4, q5,   \step, \label
1382            ifneeded \macro \nextmacro,  2,  1, q6, q7,   \step, \label
1383            ifneeded \macro \nextmacro,  1,  0, q8, q9,   \step, \label
1384
1385\label\macro\()0:
1386            b           \label\()_end
1387  .purgem ifneeded
1388.endm
1389
1390/* These macros represent the possible stages of filling the window.
1391 * Each macro is unrolled enough times that it can fill the entire window
1392 * itself, but normally it will have to hand control to subsequent macros
1393 * part-way through and this is done using labels named \next and \after, where
1394 * \next is the next macro starting at the same window position and \after is
1395 * the next macro starting after the current window position.
1396 */
1397
1398/* leftfill: v8 and v9 contain the left padding value.  While the window
1399 * extends outside of the image on the left-hand side, and at least 16 more
1400 * padding values are needed in the window, store v8 and v9 into the window.
1401 * Otherwise skip forward to storing image data.
1402 */
1403.macro prefill_leftfill, next, after, ra, rb, step
1404            cmp         r10, #i+16
1405            blo         \next
1406            prefill_out \ra, \rb, q8, q9, d19
1407.endm
1408
1409/* leftedge: The very first non-fill or partial-fill chunk from the image is
1410 * already loaded (as it was used to calculate the left padding value), so
1411 * store it here, and then drop into the regular load/store cycle in the next
1412 * macro.
1413 */
1414.macro prefill_leftedge, next, after, ra, rb, step
14151:          prefill_out \ra, \rb, q10, q11, d23
1416            b           \after
1417.endm
1418
1419/* dofetch: Copy chunks of the image into the window without any complications
1420 * from edge conditions.
1421 */
1422.macro prefill_dofetch, next, after, ra, rb, step
1423            cmp         r11, #i+16
1424            bls         \next
1425            bl          fetch_generic_asm
1426            prefill_out \ra, \rb, q10, q11, d23
1427.endm
1428
1429/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
1430 * the right-hand edge of the image.  In that case sweep the last valid pixel
1431 * across the rest of the chunk, and in either case prepare padding data in v12
1432 * and v13 for the next macro.  This is done in fetch_clampright.
1433 * This only happens once before going on to the next macro.
1434 * Sometimes leftedge also covers the rightedge case, in which case this has
1435 * to be skipped altogether.
1436 */
1437.macro prefill_rightedge, next, after, ra, rb, step
1438            cmp         r11, #i
1439            bls         \next
1440            bl          fetch_clampright\step
1441            prefill_out \ra, \rb, q10, q11, d23
1442            b           \after
1443.endm
1444
1445/* rightfill: The rest of the window is simply filled with right padding from
1446 * v12 and v13.
1447 */
1448.macro prefill_rightfill, next, after, ra, rb, step
1449            prefill_out \ra, \rb, q12, q13, d25
1450.endm
1451
1452/* Here all of the macros above are unrolled and laid out in the proper order.
1453 */
1454.macro prefill_body, max_r, step, label
1455            prefill_list leftfill,  leftedge,   \max_r, \step, \label
1456            prefill_list leftedge,  dofetch,    \max_r, \step, \label
1457            prefill_list dofetch,   rightedge,  \max_r, \step, \label
1458            prefill_list rightedge, rightfill,  \max_r, \step, \label
1459            prefill_list rightfill, oops,       \max_r, \step, \label
1460\label\()_end:
1461.endm
1462
1463/* Fill the convolution window with context data.  The aim here is to load
1464 * exactly 2*r columns, and in the main loop to read as many columns as will be
1465 * written.  This is complicated by the window being divided into chunks at
1466 * register boundaries, and the need to handle cases when the input starts very
1467 * close to the left or right (or both) edges of the image and the need to fill
1468 * the spaces that leaves with left and right edge padding values.
1469 *
1470 * Input:
1471 *      r1 -- src
1472 *      r2 -- pitch
1473 *      r3 -- count
1474 *      r4 -- available image data right of src pointer
1475 *      r5 -- r
1476 *      r6 -- rup
1477 *      r7 -- rdn
1478 *      r8 -- available image data left of src pointer
1479 *      r9 -- buffer (if needed)
1480 * Output:
1481 *      r4 -= min(inlen, count + windowsize - centertap)
1482 *      r1 += min(inlen, count + windowsize - centertap)
1483 * Modifies:
1484 *      r10 -- fill start index in the window
1485 *      r11 -- fill stop index in the window
1486 *      r12 -- scratch
1487 */
1488.macro prefill step=1, max_r=25, label=xx
1489.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
1490.set centertap, (windowsize - \max_r * \step)
1491            mov         r10, #centertap
1492            subs        r10, r10, r8
1493            movlo       r10, #0
1494
1495            subs        r11, r4, #windowsize - centertap
1496            movhs       r11, #0
1497            add         r11, r11, #windowsize
1498
1499            /* r10 indicates where in the window legal image data begins.
1500             * r11 indicates where in the window legal image date ends.
1501             * When starting near the centre of a large image these would be
1502             * zero and windowsize respectively, but when starting near the
1503             * edges this can change.
1504             * When starting on the leftmost pixel, r10 will be centertap.
1505             * When starting on the rightmost pixel, r11 will be centertap+1.
1506             */
1507
1508            /* r4 indicates how much data there is between the current pointers
1509             * and the right edge of the image.  The pointers currently point
1510             * to the data needed at centertap.  The subsequent code will
1511             * consume (windowsize - r10) data, but only the data from
1512             * centertap to windowsize comes out of r4's budget.
1513             */
15141:          subs        r4, r4, #windowsize - centertap
1515            movlo       r4, #0
1516
1517            /* And the pointers need to rewind to the start of the window.
1518             */
1519            sub         r1, r1, #centertap
1520
1521            /* Unless x8 indicated that there wasn't that much data available.
1522             */
1523            add         r1, r1, r10
1524
1525
1526            /* Get the first chunk, and add padding to align it to the window
1527             * if necessary.
1528             */
1529            bl          fetch_clampleft\step
1530
1531            /* Sometimes the start and the end of the window are in the same
1532             * chunk.  In that case both ends need filler at the outset.
1533             */
1534            sub         r12, r11, #1
1535            eor         r12,  r10, r12
1536            cmp         r12, #16
1537            bllo        prefill_sweepright\step
1538
1539            /* Iterate through all the points in the window and fill them in
1540             * with padding or image data as needed.
1541             */
1542            prefill_body \max_r, \step, \label
1543.endm
1544
1545/* The main body of the convolve functions.  Having already pre-filled the
1546 * convolution window with 2*r input values, the logic settles into a regular
1547 * pattern of reading and writing at a 1:1 rate until either input or output
1548 * expires.  The input leads the output by r values, so when processing all the
1549 * way to the right-hand edge, or within r pixels of that edge, the input will
1550 * run out first.  In the case of very narrow images, or sub-windows starting
1551 * near the right edge, the input may already have run out while the
1552 * convolution window was being filled and this loop will start with a
1553 * zero-length input.
1554 *
1555 * Once the input runs out, the rest of the output must be processed by padding
1556 * the remainder of the window with pad value from the last valid pixel from
1557 * the source.
1558 *
1559 * Input:
1560 *      r0 = dst
1561 *      r1 = src
1562 *      r2 = pitch
1563 *      r3 = count
1564 *      r4 = inlen
1565 *      r5 = r
1566 *      r6 = rup
1567 *      r7 = rdn
1568 *      r9 = buffer
1569 * Modifies
1570 *      r8 = fetch code pointer
1571 */
1572.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
1573
1574            /* If x4 >= x3 then there's no need for clipping.  The main loop
1575             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
1576             * no greater than x3 and use x4 for the loop.
1577             * However, if x4 comes out of the loop with less than 16 bytes
1578             * left, a partial read would be necessary to avoid reading beyond
1579             * the end of the image.  To avoid this, clamp x4 to the next
1580             * multiple of 16, which is still sufficient to force it out of the
1581             * loop but doesn't imply a rewind.
1582             */
1583            add         r12, r3, #15
1584            bic         r12, r12, #15
1585            cmp         r4, r12
1586            movhi       r4, r12
1587
1588            /* First calculate the entry-point into the internal fetch logic.
1589             * This is done so the same function can service several kernel
1590             * sizes.
1591             */
1592            ldr         r8, 3f
15931:          add         r8, r8, pc
1594            sub         r8, r5, LSL #5
1595            sub         r8, r5, LSL #4
1596            cmp         r5, r6
1597            cmpeq       r5, r7
1598            beq         5f
1599
1600            /* if (r != rup || r != rdn) then the address-clamping table should
1601             * be used rather than the short-cut version.
1602             */
1603            ldr         r8, 3f+4
16042:          add         r8, r8, pc
1605            sub         r8, r5, LSL #6
1606            b           5f
1607            .align 3
16083:          .word       \labelnc-1b-8
1609            .word       \labelc-2b-8
1610
1611            /* Main loop: ... */
1612            .align 4
16133:          /* first perform a vertical convolution from memory to get the next
1614             * 16 taps of the horizontal window into the register file...
1615             */
1616            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8
1617
1618            /* ...then perform a horizontal convolution on that window to
1619             * produce eight output bytes, and slide the window along.
1620             * This has to be done twice to match the 16-way vertical pass.
1621             * It would be preferable to have twice the work done in \core, but
1622             * that would demand yet another variant on those macros and would
1623             * perturb the register allocation severely.
1624             */
1625            \core
1626            vst1.u8     {d31}, [r0]!
1627            \core
1628            vst1.u8     {d31}, [r0]!
1629
1630            sub         r3, r3, #16
16315:          subs        r4, r4, #16
1632            bhi         3b
1633            /* Here there's 16 or fewer bytes available before the edge of the
1634             * source image.  x4 holds that count minus 16 (because it was
1635             * decremented before the first iteration ran).  The last read may
1636             * not be a whole chunk, and beyond that a fill value must be used.
1637             *
1638             * Of course, none of that matters if there's no more output to
1639             * produce...
1640             */
1641            cmp         r3, #0
1642            beq         5f
1643
1644            /* Oh well. */
1645            adds        r4, r4, #16
1646            bne         1f
1647  .if \step==1
1648            vdup.u16    q10, d19[3]
1649            vdup.u16    q11, d19[3]
1650  .else
1651            vmov.u64    d20, d19
1652            vmov.u64    d21, d19
1653            vmov.u64    d22, d19
1654            vmov.u64    d23, d19
1655  .endif
1656            b           3f
1657
1658            /* To avoid reading past end of input, rewind pointers by (16-r4)
1659             * to ensure that they're exactly 16 bytes from the edge.
1660             */
16611:          mov         r11, r4
1662            bl          fetch_clampright\step
1663            /* Now to put this padding to use, perform any remaining
1664             * iterations.  This is done at half the rate of the main loop,
1665             * because there's no longer pressure from a 16-lane window filler.
1666             */
16673:          \core
1668  .if \step==1
1669            vdup.u16    q11, d23[3]
1670  .else
1671            vmov.u64    d22, d23
1672  .endif
1673            subs        r3, r3, #8
1674            blo         4f
1675            vst1.u8     {d31}, [r0]!
1676            bne         3b
1677            b           5f
1678
1679            /* If the final iteration contained 0 < l < 8 values, then perform
1680             * a piecewise store of the final vector.
1681             */
16824:          tst         r3, #4
1683            beq         1f
1684            vst1.u32    {d31[0]}, [r0]!
1685            vext.u8     d31, d31, d31, #4
16861:          tst         r3, #2
1687            beq         1f
1688            vst1.u16    {d31[0]}, [r0]!
1689            vext.u8     d31, d31, d31, #2
16901:          tst         r3, #1
1691            beq         5f
1692            vst1.u8     {d31[0]}, [r0]!
1693            vext.u8     d31, d31, d31, #1
16945:          mov         r0, #0
1695.endm
1696
1697.irp r, TUNED_LIST1, 25
1698PRIVATE(convolve1_\r)
1699            push        {r12,lr}
1700
1701            prefill     step=1, max_r=\r, label=.Lcnv1_\r
1702
1703            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1704
1705            pop         {r12,pc}
1706END(convolve1_\r)
1707.endr
1708
1709.irp r, TUNED_LIST4, 25
1710PRIVATE(convolve4_\r)
1711            push        {r12,lr}
1712            sub         r9, sp, #0x200
1713            sub         sp, sp, #0x200 + 0x400
1714            bic         r9, r9, #0x3fc
1715
1716            /* r9 now points to a 0x200 byte buffer on the stack whose address
1717             * has the low 10 bits clear.  This allows easy address calculation
1718             * in the wrap-around cases.
1719             */
1720
1721            prefill     step=4, max_r=\r, label=.Lcnv4_\r
1722
1723            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1724
1725            add         sp, sp, #0x200 + 0x400
1726            pop         {r12,pc}
1727END(convolve4_\r)
1728.endr
1729
1730/* void rsdIntrinsicBlurU1_K(
1731 *                  void *out,      // r0
1732 *                  void *in,       // r1
1733 *                  size_t w,       // r2
1734 *                  size_t h,       // r3
1735 *                  size_t p,       // [sp]
1736 *                  size_t x,       // [sp,#4]
1737 *                  size_t y,       // [sp,#8]
1738 *                  size_t count,   // [sp,#12]
1739 *                  size_t r,       // [sp,#16]
1740 *                  uint16_t *tab); // [sp,#20]
1741 */
1742ENTRY(rsdIntrinsicBlurU1_K)
1743            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1744            vpush       {d8-d15}
1745            ldr         r6, [sp,#112]   // y
1746            ldr         r8, [sp,#108]   // x
1747            ldr         r5, [sp,#120]   // r
1748            sub         r4, r2, r8      // inlen = w - x
1749            sub         r7, r3, r6      // h - y
1750            ldr         r2, [sp,#104]   // pitch
1751            ldr         r3, [sp,#116]   // count
1752            sub         r7, r7, #1      // h - y - 1
1753
1754            ldr         r12, [sp,#124]
1755
1756            add         r1, r1, r8      // src += x
1757
1758            cmp         r6, r5
1759            movhi       r6, r5          // rup = min(r, y)
1760            cmp         r7, r5
1761            movhi       r7, r5          // rdn = min(r, h - y - 1)
1762
1763            vld1.u16    {d0,d1,d2,d3}, [r12]!
1764            vld1.u16    {d4,d5,d6}, [r12]!
1765
1766            adr         lr, 1f
1767  .irp r, TUNED_LIST1
1768            cmp         r5, #\r
1769            bls         convolve1_\r
1770  .endr
1771            b           convolve1_25
1772
17731:          vpop        {d8-d15}
1774            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
1775END(rsdIntrinsicBlurU1_K)
1776
1777/* void rsdIntrinsicBlurU4_K(
1778 *                  void *out,      // r0
1779 *                  void *in,       // r1
1780 *                  size_t w,       // r2
1781 *                  size_t h,       // r3
1782 *                  size_t p,       // [sp]
1783 *                  size_t x,       // [sp,#4]
1784 *                  size_t y,       // [sp,#8]
1785 *                  size_t count,   // [sp,#12]
1786 *                  size_t r,       // [sp,#16]
1787 *                  uint16_t *tab); // [sp,#20]
1788 */
1789ENTRY(rsdIntrinsicBlurU4_K)
1790            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
1791            vpush       {d8-d15}
1792            ldr         r6, [sp,#112]   // y
1793            ldr         r8, [sp,#108]   // x
1794            ldr         r5, [sp,#120]   // r
1795            lsl         r8, r8, #2
1796            rsb         r4, r8, r2, LSL #2 // inlen = (w - x)
1797            sub         r7, r3, r6      // h - y
1798            ldr         r2, [sp,#104]   // pitch
1799            ldr         r3, [sp,#116]   // count
1800            sub         r7, r7, #1      // h - y - 1
1801            lsl         r3, r3, #2      // count
1802
1803            ldr         r12, [sp,#124]
1804
1805            add         r1, r1, r8      // in += x
1806
1807            cmp         r6, r5
1808            movhi       r6, r5          // rup = min(r, y)
1809            cmp         r7, r5
1810            movhi       r7, r5          // rdn = min(r, h - y - 1)
1811
1812            vld1.u16    {d0,d1,d2,d3}, [r12]!
1813            vld1.u16    {d4,d5,d6}, [r12]!
1814
1815            adr         lr, 1f
1816  .irp r, TUNED_LIST4
1817            cmp         r5, #\r
1818            bls         convolve4_\r
1819  .endr
1820            b           convolve4_25
1821
18221:          vpop        {d8-d15}
1823            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
1824END(rsdIntrinsicBlurU4_K)
1825