1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define PRIVATE(f) .text; .align 4; .type f,#function; f:
19#define END(f) .size f, .-f;
20
21//#define ARCH_ARM64_USE_BLUR_PRELOAD
22
23/* Number of fractional bits to preserve in intermediate results.  The
24 * intermediate storage is 16-bit, and we started with 8 bit data (the integer
25 * part), so this should be between 0 and 8.
26 */
27.set FRACTION_BITS, 7
28.set MAX_R, 25
29
30
31/* A quick way of making a line of code conditional on some other condition.
32 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
33 * `ifcc`:
34 */
35.macro ifcc zzz:vararg
36.if cc
37            \zzz
38.endif
39.endm
40
41/* It's not always clear that prefetching is beneficial and this needs further
42 * testing on different cores, so it's made switchable here.
43 */
44#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
45#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
46#else
47#define VERTPLD(...) nop
48#endif
49
50/* Fetch 16 columns of bytes (regardless of image format), convolve these
51 * vertically, and leave them in the register file.  If working near the top or
52 * bottom of an image then clamp the addressing while loading the data in.
53 *
54 * The convolution is fully unrolled for windows up to max_r, with the
55 * outermost edges calculated first.  This way it's possible to branch directly
56 * into the relevant part of the code for an arbitrary convolution radius.  Two
57 * variants of the loop are produced; one eliminates the clamping code for a
58 * slight speed advantage.
59 *
60 * Where the macro is called with reg=x, the specified register is taken to
61 * contain a pre-calculated pointer into one of the two loops.
62 *
63 * Input:
64 *      x1 -- src
65 *      x2 -- pitch
66 *      x5 -- r
67 *      x6 -- rup (r, unless clipped to top of source image)
68 *      x7 -- rdn (r, unless clipped to bottom of source image)
69 *      x12 -- switch index
70 *      v0-v3 -- coefficient table
71 *      x13 = -pitch
72 *      x15 = top-row in
73 *      x19 = bottom-row in
74 * Output:
75 *      x1 += 16
76 *      v10,v11 -- 16 convolved columns
77 * Modifies:
78 *      x10 = upper row pointer
79 *      x11 = lower row pointer
80 *      v12-v15 = temporary sums
81 */
82.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
83  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
84
85            ld1         {v15.16b}, [x1], #16
86            mov         x10, x15
87
88            uxtl        v14.8h, v15.8b
89            VERTPLD(x1, #16)
90            uxtl2       v15.8h, v15.16b
91  .if \max_r < 16 // approximate
92    ifcc    adr         \reg, 1f
93  .else
94    ifcc    adrp        \reg, 1f
95    ifcc    add         \reg, \reg, #:lo12:1f
96  .endif
97
98            umull       v12.4s, v14.4h, v0.h[0]
99    ifcc    sub         \reg, \reg, x5, LSL #6
100            umull2      v13.4s, v14.8h, v0.h[0]
101            mov         x11, x19
102            umull       v14.4s, v15.4h, v0.h[0]
103    ifcc    add         \reg, \reg, x5, LSL #3
104            umull2      v15.4s, v15.8h, v0.h[0]
105            br          \reg
106
107  /* This version of the vertical fetch loop body is used away from the edges
108   * of the source image.  The pointers start at the top and bottom source rows
109   * and work their way towards the centre on each iteration.  This way the
110   * number of taps used can be controlled by jumping directly into the middle
111   * of the loop and running to completion.
112   * If the loop body changes size then the code which caculates the address of
113   * the initial iteration must be updated to accordingly.
114   */
115  .macro vertfetch_noclamp i, dreg
116    .if 0 < \i && \i <= \max_r
117            ld1         {v10.16b}, [x10], x2
118            ld1         {v11.16b}, [x11], x13
119            uaddl       v16.8h, v10.8b, v11.8b
120            uaddl2      v11.8h, v10.16b, v11.16b
121            umlal       v12.4s, v16.4h, \dreg
122            umlal2      v13.4s, v16.8h, \dreg
123            VERTPLD(x10, #32)
124            umlal       v14.4s, v11.4h, \dreg
125            VERTPLD(x11, #32)
126            umlal2      v15.4s, v11.8h, \dreg
127    .endif
128  .endm
129
130  /* This version of the vertical fetch loop body is used near the edges of the
131   * source image, where one or both of the accesses may start with a clamped
132   * value, and the row addresses only begin to change after some number of
133   * iterations before the end.
134   * If the loop body changes size then the code which caculates the address of
135   * the initial iteration must be updated to accordingly.
136   */
137  .macro vertfetch_clamped i, dreg
138    .if 0 < \i && \i <= \max_r
139            ld1         {v10.16b}, [x10], x2
140            cmp         x6, #\i
141            ld1         {v11.16b}, [x11], x13
142            csel        x10, x15, x10, lo
143            uaddl       v16.8h, v10.8b, v11.8b
144            cmp         x7, #\i
145            uaddl2      v11.8h, v10.16b, v11.16b
146            csel        x11, x19, x11, lo
147            umlal       v12.4s, v16.4h, \dreg
148            umlal2      v13.4s, v16.8h, \dreg
149            VERTPLD(x10, #32)
150            umlal       v14.4s, v11.4h, \dreg
151            VERTPLD(x11, #32)
152            umlal2      v15.4s, v11.8h, \dreg
153    .endif
154  .endm
155
156  /* Entry into this unrolled loop is computed as a negative index from
157   * \labelc at the end of the block.
158   */
159  .align 4
160  vertfetch_clamped 27, v3.h[3]
161  vertfetch_clamped 26, v3.h[2]
162  vertfetch_clamped 25, v3.h[1]
163  vertfetch_clamped 24, v3.h[0]
164  vertfetch_clamped 23, v2.h[7]
165  vertfetch_clamped 22, v2.h[6]
166  vertfetch_clamped 21, v2.h[5]
167  vertfetch_clamped 20, v2.h[4]
168  vertfetch_clamped 19, v2.h[3]
169  vertfetch_clamped 18, v2.h[2]
170  vertfetch_clamped 17, v2.h[1]
171  vertfetch_clamped 16, v2.h[0]
172  vertfetch_clamped 15, v1.h[7]
173  vertfetch_clamped 14, v1.h[6]
174  vertfetch_clamped 13, v1.h[5]
175  vertfetch_clamped 12, v1.h[4]
176  vertfetch_clamped 11, v1.h[3]
177  vertfetch_clamped 10, v1.h[2]
178  vertfetch_clamped  9, v1.h[1]
179  vertfetch_clamped  8, v1.h[0]
180  vertfetch_clamped  7, v0.h[7]
181  vertfetch_clamped  6, v0.h[6]
182  vertfetch_clamped  5, v0.h[5]
183  vertfetch_clamped  4, v0.h[4]
184  vertfetch_clamped  3, v0.h[3]
185  vertfetch_clamped  2, v0.h[2]
186  vertfetch_clamped  1, v0.h[1]
187  vertfetch_clamped  0, v0.h[0]
188  1:
189  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
190
191  /* Entry into this unrolled loop is computed as a negative index from
192   * \labelnc at the end of the block.
193   */
194  .align 4
195  vertfetch_noclamp 27, v3.h[3]
196  vertfetch_noclamp 26, v3.h[2]
197  vertfetch_noclamp 25, v3.h[1]
198  vertfetch_noclamp 24, v3.h[0]
199  vertfetch_noclamp 23, v2.h[7]
200  vertfetch_noclamp 22, v2.h[6]
201  vertfetch_noclamp 21, v2.h[5]
202  vertfetch_noclamp 20, v2.h[4]
203  vertfetch_noclamp 19, v2.h[3]
204  vertfetch_noclamp 18, v2.h[2]
205  vertfetch_noclamp 17, v2.h[1]
206  vertfetch_noclamp 16, v2.h[0]
207  vertfetch_noclamp 15, v1.h[7]
208  vertfetch_noclamp 14, v1.h[6]
209  vertfetch_noclamp 13, v1.h[5]
210  vertfetch_noclamp 12, v1.h[4]
211  vertfetch_noclamp 11, v1.h[3]
212  vertfetch_noclamp 10, v1.h[2]
213  vertfetch_noclamp  9, v1.h[1]
214  vertfetch_noclamp  8, v1.h[0]
215  vertfetch_noclamp  7, v0.h[7]
216  vertfetch_noclamp  6, v0.h[6]
217  vertfetch_noclamp  5, v0.h[5]
218  vertfetch_noclamp  4, v0.h[4]
219  vertfetch_noclamp  3, v0.h[3]
220  vertfetch_noclamp  2, v0.h[2]
221  vertfetch_noclamp  1, v0.h[1]
222  vertfetch_noclamp  0, v0.h[0]
223  \labelnc :
224
225  .purgem vertfetch_clamped
226  .purgem vertfetch_noclamp
227
228  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
229            add         x15, x15, #16
230            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
231            add         x19, x19, #16
232            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
233            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
234.endm /*}}}*/
235
236/* Some portion of the convolution window (as much as will fit, and all of it
237 * for the uchar1 cases) is kept in the register file to avoid unnecessary
238 * memory accesses.  This forces the horizontal loops to be unrolled because
239 * there's no indexed addressing into the register file.
240 *
241 * As in the fetch macro, the operations are ordered from outside to inside, so
242 * that jumping into the middle of the block bypasses the unwanted window taps.
243 *
244 * There are several variants of the macro because of the fixed offets of the
245 * taps -- the wider the maximum radius the further the centre tap is from the
246 * most recently fetched data.  This means that pre-filling the window requires
247 * more data that won't be used and it means that rotating the window involves
248 * more mov operations.
249 *
250 * When the buffer gets too big the buffer at [x9] is used.
251 *
252 * Input:
253 *      v16-v31,v4-v11 -- convoltion window
254 *      x9 -- pointer to additional convolution window data
255 * Output:
256 *      x9 -- updated buffer pointer (if used)
257 *      d31 -- result to be stored
258 * Modifies:
259 *      x12 -- temp buffer pointer
260 *      v12-v13 -- temporaries for load and vext operations.
261 *      v14-v15 -- intermediate sums
262 */
263#define TUNED_LIST1 8, 16
264.macro hconv1_8/*{{{*/
265
266.rodata
267    200:    .hword -4
268            .hword 101f-100f
269            .hword 102f-100f
270            .hword 103f-100f
271            .hword 104f-100f
272            .hword 105f-100f
273            .hword 106f-100f
274            .hword 107f-100f
275            .hword 108f-100f
276            .align      4
277.text
278            umull       v14.4s, v9.4h, v0.h[0]
279            umull2      v15.4s, v9.8h, v0.h[0]
280
281            adrp        x16, 200b
282            add         x16, x16, :lo12:200b
283            ldrsh       x12, [x16, x5, LSL #1]
284            adr         x16, 100f
285            add         x12, x12, x16
286    100:    br          x12
287    108:    umlal       v14.4s, v8.4h, v1.h[0]
288            umlal2      v15.4s, v8.8h, v1.h[0]
289            umlal       v14.4s, v10.4h, v1.h[0]
290            umlal2      v15.4s, v10.8h, v1.h[0]
291    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
292            ext         v13.16b, v9.16b, v10.16b, #7*2
293            umlal       v14.4s, v12.4h, v0.h[7]
294            umlal2      v15.4s, v12.8h, v0.h[7]
295            umlal       v14.4s, v13.4h, v0.h[7]
296            umlal2      v15.4s, v13.8h, v0.h[7]
297    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
298            ext         v13.16b, v9.16b, v10.16b, #6*2
299            umlal       v14.4s, v12.4h, v0.h[6]
300            umlal2      v15.4s, v12.8h, v0.h[6]
301            umlal       v14.4s, v13.4h, v0.h[6]
302            umlal2      v15.4s, v13.8h, v0.h[6]
303    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
304            ext         v13.16b, v9.16b, v10.16b, #5*2
305            umlal       v14.4s, v12.4h, v0.h[5]
306            umlal2      v15.4s, v12.8h, v0.h[5]
307            umlal       v14.4s, v13.4h, v0.h[5]
308            umlal2      v15.4s, v13.8h, v0.h[5]
309    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
310            //ext         v13.16b, v9.16b, v10.16b, #4*2
311            umlal2      v14.4s, v8.8h, v0.h[4]
312            umlal       v15.4s, v9.4h, v0.h[4]
313            umlal2      v14.4s, v9.8h, v0.h[4]
314            umlal       v15.4s, v10.4h, v0.h[4]
315    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
316            ext         v13.16b, v9.16b, v10.16b, #3*2
317            umlal       v14.4s, v12.4h, v0.h[3]
318            umlal2      v15.4s, v12.8h, v0.h[3]
319            umlal       v14.4s, v13.4h, v0.h[3]
320            umlal2      v15.4s, v13.8h, v0.h[3]
321    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
322            ext         v13.16b, v9.16b, v10.16b, #2*2
323            umlal       v14.4s, v12.4h, v0.h[2]
324            umlal2      v15.4s, v12.8h, v0.h[2]
325            umlal       v14.4s, v13.4h, v0.h[2]
326            umlal2      v15.4s, v13.8h, v0.h[2]
327    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
328            ext         v13.16b, v9.16b, v10.16b, #1*2
329            umlal       v14.4s, v12.4h, v0.h[1]
330            umlal2      v15.4s, v12.8h, v0.h[1]
331            umlal       v14.4s, v13.4h, v0.h[1]
332            umlal2      v15.4s, v13.8h, v0.h[1]
333
334            uqrshrn     v14.4h, v14.4s, #16
335            uqrshrn2    v14.8h, v15.4s, #16
336            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
337
338            mov         v8.16b, v9.16b
339            mov         v9.16b, v10.16b
340            mov         v10.16b, v11.16b
341.endm/*}}}*/
342
343.macro hconv1_16/*{{{*/
344.rodata
345   200:     .hword -4
346            .hword 101f-100f
347            .hword 102f-100f
348            .hword 103f-100f
349            .hword 104f-100f
350            .hword 105f-100f
351            .hword 106f-100f
352            .hword 107f-100f
353            .hword 108f-100f
354            .hword 109f-100f
355            .hword 110f-100f
356            .hword 111f-100f
357            .hword 112f-100f
358            .hword 113f-100f
359            .hword 114f-100f
360            .hword 115f-100f
361            .hword 116f-100f
362            .align 4
363
364.text
365            umull       v14.4s, v8.4h, v0.h[0]
366            umull2      v15.4s, v8.8h, v0.h[0]
367
368            adrp        x16, 200b
369            add         x16, x16, :lo12:200b
370            ldrsh       x12, [x16, x5, LSL #1]
371            adr         x16, 100f
372            add         x12, x12, x16
373    100:    br          x12
374    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
375            //ext         v13.16b, v10.16b, v11.16b, #0*2
376            umlal       v14.4s, v6.4h, v2.h[0]
377            umlal2      v15.4s, v6.8h, v2.h[0]
378            umlal       v14.4s, v10.4h, v2.h[0]
379            umlal2      v15.4s, v10.8h, v2.h[0]
380    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
381            ext         v13.16b, v9.16b, v10.16b, #7*2
382            umlal       v14.4s, v12.4h, v1.h[7]
383            umlal2      v15.4s, v12.8h, v1.h[7]
384            umlal       v14.4s, v13.4h, v1.h[7]
385            umlal2      v15.4s, v13.8h, v1.h[7]
386    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
387            ext         v13.16b, v9.16b, v10.16b, #6*2
388            umlal       v14.4s, v12.4h, v1.h[6]
389            umlal2      v15.4s, v12.8h, v1.h[6]
390            umlal       v14.4s, v13.4h, v1.h[6]
391            umlal2      v15.4s, v13.8h, v1.h[6]
392    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
393            ext         v13.16b, v9.16b, v10.16b, #5*2
394            umlal       v14.4s, v12.4h, v1.h[5]
395            umlal2      v15.4s, v12.8h, v1.h[5]
396            umlal       v14.4s, v13.4h, v1.h[5]
397            umlal2      v15.4s, v13.8h, v1.h[5]
398    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
399            //ext         v13.16b, v9.16b, v10.16b, #4*2
400            umlal2      v14.4s, v6.8h, v1.h[4]
401            umlal       v15.4s, v7.4h, v1.h[4]
402            umlal2      v14.4s, v9.8h, v1.h[4]
403            umlal       v15.4s, v10.4h, v1.h[4]
404    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
405            ext         v13.16b, v9.16b, v10.16b, #3*2
406            umlal       v14.4s, v12.4h, v1.h[3]
407            umlal2      v15.4s, v12.8h, v1.h[3]
408            umlal       v14.4s, v13.4h, v1.h[3]
409            umlal2      v15.4s, v13.8h, v1.h[3]
410    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
411            ext         v13.16b, v9.16b, v10.16b, #2*2
412            umlal       v14.4s, v12.4h, v1.h[2]
413            umlal2      v15.4s, v12.8h, v1.h[2]
414            umlal       v14.4s, v13.4h, v1.h[2]
415            umlal2      v15.4s, v13.8h, v1.h[2]
416    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
417            ext         v13.16b, v9.16b, v10.16b, #1*2
418            umlal       v14.4s, v12.4h, v1.h[1]
419            umlal2      v15.4s, v12.8h, v1.h[1]
420            umlal       v14.4s, v13.4h, v1.h[1]
421            umlal2      v15.4s, v13.8h, v1.h[1]
422    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
423            //ext         v13.16b, v9.16b, v10.16b, #0*2
424            umlal       v14.4s, v7.4h, v1.h[0]
425            umlal2      v15.4s, v7.8h, v1.h[0]
426            umlal       v14.4s, v9.4h, v1.h[0]
427            umlal2      v15.4s, v9.8h, v1.h[0]
428    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
429            ext         v13.16b, v8.16b, v9.16b, #7*2
430            umlal       v14.4s, v12.4h, v0.h[7]
431            umlal2      v15.4s, v12.8h, v0.h[7]
432            umlal       v14.4s, v13.4h, v0.h[7]
433            umlal2      v15.4s, v13.8h, v0.h[7]
434    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
435            ext         v13.16b, v8.16b, v9.16b, #6*2
436            umlal       v14.4s, v12.4h, v0.h[6]
437            umlal2      v15.4s, v12.8h, v0.h[6]
438            umlal       v14.4s, v13.4h, v0.h[6]
439            umlal2      v15.4s, v13.8h, v0.h[6]
440    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
441            ext         v13.16b, v8.16b, v9.16b, #5*2
442            umlal       v14.4s, v12.4h, v0.h[5]
443            umlal2      v15.4s, v12.8h, v0.h[5]
444            umlal       v14.4s, v13.4h, v0.h[5]
445            umlal2      v15.4s, v13.8h, v0.h[5]
446    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
447            //ext         v13.16b, v8.16b, v9.16b, #4*2
448            umlal2      v14.4s, v7.8h, v0.h[4]
449            umlal       v15.4s, v8.4h, v0.h[4]
450            umlal2      v14.4s, v8.8h, v0.h[4]
451            umlal       v15.4s, v9.4h, v0.h[4]
452    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
453            ext         v13.16b, v8.16b, v9.16b, #3*2
454            umlal       v14.4s, v12.4h, v0.h[3]
455            umlal2      v15.4s, v12.8h, v0.h[3]
456            umlal       v14.4s, v13.4h, v0.h[3]
457            umlal2      v15.4s, v13.8h, v0.h[3]
458    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
459            ext         v13.16b, v8.16b, v9.16b, #2*2
460            umlal       v14.4s, v12.4h, v0.h[2]
461            umlal2      v15.4s, v12.8h, v0.h[2]
462            umlal       v14.4s, v13.4h, v0.h[2]
463            umlal2      v15.4s, v13.8h, v0.h[2]
464    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
465            ext         v13.16b, v8.16b, v9.16b, #1*2
466            umlal       v14.4s, v12.4h, v0.h[1]
467            umlal2      v15.4s, v12.8h, v0.h[1]
468            umlal       v14.4s, v13.4h, v0.h[1]
469            umlal2      v15.4s, v13.8h, v0.h[1]
470
471            uqrshrn     v14.4h, v14.4s, #16
472            uqrshrn2    v14.8h, v15.4s, #16
473            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
474
475            mov         v6.16b, v7.16b
476            mov         v7.16b, v8.16b
477            mov         v8.16b, v9.16b
478            mov         v9.16b, v10.16b
479            mov         v10.16b, v11.16b
480.endm/*}}}*/
481
482.macro hconv1_25/*{{{*/
483.rodata
484   200:     .hword -4
485            .hword 101f-100f
486            .hword 102f-100f
487            .hword 103f-100f
488            .hword 104f-100f
489            .hword 105f-100f
490            .hword 106f-100f
491            .hword 107f-100f
492            .hword 108f-100f
493            .hword 109f-100f
494            .hword 110f-100f
495            .hword 111f-100f
496            .hword 112f-100f
497            .hword 113f-100f
498            .hword 114f-100f
499            .hword 115f-100f
500            .hword 116f-100f
501            .hword 117f-100f
502            .hword 118f-100f
503            .hword 119f-100f
504            .hword 120f-100f
505            .hword 121f-100f
506            .hword 122f-100f
507            .hword 123f-100f
508            .hword 124f-100f
509            .hword 125f-100f
510            .align 4
511.text
512            ext         v12.16b, v6.16b, v7.16b, #7*2
513            umull       v14.4s, v12.4h, v0.h[0]
514            umull2      v15.4s, v12.8h, v0.h[0]
515
516            adrp        x16, 200b
517            add         x16, x16, :lo12:200b
518            ldrsh       x12, [x16, x5, LSL #1]
519            adr         x16, 100f
520            add         x12, x12, x16
521    100:    br          x12
522    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
523            ext         v13.16b, v10.16b, v11.16b, #0*2
524            umlal       v14.4s, v12.4h, v3.h[1]
525            umlal2      v15.4s, v12.8h, v3.h[1]
526            umlal       v14.4s, v13.4h, v3.h[1]
527            umlal2      v15.4s, v13.8h, v3.h[1]
528    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
529            ext         v13.16b, v9.16b, v10.16b, #7*2
530            umlal       v14.4s, v12.4h, v3.h[0]
531            umlal2      v15.4s, v12.8h, v3.h[0]
532            umlal       v14.4s, v13.4h, v3.h[0]
533            umlal2      v15.4s, v13.8h, v3.h[0]
534    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
535            ext         v13.16b, v9.16b, v10.16b, #6*2
536            umlal       v14.4s, v12.4h, v2.h[7]
537            umlal2      v15.4s, v12.8h, v2.h[7]
538            umlal       v14.4s, v13.4h, v2.h[7]
539            umlal2      v15.4s, v13.8h, v2.h[7]
540    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
541            ext         v13.16b, v9.16b, v10.16b, #5*2
542            umlal       v14.4s, v12.4h, v2.h[6]
543            umlal2      v15.4s, v12.8h, v2.h[6]
544            umlal       v14.4s, v13.4h, v2.h[6]
545            umlal2      v15.4s, v13.8h, v2.h[6]
546    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
547            ext         v13.16b, v9.16b, v10.16b, #4*2
548            umlal       v14.4s, v12.4h, v2.h[5]
549            umlal2      v15.4s, v12.8h, v2.h[5]
550            umlal       v14.4s, v13.4h, v2.h[5]
551            umlal2      v15.4s, v13.8h, v2.h[5]
552    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
553            ext         v13.16b, v9.16b, v10.16b, #3*2
554            umlal       v14.4s, v12.4h, v2.h[4]
555            umlal2      v15.4s, v12.8h, v2.h[4]
556            umlal       v14.4s, v13.4h, v2.h[4]
557            umlal2      v15.4s, v13.8h, v2.h[4]
558    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
559            ext         v13.16b, v9.16b, v10.16b, #2*2
560            umlal       v14.4s, v12.4h, v2.h[3]
561            umlal2      v15.4s, v12.8h, v2.h[3]
562            umlal       v14.4s, v13.4h, v2.h[3]
563            umlal2      v15.4s, v13.8h, v2.h[3]
564    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
565            ext         v13.16b, v9.16b, v10.16b, #1*2
566            umlal       v14.4s, v12.4h, v2.h[2]
567            umlal2      v15.4s, v12.8h, v2.h[2]
568            umlal       v14.4s, v13.4h, v2.h[2]
569            umlal2      v15.4s, v13.8h, v2.h[2]
570    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
571            ext         v13.16b, v9.16b, v10.16b, #0*2
572            umlal       v14.4s, v12.4h, v2.h[1]
573            umlal2      v15.4s, v12.8h, v2.h[1]
574            umlal       v14.4s, v13.4h, v2.h[1]
575            umlal2      v15.4s, v13.8h, v2.h[1]
576    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
577            ext         v13.16b, v8.16b, v9.16b, #7*2
578            umlal       v14.4s, v12.4h, v2.h[0]
579            umlal2      v15.4s, v12.8h, v2.h[0]
580            umlal       v14.4s, v13.4h, v2.h[0]
581            umlal2      v15.4s, v13.8h, v2.h[0]
582    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
583            ext         v13.16b, v8.16b, v9.16b, #6*2
584            umlal       v14.4s, v12.4h, v1.h[7]
585            umlal2      v15.4s, v12.8h, v1.h[7]
586            umlal       v14.4s, v13.4h, v1.h[7]
587            umlal2      v15.4s, v13.8h, v1.h[7]
588    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
589            ext         v13.16b, v8.16b, v9.16b, #5*2
590            umlal       v14.4s, v12.4h, v1.h[6]
591            umlal2      v15.4s, v12.8h, v1.h[6]
592            umlal       v14.4s, v13.4h, v1.h[6]
593            umlal2      v15.4s, v13.8h, v1.h[6]
594    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
595            ext         v13.16b, v8.16b, v9.16b, #4*2
596            umlal       v14.4s, v12.4h, v1.h[5]
597            umlal2      v15.4s, v12.8h, v1.h[5]
598            umlal       v14.4s, v13.4h, v1.h[5]
599            umlal2      v15.4s, v13.8h, v1.h[5]
600    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
601            ext         v13.16b, v8.16b, v9.16b, #3*2
602            umlal       v14.4s, v12.4h, v1.h[4]
603            umlal2      v15.4s, v12.8h, v1.h[4]
604            umlal       v14.4s, v13.4h, v1.h[4]
605            umlal2      v15.4s, v13.8h, v1.h[4]
606    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
607            ext         v13.16b, v8.16b, v9.16b, #2*2
608            umlal       v14.4s, v12.4h, v1.h[3]
609            umlal2      v15.4s, v12.8h, v1.h[3]
610            umlal       v14.4s, v13.4h, v1.h[3]
611            umlal2      v15.4s, v13.8h, v1.h[3]
612    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
613            ext         v13.16b, v8.16b, v9.16b, #1*2
614            umlal       v14.4s, v12.4h, v1.h[2]
615            umlal2      v15.4s, v12.8h, v1.h[2]
616            umlal       v14.4s, v13.4h, v1.h[2]
617            umlal2      v15.4s, v13.8h, v1.h[2]
618    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
619            ext         v13.16b, v8.16b, v9.16b, #0*2
620            umlal       v14.4s, v12.4h, v1.h[1]
621            umlal2      v15.4s, v12.8h, v1.h[1]
622            umlal       v14.4s, v13.4h, v1.h[1]
623            umlal2      v15.4s, v13.8h, v1.h[1]
624    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
625            ext         v13.16b, v7.16b, v8.16b, #7*2
626            umlal       v14.4s, v12.4h, v1.h[0]
627            umlal2      v15.4s, v12.8h, v1.h[0]
628            umlal       v14.4s, v13.4h, v1.h[0]
629            umlal2      v15.4s, v13.8h, v1.h[0]
630    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
631            ext         v13.16b, v7.16b, v8.16b, #6*2
632            umlal       v14.4s, v12.4h, v0.h[7]
633            umlal2      v15.4s, v12.8h, v0.h[7]
634            umlal       v14.4s, v13.4h, v0.h[7]
635            umlal2      v15.4s, v13.8h, v0.h[7]
636    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
637            ext         v13.16b, v7.16b, v8.16b, #5*2
638            umlal       v14.4s, v12.4h, v0.h[6]
639            umlal2      v15.4s, v12.8h, v0.h[6]
640            umlal       v14.4s, v13.4h, v0.h[6]
641            umlal2      v15.4s, v13.8h, v0.h[6]
642    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
643            ext         v13.16b, v7.16b, v8.16b, #4*2
644            umlal       v14.4s, v12.4h, v0.h[5]
645            umlal2      v15.4s, v12.8h, v0.h[5]
646            umlal       v14.4s, v13.4h, v0.h[5]
647            umlal2      v15.4s, v13.8h, v0.h[5]
648    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
649            ext         v13.16b, v7.16b, v8.16b, #3*2
650            umlal       v14.4s, v12.4h, v0.h[4]
651            umlal2      v15.4s, v12.8h, v0.h[4]
652            umlal       v14.4s, v13.4h, v0.h[4]
653            umlal2      v15.4s, v13.8h, v0.h[4]
654    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
655            ext         v13.16b, v7.16b, v8.16b, #2*2
656            umlal       v14.4s, v12.4h, v0.h[3]
657            umlal2      v15.4s, v12.8h, v0.h[3]
658            umlal       v14.4s, v13.4h, v0.h[3]
659            umlal2      v15.4s, v13.8h, v0.h[3]
660    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
661            ext         v13.16b, v7.16b, v8.16b, #1*2
662            umlal       v14.4s, v12.4h, v0.h[2]
663            umlal2      v15.4s, v12.8h, v0.h[2]
664            umlal       v14.4s, v13.4h, v0.h[2]
665            umlal2      v15.4s, v13.8h, v0.h[2]
666    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
667            ext         v13.16b, v7.16b, v8.16b, #0*2
668            umlal       v14.4s, v12.4h, v0.h[1]
669            umlal2      v15.4s, v12.8h, v0.h[1]
670            umlal       v14.4s, v13.4h, v0.h[1]
671            umlal2      v15.4s, v13.8h, v0.h[1]
672
673            uqrshrn     v14.4h, v14.4s, #16
674            uqrshrn2    v14.8h, v15.4s, #16
675            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
676
677            mov         v31.16b, v4.16b
678            mov         v4.16b, v5.16b
679            mov         v5.16b, v6.16b
680            mov         v6.16b, v7.16b
681            mov         v7.16b, v8.16b
682            mov         v8.16b, v9.16b
683            mov         v9.16b, v10.16b
684            mov         v10.16b, v11.16b
685.endm/*}}}*/
686
687#define TUNED_LIST4 6, 12, 20
688.macro hconv4_6/*{{{*/
689.rodata
690   200:     .hword -4
691            .hword 101f-100f
692            .hword 102f-100f
693            .hword 103f-100f
694            .hword 104f-100f
695            .hword 105f-100f
696            .hword 106f-100f
697            .align      4
698.text
699            umull       v14.4s, v7.4h, v0.h[0]
700            umull2      v15.4s, v7.8h, v0.h[0]
701
702            adrp        x16, 200b
703            add         x16, x16, :lo12:200b
704            ldrsh       x12, [x16, x5, LSL #1]
705            adr         x16, 100f
706            add         x12, x12, x16
707    100:    br          x12
708    106:    umlal       v14.4s, v4.4h,  v0.h[6]
709            umlal2      v15.4s, v4.8h,  v0.h[6]
710            umlal       v14.4s, v10.4h, v0.h[6]
711            umlal2      v15.4s, v10.8h, v0.h[6]
712    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
713            umlal       v15.4s, v5.4h, v0.h[5]
714            umlal2      v14.4s, v9.8h, v0.h[5]
715            umlal       v15.4s, v10.4h, v0.h[5]
716    104:    umlal       v14.4s, v5.4h, v0.h[4]
717            umlal2      v15.4s, v5.8h, v0.h[4]
718            umlal       v14.4s, v9.4h, v0.h[4]
719            umlal2      v15.4s, v9.8h, v0.h[4]
720    103:    umlal2      v14.4s, v5.8h, v0.h[3]
721            umlal       v15.4s, v6.4h, v0.h[3]
722            umlal2      v14.4s, v8.8h, v0.h[3]
723            umlal       v15.4s, v9.4h, v0.h[3]
724    102:    umlal       v14.4s, v6.4h, v0.h[2]
725            umlal2      v15.4s, v6.8h, v0.h[2]
726            umlal       v14.4s, v8.4h, v0.h[2]
727            umlal2      v15.4s, v8.8h, v0.h[2]
728    101:    umlal2      v14.4s, v6.8h, v0.h[1]
729            umlal       v15.4s, v7.4h, v0.h[1]
730            umlal2      v14.4s, v7.8h, v0.h[1]
731            umlal       v15.4s, v8.4h, v0.h[1]
732
733            uqrshrn     v14.4h, v14.4s, #16
734            uqrshrn2    v14.8h, v15.4s, #16
735            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
736
737            mov         v4.16b, v5.16b
738            mov         v5.16b, v6.16b
739            mov         v6.16b, v7.16b
740            mov         v7.16b, v8.16b
741            mov         v8.16b, v9.16b
742            mov         v9.16b, v10.16b
743            mov         v10.16b, v11.16b
744.endm/*}}}*/
745
746.macro hconv4_12/*{{{*/
747.rodata
748   200:     .hword -4 //Might need to remove these...
749            .hword 101f-100f
750            .hword 102f-100f
751            .hword 103f-100f
752            .hword 104f-100f
753            .hword 105f-100f
754            .hword 106f-100f
755            .hword 107f-100f
756            .hword 108f-100f
757            .hword 109f-100f
758            .hword 110f-100f
759            .hword 111f-100f
760            .hword 112f-100f
761            .align 4
762.text
763            umull       v14.4s, v4.4h, v0.h[0]
764            umull2      v15.4s, v4.8h, v0.h[0]
765
766            adrp        x16, 200b
767            add         x16, x16, :lo12:200b
768            ldrsh       x12, [x16, x5, LSL #1]
769            adr         x16, 100f
770            add         x12, x12, x16
771    100:    br          x12
772    112:    umlal       v14.4s, v26.4h, v1.h[4]
773            umlal2      v15.4s, v26.8h, v1.h[4]
774            umlal       v14.4s, v10.4h, v1.h[4]
775            umlal2      v15.4s, v10.8h, v1.h[4]
776    111:    umlal2      v14.4s, v26.8h, v1.h[3]
777            umlal       v15.4s, v27.4h, v1.h[3]
778            umlal2      v14.4s, v9.8h, v1.h[3]
779            umlal       v15.4s, v10.4h, v1.h[3]
780    110:    umlal       v14.4s, v27.4h, v1.h[2]
781            umlal2      v15.4s, v27.8h, v1.h[2]
782            umlal       v14.4s, v9.4h, v1.h[2]
783            umlal2      v15.4s, v9.8h, v1.h[2]
784    109:    umlal2      v14.4s, v27.8h, v1.h[1]
785            umlal       v15.4s, v28.4h, v1.h[1]
786            umlal2      v14.4s, v8.8h, v1.h[1]
787            umlal       v15.4s, v9.4h, v1.h[1]
788    108:    umlal       v14.4s, v28.4h, v1.h[0]
789            umlal2      v15.4s, v28.8h, v1.h[0]
790            umlal       v14.4s, v8.4h, v1.h[0]
791            umlal2      v15.4s, v8.8h, v1.h[0]
792    107:    umlal2      v14.4s, v28.8h, v0.h[7]
793            umlal       v15.4s, v29.4h, v0.h[7]
794            umlal2      v14.4s, v7.8h, v0.h[7]
795            umlal       v15.4s, v8.4h, v0.h[7]
796    106:    umlal       v14.4s, v29.4h, v0.h[6]
797            umlal2      v15.4s, v29.8h, v0.h[6]
798            umlal       v14.4s, v7.4h, v0.h[6]
799            umlal2      v15.4s, v7.8h, v0.h[6]
800    105:    umlal2      v14.4s, v29.8h, v0.h[5]
801            umlal       v15.4s, v30.4h, v0.h[5]
802            umlal2      v14.4s, v6.8h, v0.h[5]
803            umlal       v15.4s, v7.4h, v0.h[5]
804    104:    umlal       v14.4s, v30.4h, v0.h[4]
805            umlal2      v15.4s, v30.8h, v0.h[4]
806            umlal       v14.4s, v6.4h, v0.h[4]
807            umlal2      v15.4s, v6.8h, v0.h[4]
808    103:    umlal2      v14.4s, v30.8h, v0.h[3]
809            umlal       v15.4s, v31.4h, v0.h[3]
810            umlal2      v14.4s, v5.8h, v0.h[3]
811            umlal       v15.4s, v6.4h, v0.h[3]
812    102:    umlal       v14.4s, v31.4h, v0.h[2]
813            umlal2      v15.4s, v31.8h, v0.h[2]
814            umlal       v14.4s, v5.4h, v0.h[2]
815            umlal2      v15.4s, v5.8h, v0.h[2]
816    101:    umlal2      v14.4s, v31.8h, v0.h[1]
817            umlal       v15.4s, v4.4h,  v0.h[1]
818            umlal2      v14.4s, v4.8h,  v0.h[1]
819            umlal       v15.4s, v5.4h, v0.h[1]
820
821            uqrshrn     v14.4h, v14.4s, #16
822            uqrshrn2    v14.8h, v15.4s, #16
823            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
824
825            mov         v26.16b, v27.16b
826            mov         v27.16b, v28.16b
827            mov         v28.16b, v29.16b
828            mov         v29.16b, v30.16b
829            mov         v30.16b, v31.16b
830            mov         v31.16b, v4.16b
831            mov         v4.16b, v5.16b
832            mov         v5.16b, v6.16b
833            mov         v6.16b, v7.16b
834            mov         v7.16b, v8.16b
835            mov         v8.16b, v9.16b
836            mov         v9.16b, v10.16b
837            mov         v10.16b, v11.16b
838.endm/*}}}*/
839
840.macro hconv4_20/*{{{*/
841.rodata
842   200:     .hword -4
843            .hword 101f-100f
844            .hword 102f-100f
845            .hword 103f-100f
846            .hword 104f-100f
847            .hword 105f-100f
848            .hword 106f-100f
849            .hword 107f-100f
850            .hword 108f-100f
851            .hword 109f-100f
852            .hword 110f-100f
853            .hword 111f-100f
854            .hword 112f-100f
855            .hword 113f-100f
856            .hword 114f-100f
857            .hword 115f-100f
858            .hword 116f-100f
859            .hword 117f-100f
860            .hword 118f-100f
861            .hword 119f-100f
862            .hword 120f-100f
863            .align 4
864.text
865            umull       v14.4s, v28.4h, v0.h[0]
866            umull2      v15.4s, v28.8h, v0.h[0]
867
868            adrp        x16, 200b
869            add         x16, x16, :lo12:200b
870            ldrsh       x12, [x16, x5, LSL #1]
871            adr         x16, 100f
872            add         x12, x12, x16
873    100:    br          x12
874    120:    umlal       v14.4s, v18.4h, v2.h[4]
875            umlal2      v15.4s, v18.8h, v2.h[4]
876            umlal       v14.4s, v10.4h, v2.h[4]
877            umlal2      v15.4s, v10.8h, v2.h[4]
878    119:    umlal2      v14.4s, v18.8h, v2.h[3]
879            umlal       v15.4s, v19.4h, v2.h[3]
880            umlal2      v14.4s, v9.8h,  v2.h[3]
881            umlal       v15.4s, v10.4h, v2.h[3]
882    118:    umlal       v14.4s, v19.4h, v2.h[2]
883            umlal2      v15.4s, v19.8h, v2.h[2]
884            umlal       v14.4s, v9.4h,  v2.h[2]
885            umlal2      v15.4s, v9.8h,  v2.h[2]
886    117:    umlal2      v14.4s, v19.8h, v2.h[1]
887            umlal       v15.4s, v20.4h, v2.h[1]
888            umlal2      v14.4s, v8.8h,  v2.h[1]
889            umlal       v15.4s, v9.4h,  v2.h[1]
890    116:    umlal       v14.4s, v20.4h, v2.h[0]
891            umlal2      v15.4s, v20.8h, v2.h[0]
892            umlal       v14.4s, v8.4h,  v2.h[0]
893            umlal2      v15.4s, v8.8h,  v2.h[0]
894    115:    umlal2      v14.4s, v20.8h, v1.h[7]
895            umlal       v15.4s, v21.4h, v1.h[7]
896            umlal2      v14.4s, v7.8h,  v1.h[7]
897            umlal       v15.4s, v8.4h,  v1.h[7]
898    114:    umlal       v14.4s, v21.4h, v1.h[6]
899            umlal2      v15.4s, v21.8h, v1.h[6]
900            umlal       v14.4s, v7.4h,  v1.h[6]
901            umlal2      v15.4s, v7.8h,  v1.h[6]
902    113:    umlal2      v14.4s, v21.8h, v1.h[5]
903            umlal       v15.4s, v22.4h, v1.h[5]
904            umlal2      v14.4s, v6.8h,  v1.h[5]
905            umlal       v15.4s, v7.4h,  v1.h[5]
906    112:    umlal       v14.4s, v22.4h, v1.h[4]
907            umlal2      v15.4s, v22.8h, v1.h[4]
908            umlal       v14.4s, v6.4h,  v1.h[4]
909            umlal2      v15.4s, v6.8h,  v1.h[4]
910    111:    umlal2      v14.4s, v22.8h, v1.h[3]
911            umlal       v15.4s, v23.4h, v1.h[3]
912            umlal2      v14.4s, v5.8h,  v1.h[3]
913            umlal       v15.4s, v6.4h,  v1.h[3]
914    110:    umlal       v14.4s, v23.4h, v1.h[2]
915            umlal2      v15.4s, v23.8h, v1.h[2]
916            umlal       v14.4s, v5.4h,  v1.h[2]
917            umlal2      v15.4s, v5.8h,  v1.h[2]
918    109:    umlal2      v14.4s, v23.8h, v1.h[1]
919            umlal       v15.4s, v24.4h, v1.h[1]
920            umlal2      v14.4s, v4.8h,  v1.h[1]
921            umlal       v15.4s, v5.4h,  v1.h[1]
922    108:    umlal       v14.4s, v24.4h, v1.h[0]
923            umlal2      v15.4s, v24.8h, v1.h[0]
924            umlal       v14.4s, v4.4h,  v1.h[0]
925            umlal2      v15.4s, v4.8h,  v1.h[0]
926    107:    umlal2      v14.4s, v24.8h, v0.h[7]
927            umlal       v15.4s, v25.4h, v0.h[7]
928            umlal2      v14.4s, v31.8h, v0.h[7]
929            umlal       v15.4s, v4.4h,  v0.h[7]
930    106:    umlal       v14.4s, v25.4h, v0.h[6]
931            umlal2      v15.4s, v25.8h, v0.h[6]
932            umlal       v14.4s, v31.4h, v0.h[6]
933            umlal2      v15.4s, v31.8h, v0.h[6]
934    105:    umlal2      v14.4s, v25.8h, v0.h[5]
935            umlal       v15.4s, v26.4h, v0.h[5]
936            umlal2      v14.4s, v30.8h, v0.h[5]
937            umlal       v15.4s, v31.4h, v0.h[5]
938    104:    umlal       v14.4s, v26.4h, v0.h[4]
939            umlal2      v15.4s, v26.8h, v0.h[4]
940            umlal       v14.4s, v30.4h, v0.h[4]
941            umlal2      v15.4s, v30.8h, v0.h[4]
942    103:    umlal2      v14.4s, v26.8h, v0.h[3]
943            umlal       v15.4s, v27.4h, v0.h[3]
944            umlal2      v14.4s, v29.8h, v0.h[3]
945            umlal       v15.4s, v30.4h, v0.h[3]
946    102:    umlal       v14.4s, v27.4h, v0.h[2]
947            umlal2      v15.4s, v27.8h, v0.h[2]
948            umlal       v14.4s, v29.4h, v0.h[2]
949            umlal2      v15.4s, v29.8h, v0.h[2]
950    101:    umlal2      v14.4s, v27.8h, v0.h[1]
951            umlal       v15.4s, v28.4h, v0.h[1]
952            umlal2      v14.4s, v28.8h, v0.h[1]
953            umlal       v15.4s, v29.4h, v0.h[1]
954
955            uqrshrn     v14.4h, v14.4s, #16
956            uqrshrn2    v14.8h, v15.4s, #16
957            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
958
959            mov         v18.16b, v19.16b
960            mov         v19.16b, v20.16b
961            mov         v20.16b, v21.16b
962            mov         v21.16b, v22.16b
963            mov         v22.16b, v23.16b
964            mov         v23.16b, v24.16b
965            mov         v24.16b, v25.16b
966            mov         v25.16b, v26.16b
967            mov         v26.16b, v27.16b
968            mov         v27.16b, v28.16b
969            mov         v28.16b, v29.16b
970            mov         v29.16b, v30.16b
971            mov         v30.16b, v31.16b
972            mov         v31.16b, v4.16b
973            mov         v4.16b, v5.16b
974            mov         v5.16b, v6.16b
975            mov         v6.16b, v7.16b
976            mov         v7.16b, v8.16b
977            mov         v8.16b, v9.16b
978            mov         v9.16b, v10.16b
979            mov         v10.16b, v11.16b
980.endm/*}}}*/
981
982.macro hconv4_25/*{{{*/
983.rodata
984   200:     .hword -4
985            .hword 101f-100f
986            .hword 102f-100f
987            .hword 103f-100f
988            .hword 104f-100f
989            .hword 105f-100f
990            .hword 106f-100f
991            .hword 107f-100f
992            .hword 108f-100f
993            .hword 109f-100f
994            .hword 110f-100f
995            .hword 111f-100f
996            .hword 112f-100f
997            .hword 113f-100f
998            .hword 114f-100f
999            .hword 115f-100f
1000            .hword 116f-100f
1001            .hword 117f-100f
1002            .hword 118f-100f
1003            .hword 119f-100f
1004            .hword 120f-100f
1005            .hword 121f-100f
1006            .hword 122f-100f
1007            .hword 123f-100f
1008            .hword 124f-100f
1009            .hword 125f-100f
1010            .align 4
1011.text
1012            umull2      v14.4s, v25.8h, v0.h[0]
1013            umull       v15.4s, v26.4h, v0.h[0]
1014
1015            adrp        x16, 200b
1016            add         x16, x16, :lo12:200b
1017            ldrsh       x12, [x16, x5, LSL #1]
1018            adr         x16, 100f
1019            add         x12, x12, x16
1020    100:    br          x12
1021    125:    ld1         {v12.8h}, [x9]
1022            umlal       v14.4s, v12.4h, v3.h[1]
1023            umlal2      v15.4s, v12.8h, v3.h[1]
1024            umlal       v14.4s, v10.4h, v3.h[1]
1025            umlal2      v15.4s, v10.8h, v3.h[1]
1026    124:    add         x12, x9, #0x08
1027            bic         x12, x12, #0x40
1028            ld1         {v12.4h}, [x12], #8
1029            bic         x12, x12, #0x40
1030            ld1         {v13.4h}, [x12]
1031            umlal       v14.4s, v12.4h, v3.h[0]
1032            umlal       v15.4s, v13.4h, v3.h[0]
1033            umlal2      v14.4s, v9.8h,  v3.h[0]
1034            umlal       v15.4s, v10.4h, v3.h[0]
1035    123:    add         x12, x9, #0x10
1036            bic         x12, x12, #0x40
1037            ld1         {v12.8h}, [x12]
1038            umlal       v14.4s, v12.4h, v2.h[7]
1039            umlal2      v15.4s, v12.8h, v2.h[7]
1040            umlal       v14.4s, v9.4h,  v2.h[7]
1041            umlal2      v15.4s, v9.8h,  v2.h[7]
1042    122:    add         x12, x9, #0x18
1043            bic         x12, x12, #0x40
1044            ld1         {v12.4h}, [x12], #8
1045            bic         x12, x12, #0x40
1046            ld1         {v13.4h}, [x12]
1047            umlal       v14.4s, v12.4h, v2.h[6]
1048            umlal       v15.4s, v13.4h, v2.h[6]
1049            umlal2      v14.4s, v8.8h,  v2.h[6]
1050            umlal       v15.4s, v9.4h,  v2.h[6]
1051    121:    add         x12, x9, #0x20
1052            bic         x12, x12, #0x40
1053            ld1         {v12.8h}, [x12]
1054            umlal       v14.4s, v12.4h, v2.h[5]
1055            umlal2      v15.4s, v12.8h, v2.h[5]
1056            umlal       v14.4s, v8.4h,  v2.h[5]
1057            umlal2      v15.4s, v8.8h,  v2.h[5]
1058    120:    add         x12, x9, #0x28
1059            bic         x12, x12, #0x40
1060            ld1         {v12.4h}, [x12], #8
1061            bic         x12, x12, #0x40
1062            ld1         {v13.4h}, [x12]
1063            umlal       v14.4s, v12.4h, v2.h[4]
1064            umlal       v15.4s, v13.4h, v2.h[4]
1065            umlal2      v14.4s, v7.8h,  v2.h[4]
1066            umlal       v15.4s, v8.4h,  v2.h[4]
1067    119:    add         x12, x9, #0x30
1068            bic         x12, x12, #0x40
1069            ld1         {v12.8h}, [x12]
1070            umlal       v14.4s, v12.4h, v2.h[3]
1071            umlal2      v15.4s, v12.8h, v2.h[3]
1072            umlal       v14.4s, v7.4h,  v2.h[3]
1073            umlal2      v15.4s, v7.8h,  v2.h[3]
1074    118:    add         x12, x9, #0x38
1075            bic         x12, x12, #0x40
1076            ld1         {v12.4h}, [x12]
1077            umlal       v14.4s, v12.4h, v2.h[2]
1078            umlal       v15.4s, v17.4h, v2.h[2]
1079            umlal2      v14.4s, v6.8h,  v2.h[2]
1080            umlal       v15.4s, v7.4h,  v2.h[2]
1081    117:    umlal       v14.4s, v17.4h, v2.h[1]
1082            umlal2      v15.4s, v17.8h, v2.h[1]
1083            umlal       v14.4s, v6.4h,  v2.h[1]
1084            umlal2      v15.4s, v6.8h,  v2.h[1]
1085    116:    umlal2      v14.4s, v17.8h, v2.h[0]
1086            umlal       v15.4s, v18.4h, v2.h[0]
1087            umlal2      v14.4s, v5.8h,  v2.h[0]
1088            umlal       v15.4s, v6.4h,  v2.h[0]
1089    115:    umlal       v14.4s, v18.4h, v1.h[7]
1090            umlal2      v15.4s, v18.8h, v1.h[7]
1091            umlal       v14.4s, v5.4h,  v1.h[7]
1092            umlal2      v15.4s, v5.8h,  v1.h[7]
1093    114:    umlal2      v14.4s, v18.8h, v1.h[6]
1094            umlal       v15.4s, v19.4h, v1.h[6]
1095            umlal2      v14.4s, v4.8h,  v1.h[6]
1096            umlal       v15.4s, v5.4h,  v1.h[6]
1097    113:    umlal       v14.4s, v19.4h, v1.h[5]
1098            umlal2      v15.4s, v19.8h, v1.h[5]
1099            umlal       v14.4s, v4.4h,  v1.h[5]
1100            umlal2      v15.4s, v4.8h,  v1.h[5]
1101    112:    umlal2      v14.4s, v19.8h, v1.h[4]
1102            umlal       v15.4s, v20.4h, v1.h[4]
1103            umlal2      v14.4s, v31.8h, v1.h[4]
1104            umlal       v15.4s, v4.4h,  v1.h[4]
1105    111:    umlal       v14.4s, v20.4h, v1.h[3]
1106            umlal2      v15.4s, v20.8h, v1.h[3]
1107            umlal       v14.4s, v31.4h, v1.h[3]
1108            umlal2      v15.4s, v31.8h, v1.h[3]
1109    110:    umlal2      v14.4s, v20.8h, v1.h[2]
1110            umlal       v15.4s, v21.4h, v1.h[2]
1111            umlal2      v14.4s, v30.8h, v1.h[2]
1112            umlal       v15.4s, v31.4h, v1.h[2]
1113    109:    umlal       v14.4s, v21.4h, v1.h[1]
1114            umlal2      v15.4s, v21.8h, v1.h[1]
1115            umlal       v14.4s, v30.4h, v1.h[1]
1116            umlal2      v15.4s, v30.8h, v1.h[1]
1117    108:    umlal2      v14.4s, v21.8h, v1.h[0]
1118            umlal       v15.4s, v22.4h, v1.h[0]
1119            umlal2      v14.4s, v29.8h, v1.h[0]
1120            umlal       v15.4s, v30.4h, v1.h[0]
1121    107:    umlal       v14.4s, v22.4h, v0.h[7]
1122            umlal2      v15.4s, v22.8h, v0.h[7]
1123            umlal       v14.4s, v29.4h, v0.h[7]
1124            umlal2      v15.4s, v29.8h, v0.h[7]
1125    106:    umlal2      v14.4s, v22.8h, v0.h[6]
1126            umlal       v15.4s, v23.4h, v0.h[6]
1127            umlal2      v14.4s, v28.8h, v0.h[6]
1128            umlal       v15.4s, v29.4h, v0.h[6]
1129    105:    umlal       v14.4s, v23.4h, v0.h[5]
1130            umlal2      v15.4s, v23.8h, v0.h[5]
1131            umlal       v14.4s, v28.4h, v0.h[5]
1132            umlal2      v15.4s, v28.8h, v0.h[5]
1133    104:    umlal2      v14.4s, v23.8h, v0.h[4]
1134            umlal       v15.4s, v24.4h, v0.h[4]
1135            umlal2      v14.4s, v27.8h, v0.h[4]
1136            umlal       v15.4s, v28.4h, v0.h[4]
1137    103:    umlal       v14.4s, v24.4h, v0.h[3]
1138            umlal2      v15.4s, v24.8h, v0.h[3]
1139            umlal       v14.4s, v27.4h, v0.h[3]
1140            umlal2      v15.4s, v27.8h, v0.h[3]
1141    102:    umlal2      v14.4s, v24.8h, v0.h[2]
1142            umlal       v15.4s, v25.4h, v0.h[2]
1143            umlal2      v14.4s, v26.8h, v0.h[2]
1144            umlal       v15.4s, v27.4h, v0.h[2]
1145    101:    umlal       v14.4s, v25.4h, v0.h[1]
1146            umlal2      v15.4s, v25.8h, v0.h[1]
1147            umlal       v14.4s, v26.4h, v0.h[1]
1148            umlal2      v15.4s, v26.8h, v0.h[1]
1149
1150            uqrshrn     v14.4h, v14.4s, #16
1151            uqrshrn2    v14.8h, v15.4s, #16
1152            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1153
1154            st1         {v17.16b}, [x9], #16
1155            bic         x9, x9, #0x40
1156            mov         v17.16b, v18.16b
1157            mov         v18.16b, v19.16b
1158            mov         v19.16b, v20.16b
1159            mov         v20.16b, v21.16b
1160            mov         v21.16b, v22.16b
1161            mov         v22.16b, v23.16b
1162            mov         v23.16b, v24.16b
1163            mov         v24.16b, v25.16b
1164            mov         v25.16b, v26.16b
1165            mov         v26.16b, v27.16b
1166            mov         v27.16b, v28.16b
1167            mov         v28.16b, v29.16b
1168            mov         v29.16b, v30.16b
1169            mov         v30.16b, v31.16b
1170            mov         v31.16b, v4.16b
1171            mov         v4.16b, v5.16b
1172            mov         v5.16b, v6.16b
1173            mov         v6.16b, v7.16b
1174            mov         v7.16b, v8.16b
1175            mov         v8.16b, v9.16b
1176            mov         v9.16b, v10.16b
1177            mov         v10.16b, v11.16b
1178.endm/*}}}*/
1179
1180/* Dedicated function wrapper for the fetch macro, for the cases where
1181 * performance isn't that important, to keep code size down.
1182 */
1183PRIVATE(fetch_generic_asm)
1184            stp         x10, x11, [sp, #-16]!
1185            fetch
1186            ldp         x10, x11, [sp], #16
1187            ret
1188END(fetch_generic_asm)
1189
1190
1191/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
1192 * beyond that limit, and filling the rest of the vector with the last legal
1193 * pixel.
1194 * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
1195 * Note: This function can read beyond the right edge of input if the image is
1196 * narrower than 16 bytes.
1197 */
1198PRIVATE(fetch_clampleft1)
1199            stp         x29, x30, [sp, #-16]!
1200            bl          fetch_generic_asm
1201            dup         v8.8h, v10.h[0]
1202            dup         v9.8h, v10.h[0]
1203            ands        x12, x10, #15
1204            beq         1f
1205            sub         x1, x1, x12
1206            sub         x15, x15, x12
1207            sub         x19, x19, x12
1208            sub         x10, x10, x12
1209            sub         x12, sp, x12, LSL #1
1210            sub         sp, sp, #64
1211            sub         x12, x12, #32
1212            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
1213            ld1         {v10.8h,v11.8h}, [x12]
1214            add         sp, sp, #64
12151:          ldp         x29, x30, [sp], #16
1216            ret
1217END(fetch_clampleft1)
1218
1219PRIVATE(fetch_clampleft4)
1220            stp         x29, x30, [sp, #-16]!
1221            bl          fetch_generic_asm
1222            dup         v8.2d, v10.d[0]
1223            dup         v9.2d, v10.d[0]
1224            ands        x12, x10, #15
1225            beq         1f
1226            sub         x1, x1, x12
1227            sub         x15, x15, x12
1228            sub         x19, x19, x12
1229            sub         x10, x10, x12
1230            sub         x12, sp, x12, LSL #1
1231            sub         sp, sp, #64
1232            sub         x12, x12, #32
1233            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
1234            ld1         {v10.8h,v11.8h}, [x12]
1235            add         sp, sp, #64
12361:          ldp         x29, x30, [sp], #16
1237            ret
1238END(fetch_clampleft4)
1239
1240/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
1241 * reading memory beyond that limit, and filling the rest of the vector with
1242 * the last legal pixel.
1243 * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
1244 * Note: This function can read beyond the left edge of input if the image is
1245 * narrower than 16 bytes.
1246 */
1247PRIVATE(fetch_clampright1)
1248            stp         x29, x30, [sp, #-16]!
1249            sub         x12, xzr, x11
1250            ands        x12, x12, #15
1251            beq         1f
1252            sub         x1, x1, x12
1253            sub         x15, x15, x12
1254            sub         x19, x19, x12
1255            bl          fetch_generic_asm
1256            dup         v12.8h, v11.h[7]
1257            dup         v13.8h, v11.h[7]
1258            sub         x12, xzr, x11
1259            and         x12, x12, #15
1260            sub         sp, sp, #64
1261            add         x12, sp, x12, LSL #1
1262            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
1263            ld1         {v10.8h,v11.8h}, [x12]
1264            add         sp, sp, #64
1265            ldp         x29, x30, [sp], #16
1266            ret
12671:          bl          fetch_generic_asm
1268            dup         v12.8h, v11.h[7]
1269            dup         v13.8h, v11.h[7]
1270            ldp         x29, x30, [sp], #16
1271            ret
1272END(fetch_clampright1)
1273
1274PRIVATE(fetch_clampright4)
1275            stp         x29, x30, [sp, #-16]!
1276            sub         x12, xzr, x11
1277            ands        x12, x12, #15
1278            beq         1f
1279            sub         x1, x1, x12
1280            sub         x15, x15, x12
1281            sub         x19, x19, x12
1282            bl          fetch_generic_asm
1283            dup         v12.2d, v11.d[1]
1284            dup         v13.2d, v11.d[1]
1285            sub         x12, xzr, x11
1286            and         x12, x12, #15
1287            sub         sp, sp, #64
1288            add         x12, sp, x12, LSL #1
1289            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
1290            ld1         {v10.8h,v11.8h}, [x12]
1291            add         sp, sp, #64
1292            ldp         x29, x30, [sp], #16
1293            ret
12941:          bl          fetch_generic_asm
1295            dup         v12.2d, v11.d[1]
1296            dup         v13.2d, v11.d[1]
1297            ldp         x29, x30, [sp], #16
1298            ret
1299END(fetch_clampright4)
1300
1301/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
1302 * value across to fill the rest of the register pair.  Used for filling the
1303 * right hand edge of the window when reading too close to the right hand edge
1304 * of the image.
1305 * Also returns a dup-ed copy of the last element in v12 for the tail-fill
1306 * case (this happens incidentally in common path, but must be done
1307 * deliberately in the fast-out path).
1308 */
1309PRIVATE(prefill_sweepright1)
1310            ands        x12, x11, #15
1311            beq         1f
1312            sub         x12, x12, #1
1313            sub         sp, sp, #64
1314            st1         {v10.8h,v11.8h}, [sp]
1315            add         x12, sp, x12, LSL #1
1316            ld1r        {v12.8h}, [x12]
1317            ld1r        {v13.8h}, [x12]
1318            st1         {v12.8h,v13.8h}, [x12]
1319            ld1         {v10.8h,v11.8h}, [sp]
1320            add         sp, sp, #64
1321            ret
13221:          dup         v12.8h, v11.h[7]
1323            dup         v13.8h, v11.h[7]
1324            ret
1325END(prefill_sweepright1)
1326
1327PRIVATE(prefill_sweepright4)
1328            ands        x12, x11, #15
1329            beq         1f
1330            sub         x12, x12, #4
1331            sub         sp, sp, #64
1332            st1         {v10.8h,v11.8h}, [sp]
1333            add         x12, sp, x12, LSL #1
1334            ld1r        {v12.2d}, [x12]
1335            st1         {v13.8h}, [x12]
1336            ld1         {v10.8h,v11.8h}, [sp]
1337            add         sp, sp, #64
1338            ret
13391:          dup         v12.2d, v11.d[1]
1340            dup         v13.2d, v11.d[1]
1341            ret
1342END(prefill_sweepright4)
1343
1344/* The main loop keeps a sliding window of data that has already been convolved
1345 * in the vertical axis for the current line.  This usually stays in the
1346 * register file, but spills to memory for large windows.  The first thing that
1347 * needs to be done at start-up is to fill this window with image data, taking
1348 * into account the padding needed if the left or right edges of the image fall
1349 * within this window.
1350 */
1351
1352/* Because the window is in the register file writes to it cannot be indexed
1353 * by another register.  Consequently the fill loops are unrolled to address
1354 * the registers directly.  This macro distinguishes between writes to the
1355 * register file and writes to the spill buffer (indicated by a destination
1356 * register named xx).
1357 */
1358.macro prefill_out ra, rb, sra, srb
1359  .ifc \ra,xx
1360    .ifc \rb,xx
1361            st1         {\sra,\srb}, [x9], #32
1362    .else
1363            bic         x9, x9, #0x40
1364            st1         {\sra}, [x9], #16
1365            mov         \rb, \srb
1366    .endif
1367  .else
1368    .ifnc \ra,\sra
1369            mov         \ra, \sra
1370    .endif
1371    .ifnc \rb,\srb
1372            mov         \rb, \srb
1373    .endif
1374  .endif
1375.endm
1376
1377/* This macro provides the list of registers representing the window, and the
1378 * cases where the register file is too small and a spill buffer is used
1379 * instead.
1380 * Since several specialisations of each function are generated, this also
1381 * culls superfluous iterations, and sets the variable `i` for subsequent
1382 * macros indicating the current index into the window.
1383 */
1384.macro prefill_list, macro, nextmacro, max_r, step, label
1385  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
1386    .if windowsize >= (\line * 16)
1387      .set i, windowsize - (\line * 16)
1388\label\macro\line:
1389            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
1390    .endif
1391  .endm
1392            ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
1393            ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
1394            ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
1395            ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
1396            ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
1397            ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
1398            ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
1399            ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
1400            ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
1401            ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
1402            ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
1403            ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
1404            ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
1405\label\macro\()0:
1406            b           \label\()_end
1407  .purgem ifneeded
1408.endm
1409
1410/* These macros represent the possible stages of filling the window.
1411 * Each macro is unrolled enough times that it can fill the entire window
1412 * itself, but normally it will have to hand control to subsequent macros
1413 * part-way through and this is done using labels named \next and \after, where
1414 * \next is the next macro starting at the same window position and \after is
1415 * the next macro starting after the current window position.
1416 */
1417
1418/* leftfill: v8 and v9 contain the left padding value.  While the window
1419 * extends outside of the image on the left-hand side, and at least 16 more
1420 * padding values are needed in the window, store v8 and v9 into the window.
1421 * Otherwise skip forward to storing image data.
1422 */
1423.macro prefill_leftfill, next, after, ra, rb, step
1424            cmp         x10, #i+16
1425            blo         \next
1426            prefill_out \ra, \rb, v8.16b, v9.16b
1427.endm
1428
1429/* leftedge: The very first non-fill or partial-fill chunk from the image is
1430 * already loaded (as it was used to calculate the left padding value), so
1431 * store it here, and then drop into the regular load/store cycle in the next
1432 * macro.
1433 */
1434.macro prefill_leftedge, next, after, ra, rb, step
14351:          prefill_out \ra, \rb, v10.16b, v11.16b
1436            b           \after
1437.endm
1438
1439/* dofetch: Copy chunks of the image into the window without any complications
1440 * from edge conditions.
1441 */
1442.macro prefill_dofetch, next, after, ra, rb, step
1443            cmp         x11, #i+16
1444            bls         \next
1445            bl          fetch_generic_asm
1446            prefill_out \ra, \rb, v10.16b, v11.16b
1447.endm
1448
1449/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
1450 * the right-hand edge of the image.  In that case sweep the last valid pixel
1451 * across the rest of the chunk, and in either case prepare padding data in v12
1452 * and v13 for the next macro.  This is done in fetch_clampright.
1453 * This only happens once before going on to the next macro.
1454 * Sometimes leftedge also covers the rightedge case, in which case this has
1455 * to be skipped altogether.
1456 */
1457.macro prefill_rightedge, next, after, ra, rb, step
1458            cmp         x11, #i
1459            bls         \next
1460            bl          fetch_clampright\step
1461            prefill_out \ra, \rb, v10.16b, v11.16b
1462            b           \after
1463.endm
1464
1465/* rightfill: The rest of the window is simply filled with right padding from
1466 * v12 and v13.
1467 */
1468.macro prefill_rightfill, next, after, ra, rb, step
1469            prefill_out \ra, \rb, v12.16b, v13.16b
1470.endm
1471
1472/* Here all of the macros above are unrolled and laid out in the proper order.
1473 */
1474.macro prefill_body, max_r, step, label
1475            prefill_list leftfill,  leftedge,   \max_r, \step, \label
1476            prefill_list leftedge,  dofetch,    \max_r, \step, \label
1477            prefill_list dofetch,   rightedge,  \max_r, \step, \label
1478            prefill_list rightedge, rightfill,  \max_r, \step, \label
1479            prefill_list rightfill, oops,       \max_r, \step, \label
1480\label\()_end:
1481.endm
1482
1483
1484/* Fill the convolution window with context data.  The aim here is to load
1485 * exactly 2*r columns, and in the main loop to read as many columns as will be
1486 * written.  This is complicated by the window being divided into chunks at
1487 * register boundaries, and the need to handle cases when the input starts very
1488 * close to the left or right (or both) edges of the image and the need to fill
1489 * the spaces that leaves with left and right edge padding values.
1490 *
1491 * Input:
1492 *      x1 -- src
1493 *      x2 -- pitch
1494 *      x3 -- count
1495 *      x4 -- available image data right of src pointer
1496 *      x5 -- r
1497 *      x6 -- rup
1498 *      x7 -- rdn
1499 *      x8 -- available image data left of src pointer
1500 *      x9 -- buffer (if needed)
1501 *      x13 = -pitch
1502 *      x15 = top-row in
1503 *      x19 = bottom-row in
1504 * Output:
1505 *      x4 -= min(inlen, count + windowsize - centertap)
1506 *      x1 += min(inlen, count + windowsize - centertap)
1507 *      x15 += min(inlen, count + windowsize - centertap)
1508 *      x19 += min(inlen, count + windowsize - centertap)
1509 * Modifies:
1510 *      x10 -- fill start index in the window
1511 *      x11 -- fill stop index in the window
1512 *      x12 -- scratch
1513 */
1514.macro prefill step=1, max_r=25, label=xx
1515.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
1516.set centertap, (windowsize - \max_r * \step)
1517            mov         x10, #centertap
1518            subs        x10, x10, x8
1519            csel        x10, xzr, x10, lo
1520
1521            subs        x11, x4, #windowsize - centertap
1522            csel        x11, xzr, x11, hs
1523            add         x11, x11, #windowsize
1524
1525            /* x10 indicates where in the window legal image data begins.
1526             * x11 indicates where in the window legal image date ends.
1527             * When starting near the centre of a large image these would be
1528             * zero and windowsize respectively, but when starting near the
1529             * edges this can change.
1530             * When starting on the leftmost pixel, x10 will be centertap.
1531             * When starting on the rightmost pixel, x11 will be centertap+1.
1532             */
1533
1534            /* x4 indicates how much data there is between the current pointers
1535             * and the right edge of the image.  The pointers currently point
1536             * to the data needed at centertap.  The subsequent code will
1537             * consume (windowsize - x10) data, but only the data from
1538             * centertap to windowsize comes out of x4's budget.
1539             */
15401:          subs        x4, x4, #windowsize - centertap
1541            csel        x4, xzr, x4, lo
1542
1543            /* And the pointers need to rewind to the start of the window.
1544             */
1545            sub         x1, x1, #centertap
1546            sub         x15, x15, #centertap
1547            sub         x19, x19, #centertap
1548
1549            /* Unless x8 indicated that there wasn't that much data available.
1550             */
1551            add         x1, x1, x10
1552            add         x15, x15, x10
1553            add         x19, x19, x10
1554
1555            /* Get the first chunk, and add padding to align it to the window
1556             * if necessary.
1557             */
1558            bl          fetch_clampleft\step
1559
1560            /* Sometimes the start and the end of the window are in the same
1561             * chunk.  In that case both ends need filler at the outset.
1562             */
1563            sub         x12, x11, #1
1564            eor         x12,  x10, x12
1565            cmp         x12, #16
1566            bhs         1f
1567            bl          prefill_sweepright\step
1568
1569            /* Iterate through all the points in the window and fill them in
1570             * with padding or image data as needed.
1571             */
15721:          prefill_body \max_r, \step, \label
1573.endm
1574
1575/* The main body of the convolve functions.  Having already pre-filled the
1576 * convolution window with 2*r input values, the logic settles into a regular
1577 * pattern of reading and writing at a 1:1 rate until either input or output
1578 * expires.  The input leads the output by r values, so when processing all the
1579 * way to the right-hand edge, or within r pixels of that edge, the input will
1580 * run out first.  In the case of very narrow images, or sub-windows starting
1581 * near the right edge, the input may already have run out while the
1582 * convolution window was being filled and this loop will start with a
1583 * zero-length input.
1584 *
1585 * Once the input runs out, the rest of the output must be processed by padding
1586 * the remainder of the window with pad value from the last valid pixel from
1587 * the source.
1588 *
1589 * Input:
1590 *      x0 = dst
1591 *      x1 = src
1592 *      x2 = pitch
1593 *      x3 = count
1594 *      x4 = inlen
1595 *      x5 = r
1596 *      x6 = rup
1597 *      x7 = rdn
1598 *      x9 = buffer
1599 *      x13 = -pitch
1600 *      x15 = top-row in
1601 *      x19 = bottom-row in
1602 * Modifies
1603 *      x8 = fetch code pointer
1604 */
1605.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
1606
1607            /* If x4 >= x3 then there's no need for clipping.  The main loop
1608             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
1609             * no greater than x3 and use x4 for the loop.
1610             * However, if x4 comes out of the loop with less than 16 bytes
1611             * left, a partial read would be necessary to avoid reading beyond
1612             * the end of the image.  To avoid this, clamp x4 to the next
1613             * multiple of 16, which is still sufficient to force it out of the
1614             * loop but doesn't imply a rewind.
1615             */
1616            add         x12, x3, #15
1617            bic         x12, x12, #15
1618            cmp         x4, x12
1619            csel        x4, x12, x4, hi
1620
1621            /* First calculate the entry-point into the internal fetch logic.
1622             * This is done so the same function can service several kernel
1623             * sizes.
1624             */
1625            adrp        x8, \labelnc
1626            add         x8, x8, #:lo12:\labelnc
1627            sub         x8, x8, x5, LSL #5
1628            sub         x8, x8, x5, LSL #3
1629            cmp         x5, x6
1630            ccmp        x5, x7, #0, eq
1631            beq         5f
1632
1633            /* if (r != rup || r != rdn) then the address-clamping table should
1634             * be used rather than the short-cut version.
1635             */
1636            adrp        x8, \labelc
1637            add         x8, x8, #:lo12:\labelc
1638            sub         x8, x8, x5, LSL #6
1639            add         x8, x8, x5, LSL #3
1640            b           5f
1641
1642            /* Main loop: ... */
1643            .align  4
16443:          /* first perform a vertical convolution from memory to get the next
1645             * 16 taps of the horizontal window into the register file...
1646             */
1647            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
1648
1649            /* ...then perform a horizontal convolution on that window to
1650             * produce eight output bytes, and slide the window along.
1651             * This has to be done twice to match the 16-way vertical pass.
1652             * It would be preferable to have twice the work done in \core, but
1653             * that would demand yet another variant on those macros and would
1654             * perturb the register allocation severely.
1655             */
1656            \core
1657            st1         {v15.8b}, [x0], #8
1658            \core
1659            st1         {v15.8b}, [x0], #8
1660
1661            sub         x3, x3, #16
16625:          subs        x4, x4, #16
1663            bhi         3b
1664            /* Here there's 16 or fewer bytes available before the edge of the
1665             * source image.  x4 holds that count minus 16 (because it was
1666             * decremented before the first iteration ran).  The last read may
1667             * not be a whole chunk, and beyond that a fill value must be used.
1668             *
1669             * Of course, none of that matters if there's no more output to
1670             * produce...
1671             */
1672            cbz         x3, 5f
1673
1674            /* Oh well. */
1675            adds        x4, x4, #16
1676            bne         1f
1677  .if \step==1
1678            dup         v10.8h, v9.h[7]
1679            dup         v11.8h, v9.h[7]
1680  .else
1681            dup         v10.2d, v9.d[1]
1682            dup         v11.2d, v9.d[1]
1683  .endif
1684            b           3f
1685
1686            /* To avoid reading past end of input, rewind pointers by (16-x4)
1687             * to ensure that they're exactly 16 bytes from the edge.
1688             */
16891:          mov         x11, x4
1690            bl          fetch_clampright\step
1691            /* Now to put this padding to use, perform any remaining
1692             * iterations.  This is done at half the rate of the main loop,
1693             * because there's no longer pressure from a 16-lane window filler.
1694             */
16953:          \core
1696  .if \step==1
1697            dup         v11.8h, v11.h[7]
1698  .else
1699            dup         v11.2d, v11.d[1]
1700  .endif
1701            subs        x3, x3, #8
1702            blo         4f
1703            st1         {v15.8b}, [x0], #8
1704            bne         3b
1705            b           5f
1706
1707            /* If the final iteration contained 0 < l < 8 values, then perform
1708             * a piecewise store of the final vector.
1709             */
17104:          tbz         x3, #2, 1f
1711            st1         {v15.s}[0], [x0], #4
1712            ext         v15.8b, v15.8b, v15.8b, #4
17131:          tbz         x3, #1, 1f
1714            st1         {v15.h}[0], [x0], #2
1715            ext         v15.8b, v15.8b, v15.8b, #2
17161:          tbz         x3, #0, 5f
1717            st1         {v15.b}[0], [x0], #1
1718            ext         v15.8b, v15.8b, v15.8b, #1
17195:          mov         x0, #0
1720.endm
1721
1722
1723.irp r, TUNED_LIST1, 25
1724PRIVATE(convolve1_\r)
1725            stp         x29,x30, [sp, #-16]!
1726
1727            prefill     step=1, max_r=\r, label=.Lcnv1_\r
1728
1729            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1730
1731            ldp         x29,x30, [sp], #16
1732            ret
1733END(convolve1_\r)
1734.endr
1735
1736.irp r, TUNED_LIST4, 25
1737PRIVATE(convolve4_\r)
1738            sub         x9, sp, #0x40
1739            stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
1740            bic         x9, x9, #0x7f
1741
1742            /* x9 now points to a 0x40 byte buffer on the stack whose address
1743             * has the low 7 bits clear.  This allows easy address calculation
1744             * in the wrap-around cases.
1745             */
1746
1747            prefill     step=4, max_r=\r, label=.Lcnv4_\r
1748
1749            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1750
1751            ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
1752            ret
1753END(convolve4_\r)
1754.endr
1755
1756/* void rsdIntrinsicBlurU1_K(
1757 *                  void *out,      // x0
1758 *                  void *in,       // x1
1759 *                  size_t w,       // x2
1760 *                  size_t h,       // x3
1761 *                  size_t p,       // x4
1762 *                  size_t x,       // x5
1763 *                  size_t y,       // x6
1764 *                  size_t count,   // x7
1765 *                  size_t r,       // [sp]
1766 *                  uint16_t *tab); // [sp,#8]
1767 */
1768ENTRY(rsdIntrinsicBlurU1_K)
1769            stp         x19,x30, [sp, #-16]!
1770            sub         x8, sp, #32
1771            sub         sp, sp, #64
1772            st1         {v8.1d - v11.1d}, [sp]
1773            st1         {v12.1d - v15.1d}, [x8]
1774            mov         x8, x5          // x
1775            ldr         w5, [sp,#80]    // r
1776            sub         x9, x2, x8      // w - x
1777            sub         x10, x3, x6     // h - y
1778            mov         x2, x4          // pitch
1779            mov         x3, x7          // count
1780            sub         x7, x10, #1     // h - y - 1
1781            mov         x4, x9          // inlen = (w - x)
1782
1783            ldr         x12, [sp, #88] // tab
1784
1785            add         x1, x1, x8      // src += x
1786
1787            cmp         x6, x5
1788            csel        x6, x5, x6, hs  // rup = min(r, y)
1789            cmp         x7, x5
1790            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1791
1792            sub         x13, xzr, x2    // -pitch
1793            msub        x15, x2, x6, x1
1794            madd        x19, x2, x7, x1
1795
1796            ld1         {v0.8h,v1.8h}, [x12], #32
1797            ld1         {v2.8h,v3.8h}, [x12], #32
1798
1799            adr         x30, 1f
1800  .irp r, TUNED_LIST1
1801            cmp         x5, #\r
1802            bls         convolve1_\r
1803  .endr
1804            b           convolve1_25
1805
18061:          ld1         {v8.1d - v11.1d}, [sp], #32
1807            ld1         {v12.1d - v15.1d}, [sp], #32
1808            ldp         x19,x30, [sp], #16
1809            ret
1810END(rsdIntrinsicBlurU1_K)
1811
1812/* void rsdIntrinsicBlurU4_K(
1813 *                  void *out,      // x0
1814 *                  void *in,       // x1
1815 *                  size_t w,       // x2
1816 *                  size_t h,       // x3
1817 *                  size_t p,       // x4
1818 *                  size_t x,       // x5
1819 *                  size_t y,       // x6
1820 *                  size_t count,   // x7
1821 *                  size_t r,       // [sp]
1822 *                  uint16_t *tab); // [sp,#8]
1823 */
1824ENTRY(rsdIntrinsicBlurU4_K)
1825            stp         x19,x30, [sp, #-16]!
1826            sub         x8, sp, #32
1827            sub         sp, sp, #64
1828            st1         {v8.1d - v11.1d}, [sp]
1829            st1         {v12.1d - v15.1d}, [x8]
1830            lsl         x8, x5, #2      // x
1831            lsl         x2, x2, #2
1832            ldr         w5, [sp,#80]    // r
1833            sub         x9, x2, x8      // w - x
1834            sub         x10, x3, x6     // h - y
1835            mov         x2, x4          // pitch
1836            lsl         x3, x7, #2      // count
1837            sub         x7, x10, #1     // h - y - 1
1838            mov         x4, x9          // inlen = (w - x)
1839
1840            ldr         x12, [sp, #88]
1841
1842            add         x1, x1, x8      // in += x
1843
1844            cmp         x6, x5
1845            csel        x6, x5, x6, hs  // rup = min(r, y)
1846            cmp         x7, x5
1847            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
1848
1849
1850            sub         x13, xzr, x2
1851            msub        x15, x2, x6, x1
1852            madd        x19, x2, x7, x1
1853
1854            ld1         {v0.8h,v1.8h}, [x12], #32
1855            ld1         {v2.8h,v3.8h}, [x12], #32
1856
1857            adr         x30, 1f
1858  .irp r, TUNED_LIST4
1859            cmp         x5, #\r
1860            bls         convolve4_\r
1861  .endr
1862            b           convolve4_25
1863
18641:          ld1         {v8.1d - v11.1d}, [sp], #32
1865            ld1         {v12.1d - v15.1d}, [sp], #32
1866            ldp         x19,x30, [sp], #16
1867            ret
1868END(rsdIntrinsicBlurU4_K)
1869