1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  * Copyright (C) 2016 Mopria Alliance, Inc.
4  * Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  *      http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "wprint_scaler.h"
20 #include <assert.h>
21 #include <stdio.h>
22 
23 #define ROUND_4_DOWN(x) ((x) & ~3)
24 #define ROUND_4_UP(x)   (ROUND_4_DOWN((x) + 3))
25 #define PSCALER_FRACT_BITS_COUNT 24
26 
27 typedef enum {
28     FRACTION_ROUND_UP,
29     FRACTION_TRUNCATE
30 } pscaler_fraction_t;
31 
32 static uint32
33         _scaler_fraction_part(uint32 iNum, uint32 iDen, pscaler_fraction_t mode, bool_t *overflow);
34 
35 static void _hw_scale_image_plane(scaler_config_t *pscaler_config, scaler_mode_t scaleMode);
36 
37 static void _calculate_factors(scaler_config_t *pscaler_config, scaler_mode_t scaleMode);
38 
scaler_make_image_scaler_tables(uint16 image_input_width,uint16 image_input_buf_width,uint16 image_output_width,uint16 image_output_buf_width,uint16 image_input_height,uint16 image_output_height,scaler_config_t * pscaler_config)39 void scaler_make_image_scaler_tables(uint16 image_input_width, uint16 image_input_buf_width,
40         uint16 image_output_width, uint16 image_output_buf_width, uint16 image_input_height,
41         uint16 image_output_height, scaler_config_t *pscaler_config) {
42     pscaler_config->iSrcWidth = image_input_width;
43     pscaler_config->iSrcHeight = image_input_height;
44     pscaler_config->iOutWidth = image_output_width;
45     pscaler_config->iOutHeight = image_output_height;
46 
47     if ((image_input_width >= image_output_width) &&
48             (image_input_height >= image_output_height)) { // scale DOWN
49         pscaler_config->scaleMode = PSCALER_SCALE_DOWN;
50     } else if ((image_input_width <= image_output_width) &&
51             (image_input_height <= image_output_height)) { // scale UP
52         pscaler_config->scaleMode = PSCALER_SCALE_UP;
53     } else if (image_input_width > image_output_width) { // mixed scale Y-axis first
54         pscaler_config->scaleMode = PSCALER_SCALE_MIXED_YUP;
55     } else { // mixed scale X-axis first
56         pscaler_config->scaleMode = PSCALER_SCALE_MIXED_XUP;
57     }
58 
59     // Setup scale factors
60     _calculate_factors(pscaler_config, pscaler_config->scaleMode);
61 
62     // calculates initial buffer sizes for scaling whole image
63     //  start rows    == 0
64     //  end_rows      == image height
65     //  buffer widths == image widths
66     pscaler_config->fSrcStartRow.decimal = 0;
67     pscaler_config->fSrcStartRow.fraction = 0;
68     pscaler_config->iSrcStartRow = 0;
69     pscaler_config->iSrcEndRow = pscaler_config->iSrcHeight;
70     pscaler_config->iSrcBufWidth = image_input_buf_width;
71     pscaler_config->iOutStartRow = 0;
72     pscaler_config->iOutEndRow = pscaler_config->iOutHeight;
73     pscaler_config->iOutBufWidth = image_output_buf_width;
74     pscaler_config->pSrcBuf = NULL;
75     pscaler_config->pOutBuf = NULL;
76     pscaler_config->pTmpBuf = NULL;
77 }
78 
scaler_calculate_scaling_rows(uint16 start_output_row_number,uint16 end_output_row_number,void * tables_ptr,uint16 * start_input_row_number,uint16 * end_input_row_number,uint16 * num_output_rows_generated,uint16 * num_rows_offset_to_start_output_row,uint32 * mixed_axis_temp_buffer_size_needed)79 void scaler_calculate_scaling_rows(uint16 start_output_row_number, uint16 end_output_row_number,
80         void *tables_ptr, uint16 *start_input_row_number, uint16 *end_input_row_number,
81         uint16 *num_output_rows_generated, uint16 *num_rows_offset_to_start_output_row,
82         uint32 *mixed_axis_temp_buffer_size_needed) {
83     float64_t fSrcEndRow;
84     bool_t overflow;
85     scaler_config_t *pscaler_config;
86 
87     pscaler_config = (scaler_config_t *) tables_ptr;
88     assert (start_output_row_number < pscaler_config->iOutHeight);
89 
90     // copy the output start and end rows
91     // Don't ever attempt to output a single row from the scaler.
92     if (end_output_row_number == start_output_row_number) {
93         if (start_output_row_number == 0) {
94             pscaler_config->iOutStartRow = start_output_row_number;
95             pscaler_config->iOutEndRow = end_output_row_number + 1;
96             *num_rows_offset_to_start_output_row = 0;
97         } else {
98             pscaler_config->iOutStartRow = start_output_row_number - 1;
99             pscaler_config->iOutEndRow = end_output_row_number;
100             *num_rows_offset_to_start_output_row = 1;
101         }
102     } else {
103         pscaler_config->iOutStartRow = start_output_row_number;
104         pscaler_config->iOutEndRow = end_output_row_number;
105         *num_rows_offset_to_start_output_row = 0;
106     }
107 
108     if (pscaler_config->iOutEndRow >= pscaler_config->iOutHeight) { // last stripe
109         pscaler_config->iOutEndRow = pscaler_config->iOutHeight - 1;
110     }
111 
112     if (pscaler_config->scaleMode == PSCALER_SCALE_UP ||
113             pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP) {
114         // scale factors are calculated as dim-1/dim-1
115         pscaler_config->iSrcHeight--;
116         pscaler_config->iOutHeight--;
117     }
118 
119     pscaler_config->fSrcStartRow.decimal = (uint32) pscaler_config->iOutStartRow *
120             (uint32) pscaler_config->iSrcHeight / (uint32) pscaler_config->iOutHeight;
121 
122     pscaler_config->fSrcStartRow.fraction = _scaler_fraction_part(
123             (uint32) pscaler_config->iOutStartRow * (uint32) pscaler_config->iSrcHeight,
124             (uint32) pscaler_config->iOutHeight, FRACTION_ROUND_UP, &overflow);
125 
126     if (overflow) {
127         pscaler_config->fSrcStartRow.decimal++;
128     }
129 
130     pscaler_config->iSrcStartRow = pscaler_config->fSrcStartRow.decimal;
131 
132     if (pscaler_config->scaleMode == PSCALER_SCALE_UP ||
133             pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP) {
134         fSrcEndRow.decimal = (uint32) pscaler_config->iOutEndRow *
135                 (uint32) pscaler_config->iSrcHeight / (uint32) pscaler_config->iOutHeight;
136         fSrcEndRow.fraction = _scaler_fraction_part(
137                 (uint32) pscaler_config->iOutEndRow * (uint32) pscaler_config->iSrcHeight,
138                 (uint32) pscaler_config->iOutHeight, FRACTION_TRUNCATE, &overflow);
139 
140         pscaler_config->iSrcEndRow = (uint16) fSrcEndRow.decimal;
141 
142         if (0 != fSrcEndRow.fraction) {
143             // will cause an extra output row to be created...
144             pscaler_config->iSrcEndRow++;
145             pscaler_config->iOutEndRow++;
146         }
147 
148         // restore dimensions
149         pscaler_config->iSrcHeight++;
150         pscaler_config->iOutHeight++;
151     } else {
152         fSrcEndRow.decimal = (uint32) (pscaler_config->iOutEndRow + 1) *
153                 (uint32) pscaler_config->iSrcHeight /
154                 (uint32) pscaler_config->iOutHeight;
155 
156         fSrcEndRow.fraction = _scaler_fraction_part(
157                 (uint32) (pscaler_config->iOutEndRow + 1) * (uint32) pscaler_config->iSrcHeight,
158                 (uint32) pscaler_config->iOutHeight, FRACTION_TRUNCATE, &overflow);
159 
160         pscaler_config->iSrcEndRow = (uint16) fSrcEndRow.decimal;
161 
162         if (0 == fSrcEndRow.fraction) {
163             pscaler_config->iSrcEndRow--;
164         }
165     }
166 
167     // check to be sure we're not going beyond the source image
168     if (pscaler_config->iSrcEndRow >= pscaler_config->iSrcHeight) { // last stripe
169         pscaler_config->iSrcEndRow = pscaler_config->iSrcHeight - 1;
170     }
171 
172     *start_input_row_number = pscaler_config->iSrcStartRow;
173     *end_input_row_number = pscaler_config->iSrcEndRow;
174     *num_output_rows_generated = (pscaler_config->iOutEndRow - pscaler_config->iOutStartRow + 1);
175 
176     // Calculate the 2nd pass buffer size if mixed scaling is done
177     if (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_XUP) {
178         *mixed_axis_temp_buffer_size_needed =
179                 ROUND_4_UP(pscaler_config->iOutWidth + 1) *
180                         (*end_input_row_number - *start_input_row_number + 1);
181     } else if (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP) {
182         *mixed_axis_temp_buffer_size_needed =
183                 ROUND_4_UP(pscaler_config->iSrcWidth) * (*num_output_rows_generated + 1);
184     } else {
185         *mixed_axis_temp_buffer_size_needed = 0;
186     }
187 
188     (*num_output_rows_generated)++;
189 }
190 
scaler_scale_image_data(uint8 * input_plane,void * tables_ptr,uint8 * scaled_output_plane,uint8 * temp_buffer_for_mixed_axis_scaling)191 void scaler_scale_image_data(uint8 *input_plane, void *tables_ptr, uint8 *scaled_output_plane,
192         uint8 *temp_buffer_for_mixed_axis_scaling) {
193     uint16 iOrigWidth, iOrigHeight, iOrigOutBufWidth, iOrigSrcBufWidth;
194     uint16 iOrigOutStartRow, iOrigOutEndRow, iOrigSrcStartRow, iOrigSrcEndRow;
195     float64_t fOrigSrcStartRow;
196     uint8 *pOrigBuf;
197     scaler_config_t *pscaler_config;
198 
199     pscaler_config = (scaler_config_t *) tables_ptr;
200     pscaler_config->pSrcBuf = input_plane;
201     pscaler_config->pOutBuf = scaled_output_plane;
202 
203     if ((PSCALER_SCALE_MIXED_XUP == pscaler_config->scaleMode) ||
204             (PSCALER_SCALE_MIXED_YUP == pscaler_config->scaleMode)) {
205         pscaler_config->pTmpBuf = temp_buffer_for_mixed_axis_scaling;
206 
207         // save the output buffer
208         pOrigBuf = pscaler_config->pOutBuf;
209 
210         // use the temp buff as the output buff for pass 1
211         pscaler_config->pOutBuf = pscaler_config->pTmpBuf;
212 
213         if (PSCALER_SCALE_MIXED_YUP == pscaler_config->scaleMode) {
214             // save the original output widths
215             iOrigWidth = pscaler_config->iOutWidth;
216             iOrigOutBufWidth = pscaler_config->iOutBufWidth;
217 
218             // set output widths to input widths (1::1)
219             pscaler_config->iOutWidth = pscaler_config->iSrcWidth;
220             pscaler_config->iOutBufWidth = pscaler_config->iSrcBufWidth;
221 
222             // calculate the new scaler factors
223             _calculate_factors(pscaler_config, PSCALER_SCALE_UP);
224 
225             // Run the photo scaler hardware
226             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_UP);
227 
228             // reset the output widths
229             pscaler_config->iOutWidth = iOrigWidth;
230             pscaler_config->iOutBufWidth = iOrigOutBufWidth;
231         } else {
232             // save the original output height and row info
233             iOrigHeight = pscaler_config->iOutHeight;
234             iOrigOutStartRow = pscaler_config->iOutStartRow;
235             iOrigOutEndRow = pscaler_config->iOutEndRow;
236             fOrigSrcStartRow.fraction = pscaler_config->fSrcStartRow.fraction;
237 
238             // set output height and rows to input height and rows(1::1)
239             pscaler_config->iOutHeight = pscaler_config->iSrcHeight;
240             pscaler_config->iOutStartRow = pscaler_config->iSrcStartRow;
241             pscaler_config->iOutEndRow = pscaler_config->iSrcEndRow;
242             pscaler_config->fSrcStartRow.fraction = 0;
243 
244             // calculate the new scaler factors
245             _calculate_factors(pscaler_config, PSCALER_SCALE_UP);
246 
247             // Run the photo scaler hardware
248             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_UP);
249 
250             // reset the output height and rows
251             pscaler_config->iOutHeight = iOrigHeight;
252             pscaler_config->iOutStartRow = iOrigOutStartRow;
253             pscaler_config->iOutEndRow = iOrigOutEndRow;
254             pscaler_config->fSrcStartRow.fraction = fOrigSrcStartRow.fraction;
255         }
256         // restore the original output buffer
257         pscaler_config->pOutBuf = pOrigBuf;
258 
259         // save the original input buffer
260         pOrigBuf = pscaler_config->pSrcBuf;
261 
262         // use the previous output (temp) buffer as the new input buffer
263         pscaler_config->pSrcBuf = pscaler_config->pTmpBuf;
264 
265         if (PSCALER_SCALE_MIXED_YUP == pscaler_config->scaleMode) {
266             // save the original input height and rows
267             iOrigHeight = pscaler_config->iSrcHeight;
268             iOrigSrcStartRow = pscaler_config->iSrcStartRow;
269             iOrigSrcEndRow = pscaler_config->iSrcEndRow;
270             fOrigSrcStartRow.decimal = pscaler_config->fSrcStartRow.decimal;
271             fOrigSrcStartRow.fraction = pscaler_config->fSrcStartRow.fraction;
272 
273             // set the height and rows to 1::1 for the second pass
274             pscaler_config->iSrcHeight = pscaler_config->iOutHeight;
275             pscaler_config->iSrcStartRow = pscaler_config->iOutStartRow;
276             pscaler_config->iSrcEndRow = pscaler_config->iOutEndRow;
277             pscaler_config->fSrcStartRow.decimal = pscaler_config->iOutStartRow;
278             pscaler_config->fSrcStartRow.fraction = 0;
279 
280             // calculate new scale factors
281             _calculate_factors(pscaler_config, PSCALER_SCALE_DOWN);
282 
283             // Run the photo scaler hardware
284             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_DOWN);
285 
286             // restore original input height and rows
287             pscaler_config->iSrcHeight = iOrigHeight;
288             pscaler_config->iSrcStartRow = iOrigSrcStartRow;
289             pscaler_config->iSrcEndRow = iOrigSrcEndRow;
290             pscaler_config->fSrcStartRow.decimal = fOrigSrcStartRow.decimal;
291             pscaler_config->fSrcStartRow.fraction = fOrigSrcStartRow.fraction;
292         } else {
293             // save the original input widths
294             iOrigWidth = pscaler_config->iSrcWidth;
295             iOrigSrcBufWidth = pscaler_config->iSrcBufWidth;
296 
297             // set the widths to 1::1 for the second pass
298             pscaler_config->iSrcWidth = pscaler_config->iOutWidth;
299             pscaler_config->iSrcBufWidth = pscaler_config->iOutBufWidth;
300 
301             // calculate new scale factors
302             _calculate_factors(pscaler_config, PSCALER_SCALE_DOWN);
303 
304             // Run the photo scaler hardware
305             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_DOWN);
306 
307             // restore original input widths
308             pscaler_config->iSrcWidth = iOrigWidth;
309             pscaler_config->iSrcBufWidth = iOrigSrcBufWidth;
310         }
311 
312         // restore the input buffer
313         pscaler_config->pTmpBuf = pscaler_config->pSrcBuf;
314         pscaler_config->pSrcBuf = pOrigBuf;
315 
316         // release the temp buffer
317         pscaler_config->pTmpBuf = NULL;
318     } else {
319         // Run the photo scaler hardware
320         _hw_scale_image_plane(pscaler_config, pscaler_config->scaleMode);
321     }
322 }
323 
_calculate_factors(scaler_config_t * pscaler_config,scaler_mode_t scaleMode)324 static void _calculate_factors(scaler_config_t *pscaler_config, scaler_mode_t scaleMode) {
325     bool_t overflow;
326     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
327             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP)) {
328         // scale up factors are computed as (dim-1)/(dim-1)
329         pscaler_config->iSrcHeight--;
330         pscaler_config->iOutHeight--;
331     }
332     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
333             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_XUP)) {
334         pscaler_config->iSrcWidth--;
335         pscaler_config->iOutWidth--;
336     }
337 
338     pscaler_config->fXfactor.decimal = (uint32) pscaler_config->iOutWidth /
339             (uint32) pscaler_config->iSrcWidth;
340     pscaler_config->fXfactor.fraction = _scaler_fraction_part(
341             (uint32) pscaler_config->iOutWidth,
342             (uint32) pscaler_config->iSrcWidth,
343             FRACTION_TRUNCATE,
344             &overflow);
345 
346     pscaler_config->fXfactorInv.decimal = (uint32) pscaler_config->iSrcWidth /
347             (uint32) pscaler_config->iOutWidth;
348     pscaler_config->fXfactorInv.fraction = _scaler_fraction_part(
349             (uint32) pscaler_config->iSrcWidth, (uint32) pscaler_config->iOutWidth,
350             FRACTION_ROUND_UP, &overflow);
351 
352     if (overflow) {
353         pscaler_config->fXfactorInv.decimal++;
354     }
355 
356     pscaler_config->fYfactor.decimal = (uint32) pscaler_config->iOutHeight /
357             (uint32) pscaler_config->iSrcHeight;
358     pscaler_config->fYfactor.fraction = _scaler_fraction_part(
359             (uint32) pscaler_config->iOutHeight, (uint32) pscaler_config->iSrcHeight,
360             FRACTION_TRUNCATE, &overflow);
361 
362     pscaler_config->fYfactorInv.decimal = (uint32) pscaler_config->iSrcHeight /
363             (uint32) pscaler_config->iOutHeight;
364     pscaler_config->fYfactorInv.fraction = _scaler_fraction_part(
365             (uint32) pscaler_config->iSrcHeight, (uint32) pscaler_config->iOutHeight,
366             FRACTION_ROUND_UP, &overflow);
367 
368     if (overflow) {
369         pscaler_config->fYfactorInv.decimal++;
370     }
371 
372     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
373             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP)) {
374         // restore original dimensions
375         pscaler_config->iSrcHeight++;
376         pscaler_config->iOutHeight++;
377     }
378     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
379             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_XUP)) {
380         pscaler_config->iSrcWidth++;
381         pscaler_config->iOutWidth++;
382     }
383 }
384 
_scaler_fraction_part(uint32 iNum,uint32 iDen,pscaler_fraction_t mode,bool_t * overflow)385 static uint32 _scaler_fraction_part(uint32 iNum, uint32 iDen, pscaler_fraction_t mode,
386         bool_t *overflow) {
387     uint32 iFract;     // fractional part
388     uint32 iRem;       // remainder part
389     int i;          // loop counter
390 
391     *overflow = 0;
392     iFract = 0;
393     iRem = iNum % iDen;
394 
395     if (iRem == 0) {
396         return (0);
397     }
398 
399     for (i = PSCALER_FRACT_BITS_COUNT - 1; i >= 0; i--) {
400         iRem <<= 1;
401 
402         if (iRem == iDen) {
403             iFract |= (1 << i);
404             break;
405         } else if (iRem > iDen) {
406             iFract |= (1 << i);
407             iRem -= iDen;
408         }
409     }
410 
411     if (mode == FRACTION_TRUNCATE) {
412         return (iFract << 8);
413     } else {
414         if (iRem == 0) {
415             return (iFract << 8);
416         } else {
417             if (iFract < 0x00ffffff) {
418                 iFract++;
419                 return (iFract << 8);
420             } else {
421                 *overflow = 1;
422                 return (0);
423             }
424         }
425     }
426 }
427 
428 #define _RESTRICT_ __restrict__
429 
_scale_row_down_9in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ in2,uint8 * _RESTRICT_ in3,uint8 * _RESTRICT_ in4,uint8 * _RESTRICT_ in5,uint8 * _RESTRICT_ in6,uint8 * _RESTRICT_ in7,uint8 * _RESTRICT_ in8,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)430 static inline void _scale_row_down_9in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
431         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
432         uint8 *_RESTRICT_ in6, uint8 *_RESTRICT_ in7, uint8 *_RESTRICT_ in8, uint8 *_RESTRICT_ out,
433         uint64 position_x, uint64 x_factor_inv, uint32 top_weight, uint32 bot_weight,
434         uint32 weight_reciprocal, int out_width) {
435     int x;
436     uint32 in_col;
437     sint32 total_weight;
438 
439     for (x = 0; x < out_width; x++) {
440         uint32 acc_r = 0;
441         uint32 acc_g = 0;
442         uint32 acc_b = 0;
443         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
444         total_weight = x_factor_inv >> 24;
445 
446         in_col = position_x >> 32;
447 
448         while (total_weight > 0) {
449             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
450             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
451             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
452             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
453             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
454             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight << 8;
455             acc_r += (uint32) in6[(in_col * 3) + 0] * curr_weight << 8;
456             acc_r += (uint32) in7[(in_col * 3) + 0] * curr_weight << 8;
457             acc_r += (uint32) in8[(in_col * 3) + 0] * curr_weight * bot_weight;
458 
459             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
460             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
461             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
462             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
463             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
464             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight << 8;
465             acc_g += (uint32) in6[(in_col * 3) + 1] * curr_weight << 8;
466             acc_g += (uint32) in7[(in_col * 3) + 1] * curr_weight << 8;
467             acc_g += (uint32) in8[(in_col * 3) + 1] * curr_weight * bot_weight;
468 
469             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
470             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
471             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
472             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
473             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
474             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight << 8;
475             acc_b += (uint32) in6[(in_col * 3) + 2] * curr_weight << 8;
476             acc_b += (uint32) in7[(in_col * 3) + 2] * curr_weight << 8;
477             acc_b += (uint32) in8[(in_col * 3) + 2] * curr_weight * bot_weight;
478 
479             in_col++;
480 
481             total_weight -= curr_weight;
482             curr_weight = total_weight > 256 ? 256 : total_weight;
483         }
484 
485         position_x += x_factor_inv;
486 
487         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
488         out[(x * 3) + 0] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
489         out[(x * 3) + 0] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
490     }
491 }
492 
_scale_row_down_8in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ in2,uint8 * _RESTRICT_ in3,uint8 * _RESTRICT_ in4,uint8 * _RESTRICT_ in5,uint8 * _RESTRICT_ in6,uint8 * _RESTRICT_ in7,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)493 static inline void _scale_row_down_8in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
494         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
495         uint8 *_RESTRICT_ in6, uint8 *_RESTRICT_ in7, uint8 *_RESTRICT_ out, uint64 position_x,
496         uint64 x_factor_inv, uint32 top_weight,
497         uint32 bot_weight, uint32 weight_reciprocal,
498         int out_width) {
499     int x;
500     uint32 in_col;
501     sint32 total_weight;
502 
503     for (x = 0; x < out_width; x++) {
504         uint32 acc_r = 0;
505         uint32 acc_g = 0;
506         uint32 acc_b = 0;
507         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
508         total_weight = x_factor_inv >> 24;
509 
510         in_col = position_x >> 32;
511 
512         while (total_weight > 0) {
513             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
514             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
515             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
516             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
517             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
518             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight << 8;
519             acc_r += (uint32) in6[(in_col * 3) + 0] * curr_weight << 8;
520             acc_r += (uint32) in7[(in_col * 3) + 0] * curr_weight * bot_weight;
521 
522             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
523             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
524             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
525             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
526             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
527             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight << 8;
528             acc_g += (uint32) in6[(in_col * 3) + 1] * curr_weight << 8;
529             acc_g += (uint32) in7[(in_col * 3) + 1] * curr_weight * bot_weight;
530 
531             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
532             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
533             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
534             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
535             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
536             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight << 8;
537             acc_b += (uint32) in6[(in_col * 3) + 2] * curr_weight << 8;
538             acc_b += (uint32) in7[(in_col * 3) + 2] * curr_weight * bot_weight;
539 
540             in_col++;
541 
542             total_weight -= curr_weight;
543             curr_weight = total_weight > 256 ? 256 : total_weight;
544         }
545 
546         position_x += x_factor_inv;
547 
548         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
549         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
550         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
551     }
552 }
553 
_scale_row_down_7in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ in2,uint8 * _RESTRICT_ in3,uint8 * _RESTRICT_ in4,uint8 * _RESTRICT_ in5,uint8 * _RESTRICT_ in6,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)554 static inline void _scale_row_down_7in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
555         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
556         uint8 *_RESTRICT_ in6, uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv,
557         uint32 top_weight, uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
558     int x;
559     uint32 in_col;
560     sint32 total_weight;
561 
562     for (x = 0; x < out_width; x++) {
563         uint32 acc_r = 0;
564         uint32 acc_g = 0;
565         uint32 acc_b = 0;
566         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
567         total_weight = x_factor_inv >> 24;
568 
569         in_col = position_x >> 32;
570 
571         while (total_weight > 0) {
572             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
573             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
574             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
575             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
576             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
577             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight << 8;
578             acc_r += (uint32) in6[(in_col * 3) + 0] * curr_weight * bot_weight;
579 
580             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
581             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
582             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
583             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
584             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
585             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight << 8;
586             acc_g += (uint32) in6[(in_col * 3) + 1] * curr_weight * bot_weight;
587 
588             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
589             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
590             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
591             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
592             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
593             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight << 8;
594             acc_b += (uint32) in6[(in_col * 3) + 2] * curr_weight * bot_weight;
595 
596             in_col++;
597 
598             total_weight -= curr_weight;
599             curr_weight = total_weight > 256 ? 256 : total_weight;
600         }
601 
602         position_x += x_factor_inv;
603 
604         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
605         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
606         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
607     }
608 }
609 
_scale_row_down_6in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ in2,uint8 * _RESTRICT_ in3,uint8 * _RESTRICT_ in4,uint8 * _RESTRICT_ in5,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)610 static inline void _scale_row_down_6in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
611         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
612         uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv, uint32 top_weight,
613         uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
614     int x;
615     uint32 in_col;
616     sint32 total_weight;
617 
618     for (x = 0; x < out_width; x++) {
619         uint32 acc_r = 0;
620         uint32 acc_g = 0;
621         uint32 acc_b = 0;
622         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
623         total_weight = x_factor_inv >> 24;
624 
625         in_col = position_x >> 32;
626 
627         while (total_weight > 0) {
628             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
629             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
630             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
631             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
632             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
633             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight * bot_weight;
634 
635             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
636             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
637             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
638             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
639             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
640             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight * bot_weight;
641 
642             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
643             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
644             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
645             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
646             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
647             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight * bot_weight;
648 
649             in_col++;
650 
651             total_weight -= curr_weight;
652             curr_weight = total_weight > 256 ? 256 : total_weight;
653         }
654 
655         position_x += x_factor_inv;
656 
657         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
658         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
659         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
660     }
661 }
662 
_scale_row_down_5in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ in2,uint8 * _RESTRICT_ in3,uint8 * _RESTRICT_ in4,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)663 static inline void _scale_row_down_5in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
664         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ out,
665         uint64 position_x, uint64 x_factor_inv, uint32 top_weight, uint32 bot_weight,
666         uint32 weight_reciprocal, int out_width) {
667     int x;
668     uint32 in_col;
669     sint32 total_weight;
670 
671     for (x = 0; x < out_width; x++) {
672         uint32 acc_r = 0;
673         uint32 acc_g = 0;
674         uint32 acc_b = 0;
675         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
676         total_weight = x_factor_inv >> 24;
677 
678         in_col = position_x >> 32;
679 
680         while (total_weight > 0) {
681             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
682             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
683             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
684             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
685             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight * bot_weight;
686 
687             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
688             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
689             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
690             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
691             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight * bot_weight;
692 
693             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
694             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
695             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
696             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
697             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight * bot_weight;
698 
699             in_col++;
700 
701             total_weight -= curr_weight;
702             curr_weight = total_weight > 256 ? 256 : total_weight;
703         }
704 
705         position_x += x_factor_inv;
706 
707         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
708         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
709         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
710     }
711 }
712 
_scale_row_down_4in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ in2,uint8 * _RESTRICT_ in3,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)713 static inline void _scale_row_down_4in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
714         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ out, uint64 position_x,
715         uint64 x_factor_inv, uint32 top_weight, uint32 bot_weight, uint32 weight_reciprocal,
716         int out_width) {
717     int x;
718     uint32 in_col;
719     sint32 total_weight;
720 
721     for (x = 0; x < out_width; x++) {
722         uint32 acc_r = 0;
723         uint32 acc_g = 0;
724         uint32 acc_b = 0;
725         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
726         total_weight = x_factor_inv >> 24;
727 
728         in_col = position_x >> 32;
729 
730         while (total_weight > 0) {
731             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
732             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
733             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
734             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight * bot_weight;
735 
736             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
737             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
738             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
739             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight * bot_weight;
740 
741             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
742             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
743             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
744             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight * bot_weight;
745 
746             in_col++;
747 
748             total_weight -= curr_weight;
749             curr_weight = total_weight > 256 ? 256 : total_weight;
750         }
751 
752         position_x += x_factor_inv;
753 
754         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
755         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
756         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
757     }
758 }
759 
_scale_row_down_3in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ in2,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)760 static inline void _scale_row_down_3in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
761         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv,
762         uint32 top_weight, uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
763     int x;
764     uint32 in_col;
765     sint32 total_weight;
766 
767     for (x = 0; x < out_width; x++) {
768         uint32 acc_r = 0;
769         uint32 acc_g = 0;
770         uint32 acc_b = 0;
771         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
772         total_weight = x_factor_inv >> 24;
773 
774         in_col = position_x >> 32;
775 
776         while (total_weight > 0) {
777             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
778             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
779             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight * bot_weight;
780 
781             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
782             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
783             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight * bot_weight;
784 
785             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
786             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
787             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight * bot_weight;
788 
789             in_col++;
790 
791             total_weight -= curr_weight;
792             curr_weight = total_weight > 256 ? 256 : total_weight;
793         }
794 
795         position_x += x_factor_inv;
796 
797         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
798         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
799         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
800     }
801 }
802 
_scale_row_down_2in(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ out,uint64 position_x,uint64 x_factor_inv,uint32 top_weight,uint32 bot_weight,uint32 weight_reciprocal,int out_width)803 static inline void _scale_row_down_2in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
804         uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv, uint32 top_weight,
805         uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
806     int x;
807     uint32 in_col;
808     sint32 total_weight;
809 
810     for (x = 0; x < out_width; x++) {
811         uint32 acc_r = 0;
812         uint32 acc_g = 0;
813         uint32 acc_b = 0;
814         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
815         total_weight = x_factor_inv >> 24;
816 
817         in_col = position_x >> 32;
818 
819         while (total_weight > 0) {
820             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
821             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight * bot_weight;
822 
823             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
824             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight * bot_weight;
825 
826             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
827             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight * bot_weight;
828 
829             in_col++;
830 
831             total_weight -= curr_weight;
832             curr_weight = total_weight > 256 ? 256 : total_weight;
833         }
834 
835         position_x += x_factor_inv;
836 
837         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
838         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
839         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
840     }
841 }
842 
_scale_row_down(uint8 * in,uint8 * _RESTRICT_ out,uint32 in_row_ofs,uint64 position_x,uint64 position_y,uint64 x_factor_inv,uint64 y_factor_inv,uint32 weight_reciprocal,int out_width)843 static inline void _scale_row_down(uint8 *in, uint8 *_RESTRICT_ out, uint32 in_row_ofs,
844         uint64 position_x, uint64 position_y, uint64 x_factor_inv, uint64 y_factor_inv,
845         uint32 weight_reciprocal, int out_width) {
846     int x;
847     uint32 y, in_col, in_rows, top_weight, bot_weight;
848     sint32 total_weight;
849 
850     total_weight = y_factor_inv >> 24;
851 
852     top_weight = (uint32) 256 - ((position_y >> 24) & 0xff);
853 
854     if ((sint32) top_weight > total_weight) {
855         top_weight = total_weight;
856     }
857     total_weight -= top_weight;
858 
859     if (total_weight & 0xff) {
860         bot_weight = total_weight & 0xff;
861     } else if (total_weight > 255) {
862         bot_weight = 256;
863     } else {
864         bot_weight = 0;
865     }
866 
867     total_weight -= bot_weight;
868 
869     assert(total_weight >= 0);
870     assert((total_weight & 0xff) == 0);
871 
872     in_rows = 2 + (total_weight >> 8);
873 
874     if (in_rows == 2) {
875         _scale_row_down_2in(in, in + in_row_ofs,
876                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
877                 out_width);
878     } else if (in_rows == 3) {
879         _scale_row_down_3in(in, in + in_row_ofs, in + 2 * in_row_ofs,
880                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
881                 out_width);
882     } else if (in_rows == 4) {
883         _scale_row_down_4in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
884                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
885                 out_width);
886     } else if (in_rows == 5) {
887         _scale_row_down_5in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
888                 in + 4 * in_row_ofs,
889                 out, position_x, x_factor_inv,
890                 top_weight, bot_weight, weight_reciprocal,
891                 out_width);
892     } else if (in_rows == 6) {
893         _scale_row_down_6in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
894                 in + 4 * in_row_ofs, in + 5 * in_row_ofs,
895                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
896                 out_width);
897     } else if (in_rows == 7) {
898         _scale_row_down_7in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
899                 in + 4 * in_row_ofs, in + 5 * in_row_ofs, in + 6 * in_row_ofs,
900                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
901                 out_width);
902     } else if (in_rows == 8) {
903         _scale_row_down_8in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
904                 in + 4 * in_row_ofs, in + 5 * in_row_ofs, in + 6 * in_row_ofs,
905                 in + 7 * in_row_ofs,
906                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
907                 out_width);
908     } else if (in_rows == 9) {
909         _scale_row_down_9in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
910                 in + 4 * in_row_ofs, in + 5 * in_row_ofs, in + 6 * in_row_ofs,
911                 in + 7 * in_row_ofs, in + 8 * in_row_ofs,
912                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
913                 out_width);
914     } else {
915         for (x = 0; x < out_width; x++) {
916             uint32 acc_r = 0;
917             uint32 acc_g = 0;
918             uint32 acc_b = 0;
919             uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
920             total_weight = x_factor_inv >> 24;
921 
922             in_col = position_x >> 32;
923 
924             while (total_weight > 0) {
925                 acc_r += (uint32) in[(in_col * 3) + 0] * curr_weight * top_weight;
926                 acc_g += (uint32) in[(in_col * 3) + 1] * curr_weight * top_weight;
927                 acc_b += (uint32) in[(in_col * 3) + 2] * curr_weight * top_weight;
928 
929                 for (y = 1; y < in_rows - 1; y++) {
930                     acc_r += (uint32) in[y * in_row_ofs + ((in_col * 3) + 0)] * curr_weight * 256;
931                     acc_g += (uint32) in[y * in_row_ofs + ((in_col * 3) + 1)] * curr_weight * 256;
932                     acc_b += (uint32) in[y * in_row_ofs + ((in_col * 3) + 2)] * curr_weight * 256;
933                 }
934 
935                 acc_r +=
936                         (uint32) in[y * in_row_ofs + ((in_col * 3) + 0)] * curr_weight * bot_weight;
937                 acc_g +=
938                         (uint32) in[y * in_row_ofs + ((in_col * 3) + 1)] * curr_weight * bot_weight;
939                 acc_b +=
940                         (uint32) in[y * in_row_ofs + ((in_col * 3) + 2)] * curr_weight * bot_weight;
941 
942                 in_col++;
943                 total_weight -= curr_weight;
944                 curr_weight = total_weight > 256 ? 256 : total_weight;
945             }
946 
947             position_x += x_factor_inv;
948 
949             out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
950             out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
951             out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
952         }
953     }
954 }
955 
_scale_row_up(uint8 * _RESTRICT_ in0,uint8 * _RESTRICT_ in1,uint8 * _RESTRICT_ out,sint32 weight_y,uint64 position_x,uint64 increment_x,int out_width)956 static void _scale_row_up(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1, uint8 *_RESTRICT_ out,
957         sint32 weight_y, uint64 position_x, uint64 increment_x, int out_width) {
958     int x;
959     for (x = 0; x < out_width; x++) {
960         sint32 top_val_r, bot_val_r;
961         sint32 top_val_g, bot_val_g;
962         sint32 top_val_b, bot_val_b;
963 
964         // Position is tracked with 32 bits of precision, but interpolation is
965         // only guided by 10. REVISIT - Check ASM and make sure the compiler
966         // handled the second part here optimally.
967         uint32 pix_x = position_x >> 32;
968 
969         sint32 weight_x = (position_x & 0xffffffff) >> 22;
970 
971         // top_val and bot_val become 18-bit values here
972         top_val_r = (in0[(pix_x * 3) + 0] << 10) +
973                 weight_x * ((sint32) in0[((pix_x + 1) * 3) + 0] - in0[(pix_x * 3) + 0]);
974         bot_val_r = (in1[(pix_x * 3) + 0] << 10) +
975                 weight_x * ((sint32) in1[((pix_x + 1) * 3) + 0] - in1[(pix_x * 3) + 0]);
976 
977         top_val_g = (in0[(pix_x * 3) + 1] << 10) +
978                 weight_x * ((sint32) in0[((pix_x + 1) * 3) + 1] - in0[(pix_x * 3) + 1]);
979         bot_val_g = (in1[(pix_x * 3) + 1] << 10) +
980                 weight_x * ((sint32) in1[((pix_x + 1) * 3) + 1] - in1[(pix_x * 3) + 1]);
981 
982         top_val_b = (in0[(pix_x * 3) + 2] << 10) +
983                 weight_x * ((sint32) in0[((pix_x + 1) * 3) + 2] - in0[(pix_x * 3) + 2]);
984         bot_val_b = (in1[(pix_x * 3) + 2] << 10) +
985                 weight_x * ((sint32) in1[((pix_x + 1) * 3) + 2] - in1[(pix_x * 3) + 2]);
986 
987         // out is an 8-bit value. We do not need to range-check, as overflow
988         // is mathematically impossible.
989         out[(x * 3) + 0] = ((top_val_r << 10) + weight_y * (bot_val_r - top_val_r)) >> 20;
990         out[(x * 3) + 1] = ((top_val_g << 10) + weight_y * (bot_val_g - top_val_g)) >> 20;
991         out[(x * 3) + 2] = ((top_val_b << 10) + weight_y * (bot_val_b - top_val_b)) >> 20;
992 
993         position_x += increment_x;
994     }
995 }
996 
_hw_scale_image_plane(scaler_config_t * pscaler_config,scaler_mode_t scaleMode)997 static void _hw_scale_image_plane(scaler_config_t *pscaler_config, scaler_mode_t scaleMode) {
998     // These pointers duplicate h/w regs
999     uint64 x_factor, y_factor, x_factor_inv, y_factor_inv;
1000     uint32 x_output_width, y_output_width;
1001     uint32 input_pixel_ptr_offset, output_pixel_ptr_offset;
1002     uint32 first_xi;
1003     uint64 first_y_src, first_x_src, weight_reciprocal;
1004 
1005     // These are internal state
1006     uint32 r;
1007     uint8 *outp;
1008 
1009     x_output_width = pscaler_config->iOutWidth;
1010     y_output_width = pscaler_config->iOutEndRow -
1011             pscaler_config->iOutStartRow + 1;
1012 
1013     input_pixel_ptr_offset = pscaler_config->iSrcBufWidth;
1014     output_pixel_ptr_offset = pscaler_config->iOutBufWidth;
1015 
1016     x_factor = (uint64) pscaler_config->fXfactor.decimal << 32;
1017     x_factor |= pscaler_config->fXfactor.fraction;
1018 
1019     y_factor = (uint64) pscaler_config->fYfactor.decimal << 32;
1020     y_factor |= pscaler_config->fYfactor.fraction;
1021 
1022     x_factor_inv = (uint64) pscaler_config->fXfactorInv.decimal << 32;
1023     x_factor_inv |= pscaler_config->fXfactorInv.fraction;
1024 
1025     y_factor_inv = (uint64) pscaler_config->fYfactorInv.decimal << 32;
1026     y_factor_inv |= pscaler_config->fYfactorInv.fraction;
1027 
1028     first_y_src = (uint64) pscaler_config->fSrcStartRow.decimal << 32;
1029     first_y_src |= pscaler_config->fSrcStartRow.fraction;
1030 
1031     // PC REVISIT - The HW has config registers for these, but they aren't being
1032     // used by lib_photo_scaler do I don't want to use them, either. For now
1033     // just print them so I can figure out what's going on and then clear the
1034     // associated variables. Maybe we're always running the scaler from the
1035     // left edge of the source so they're implicitly zero?
1036     first_xi = pscaler_config->iOutStartColumn;
1037 
1038     first_x_src = (uint64) pscaler_config->fSrcStartColumn.decimal << 32;
1039     first_x_src |= pscaler_config->fSrcStartColumn.fraction;
1040 
1041     first_xi = first_x_src = 0;
1042 
1043     weight_reciprocal = ((uint64) 1 << 32);
1044     weight_reciprocal /= (x_factor_inv >> 24) * (y_factor_inv >> 24);
1045 
1046     outp = (pscaler_config->pOutBuf) + (first_xi * 3);
1047 
1048     // PC - Assume pSrcBuf is already aligned to "true" base of input,
1049     // so ignore whole-number part of first_y_src.
1050     first_y_src = first_y_src & 0xffffffff;
1051 
1052     for (r = 0; r < y_output_width; r++) {
1053         uint8 *inp = (pscaler_config->pSrcBuf) +
1054                 (first_y_src >> 32) * input_pixel_ptr_offset;
1055         {
1056             if (scaleMode == PSCALER_SCALE_UP) {
1057                 _scale_row_up(inp, inp + input_pixel_ptr_offset, outp,
1058                         (first_y_src & 0xffffffff) >> 22, first_x_src,
1059                         x_factor_inv, x_output_width);
1060             } else {
1061                 _scale_row_down(inp, outp, input_pixel_ptr_offset,
1062                         first_x_src, first_y_src, x_factor_inv, y_factor_inv,
1063                         weight_reciprocal, x_output_width);
1064             }
1065         }
1066         first_y_src += y_factor_inv;
1067         outp += output_pixel_ptr_offset;
1068     }
1069 }