1 /******************************************************************************
2 *
3 * Copyright 2014 The Android Open Source Project
4 * Copyright 2003 - 2004 Open Interface North America, Inc. All rights
5 * reserved.
6 *
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at:
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 ******************************************************************************/
20
21 /*******************************************************************************
22 $Revision: #1 $
23 ******************************************************************************/
24
25 /** @file
26
27 This file, along with synthesis-generated.c, contains the synthesis
28 filterbank routines. The operations performed correspond to the
29 operations described in A2DP Appendix B, Figure 12.3. Several
30 mathematical optimizations are performed, particularly for the
31 8-subband case.
32
33 One important optimization is to note that the "matrixing" operation
34 can be decomposed into the product of a type II discrete cosine kernel
35 and another, sparse matrix.
36
37 According to Fig 12.3, in the 8-subband case,
38 @code
39 N[k][i] = cos((i+0.5)*(k+4)*pi/8), k = 0..15 and i = 0..7
40 @endcode
41
42 N can be factored as R * C2, where C2 is an 8-point type II discrete
43 cosine kernel given by
44 @code
45 C2[k][i] = cos((i+0.5)*k*pi/8)), k = 0..7 and i = 0..7
46 @endcode
47
48 R turns out to be a sparse 16x8 matrix with the following non-zero
49 entries:
50 @code
51 R[k][k+4] = 1, k = 0..3
52 R[k][abs(12-k)] = -1, k = 5..15
53 @endcode
54
55 The spec describes computing V[0..15] as N * R.
56 @code
57 V[0..15] = N * R = (R * C2) * R = R * (C2 * R)
58 @endcode
59
60 C2 * R corresponds to computing the discrete cosine transform of R, so
61 V[0..15] can be computed by taking the DCT of R followed by assignment
62 and selective negation of the DCT result into V.
63
64 Although this was derived empirically using GNU Octave, it is
65 formally demonstrated in, e.g., Liu, Chi-Min and Lee,
66 Wen-Chieh. "A Unified Fast Algorithm for Cosine Modulated
67 Filter Banks in Current Audio Coding Standards." Journal of
68 the AES 47 (December 1999): 1061.
69
70 Given the shift operation performed prior to computing V[0..15], it is
71 clear that V[0..159] represents a rolling history of the 10 most
72 recent groups of blocks input to the synthesis operation. Interpreting
73 the matrix N in light of its factorization into C2 and R, R's
74 sparseness has implications for interpreting the values in V. In
75 particular, there is considerable redundancy in the values stored in
76 V. Furthermore, since R[4][0..7] are all zeros, one out of every 16
77 values in V will be zero regardless of the input data. Within each
78 block of 16 values in V, fully half of them are redundant or
79 irrelevant:
80
81 @code
82 V[ 0] = DCT[4]
83 V[ 1] = DCT[5]
84 V[ 2] = DCT[6]
85 V[ 3] = DCT[7]
86 V[ 4] = 0
87 V[ 5] = -DCT[7] = -V[3] (redundant)
88 V[ 6] = -DCT[6] = -V[2] (redundant)
89 V[ 7] = -DCT[5] = -V[1] (redundant)
90 V[ 8] = -DCT[4] = -V[0] (redundant)
91 V[ 9] = -DCT[3]
92 V[10] = -DCT[2]
93 V[11] = -DCT[1]
94 V[12] = -DCT[0]
95 V[13] = -DCT[1] = V[11] (redundant)
96 V[14] = -DCT[2] = V[10] (redundant)
97 V[15] = -DCT[3] = V[ 9] (redundant)
98 @endcode
99
100 Since the elements of V beyond 15 were originally computed the same
101 way during a previous run, what holds true for V[x] also holds true
102 for V[x+16]. Thus, so long as care is taken to maintain the mapping,
103 we need only actually store the unique values, which correspond to the
104 output of the DCT, in some cases inverted. In fact, instead of storing
105 V[0..159], we could store DCT[0..79] which would contain a history of
106 DCT results. More on this in a bit.
107
108 Going back to figure 12.3 in the spec, it should be clear that the
109 vector U need not actually be explicitly constructed, but that with
110 suitable indexing into V during the window operation, the same end can
111 be accomplished. In the same spirit of the pseudocode shown in the
112 figure, the following is the construction of W without using U:
113
114 @code
115 for i=0 to 79 do
116 W[i] = D[i]*VSIGN(i)*V[remap_V(i)] where remap_V(i) = 32*(int(i/16)) +
117 (i % 16) + (i % 16 >= 8 ? 16 : 0)
118 and VSIGN(i) maps i%16 into {1, 1,
119 1, 1, 0, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1 }
120 These values correspond to the
121 signs of the redundant values as
122 shown in the explanation three
123 paragraphs above.
124 @endcode
125
126 We saw above how V[4..8,13..15] (and by extension
127 V[(4..8,13..15)+16*n]) can be defined in terms of other elements
128 within the subblock of V. V[0..3,9..12] correspond to DCT elements.
129
130 @code
131 for i=0 to 79 do
132 W[i] = D[i]*DSIGN(i)*DCT[remap_DCT(i)]
133 @endcode
134
135 The DCT is calculated using the Arai-Agui-Nakajima factorization,
136 which saves some computation by producing output that needs to be
137 multiplied by scaling factors before being used.
138
139 @code
140 for i=0 to 79 do
141 W[i] = D[i]*SCALE[i%8]*AAN_DCT[remap_DCT(i)]
142 @endcode
143
144 D can be premultiplied with the DCT scaling factors to yield
145
146 @code
147 for i=0 to 79 do
148 W[i] = DSCALED[i]*AAN_DCT[remap_DCT(i)] where DSCALED[i] =
149 D[i]*SCALE[i%8]
150 @endcode
151
152 The output samples X[0..7] are defined as sums of W:
153
154 @code
155 X[j] = sum{i=0..9}(W[j+8*i])
156 @endcode
157
158 @ingroup codec_internal
159 */
160
161 /**
162 @addtogroup codec_internal
163 @{
164 */
165
166 #include "oi_codec_sbc_private.h"
167
168 const int32_t dec_window_4[21] = {
169 0, /* +0.00000000E+00 */
170 97, /* +5.36548976E-04 */
171 270, /* +1.49188357E-03 */
172 495, /* +2.73370904E-03 */
173 694, /* +3.83720193E-03 */
174 704, /* +3.89205149E-03 */
175 338, /* +1.86581691E-03 */
176 -554, /* -3.06012286E-03 */
177 1974, /* +1.09137620E-02 */
178 3697, /* +2.04385087E-02 */
179 5224, /* +2.88757392E-02 */
180 5824, /* +3.21939290E-02 */
181 4681, /* +2.58767811E-02 */
182 1109, /* +6.13245186E-03 */
183 -5214, /* -2.88217274E-02 */
184 -14047, /* -7.76463494E-02 */
185 24529, /* +1.35593274E-01 */
186 35274, /* +1.94987841E-01 */
187 44618, /* +2.46636662E-01 */
188 50984, /* +2.81828203E-01 */
189 53243, /* +2.94315332E-01 */
190 };
191
192 #define DCTII_4_K06_FIX (11585) /* S1.14 11585 0.707107*/
193
194 #define DCTII_4_K08_FIX (21407) /* S1.14 21407 1.306563*/
195
196 #define DCTII_4_K09_FIX (-15137) /* S1.14 -15137 -0.923880*/
197
198 #define DCTII_4_K10_FIX (-8867) /* S1.14 -8867 -0.541196*/
199
200 /** Scales x by y bits to the right, adding a rounding factor.
201 */
202 #ifndef SCALE
203 #define SCALE(x, y) (((x) + (1 << ((y)-1))) >> (y))
204 #endif
205
206 #ifndef CLIP_INT16
207 #define CLIP_INT16(x) \
208 do { \
209 if ((x) > OI_INT16_MAX) { \
210 (x) = OI_INT16_MAX; \
211 } else if ((x) < OI_INT16_MIN) { \
212 (x) = OI_INT16_MIN; \
213 } \
214 } while (0)
215 #endif
216
217 /**
218 * Default C language implementation of a 16x32->32 multiply. This function may
219 * be replaced by a platform-specific version for speed.
220 *
221 * @param u A signed 16-bit multiplicand
222 * @param v A signed 32-bit multiplier
223
224 * @return A signed 32-bit value corresponding to the 32 most significant bits
225 * of the 48-bit product of u and v.
226 */
default_mul_16s_32s_hi(int16_t u,int32_t v)227 INLINE int32_t default_mul_16s_32s_hi(int16_t u, int32_t v) {
228 uint16_t v0;
229 int16_t v1;
230
231 int32_t w, x;
232
233 v0 = (uint16_t)(v & 0xffff);
234 v1 = (int16_t)(v >> 16);
235
236 w = v1 * u;
237 x = u * v0;
238
239 return w + (x >> 16);
240 }
241
242 #define MUL_16S_32S_HI(_x, _y) default_mul_16s_32s_hi(_x, _y)
243
244 #define LONG_MULT_DCT(K, sample) (MUL_16S_32S_HI(K, sample) << 2)
245
246 PRIVATE void SynthWindow80_generated(int16_t* pcm,
247 SBC_BUFFER_T const* RESTRICT buffer,
248 OI_UINT strideShift);
249 PRIVATE void SynthWindow112_generated(int16_t* pcm,
250 SBC_BUFFER_T const* RESTRICT buffer,
251 OI_UINT strideShift);
252 PRIVATE void dct2_8(SBC_BUFFER_T* RESTRICT out, int32_t const* RESTRICT x);
253
254 typedef void (*SYNTH_FRAME)(OI_CODEC_SBC_DECODER_CONTEXT* context, int16_t* pcm,
255 OI_UINT blkstart, OI_UINT blkcount);
256
257 #ifndef COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS
258 #define COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(dest, src) \
259 do { \
260 shift_buffer(dest, src, 72); \
261 } while (0)
262 #endif
263
264 #ifndef DCT2_8
265 #define DCT2_8(dst, src) dct2_8(dst, src)
266 #endif
267
268 #ifndef SYNTH80
269 #define SYNTH80 SynthWindow80_generated
270 #endif
271
272 #ifndef SYNTH112
273 #define SYNTH112 SynthWindow112_generated
274 #endif
275
OI_SBC_SynthFrame_80(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)276 PRIVATE void OI_SBC_SynthFrame_80(OI_CODEC_SBC_DECODER_CONTEXT* context,
277 int16_t* pcm, OI_UINT blkstart,
278 OI_UINT blkcount) {
279 OI_UINT blk;
280 OI_UINT ch;
281 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
282 OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
283 OI_UINT offset = context->common.filterBufferOffset;
284 int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
285 OI_UINT blkstop = blkstart + blkcount;
286
287 for (blk = blkstart; blk < blkstop; blk++) {
288 if (offset == 0) {
289 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
290 context->common.filterBuffer[0] + context->common.filterBufferLen -
291 72,
292 context->common.filterBuffer[0]);
293 if (nrof_channels == 2) {
294 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
295 context->common.filterBuffer[1] + context->common.filterBufferLen -
296 72,
297 context->common.filterBuffer[1]);
298 }
299 offset = context->common.filterBufferLen - 80;
300 } else {
301 offset -= 1 * 8;
302 }
303
304 for (ch = 0; ch < nrof_channels; ch++) {
305 DCT2_8(context->common.filterBuffer[ch] + offset, s);
306 SYNTH80(pcm + ch, context->common.filterBuffer[ch] + offset,
307 pcmStrideShift);
308 s += 8;
309 }
310 pcm += (8 << pcmStrideShift);
311 }
312 context->common.filterBufferOffset = offset;
313 }
314
OI_SBC_SynthFrame_4SB(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)315 PRIVATE void OI_SBC_SynthFrame_4SB(OI_CODEC_SBC_DECODER_CONTEXT* context,
316 int16_t* pcm, OI_UINT blkstart,
317 OI_UINT blkcount) {
318 OI_UINT blk;
319 OI_UINT ch;
320 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
321 OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
322 OI_UINT offset = context->common.filterBufferOffset;
323 int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
324 OI_UINT blkstop = blkstart + blkcount;
325
326 for (blk = blkstart; blk < blkstop; blk++) {
327 if (offset == 0) {
328 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
329 context->common.filterBuffer[0] + context->common.filterBufferLen -
330 72,
331 context->common.filterBuffer[0]);
332 if (nrof_channels == 2) {
333 COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(
334 context->common.filterBuffer[1] + context->common.filterBufferLen -
335 72,
336 context->common.filterBuffer[1]);
337 }
338 offset = context->common.filterBufferLen - 80;
339 } else {
340 offset -= 8;
341 }
342 for (ch = 0; ch < nrof_channels; ch++) {
343 cosineModulateSynth4(context->common.filterBuffer[ch] + offset, s);
344 SynthWindow40_int32_int32_symmetry_with_sum(
345 pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
346 s += 4;
347 }
348 pcm += (4 << pcmStrideShift);
349 }
350 context->common.filterBufferOffset = offset;
351 }
352
353 #ifdef SBC_ENHANCED
354
OI_SBC_SynthFrame_Enhanced(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT blkstart,OI_UINT blkcount)355 PRIVATE void OI_SBC_SynthFrame_Enhanced(OI_CODEC_SBC_DECODER_CONTEXT* context,
356 int16_t* pcm, OI_UINT blkstart,
357 OI_UINT blkcount) {
358 OI_UINT blk;
359 OI_UINT ch;
360 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
361 OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
362 OI_UINT offset = context->common.filterBufferOffset;
363 int32_t* s = context->common.subdata + 8 * nrof_channels * blkstart;
364 OI_UINT blkstop = blkstart + blkcount;
365
366 for (blk = blkstart; blk < blkstop; blk++) {
367 if (offset == 0) {
368 COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(
369 context->common.filterBuffer[0] + context->common.filterBufferLen -
370 104,
371 context->common.filterBuffer[0]);
372 if (nrof_channels == 2) {
373 COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(
374 context->common.filterBuffer[1] + context->common.filterBufferLen -
375 104,
376 context->common.filterBuffer[1]);
377 }
378 offset = context->common.filterBufferLen - 112;
379 } else {
380 offset -= 8;
381 }
382 for (ch = 0; ch < nrof_channels; ++ch) {
383 DCT2_8(context->common.filterBuffer[ch] + offset, s);
384 SYNTH112(pcm + ch, context->common.filterBuffer[ch] + offset,
385 pcmStrideShift);
386 s += 8;
387 }
388 pcm += (8 << pcmStrideShift);
389 }
390 context->common.filterBufferOffset = offset;
391 }
392
393 static const SYNTH_FRAME SynthFrameEnhanced[] = {
394 NULL, /* invalid */
395 OI_SBC_SynthFrame_Enhanced, /* mono */
396 OI_SBC_SynthFrame_Enhanced /* stereo */
397 };
398
399 #endif
400
401 static const SYNTH_FRAME SynthFrame8SB[] = {
402 NULL, /* invalid */
403 OI_SBC_SynthFrame_80, /* mono */
404 OI_SBC_SynthFrame_80 /* stereo */
405 };
406
407 static const SYNTH_FRAME SynthFrame4SB[] = {
408 NULL, /* invalid */
409 OI_SBC_SynthFrame_4SB, /* mono */
410 OI_SBC_SynthFrame_4SB /* stereo */
411 };
412
OI_SBC_SynthFrame(OI_CODEC_SBC_DECODER_CONTEXT * context,int16_t * pcm,OI_UINT start_block,OI_UINT nrof_blocks)413 PRIVATE void OI_SBC_SynthFrame(OI_CODEC_SBC_DECODER_CONTEXT* context,
414 int16_t* pcm, OI_UINT start_block,
415 OI_UINT nrof_blocks) {
416 OI_UINT nrof_subbands = context->common.frameInfo.nrof_subbands;
417 OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
418
419 OI_ASSERT(nrof_subbands == 4 || nrof_subbands == 8);
420 if (nrof_subbands == 4) {
421 SynthFrame4SB[nrof_channels](context, pcm, start_block, nrof_blocks);
422 #ifdef SBC_ENHANCED
423 } else if (context->common.frameInfo.enhanced) {
424 SynthFrameEnhanced[nrof_channels](context, pcm, start_block, nrof_blocks);
425 #endif /* SBC_ENHANCED */
426 } else {
427 SynthFrame8SB[nrof_channels](context, pcm, start_block, nrof_blocks);
428 }
429 }
430
SynthWindow40_int32_int32_symmetry_with_sum(int16_t * pcm,SBC_BUFFER_T buffer[80],OI_UINT strideShift)431 void SynthWindow40_int32_int32_symmetry_with_sum(int16_t* pcm,
432 SBC_BUFFER_T buffer[80],
433 OI_UINT strideShift) {
434 int32_t pa;
435 int32_t pb;
436
437 /* These values should be zero, since out[2] of the 4-band cosine modulation
438 * is always zero. */
439 OI_ASSERT(buffer[2] == 0);
440 OI_ASSERT(buffer[10] == 0);
441 OI_ASSERT(buffer[18] == 0);
442 OI_ASSERT(buffer[26] == 0);
443 OI_ASSERT(buffer[34] == 0);
444 OI_ASSERT(buffer[42] == 0);
445 OI_ASSERT(buffer[50] == 0);
446 OI_ASSERT(buffer[58] == 0);
447 OI_ASSERT(buffer[66] == 0);
448 OI_ASSERT(buffer[74] == 0);
449
450 pa = dec_window_4[4] * (buffer[12] + buffer[76]);
451 pa += dec_window_4[8] * (buffer[16] - buffer[64]);
452 pa += dec_window_4[12] * (buffer[28] + buffer[60]);
453 pa += dec_window_4[16] * (buffer[32] - buffer[48]);
454 pa += dec_window_4[20] * buffer[44];
455 pa = SCALE(-pa, 15);
456 CLIP_INT16(pa);
457 pcm[0 << strideShift] = (int16_t)pa;
458
459 pa = dec_window_4[1] * buffer[1];
460 pb = dec_window_4[1] * buffer[79];
461 pb += dec_window_4[3] * buffer[3];
462 pa += dec_window_4[3] * buffer[77];
463 pa += dec_window_4[5] * buffer[13];
464 pb += dec_window_4[5] * buffer[67];
465 pb += dec_window_4[7] * buffer[15];
466 pa += dec_window_4[7] * buffer[65];
467 pa += dec_window_4[9] * buffer[17];
468 pb += dec_window_4[9] * buffer[63];
469 pb += dec_window_4[11] * buffer[19];
470 pa += dec_window_4[11] * buffer[61];
471 pa += dec_window_4[13] * buffer[29];
472 pb += dec_window_4[13] * buffer[51];
473 pb += dec_window_4[15] * buffer[31];
474 pa += dec_window_4[15] * buffer[49];
475 pa += dec_window_4[17] * buffer[33];
476 pb += dec_window_4[17] * buffer[47];
477 pb += dec_window_4[19] * buffer[35];
478 pa += dec_window_4[19] * buffer[45];
479 pa = SCALE(-pa, 15);
480 CLIP_INT16(pa);
481 pcm[1 << strideShift] = (int16_t)(pa);
482 pb = SCALE(-pb, 15);
483 CLIP_INT16(pb);
484 pcm[3 << strideShift] = (int16_t)(pb);
485
486 pa = dec_window_4[2] *
487 (/*buffer[ 2] + */ buffer[78]); /* buffer[ 2] is always zero */
488 pa += dec_window_4[6] *
489 (buffer[14] /* + buffer[66]*/); /* buffer[66] is always zero */
490 pa += dec_window_4[10] *
491 (/*buffer[18] + */ buffer[62]); /* buffer[18] is always zero */
492 pa += dec_window_4[14] *
493 (buffer[30] /* + buffer[50]*/); /* buffer[50] is always zero */
494 pa += dec_window_4[18] *
495 (/*buffer[34] + */ buffer[46]); /* buffer[34] is always zero */
496 pa = SCALE(-pa, 15);
497 CLIP_INT16(pa);
498 pcm[2 << strideShift] = (int16_t)(pa);
499 }
500
501 /**
502 This routine implements the cosine modulation matrix for 4-subband
503 synthesis. This is called "matrixing" in the SBC specification. This
504 matrix, M4, can be factored into an 8-point Type II Discrete Cosine
505 Transform, DCTII_4 and a matrix S4, given here:
506
507 @code
508 __ __
509 | 0 0 1 0 |
510 | 0 0 0 1 |
511 | 0 0 0 0 |
512 | 0 0 0 -1 |
513 S4 = | 0 0 -1 0 |
514 | 0 -1 0 0 |
515 | -1 0 0 0 |
516 |__ 0 -1 0 0 __|
517
518 M4 * in = S4 * (DCTII_4 * in)
519 @endcode
520
521 (DCTII_4 * in) is computed using a Fast Cosine Transform. The algorithm
522 here is based on an implementation computed by the SPIRAL computer
523 algebra system, manually converted to fixed-point arithmetic. S4 can be
524 implemented using only assignment and negation.
525 */
cosineModulateSynth4(SBC_BUFFER_T * RESTRICT out,int32_t const * RESTRICT in)526 PRIVATE void cosineModulateSynth4(SBC_BUFFER_T* RESTRICT out,
527 int32_t const* RESTRICT in) {
528 int32_t f0, f1, f2, f3, f4, f7, f8, f9, f10;
529 int32_t y0, y1, y2, y3;
530
531 f0 = (in[0] - in[3]);
532 f1 = (in[0] + in[3]);
533 f2 = (in[1] - in[2]);
534 f3 = (in[1] + in[2]);
535
536 f4 = f1 - f3;
537
538 y0 = -SCALE(f1 + f3, DCT_SHIFT);
539 y2 = -SCALE(LONG_MULT_DCT(DCTII_4_K06_FIX, f4), DCT_SHIFT);
540 f7 = f0 + f2;
541 f8 = LONG_MULT_DCT(DCTII_4_K08_FIX, f0);
542 f9 = LONG_MULT_DCT(DCTII_4_K09_FIX, f7);
543 f10 = LONG_MULT_DCT(DCTII_4_K10_FIX, f2);
544 y3 = -SCALE(f8 + f9, DCT_SHIFT);
545 y1 = -SCALE(f10 - f9, DCT_SHIFT);
546
547 out[0] = (int16_t)-y2;
548 out[1] = (int16_t)-y3;
549 out[2] = (int16_t)0;
550 out[3] = (int16_t)y3;
551 out[4] = (int16_t)y2;
552 out[5] = (int16_t)y1;
553 out[6] = (int16_t)y0;
554 out[7] = (int16_t)y1;
555 }
556
557 /**
558 @}
559 */
560