1 /*
2  ** Copyright 2003-2010, VisualOn, Inc.
3  **
4  ** Licensed under the Apache License, Version 2.0 (the "License");
5  ** you may not use this file except in compliance with the License.
6  ** You may obtain a copy of the License at
7  **
8  **     http://www.apache.org/licenses/LICENSE-2.0
9  **
10  ** Unless required by applicable law or agreed to in writing, software
11  ** distributed under the License is distributed on an "AS IS" BASIS,
12  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  ** See the License for the specific language governing permissions and
14  ** limitations under the License.
15  */
16 
17 /***********************************************************************
18 *      File: wb_vad.c                                                  *
19 *                                                                      *
20 *      Description: Voice Activity Detection                           *
21 *                                                                      *
22 ************************************************************************/
23 
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include "cnst.h"
27 #include "wb_vad.h"
28 #include "typedef.h"
29 #include "basic_op.h"
30 #include "math_op.h"
31 #include "wb_vad_c.h"
32 #include "mem_align.h"
33 
34 /******************************************************************************
35 *  Calculate Log2 and scale the signal:
36 *
37 *    ilog2(Word32 in) = -1024*log10(in * 2^-31)/log10(2), where in = [1, 2^31-1]
38 *
39 *  input   output
40 *  32768   16384
41 *  1       31744
42 *
43 * When input is in the range of [1,2^16], max error is 0.0380%.
44 *********************************************************************************/
45 
ilog2(Word16 mant)46 static Word16 ilog2(                       /* return: output value of the log2 */
47         Word16 mant                        /* i: value to be converted */
48         )
49 {
50     Word16 ex, ex2, res;
51     Word32 i, l_temp;
52 
53     if (mant <= 0)
54     {
55         mant = 1;
56     }
57     ex = norm_s(mant);
58     mant = mant << ex;
59 
60     for (i = 0; i < 3; i++)
61         mant = vo_mult(mant, mant);
62     l_temp = vo_L_mult(mant, mant);
63 
64     ex2 = norm_l(l_temp);
65     mant = extract_h(l_temp << ex2);
66 
67     res = (ex + 16) << 10;
68     res = add1(res, (ex2 << 6));
69     res = vo_sub(add1(res, 127), (mant >> 8));
70     return (res);
71 }
72 
73 /******************************************************************************
74 *
75 *     Function     : filter5
76 *     Purpose      : Fifth-order half-band lowpass/highpass filter pair with
77 *                    decimation.
78 *
79 *******************************************************************************/
80 
filter5(Word16 * in0,Word16 * in1,Word16 data[])81 static void filter5(
82         Word16 * in0,                         /* i/o : input values; output low-pass part  */
83         Word16 * in1,                         /* i/o : input values; output high-pass part */
84         Word16 data[]                         /* i/o : filter memory                       */
85         )
86 {
87     Word16 temp0, temp1, temp2;
88 
89     temp0 = vo_sub(*in0, vo_mult(COEFF5_1, data[0]));
90     temp1 = add1(data[0], vo_mult(COEFF5_1, temp0));
91     data[0] = temp0;
92 
93     temp0 = vo_sub(*in1, vo_mult(COEFF5_2, data[1]));
94     temp2 = add1(data[1], vo_mult(COEFF5_2, temp0));
95     data[1] = temp0;
96 
97     *in0 = extract_h((vo_L_add(temp1, temp2) << 15));
98     *in1 = extract_h((vo_L_sub(temp1, temp2) << 15));
99 }
100 
101 /******************************************************************************
102 *
103 *     Function     : filter3
104 *     Purpose      : Third-order half-band lowpass/highpass filter pair with
105 *                    decimation.
106 *
107 *******************************************************************************/
108 
filter3(Word16 * in0,Word16 * in1,Word16 * data)109 static void filter3(
110         Word16 * in0,                         /* i/o : input values; output low-pass part  */
111         Word16 * in1,                         /* i/o : input values; output high-pass part */
112         Word16 * data                         /* i/o : filter memory                       */
113         )
114 {
115     Word16 temp1, temp2;
116 
117     temp1 = vo_sub(*in1, vo_mult(COEFF3, *data));
118     temp2 = add1(*data, vo_mult(COEFF3, temp1));
119     *data = temp1;
120 
121     *in1 = extract_h((vo_L_sub(*in0, temp2) << 15));
122     *in0 = extract_h((vo_L_add(*in0, temp2) << 15));
123 }
124 
125 /******************************************************************************
126 *
127 *     Function   : level_calculation
128 *     Purpose    : Calculate signal level in a sub-band. Level is calculated
129 *                  by summing absolute values of the input data.
130 *
131 *                  Signal level calculated from of the end of the frame
132 *                  (data[count1 - count2]) is stored to (*sub_level)
133 *                  and added to the level of the next frame.
134 *
135 ******************************************************************************/
136 
level_calculation(Word16 data[],Word16 * sub_level,Word16 count1,Word16 count2,Word16 ind_m,Word16 ind_a,Word16 scale)137 static Word16 level_calculation(                      /* return: signal level */
138         Word16 data[],                        /* i   : signal buffer                                    */
139         Word16 * sub_level,                   /* i   : level calculated at the end of the previous frame*/
140                                               /* o   : level of signal calculated from the last         */
141                                               /*       (count2 - count1) samples                        */
142         Word16 count1,                        /* i   : number of samples to be counted                  */
143         Word16 count2,                        /* i   : number of samples to be counted                  */
144         Word16 ind_m,                         /* i   : step size for the index of the data buffer       */
145         Word16 ind_a,                         /* i   : starting index of the data buffer                */
146         Word16 scale                          /* i   : scaling for the level calculation                */
147         )
148 {
149     Word32 i, l_temp1, l_temp2;
150     Word16 level;
151 
152     l_temp1 = 0L;
153     for (i = count1; i < count2; i++)
154     {
155         l_temp1 += (abs_s(data[ind_m * i + ind_a])<<1);
156     }
157 
158     l_temp2 = vo_L_add(l_temp1, L_shl(*sub_level, 16 - scale));
159     *sub_level = extract_h(L_shl(l_temp1, scale));
160 
161     for (i = 0; i < count1; i++)
162     {
163         l_temp2 += (abs_s(data[ind_m * i + ind_a])<<1);
164     }
165     level = extract_h(L_shl2(l_temp2, scale));
166 
167     return level;
168 }
169 
170 /******************************************************************************
171 *
172 *     Function     : filter_bank
173 *     Purpose      : Divide input signal into bands and calculate level of
174 *                    the signal in each band
175 *
176 *******************************************************************************/
177 
filter_bank(VadVars * st,Word16 in[],Word16 level[])178 static void filter_bank(
179         VadVars * st,                         /* i/o : State struct               */
180         Word16 in[],                          /* i   : input frame                */
181         Word16 level[]                        /* o   : signal levels at each band */
182         )
183 {
184     Word32 i;
185     Word16 tmp_buf[FRAME_LEN];
186 
187     /* shift input 1 bit down for safe scaling */
188     for (i = 0; i < FRAME_LEN; i++)
189     {
190         tmp_buf[i] = in[i] >> 1;
191     }
192 
193     /* run the filter bank */
194     for (i = 0; i < 128; i++)
195     {
196         filter5(&tmp_buf[2 * i], &tmp_buf[2 * i + 1], st->a_data5[0]);
197     }
198     for (i = 0; i < 64; i++)
199     {
200         filter5(&tmp_buf[4 * i], &tmp_buf[4 * i + 2], st->a_data5[1]);
201         filter5(&tmp_buf[4 * i + 1], &tmp_buf[4 * i + 3], st->a_data5[2]);
202     }
203     for (i = 0; i < 32; i++)
204     {
205         filter5(&tmp_buf[8 * i], &tmp_buf[8 * i + 4], st->a_data5[3]);
206         filter5(&tmp_buf[8 * i + 2], &tmp_buf[8 * i + 6], st->a_data5[4]);
207         filter3(&tmp_buf[8 * i + 3], &tmp_buf[8 * i + 7], &st->a_data3[0]);
208     }
209     for (i = 0; i < 16; i++)
210     {
211         filter3(&tmp_buf[16 * i + 0], &tmp_buf[16 * i + 8], &st->a_data3[1]);
212         filter3(&tmp_buf[16 * i + 4], &tmp_buf[16 * i + 12], &st->a_data3[2]);
213         filter3(&tmp_buf[16 * i + 6], &tmp_buf[16 * i + 14], &st->a_data3[3]);
214     }
215 
216     for (i = 0; i < 8; i++)
217     {
218         filter3(&tmp_buf[32 * i + 0], &tmp_buf[32 * i + 16], &st->a_data3[4]);
219         filter3(&tmp_buf[32 * i + 8], &tmp_buf[32 * i + 24], &st->a_data3[5]);
220     }
221 
222     /* calculate levels in each frequency band */
223 
224     /* 4800 - 6400 Hz */
225     level[11] = level_calculation(tmp_buf, &st->sub_level[11], 16, 64, 4, 1, 14);
226     /* 4000 - 4800 Hz */
227     level[10] = level_calculation(tmp_buf, &st->sub_level[10], 8, 32, 8, 7, 15);
228     /* 3200 - 4000 Hz */
229     level[9] = level_calculation(tmp_buf, &st->sub_level[9],8, 32, 8, 3, 15);
230     /* 2400 - 3200 Hz */
231     level[8] = level_calculation(tmp_buf, &st->sub_level[8],8, 32, 8, 2, 15);
232     /* 2000 - 2400 Hz */
233     level[7] = level_calculation(tmp_buf, &st->sub_level[7],4, 16, 16, 14, 16);
234     /* 1600 - 2000 Hz */
235     level[6] = level_calculation(tmp_buf, &st->sub_level[6],4, 16, 16, 6, 16);
236     /* 1200 - 1600 Hz */
237     level[5] = level_calculation(tmp_buf, &st->sub_level[5],4, 16, 16, 4, 16);
238     /* 800 - 1200 Hz */
239     level[4] = level_calculation(tmp_buf, &st->sub_level[4],4, 16, 16, 12, 16);
240     /* 600 - 800 Hz */
241     level[3] = level_calculation(tmp_buf, &st->sub_level[3],2, 8, 32, 8, 17);
242     /* 400 - 600 Hz */
243     level[2] = level_calculation(tmp_buf, &st->sub_level[2],2, 8, 32, 24, 17);
244     /* 200 - 400 Hz */
245     level[1] = level_calculation(tmp_buf, &st->sub_level[1],2, 8, 32, 16, 17);
246     /* 0 - 200 Hz */
247     level[0] = level_calculation(tmp_buf, &st->sub_level[0],2, 8, 32, 0, 17);
248 }
249 
250 /******************************************************************************
251 *
252 *     Function   : update_cntrl
253 *     Purpose    : Control update of the background noise estimate.
254 *
255 *******************************************************************************/
256 
update_cntrl(VadVars * st,Word16 level[])257 static void update_cntrl(
258         VadVars * st,                         /* i/o : State structure                    */
259         Word16 level[]                        /* i   : sub-band levels of the input frame */
260         )
261 {
262     Word32 i;
263     Word16 num, temp, stat_rat, exp, denom;
264     Word16 alpha;
265 
266     /* if a tone has been detected for a while, initialize stat_count */
267     if (sub((Word16) (st->tone_flag & 0x7c00), 0x7c00) == 0)
268     {
269         st->stat_count = STAT_COUNT;
270     } else
271     {
272         /* if 8 last vad-decisions have been "0", reinitialize stat_count */
273         if ((st->vadreg & 0x7f80) == 0)
274         {
275             st->stat_count = STAT_COUNT;
276         } else
277         {
278             stat_rat = 0;
279             for (i = 0; i < COMPLEN; i++)
280             {
281                 if(level[i] > st->ave_level[i])
282                 {
283                     num = level[i];
284                     denom = st->ave_level[i];
285                 } else
286                 {
287                     num = st->ave_level[i];
288                     denom = level[i];
289                 }
290                 /* Limit nimimum value of num and denom to STAT_THR_LEVEL */
291                 if(num < STAT_THR_LEVEL)
292                 {
293                     num = STAT_THR_LEVEL;
294                 }
295                 if(denom < STAT_THR_LEVEL)
296                 {
297                     denom = STAT_THR_LEVEL;
298                 }
299                 exp = norm_s(denom);
300                 denom = denom << exp;
301 
302                 /* stat_rat = num/denom * 64 */
303                 temp = div_s(num >> 1, denom);
304                 stat_rat = add1(stat_rat, shr(temp, (8 - exp)));
305             }
306 
307             /* compare stat_rat with a threshold and update stat_count */
308             if(stat_rat > STAT_THR)
309             {
310                 st->stat_count = STAT_COUNT;
311             } else
312             {
313                 if ((st->vadreg & 0x4000) != 0)
314                 {
315 
316                     if (st->stat_count != 0)
317                     {
318                         st->stat_count = st->stat_count - 1;
319                     }
320                 }
321             }
322         }
323     }
324 
325     /* Update average amplitude estimate for stationarity estimation */
326     alpha = ALPHA4;
327     if(st->stat_count == STAT_COUNT)
328     {
329         alpha = 32767;
330     } else if ((st->vadreg & 0x4000) == 0)
331     {
332         alpha = ALPHA5;
333     }
334     for (i = 0; i < COMPLEN; i++)
335     {
336         st->ave_level[i] = add1(st->ave_level[i], vo_mult_r(alpha, vo_sub(level[i], st->ave_level[i])));
337     }
338 }
339 
340 /******************************************************************************
341 *
342 *     Function     : hangover_addition
343 *     Purpose      : Add hangover after speech bursts
344 *
345 *******************************************************************************/
346 
hangover_addition(VadVars * st,Word16 low_power,Word16 hang_len,Word16 burst_len)347 static Word16 hangover_addition(                      /* return: VAD_flag indicating final VAD decision */
348         VadVars * st,                         /* i/o : State structure                     */
349         Word16 low_power,                     /* i   : flag power of the input frame    */
350         Word16 hang_len,                      /* i   : hangover length */
351         Word16 burst_len                      /* i   : minimum burst length for hangover addition */
352         )
353 {
354     /* if the input power (pow_sum) is lower than a threshold, clear counters and set VAD_flag to "0"         */
355     if (low_power != 0)
356     {
357         st->burst_count = 0;
358         st->hang_count = 0;
359         return 0;
360     }
361     /* update the counters (hang_count, burst_count) */
362     if ((st->vadreg & 0x4000) != 0)
363     {
364         st->burst_count = st->burst_count + 1;
365         if(st->burst_count >= burst_len)
366         {
367             st->hang_count = hang_len;
368         }
369         return 1;
370     } else
371     {
372         st->burst_count = 0;
373         if (st->hang_count > 0)
374         {
375             st->hang_count = st->hang_count - 1;
376             return 1;
377         }
378     }
379     return 0;
380 }
381 
382 /******************************************************************************
383 *
384 *     Function   : noise_estimate_update
385 *     Purpose    : Update of background noise estimate
386 *
387 *******************************************************************************/
388 
noise_estimate_update(VadVars * st,Word16 level[])389 static void noise_estimate_update(
390         VadVars * st,                         /* i/o : State structure                       */
391         Word16 level[]                        /* i   : sub-band levels of the input frame */
392         )
393 {
394     Word32 i;
395     Word16 alpha_up, alpha_down, bckr_add = 2;
396 
397     /* Control update of bckr_est[] */
398     update_cntrl(st, level);
399 
400     /* Choose update speed */
401     if ((0x7800 & st->vadreg) == 0)
402     {
403         alpha_up = ALPHA_UP1;
404         alpha_down = ALPHA_DOWN1;
405     } else
406     {
407         if (st->stat_count == 0)
408         {
409             alpha_up = ALPHA_UP2;
410             alpha_down = ALPHA_DOWN2;
411         } else
412         {
413             alpha_up = 0;
414             alpha_down = ALPHA3;
415             bckr_add = 0;
416         }
417     }
418 
419     /* Update noise estimate (bckr_est) */
420     for (i = 0; i < COMPLEN; i++)
421     {
422         Word16 temp;
423         temp = (st->old_level[i] - st->bckr_est[i]);
424 
425         if (temp < 0)
426         {                                  /* update downwards */
427             st->bckr_est[i] = add1(-2, add(st->bckr_est[i],vo_mult_r(alpha_down, temp)));
428             /* limit minimum value of the noise estimate to NOISE_MIN */
429             if(st->bckr_est[i] < NOISE_MIN)
430             {
431                 st->bckr_est[i] = NOISE_MIN;
432             }
433         } else
434         {                                  /* update upwards */
435             st->bckr_est[i] = add1(bckr_add, add1(st->bckr_est[i],vo_mult_r(alpha_up, temp)));
436 
437             /* limit maximum value of the noise estimate to NOISE_MAX */
438             if(st->bckr_est[i] > NOISE_MAX)
439             {
440                 st->bckr_est[i] = NOISE_MAX;
441             }
442         }
443     }
444 
445     /* Update signal levels of the previous frame (old_level) */
446     for (i = 0; i < COMPLEN; i++)
447     {
448         st->old_level[i] = level[i];
449     }
450 }
451 
452 /******************************************************************************
453 *
454 *     Function     : vad_decision
455 *     Purpose      : Calculates VAD_flag
456 *
457 *******************************************************************************/
458 
vad_decision(VadVars * st,Word16 level[COMPLEN],Word32 pow_sum)459 static Word16 vad_decision(                           /* return value : VAD_flag */
460         VadVars * st,                         /* i/o : State structure                       */
461         Word16 level[COMPLEN],                /* i   : sub-band levels of the input frame */
462         Word32 pow_sum                        /* i   : power of the input frame           */
463         )
464 {
465     Word32 i;
466     Word32 L_snr_sum;
467     Word32 L_temp;
468     Word16 vad_thr, temp, noise_level;
469     Word16 low_power_flag;
470     Word16 hang_len, burst_len;
471     Word16 ilog2_speech_level, ilog2_noise_level;
472     Word16 temp2;
473 
474     /* Calculate squared sum of the input levels (level) divided by the background noise components
475      * (bckr_est). */
476     L_snr_sum = 0;
477     for (i = 0; i < COMPLEN; i++)
478     {
479         Word16 exp;
480 
481         exp = norm_s(st->bckr_est[i]);
482         temp = (st->bckr_est[i] << exp);
483         temp = div_s((level[i] >> 1), temp);
484         temp = shl(temp, (exp - (UNIRSHFT - 1)));
485         L_snr_sum = L_mac(L_snr_sum, temp, temp);
486     }
487 
488     /* Calculate average level of estimated background noise */
489     L_temp = 0;
490     for (i = 1; i < COMPLEN; i++)          /* ignore lowest band */
491     {
492         L_temp = vo_L_add(L_temp, st->bckr_est[i]);
493     }
494 
495     noise_level = extract_h((L_temp << 12));
496     /* if SNR is lower than a threshold (MIN_SPEECH_SNR), and increase speech_level */
497     temp = vo_mult(noise_level, MIN_SPEECH_SNR) << 3;
498 
499     if(st->speech_level < temp)
500     {
501         st->speech_level = temp;
502     }
503     ilog2_noise_level = ilog2(noise_level);
504 
505     /* If SNR is very poor, speech_level is probably corrupted by noise level. This is correctred by
506      * subtracting MIN_SPEECH_SNR*noise_level from speech level */
507     ilog2_speech_level = ilog2(st->speech_level - temp);
508 
509     temp = add1(vo_mult(NO_SLOPE, (ilog2_noise_level - NO_P1)), THR_HIGH);
510 
511     temp2 = add1(SP_CH_MIN, vo_mult(SP_SLOPE, (ilog2_speech_level - SP_P1)));
512     if (temp2 < SP_CH_MIN)
513     {
514         temp2 = SP_CH_MIN;
515     }
516     if (temp2 > SP_CH_MAX)
517     {
518         temp2 = SP_CH_MAX;
519     }
520     vad_thr = temp + temp2;
521 
522     if(vad_thr < THR_MIN)
523     {
524         vad_thr = THR_MIN;
525     }
526     /* Shift VAD decision register */
527     st->vadreg = (st->vadreg >> 1);
528 
529     /* Make intermediate VAD decision */
530     if(L_snr_sum > vo_L_mult(vad_thr, (512 * COMPLEN)))
531     {
532         st->vadreg = (Word16) (st->vadreg | 0x4000);
533     }
534     /* check if the input power (pow_sum) is lower than a threshold" */
535     if(pow_sum < VAD_POW_LOW)
536     {
537         low_power_flag = 1;
538     } else
539     {
540         low_power_flag = 0;
541     }
542     /* Update background noise estimates */
543     noise_estimate_update(st, level);
544 
545     /* Calculate values for hang_len and burst_len based on vad_thr */
546     hang_len = add1(vo_mult(HANG_SLOPE, (vad_thr - HANG_P1)), HANG_HIGH);
547     if(hang_len < HANG_LOW)
548     {
549         hang_len = HANG_LOW;
550     }
551     burst_len = add1(vo_mult(BURST_SLOPE, (vad_thr - BURST_P1)), BURST_HIGH);
552 
553     return (hangover_addition(st, low_power_flag, hang_len, burst_len));
554 }
555 
556 /******************************************************************************
557 *
558 *     Function : Estimate_Speech()
559 *     Purpose  : Estimate speech level
560 *
561 * Maximum signal level is searched and stored to the variable sp_max.
562 * The speech frames must locate within SP_EST_COUNT number of frames.
563 * Thus, noisy frames having occasional VAD = "1" decisions will not
564 * affect to the estimated speech_level.
565 *
566 *******************************************************************************/
567 
Estimate_Speech(VadVars * st,Word16 in_level)568 static void Estimate_Speech(
569         VadVars * st,                         /* i/o : State structure    */
570         Word16 in_level                       /* level of the input frame */
571         )
572 {
573     Word16 alpha;
574 
575     /* if the required activity count cannot be achieved, reset counters */
576     if((st->sp_est_cnt - st->sp_max_cnt) > (SP_EST_COUNT - SP_ACTIVITY_COUNT))
577     {
578         st->sp_est_cnt = 0;
579         st->sp_max = 0;
580         st->sp_max_cnt = 0;
581     }
582     st->sp_est_cnt += 1;
583 
584     if (((st->vadreg & 0x4000)||(in_level > st->speech_level)) && (in_level > MIN_SPEECH_LEVEL1))
585     {
586         /* update sp_max */
587         if(in_level > st->sp_max)
588         {
589             st->sp_max = in_level;
590         }
591         st->sp_max_cnt += 1;
592 
593         if(st->sp_max_cnt >= SP_ACTIVITY_COUNT)
594         {
595             Word16 tmp;
596             /* update speech estimate */
597             tmp = (st->sp_max >> 1);      /* scale to get "average" speech level */
598 
599             /* select update speed */
600             if(tmp > st->speech_level)
601             {
602                 alpha = ALPHA_SP_UP;
603             } else
604             {
605                 alpha = ALPHA_SP_DOWN;
606             }
607             if(tmp > MIN_SPEECH_LEVEL2)
608             {
609                 st->speech_level = add1(st->speech_level, vo_mult_r(alpha, vo_sub(tmp, st->speech_level)));
610             }
611             /* clear all counters used for speech estimation */
612             st->sp_max = 0;
613             st->sp_max_cnt = 0;
614             st->sp_est_cnt = 0;
615         }
616     }
617 }
618 
619 /******************************************************************************
620 *
621 *  Function:   wb_vad_init
622 *  Purpose:    Allocates state memory and initializes state memory
623 *
624 *******************************************************************************/
625 
wb_vad_init(VadVars ** state,VO_MEM_OPERATOR * pMemOP)626 Word16 wb_vad_init(                        /* return: non-zero with error, zero for ok. */
627         VadVars ** state,                     /* i/o : State structure    */
628         VO_MEM_OPERATOR *pMemOP
629         )
630 {
631     VadVars *s;
632 
633     if (state == (VadVars **) NULL)
634     {
635         fprintf(stderr, "vad_init: invalid parameter\n");
636         return -1;
637     }
638     *state = NULL;
639 
640     /* allocate memory */
641     if ((s = (VadVars *) mem_malloc(pMemOP, sizeof(VadVars), 32, VO_INDEX_ENC_AMRWB)) == NULL)
642     {
643         fprintf(stderr, "vad_init: can not malloc state structure\n");
644         return -1;
645     }
646     wb_vad_reset(s);
647 
648     *state = s;
649 
650     return 0;
651 }
652 
653 /******************************************************************************
654 *
655 *  Function:   wb_vad_reset
656 *  Purpose:    Initializes state memory
657 *
658 *******************************************************************************/
659 
wb_vad_reset(VadVars * state)660 Word16 wb_vad_reset(                       /* return: non-zero with error, zero for ok. */
661         VadVars * state                       /* i/o : State structure    */
662         )
663 {
664     Word32 i, j;
665 
666     if (state == (VadVars *) NULL)
667     {
668         fprintf(stderr, "vad_reset: invalid parameter\n");
669         return -1;
670     }
671     state->tone_flag = 0;
672     state->vadreg = 0;
673     state->hang_count = 0;
674     state->burst_count = 0;
675     state->hang_count = 0;
676 
677     /* initialize memory used by the filter bank */
678     for (i = 0; i < F_5TH_CNT; i++)
679     {
680         for (j = 0; j < 2; j++)
681         {
682             state->a_data5[i][j] = 0;
683         }
684     }
685 
686     for (i = 0; i < F_3TH_CNT; i++)
687     {
688         state->a_data3[i] = 0;
689     }
690 
691     /* initialize the rest of the memory */
692     for (i = 0; i < COMPLEN; i++)
693     {
694         state->bckr_est[i] = NOISE_INIT;
695         state->old_level[i] = NOISE_INIT;
696         state->ave_level[i] = NOISE_INIT;
697         state->sub_level[i] = 0;
698     }
699 
700     state->sp_est_cnt = 0;
701     state->sp_max = 0;
702     state->sp_max_cnt = 0;
703     state->speech_level = SPEECH_LEVEL_INIT;
704     state->prev_pow_sum = 0;
705     return 0;
706 }
707 
708 /******************************************************************************
709 *
710 *  Function:   wb_vad_exit
711 *  Purpose:    The memory used for state memory is freed
712 *
713 *******************************************************************************/
714 
wb_vad_exit(VadVars ** state,VO_MEM_OPERATOR * pMemOP)715 void wb_vad_exit(
716         VadVars ** state,                      /* i/o : State structure    */
717         VO_MEM_OPERATOR *pMemOP
718         )
719 {
720     if (state == NULL || *state == NULL)
721         return;
722     /* deallocate memory */
723     mem_free(pMemOP, *state, VO_INDEX_ENC_AMRWB);
724     *state = NULL;
725     return;
726 }
727 
728 /******************************************************************************
729 *
730 *     Function     : wb_vad_tone_detection
731 *     Purpose      : Search maximum pitch gain from a frame. Set tone flag if
732 *                    pitch gain is high. This is used to detect
733 *                    signaling tones and other signals with high pitch gain.
734 *
735 *******************************************************************************/
736 
wb_vad_tone_detection(VadVars * st,Word16 p_gain)737 void wb_vad_tone_detection(
738         VadVars * st,                         /* i/o : State struct            */
739         Word16 p_gain                         /* pitch gain      */
740         )
741 {
742     /* update tone flag */
743     st->tone_flag = (st->tone_flag >> 1);
744 
745     /* if (pitch_gain > TONE_THR) set tone flag */
746     if (p_gain > TONE_THR)
747     {
748         st->tone_flag = (Word16) (st->tone_flag | 0x4000);
749     }
750 }
751 
752 /******************************************************************************
753 *
754 *     Function     : wb_vad
755 *     Purpose      : Main program for Voice Activity Detection (VAD) for AMR
756 *
757 *******************************************************************************/
758 
wb_vad(VadVars * st,Word16 in_buf[])759 Word16 wb_vad(                                /* Return value : VAD Decision, 1 = speech, 0 = noise */
760         VadVars * st,                         /* i/o : State structure                 */
761         Word16 in_buf[]                       /* i   : samples of the input frame   */
762          )
763 {
764     Word16 level[COMPLEN];
765     Word32 i;
766     Word16 VAD_flag, temp;
767     Word32 L_temp, pow_sum;
768 
769     /* Calculate power of the input frame. */
770     L_temp = 0L;
771     for (i = 0; i < FRAME_LEN; i++)
772     {
773         L_temp = L_mac(L_temp, in_buf[i], in_buf[i]);
774     }
775 
776     /* pow_sum = power of current frame and previous frame */
777     pow_sum = L_add(L_temp, st->prev_pow_sum);
778 
779     /* save power of current frame for next call */
780     st->prev_pow_sum = L_temp;
781 
782     /* If input power is very low, clear tone flag */
783     if (pow_sum < POW_TONE_THR)
784     {
785         st->tone_flag = (Word16) (st->tone_flag & 0x1fff);
786     }
787     /* Run the filter bank and calculate signal levels at each band */
788     filter_bank(st, in_buf, level);
789 
790     /* compute VAD decision */
791     VAD_flag = vad_decision(st, level, pow_sum);
792 
793     /* Calculate input level */
794     L_temp = 0;
795     for (i = 1; i < COMPLEN; i++)          /* ignore lowest band */
796     {
797         L_temp = vo_L_add(L_temp, level[i]);
798     }
799 
800     temp = extract_h(L_temp << 12);
801 
802     Estimate_Speech(st, temp);             /* Estimate speech level */
803     return (VAD_flag);
804 }
805 
806 
807 
808 
809