/src/libxaac/encoder/ixheaace_signal_classifier.c

Source
/******************************************************************************
 *                                                                            *
 * Copyright (C) 2023 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
#include <string.h>
#include <math.h>
#include "iusace_type_def.h"
#include "iusace_cnst.h"

#include "iusace_fd_quant.h"
#include "iusace_bitbuffer.h"
#include "impd_drc_common_enc.h"
#include "impd_drc_uni_drc.h"
#include "impd_drc_api.h"
#include "impd_drc_uni_drc_eq.h"
#include "impd_drc_uni_drc_filter_bank.h"
#include "impd_drc_gain_enc.h"
#include "impd_drc_struct_def.h"

#include "ixheaace_memory_standards.h"
#include "iusace_tns_usac.h"
#include "iusace_psy_mod.h"
#include "iusace_config.h"
#include "iusace_signal_classifier.h"
#include "iusace_fft.h"
#include "iusace_block_switch_const.h"
#include "iusace_block_switch_struct_def.h"
#include "iusace_cnst.h"
#include "iusace_ms.h"
#include "ixheaace_adjust_threshold_data.h"
#include "iusace_fd_qc_util.h"
#include "ixheaace_sbr_header.h"
#include "ixheaace_config.h"
#include "ixheaace_asc_write.h"
#include "iusace_main.h"

static VOID iusace_calc_pds(FLOAT32 *ptr_input, WORD32 ccfl) {
  WORD32 i;
  FLOAT64 max_pow, delta;
  FLOAT64 log_ccfl_base_10 = (ccfl == 1024) ? LOG_1024_BASE_10 : LOG_768_BASE_10;

  max_pow = MAX(
      10 * (log10(ptr_input[0] * ptr_input[0] + ptr_input[1] * ptr_input[1]) - log_ccfl_base_10) +
          10e-15,
      MIN_POW);

  for (i = 1; i<ccfl>> 1; i++) {
    /* removed the sqrt along with clubbing the for loops */
    ptr_input[2 * i] = (FLOAT32)MAX(10 * (log10(ptr_input[2 * i] * ptr_input[2 * i] +
                                                ptr_input[2 * i + 1] * ptr_input[2 * i + 1]) -
                                          log_ccfl_base_10) +
                                        10e-15,
                                    MIN_POW);

    max_pow = MAX(max_pow, ptr_input[2 * i]);
  }

  /* Normalized to reference sound pressure level 96 dB */
  delta = 96 - max_pow;

  for (i = 0; i<ccfl>> 1; i++) {
    ptr_input[2 * i] = ptr_input[2 * i] + (FLOAT32)delta;
  }
  return;
}

static VOID iusace_find_tonal(FLOAT32 *ptr_input, WORD32 *ptr_tonal_flag, FLOAT32 *ptr_scratch,
                              WORD32 ccfl) {
  WORD32 i, j;
  WORD32 is_tonal;
  FLOAT64 tonal_spl;
  FLOAT64 absolute_threshold_xm;

  for (i = 0; i<ccfl>> 1; i++) {
    ptr_scratch[i] = ptr_input[2 * i];
  }

  if (ccfl == FRAME_LEN_LONG) {
    for (i = 0; i <= 511; i++) {
      ptr_tonal_flag[i] = 0;
    }

    for (i = 2; i < 500; i++) {
      if (ptr_scratch[i] > ptr_scratch[i - 1] && ptr_scratch[i] >= ptr_scratch[i + 1]) {
        is_tonal = 1;

        /* Verify it meets the condition: ptr_scratch[i]-ptr_scratch[i+j]>=7 */

        if (1 < i && i < 62) {
          for (j = -2; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 2; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }

        else if (62 <= i && i < 126) {
          for (j = -3; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 3; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }

        else if (126 <= i && i < 254) {
          for (j = -6; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 6; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }

        else if (254 <= i && i < 500) {
          for (j = -12; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 12; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }
      }
    }

    for (i = 0; i <= 511; i++) {
      if (ptr_tonal_flag[i] == 1) {
        /* compute the SPL of tonal */
        tonal_spl =
            10 * log10(pow(10, (ptr_scratch[i - 1] / 10)) + pow(10, (ptr_scratch[i] / 10)) +
                       pow(10, (ptr_scratch[i + 1] / 10)));

        if (i >= 324) {
          absolute_threshold_xm = iusace_classify_arrays.absolute_threshold_1024[i] + 20;
        } else {
          absolute_threshold_xm = iusace_classify_arrays.absolute_threshold_1024[i];
        }
        if (tonal_spl < absolute_threshold_xm) {
          ptr_tonal_flag[i] = 0;
        }
      }
    }
  } else  // (ccfl == 768)
  {
    for (i = 0; i <= 383; i++) {
      ptr_tonal_flag[i] = 0;
    }

    for (i = 2; i < 375; i++) {
      if (ptr_scratch[i] > ptr_scratch[i - 1] && ptr_scratch[i] >= ptr_scratch[i + 1]) {
        is_tonal = 1;

        /* Verify it meets the condition: ptr_scratch[i]-ptr_scratch[i+j]>=7 */

        if (1 < i && i < 47) {
          for (j = -2; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 2; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }

        else if (47 <= i && i < 95) {
          for (j = -3; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 3; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }

        else if (95 <= i && i < 194) {
          for (j = -5; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 5; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }

        else if (191 <= i && i < 375) {
          for (j = -9; j <= -2; j++) {
            is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
            if (is_tonal == 0) break;
          }
          if (is_tonal == 1) {
            for (j = 2; j <= 9; j++) {
              is_tonal = is_tonal && ptr_scratch[i] - ptr_scratch[i + j] >= 7;
              if (is_tonal == 0) break;
            }
          }

          if (is_tonal == 1) {
            ptr_tonal_flag[i] = 1;
          }
        }
      }
    }

    for (i = 0; i <= 383; i++) {
      if (ptr_tonal_flag[i] == 1) {
        /* compute the SPL of tonal */
        tonal_spl =
            10 * log10(pow(10, (ptr_scratch[i - 1] / 10)) + pow(10, (ptr_scratch[i] / 10)) +
                       pow(10, (ptr_scratch[i + 1] / 10)));

        if (i >= 243) {
          absolute_threshold_xm = iusace_classify_arrays.absolute_threshold_768[i] + 20;
        } else {
          absolute_threshold_xm = iusace_classify_arrays.absolute_threshold_768[i];
        }
        if (tonal_spl < absolute_threshold_xm) {
          ptr_tonal_flag[i] = 0;
        }
      }
    }
  }
  return;
}

static VOID iusace_tonal_analysis(ia_tonal_params_struct *pstr_ton_params,
                                  iusace_scratch_mem *pstr_scratch, WORD32 ccfl) {
  FLOAT32 *ptr_complex_fft = pstr_scratch->p_complex_fft;
  WORD32 *ptr_tonal_flag = pstr_scratch->p_tonal_flag;
  FLOAT32 *ptr_time_sig = pstr_ton_params->time_signal;
  WORD32 framecnt_xm = pstr_ton_params->framecnt_xm;
  WORD32 *ptr_n_tonal = pstr_ton_params->n_tonal;
  WORD32 *ptr_n_tonal_low_frequency = pstr_ton_params->n_tonal_low_frequency;
  FLOAT32 *ptr_n_tonal_low_frequency_ratio = pstr_ton_params->n_tonal_low_frequency_ratio;
  FLOAT32 *ave_n_tonal = pstr_ton_params->ave_n_tonal;
  FLOAT32 *ave_n_tonal_short = pstr_ton_params->ave_n_tonal_short;
  WORD32 i;
  WORD32 fft_size = ccfl;

  WORD32 frame_length;
  WORD32 n_tonal_total, n_tonal_low_frequency_total;

  for (i = 0; i < ccfl; i++) {
    ptr_complex_fft[2 * i] = (FLOAT32)(
        ptr_time_sig[i] * ((ccfl == 1024) ? iusace_classify_arrays.hanning_window_1024[i]
                                          : iusace_classify_arrays.hanning_window_768[i]));
    ptr_complex_fft[2 * i + 1] = 0;
  }

  iusace_complex_fft(ptr_complex_fft, fft_size, pstr_scratch);

  /* compute power density spectrum */
  /* re_fft contains the resulting pds */
  iusace_calc_pds(ptr_complex_fft, ccfl);

  /* detect tonal */
  iusace_find_tonal(ptr_complex_fft, ptr_tonal_flag, pstr_scratch->p_pow_spec, ccfl);

  /* update n_tonal, n_tonal_low_frequency */
  for (i = 0; i < 99; i++) {
    ptr_n_tonal[i] = ptr_n_tonal[i + 1];
    ptr_n_tonal_low_frequency[i] = ptr_n_tonal_low_frequency[i + 1];
  }
  ptr_n_tonal[99] = 0;
  for (i = 0; i<ccfl>> 1; i++) {
    ptr_n_tonal[99] += ptr_tonal_flag[i];
  }
  ptr_n_tonal_low_frequency[99] = 0;
  for (i = 0; i < INDEXOFLOWFREQUENCY; i++) {
    ptr_n_tonal_low_frequency[99] += ptr_tonal_flag[i];
  }

  /* compute long-term AVE and the ratio of distribution in low-frequency domain */
  if (framecnt_xm < AVE_TONAL_LENGTH) {
    frame_length = framecnt_xm;
  } else {
    frame_length = AVE_TONAL_LENGTH;
  }

  n_tonal_total = 0;
  n_tonal_low_frequency_total = 0;
  for (i = 0; i < frame_length; i++) {
    n_tonal_total += ptr_n_tonal[99 - i];
    n_tonal_low_frequency_total += ptr_n_tonal_low_frequency[99 - i];
  }

  *ave_n_tonal = (FLOAT32)n_tonal_total / frame_length;

  if (n_tonal_total == 0) {
    *ptr_n_tonal_low_frequency_ratio = 1;
  } else {
    *ptr_n_tonal_low_frequency_ratio = (FLOAT32)n_tonal_low_frequency_total / n_tonal_total;
  }

  /* compute the short-term AVE */
  if (framecnt_xm < AVE_TONAL_LENGTH_SHORT) {
    frame_length = framecnt_xm;
  } else {
    frame_length = AVE_TONAL_LENGTH_SHORT;
  }

  n_tonal_total = 0;
  for (i = 0; i < frame_length; i++) {
    n_tonal_total += ptr_n_tonal[99 - i];
  }

  *ave_n_tonal_short = (FLOAT32)n_tonal_total / frame_length;
  return;
}

static VOID iusace_spectral_tilt_analysis(ia_spec_tilt_params_struct *ptr_spec_params,
                                          WORD32 ccfl) {
  FLOAT32 *ptr_time_signal = ptr_spec_params->time_signal;
  WORD32 framecnt_xm = ptr_spec_params->framecnt_xm;
  FLOAT32 *ptr_spec_tilt_buf = ptr_spec_params->spec_tilt_buf;
  FLOAT32 *ptr_msd_spec_tilt = ptr_spec_params->msd_spec_tilt;
  FLOAT32 *ptr_msd_spec_tilt_short = ptr_spec_params->msd_spec_tilt_short;
  WORD32 i;
  WORD32 frame_length;

  FLOAT32 r0, r1;
  FLOAT32 spec_tilt;
  FLOAT32 ave_spec_tilt;

  /* compute spectral tilt */
  r0 = 0;
  r1 = 0;
  for (i = 0; i < ccfl - 1; i++) {
    r0 += ptr_time_signal[i] * ptr_time_signal[i];
    r1 += ptr_time_signal[i] * ptr_time_signal[i + 1];
  }
  r0 += ptr_time_signal[i] * ptr_time_signal[i];

  if (r0 == 0) {
    spec_tilt = 1.0f;
  } else {
    spec_tilt = r1 / r0;
  }

  /* update spec_tilt_buf */
  for (i = 0; i < 100 - 1; i++) {
    ptr_spec_tilt_buf[i] = ptr_spec_tilt_buf[i + 1];
  }
  ptr_spec_tilt_buf[99] = spec_tilt;

  /* compute the long-term mean square deviation of the spectral tilt */
  if (framecnt_xm < SPECTRAL_TILT_LENGTH) {
    frame_length = framecnt_xm;
  } else {
    frame_length = SPECTRAL_TILT_LENGTH;
  }

  ave_spec_tilt = 0;
  for (i = 0; i < frame_length; i++) {
    ave_spec_tilt += ptr_spec_tilt_buf[99 - i];
  }
  ave_spec_tilt /= frame_length;

  *ptr_msd_spec_tilt = 0;
  for (i = 0; i < frame_length; i++) {
    *ptr_msd_spec_tilt +=
        (ptr_spec_tilt_buf[99 - i] - ave_spec_tilt) * (ptr_spec_tilt_buf[99 - i] - ave_spec_tilt);
  }
  *ptr_msd_spec_tilt /= frame_length;

  /* compute the short-term mean square deviation of the spectral tilt */
  if (framecnt_xm < SPECTRAL_TILT_LENGTH_SHORT) {
    frame_length = framecnt_xm;
  } else {
    frame_length = SPECTRAL_TILT_LENGTH_SHORT;
  }

  ave_spec_tilt = 0;
  for (i = 0; i < frame_length; i++) {
    ave_spec_tilt += ptr_spec_tilt_buf[99 - i];
  }
  ave_spec_tilt /= frame_length;

  *ptr_msd_spec_tilt_short = 0;
  for (i = 0; i < frame_length; i++) {
    *ptr_msd_spec_tilt_short +=
        (ptr_spec_tilt_buf[99 - i] - ave_spec_tilt) * (ptr_spec_tilt_buf[99 - i] - ave_spec_tilt);
  }
  *ptr_msd_spec_tilt_short /= frame_length;

  /* compute the energy of current frame */
  if (r0 <= 1) {
    ptr_spec_params->frame_energy = 0;
  } else {
    ptr_spec_params->frame_energy = (FLOAT32)(10 * log(r0) / log(10));
  }
  return;
}

static WORD32 iusace_init_mode_decision(ia_mode_params_struct *pstr_mode_params) {
  WORD32 i;
  WORD32 framecnt = pstr_mode_params->framecnt;
  WORD32 *framecnt_xm = pstr_mode_params->framecnt_xm;
  WORD32 *flag_border = pstr_mode_params->flag_border;
  FLOAT32 ave_n_tonal_short = pstr_mode_params->ave_n_tonal_short;
  FLOAT32 ave_n_tonal = pstr_mode_params->ave_n_tonal;
  FLOAT32 *ave_n_tonal_short_buf = pstr_mode_params->ave_n_tonal_short_buf;
  FLOAT32 *ave_n_tonal_buf = pstr_mode_params->ave_n_tonal_buf;
  FLOAT32 msd_spec_tilt = pstr_mode_params->msd_spec_tilt;
  FLOAT32 msd_spec_tilt_short = pstr_mode_params->msd_spec_tilt_short;
  FLOAT32 *msd_spec_tilt_buf = pstr_mode_params->msd_spec_tilt_buf;
  FLOAT32 *msd_spec_tilt_short_buf = pstr_mode_params->msd_spec_tilt_short_buf;
  FLOAT32 n_tonal_low_frequency_ratio = pstr_mode_params->n_tonal_low_frequency_ratio;
  FLOAT32 frame_energy = pstr_mode_params->frame_energy;
  WORD32 init_mode_decision_result = TBD;
  WORD32 count_msd_st_monchhichi = 0;
  WORD32 count_msd_st_speech_music = 0, count_msd_st_music_speech = 0;
  WORD32 flag_ave_music_speech = 0;
  WORD32 count_msd_st_music = 0;
  WORD32 border_state = 0;
  WORD32 count_quiet_mode = 0;

  *flag_border = NO_BORDER;

  /* border decision according to spectral tilt */

  /* update msd_spec_tilt_buf, msd_spec_tilt_short_buf */
  for (i = 0; i < 5 - 1; i++) {
    msd_spec_tilt_buf[i] = msd_spec_tilt_buf[i + 1];
    msd_spec_tilt_short_buf[i] = msd_spec_tilt_short_buf[i + 1];
  }
  msd_spec_tilt_buf[4] = msd_spec_tilt;
  msd_spec_tilt_short_buf[4] = msd_spec_tilt_short;

  /* speech->music find strict border of speech->music */
  if ((msd_spec_tilt >= 0.014) && (msd_spec_tilt_short <= 0.000005)) {
    count_msd_st_monchhichi++;
  } else {
    count_msd_st_monchhichi = 0;
  }
  if (((*flag_border != BORDER_SPEECH_MUSIC_DEFINITE) &&
       (*flag_border != BORDER_MUSIC_SPEECH_DEFINITE)) &&
      (border_state != BORDER_SPEECH_MUSIC_DEFINITE) && (count_msd_st_monchhichi >= 15) &&
      (*framecnt_xm >= 300)) {
    *framecnt_xm = 10;
    *flag_border = BORDER_SPEECH_MUSIC;
  }

  /* find the relative loose border of speech->music */
  if ((msd_spec_tilt >= 0.0025) && (msd_spec_tilt_short <= 0.000003)) {
    count_msd_st_speech_music++;
  } else {
    count_msd_st_speech_music = 0;
  }
  if (((*flag_border != BORDER_SPEECH_MUSIC_DEFINITE) &&
       (*flag_border != BORDER_MUSIC_SPEECH_DEFINITE)) &&
      (border_state != BORDER_SPEECH_MUSIC_DEFINITE) && (count_msd_st_speech_music >= 15) &&
      (*framecnt_xm >= 300)) {
    *framecnt_xm = 10;
    *flag_border = BORDER_SPEECH_MUSIC;
  }

  /* music->speech */
  if ((msd_spec_tilt_buf[0] <= 0.0003) && (msd_spec_tilt_short_buf[0] <= 0.0002)) {
    count_msd_st_music_speech++;
  }
  if (((*flag_border != BORDER_SPEECH_MUSIC_DEFINITE) &&
       (*flag_border != BORDER_MUSIC_SPEECH_DEFINITE)) &&
      (border_state != BORDER_MUSIC_SPEECH_DEFINITE) && (count_msd_st_music_speech >= 100) &&
      (msd_spec_tilt >= 0.0008) && (msd_spec_tilt_short >= 0.0025) && (*framecnt_xm >= 20)) {
    *framecnt_xm = 10;
    *flag_border = BORDER_MUSIC_SPEECH;
  }

  /* border decision according to tonal
   *  update ave_n_tonal_short_buf, ave_n_tonal_buf */
  for (i = 0; i < 5 - 1; i++) {
    ave_n_tonal_short_buf[i] = ave_n_tonal_short_buf[i + 1];
    ave_n_tonal_buf[i] = ave_n_tonal_buf[i + 1];
  }
  ave_n_tonal_short_buf[4] = ave_n_tonal_short;
  ave_n_tonal_buf[4] = ave_n_tonal;

  /* music->speech */
  if ((ave_n_tonal_buf[0] >= 12) && (ave_n_tonal_buf[0] < 15) &&
      (ave_n_tonal_buf[0] - ave_n_tonal_short_buf[0] >= 5) && (*framecnt_xm >= 20) &&
      (ave_n_tonal_short - ave_n_tonal_short_buf[0] < 5)) {
    *framecnt_xm = 10;
    flag_ave_music_speech = 1;
    *flag_border = BORDER_MUSIC_SPEECH_DEFINITE;
  }

  /* update border decision according to energy */
  if (frame_energy <= 60) {
    count_quiet_mode = 0;
  } else {
    count_quiet_mode++;
  }

  if ((*flag_border == BORDER_MUSIC_SPEECH) && (count_quiet_mode <= 5)) {
    *flag_border = BORDER_MUSIC_SPEECH_DEFINITE;
    *framecnt_xm = 10;
  }

  /* MUSIC_DEFINITE and SPEECH_DEFINITE mode decision according to short-term characters */

  /* ave_n_tonal_short */
  if ((init_mode_decision_result == TBD) && (ave_n_tonal_short >= 19)) {
    init_mode_decision_result = MUSIC_DEFINITE;
  }
  if ((init_mode_decision_result == TBD) && (ave_n_tonal_short <= 1.5)) {
    init_mode_decision_result = SPEECH_DEFINITE;
  }

  /* msd_spec_tilt_short */
  if (msd_spec_tilt_short >= 0.02) {
    init_mode_decision_result = SPEECH_DEFINITE;
  }
  if ((init_mode_decision_result == TBD) && (msd_spec_tilt_short <= 0.00000025) &&
      (framecnt >= 10)) {
    init_mode_decision_result = MUSIC_DEFINITE;
  }

  /* SPEECH mode decision */

  /* flag_ave_music_speech??ave_n_tonal_short */
  if ((init_mode_decision_result == TBD) && (flag_ave_music_speech == 1)) {
    if ((ave_n_tonal_short <= 12) && (*framecnt_xm <= 150)) {
      init_mode_decision_result = SPEECH;
    }
  }

  /* MUSIC_DEFINITE and SPEECH_DEFINITE mode decision */

  /* ave_n_tonal */
  if ((init_mode_decision_result == TBD) && (ave_n_tonal <= 3)) {
    init_mode_decision_result = SPEECH_DEFINITE;
  }
  if ((init_mode_decision_result == TBD) && (ave_n_tonal >= 15)) {
    init_mode_decision_result = MUSIC_DEFINITE;
  }

  /** ave_n_tonal_short
   */
  if ((init_mode_decision_result == TBD) && (ave_n_tonal_short >= 17)) {
    init_mode_decision_result = MUSIC_DEFINITE;
  }

  /** msd_spec_tilt
   */
  if ((init_mode_decision_result == TBD) && (msd_spec_tilt >= 0.01)) {
    init_mode_decision_result = SPEECH_DEFINITE;
  }
  if ((init_mode_decision_result == TBD) && (framecnt >= 10) && (msd_spec_tilt <= 0.00004)) {
    init_mode_decision_result = MUSIC_DEFINITE;
  }

  /** n_tonal_low_frequency_ratio
   */
  if ((init_mode_decision_result == TBD) && (n_tonal_low_frequency_ratio <= 0.91)) {
    init_mode_decision_result = MUSIC_DEFINITE;
  }

  /** MUSIC and SPEECH mode decision
   */

  /** msd_spec_tilt
   */
  if ((init_mode_decision_result == TBD) && (msd_spec_tilt <= 0.0002) && (*framecnt_xm >= 15)) {
    init_mode_decision_result = MUSIC;
  }

  /** n_tonal_low_frequency_ratio
   */
  if ((init_mode_decision_result == TBD) && (n_tonal_low_frequency_ratio >= 0.95)) {
    init_mode_decision_result = SPEECH;
  }
  if ((init_mode_decision_result == TBD) && (n_tonal_low_frequency_ratio <= 0.935)) {
    init_mode_decision_result = MUSIC;
  }

  /** the rest of the frame to SPEECH
   */
  if (init_mode_decision_result == TBD) {
    init_mode_decision_result = SPEECH;
  }

  /** MUSIC mode decision according to changes of the MSD of the spectral tilt
   */

  /** compute the changes of the MSD of the spectral tilt
   */
  if ((msd_spec_tilt <= 0.007) && (init_mode_decision_result != SPEECH_DEFINITE)) {
    if (init_mode_decision_result != SPEECH) {
      count_msd_st_music++;
    }
  } else {
    count_msd_st_music = 0;
  }

  if ((init_mode_decision_result != SPEECH_DEFINITE) && (count_msd_st_music >= 400) &&
      (border_state != BORDER_MUSIC_SPEECH_DEFINITE)) {
    init_mode_decision_result = MUSIC;
  }

  /** update border flag
   */

  if (*flag_border != NO_BORDER) {
    border_state = *flag_border;
  }

  /** update BORDER_SPEECH_MUSIC_DEFINITE
   */
  if (((border_state == BORDER_MUSIC_SPEECH) || (border_state == BORDER_MUSIC_SPEECH_DEFINITE)) &&
      (init_mode_decision_result == MUSIC_DEFINITE) && (*framecnt_xm >= 20)) {
    *flag_border = BORDER_SPEECH_MUSIC_DEFINITE;
    *framecnt_xm = 10;
    border_state = *flag_border;
  }

  /** update BORDER_MUSIC_SPEECH_DEFINITE
   */
  if (((border_state == BORDER_SPEECH_MUSIC) || (border_state == BORDER_SPEECH_MUSIC_DEFINITE)) &&
      (init_mode_decision_result == SPEECH_DEFINITE) && (*framecnt_xm >= 20)) {
    *flag_border = BORDER_MUSIC_SPEECH_DEFINITE;
    *framecnt_xm = 10;
  }

  return init_mode_decision_result;
}

static WORD32 iusace_smoothing_mode_decision(ia_smooth_params_struct *pstr_smooth_param) {
  WORD32 *ptr_init_result_ahead = pstr_smooth_param->init_result_ahead;
  WORD32 flag_border = pstr_smooth_param->flag_border;
  WORD32 *ptr_flag_border_buf_behind = pstr_smooth_param->flag_border_buf_behind;
  WORD32 *ptr_flag_border_buf_ahead = pstr_smooth_param->flag_border_buf_ahead;
  FLOAT32 frame_energy = pstr_smooth_param->frame_energy;
  FLOAT32 *ptr_frame_energy_buf_behind = pstr_smooth_param->frame_energy_buf_behind;
  FLOAT32 *ptr_frame_energy_buf_ahead = pstr_smooth_param->frame_energy_buf_ahead;
  WORD32 *ptr_smoothing_result_buf = pstr_smooth_param->smoothing_result_buf;
  WORD32 *ptr_init_result_behind = pstr_smooth_param->init_result_behind;
  WORD32 init_mode_decision_result = pstr_smooth_param->init_mode_decision_result;
  WORD32 i;

  WORD32 mode_decision_result;

  WORD32 num_music, num_speech;

  /** update data array
   */

  /** update init_result_behind, init_result_ahead
   */
  for (i = 0; i < 99; i++) {
    ptr_init_result_behind[i] = ptr_init_result_behind[i + 1];
  }
  ptr_init_result_behind[99] = ptr_init_result_ahead[0];

  ptr_init_result_ahead[NFRAMEAHEAD - 1] = init_mode_decision_result;

  /** update flag_border_buf_behind, flag_border_buf_ahead
   * update frame_energy_buf_behind, frame_energy_buf_ahead
   */

  for (i = 0; i < 9; i++) {
    ptr_flag_border_buf_behind[i] = ptr_flag_border_buf_behind[i + 1];
    ptr_frame_energy_buf_behind[i] = ptr_frame_energy_buf_behind[i + 1];
  }
  ptr_flag_border_buf_behind[9] = ptr_flag_border_buf_ahead[0];
  ptr_frame_energy_buf_behind[9] = ptr_frame_energy_buf_ahead[0];

  ptr_flag_border_buf_ahead[NFRAMEAHEAD - 1] = flag_border;

  ptr_frame_energy_buf_ahead[NFRAMEAHEAD - 1] = frame_energy;

  /** smoothing according to past results
   */

  mode_decision_result = ptr_init_result_behind[99];

  /** update smoothing_result_buf
   */
  if (ptr_flag_border_buf_behind[9] == NO_BORDER) {
    for (i = 0; i < 99; i++) {
      ptr_smoothing_result_buf[i] = ptr_smoothing_result_buf[i + 1];
    }
    pstr_smooth_param->num_smoothing++;
  } else {
    for (i = 0; i < 99; i++) {
      ptr_smoothing_result_buf[i] = TBD;
    }
    pstr_smooth_param->num_smoothing = 1;
  }
  ptr_smoothing_result_buf[99] = ptr_init_result_behind[99];

  if (pstr_smooth_param->num_smoothing >= SMOOTHING_LENGTH) {
    num_music = 0;
    num_speech = 0;

    /** smoothed result count
     */
    for (i = 0; i < SMOOTHING_LENGTH; i++) {
      if ((ptr_smoothing_result_buf[100 - i] == SPEECH) ||
          (ptr_smoothing_result_buf[100 - i] == SPEECH_DEFINITE)) {
        num_speech++;
      } else {
        num_music++;
      }
    }

    /** smoothing
     */
    if ((num_speech > num_music) && (init_mode_decision_result != MUSIC_DEFINITE)) {
      mode_decision_result = SPEECH;
    }
    if ((num_music > num_speech) && (init_mode_decision_result != SPEECH_DEFINITE)) {
      mode_decision_result = MUSIC;
    }
  }

  /** correct according to energies and ahead mode decision results
   */

  if ((mode_decision_result == MUSIC) && (ptr_frame_energy_buf_behind[9] <= 60)) {
    for (i = 0; i < NFRAMEAHEAD; i++) {
      if ((ptr_init_result_ahead[i] == SPEECH_DEFINITE) || (ptr_init_result_ahead[i] == SPEECH)) {
        pstr_smooth_param->flag_speech_definite = 1;
      }
    }
  }
  if ((pstr_smooth_param->flag_speech_definite == 1) && (mode_decision_result == MUSIC)) {
    mode_decision_result = SPEECH;
  } else {
    pstr_smooth_param->flag_speech_definite = 0;
  }

  /** correct MUSIC mode
   */

  if (ptr_frame_energy_buf_behind[9] <= 65) {
    pstr_smooth_param->count_small_energy = 0;
  } else {
    pstr_smooth_param->count_small_energy++;
  }
  if (((ptr_flag_border_buf_ahead[NFRAMEAHEAD - 1] == BORDER_SPEECH_MUSIC) ||
       (ptr_flag_border_buf_ahead[NFRAMEAHEAD - 1] == BORDER_SPEECH_MUSIC_DEFINITE)) &&
      (pstr_smooth_param->count_small_energy <= 30)) {
    pstr_smooth_param->flag_music_definite = 1;
  }
  if ((pstr_smooth_param->flag_music_definite == 1) &&
      ((mode_decision_result == SPEECH) || (mode_decision_result == SPEECH_DEFINITE))) {
    mode_decision_result = MUSIC;
  } else {
    pstr_smooth_param->flag_music_definite = 0;
  }

  return mode_decision_result;
}

static WORD32 iusace_classification_ccfl(ia_classification_struct *pstr_sig_class,
                                         FLOAT32 *ptr_time_signal,
                                         iusace_scratch_mem *pstr_scratch, WORD32 ccfl) {
  WORD32 i;
  ia_tonal_params_struct pstr_ton_params;
  ia_smooth_params_struct smooth_param;
  ia_mode_params_struct pstr_mode_params;
  ia_spec_tilt_params_struct ptr_spec_params;

  ia_classification_buf_struct *pstr_buffers = &(pstr_sig_class->buffers);
  pFLOAT32 spec_tilt_buf = pstr_sig_class->spec_tilt_buf;
  pWORD32 n_tonal = pstr_sig_class->n_tonal;
  pWORD32 n_tonal_low_frequency = pstr_sig_class->n_tonal_low_frequency;
  pWORD32 framecnt_xm = &(pstr_sig_class->framecnt_xm);
  pWORD32 framecnt = &(pstr_sig_class->framecnt);
  pFLOAT32 ave_n_tonal_short_buf = pstr_sig_class->ave_n_tonal_short_buf;
  pFLOAT32 ave_n_tonal_buf = pstr_sig_class->ave_n_tonal_buf;
  pFLOAT32 msd_spec_tilt_buf = pstr_sig_class->msd_spec_tilt_buf;
  pFLOAT32 msd_spec_tilt_short_buf = pstr_sig_class->msd_spec_tilt_short_buf;

  FLOAT32 n_tonal_low_frequency_ratio;    /* the ratio of distribution of the numbers */
                                          /* of tonal in the low frequency domain     */
  FLOAT32 ave_n_tonal, ave_n_tonal_short; /**< the number of tonal */
  FLOAT32 msd_spec_tilt;                  /* the long-term MSD of spectral tilt */
  FLOAT32 msd_spec_tilt_short;            /* the short-term MSD of spectral tilt */

  WORD32 init_mode_decision_result; /* the initial mode decision */
  WORD32 flag_border = NO_BORDER;   /* flag of current border */

  WORD32 mode_decision_result; /* final mode decision result */

  if (pstr_sig_class->init_flag == 0) {
    /* initialize */
    pstr_sig_class->init_flag = 1;

    for (i = 0; i < 5; i++) {
      n_tonal[i] = 0;
      n_tonal_low_frequency[i] = 0;
      spec_tilt_buf[i] = 0;
      pstr_buffers->init_result_behind[i] = TBD;
      pstr_buffers->smoothing_result_buf[i] = TBD;

      ave_n_tonal_short_buf[i] = 0;
      ave_n_tonal_buf[i] = 0;
      msd_spec_tilt_buf[i] = 0;
      msd_spec_tilt_short_buf[i] = 0;

      pstr_buffers->frame_energy_buf_behind[i] = 0;
      pstr_buffers->flag_border_buf_behind[i] = NO_BORDER;
    }
    for (; i < 10; i++) {
      n_tonal[i] = 0;
      n_tonal_low_frequency[i] = 0;
      spec_tilt_buf[i] = 0;
      pstr_buffers->init_result_behind[i] = TBD;
      pstr_buffers->smoothing_result_buf[i] = TBD;

      pstr_buffers->frame_energy_buf_behind[i] = 0;
      pstr_buffers->flag_border_buf_behind[i] = NO_BORDER;
    }

    for (; i < 100; i++) {
      n_tonal[i] = 0;
      n_tonal_low_frequency[i] = 0;
      spec_tilt_buf[i] = 0;
      pstr_buffers->init_result_behind[i] = TBD;
      pstr_buffers->smoothing_result_buf[i] = TBD;
    }
    for (i = 0; i < NFRAMEAHEAD; i++) {
      pstr_buffers->frame_energy_buf_ahead[i] = 0;
      pstr_buffers->flag_border_buf_ahead[i] = NO_BORDER;
      pstr_buffers->init_result_ahead[i] = TBD;
    }
  }

  *framecnt += 1;
  *framecnt_xm += 1;

  pstr_ton_params.time_signal = (FLOAT32 *)ptr_time_signal;
  pstr_ton_params.framecnt_xm = *framecnt_xm;
  pstr_ton_params.n_tonal = n_tonal;
  pstr_ton_params.n_tonal_low_frequency = n_tonal_low_frequency;
  pstr_ton_params.n_tonal_low_frequency_ratio = &n_tonal_low_frequency_ratio;
  pstr_ton_params.ave_n_tonal = &ave_n_tonal;
  pstr_ton_params.ave_n_tonal_short = &ave_n_tonal_short;
  /** analysis tonal
   */
  iusace_tonal_analysis(&pstr_ton_params, pstr_scratch, ccfl);

  ptr_spec_params.time_signal = ptr_time_signal;
  ptr_spec_params.framecnt_xm = *framecnt_xm;
  ptr_spec_params.spec_tilt_buf = spec_tilt_buf;
  ptr_spec_params.msd_spec_tilt = &msd_spec_tilt;
  ptr_spec_params.msd_spec_tilt_short = &msd_spec_tilt_short;
  /** analysis spectral tilt
   */
  iusace_spectral_tilt_analysis(&ptr_spec_params, ccfl);

  pstr_mode_params.framecnt = *framecnt;
  pstr_mode_params.framecnt_xm = framecnt_xm;
  pstr_mode_params.flag_border = &flag_border;
  pstr_mode_params.ave_n_tonal_short = ave_n_tonal_short;
  pstr_mode_params.ave_n_tonal = ave_n_tonal;
  pstr_mode_params.ave_n_tonal_short_buf = ave_n_tonal_short_buf;
  pstr_mode_params.ave_n_tonal_buf = ave_n_tonal_buf;
  pstr_mode_params.msd_spec_tilt = msd_spec_tilt;
  pstr_mode_params.msd_spec_tilt_short = msd_spec_tilt_short;
  pstr_mode_params.msd_spec_tilt_buf = msd_spec_tilt_buf;
  pstr_mode_params.msd_spec_tilt_short_buf = msd_spec_tilt_short_buf;
  pstr_mode_params.n_tonal_low_frequency_ratio = n_tonal_low_frequency_ratio;
  pstr_mode_params.frame_energy = ptr_spec_params.frame_energy;
  /** initial mode decision and boundary decisions
   */
  init_mode_decision_result = iusace_init_mode_decision(&pstr_mode_params);

  smooth_param.flag_border_buf_behind = pstr_buffers->flag_border_buf_behind;
  smooth_param.flag_border_buf_ahead = pstr_buffers->flag_border_buf_ahead;
  smooth_param.frame_energy = ptr_spec_params.frame_energy;
  smooth_param.frame_energy_buf_behind = pstr_buffers->frame_energy_buf_behind;
  smooth_param.frame_energy_buf_ahead = pstr_buffers->frame_energy_buf_ahead;
  smooth_param.smoothing_result_buf = pstr_buffers->smoothing_result_buf;
  smooth_param.init_result_ahead = pstr_buffers->init_result_ahead;
  smooth_param.flag_border = flag_border;
  smooth_param.init_result_behind = pstr_buffers->init_result_behind;
  smooth_param.init_mode_decision_result = init_mode_decision_result;
  smooth_param.flag_speech_definite = 0;
  smooth_param.count_small_energy = 0;
  smooth_param.flag_music_definite = 0;
  smooth_param.num_smoothing = 0;
  /* smoothing */
  mode_decision_result = iusace_smoothing_mode_decision(&smooth_param);

  return mode_decision_result;
}

VOID iusace_classification(ia_classification_struct *pstr_sig_class,
                           iusace_scratch_mem *pstr_scratch, WORD32 ccfl) {
  WORD32 n_frames, n_class, avg_cls, nf;
  WORD32 i;
  FLOAT32 *ptr_time_signal = pstr_scratch->p_time_signal;
  WORD32 mode_decision_result;

  n_frames = pstr_sig_class->n_buffer_samples / ccfl;

  for (nf = 0; nf < n_frames; nf++) {
    for (i = 0; i < ccfl; i++) {
      ptr_time_signal[i] = pstr_sig_class->input_samples[ccfl * nf + i];
    }

    /* classification of ccfl-frame */
    mode_decision_result =
        iusace_classification_ccfl(pstr_sig_class, ptr_time_signal, pstr_scratch, ccfl);

    /* coding mode decision of 1024-frame */
    if ((mode_decision_result == MUSIC) || (mode_decision_result == MUSIC_DEFINITE)) {
      pstr_sig_class->coding_mode = FD_MODE;
    } else if ((mode_decision_result == SPEECH) || (mode_decision_result == SPEECH_DEFINITE)) {
      pstr_sig_class->coding_mode = TD_MODE;
    }

    pstr_sig_class->class_buf[pstr_sig_class->n_buf_class + nf] = pstr_sig_class->coding_mode;
    pstr_sig_class->pre_mode = pstr_sig_class->coding_mode;
  }

  /* merge ccfl-frame results */
  pstr_sig_class->n_buf_class += n_frames;
  n_class = (pstr_sig_class->n_class_frames > pstr_sig_class->n_buf_class)
                ? pstr_sig_class->n_buf_class
                : pstr_sig_class->n_class_frames;
  {
    WORD32 min_cls, max_cls;

    min_cls = max_cls = pstr_sig_class->class_buf[0];
    for (i = 1; i < n_class; i++) {
      if (pstr_sig_class->class_buf[i] > max_cls) {
        max_cls = pstr_sig_class->class_buf[i];
      } else if (pstr_sig_class->class_buf[i] < min_cls) {
        min_cls = pstr_sig_class->class_buf[i];
      }
    }

    avg_cls = 0;
    for (i = 0; i < n_class; i++) {
      if (pstr_sig_class->class_buf[i] == max_cls) {
        avg_cls += 1;
      }
      if (pstr_sig_class->class_buf[i] == min_cls) {
        avg_cls += -1;
      }
    }

    if (avg_cls > 0) {
      pstr_sig_class->coding_mode = max_cls;
    } else {
      pstr_sig_class->coding_mode = min_cls;
    }
  }

  /* shift, save pre_mode and unused class */
  if (n_class > 0) {
    pstr_sig_class->pre_mode = pstr_sig_class->class_buf[n_class - 1];
  }
  pstr_sig_class->n_buf_class -= n_class;
  pstr_sig_class->n_buffer_samples -= ccfl * n_frames;

  WORD32 minimum = MIN(pstr_sig_class->n_buf_class, pstr_sig_class->n_buffer_samples);
  if (minimum == pstr_sig_class->n_buf_class) {
    for (i = 0; i < minimum; i++) {
      pstr_sig_class->class_buf[i] = pstr_sig_class->class_buf[i + n_class];
      pstr_sig_class->input_samples[i] = pstr_sig_class->input_samples[i + ccfl * n_frames];
    }

    /* shift, save unused samples */
    for (; i < pstr_sig_class->n_buffer_samples; i++) {
      pstr_sig_class->input_samples[i] = pstr_sig_class->input_samples[i + ccfl * n_frames];
    }
  } else {
    for (i = 0; i < minimum; i++) {
      pstr_sig_class->class_buf[i] = pstr_sig_class->class_buf[i + n_class];
      pstr_sig_class->input_samples[i] = pstr_sig_class->input_samples[i + ccfl * n_frames];
    }

    /* shift, save unused samples */
    for (; i < pstr_sig_class->n_buf_class; i++) {
      pstr_sig_class->class_buf[i] = pstr_sig_class->class_buf[i + n_class];
    }
  }
}

VOID iusace_init_classification(ia_classification_struct *pstr_sig_class) {
  pstr_sig_class->pre_mode = FD_MODE;

  pstr_sig_class->n_buffer_samples = 0;
  memset(pstr_sig_class->input_samples, 0, 3840 * 2 * sizeof(FLOAT32));
  pstr_sig_class->n_class_frames = 2;
  pstr_sig_class->n_buf_class = 0;

  pstr_sig_class->is_switch_mode = 1;

  pstr_sig_class->framecnt = 0;
  pstr_sig_class->init_flag = 0;
  pstr_sig_class->framecnt_xm = 0;

  memset(&pstr_sig_class->buffers, 0, sizeof(ia_classification_buf_struct));
  memset(pstr_sig_class->spec_tilt_buf, 0, sizeof(FLOAT32) * 100);
  memset(pstr_sig_class->n_tonal, 0, sizeof(WORD32) * 100);
  memset(pstr_sig_class->n_tonal_low_frequency, 0, sizeof(WORD32) * 100);
  memset(pstr_sig_class->msd_spec_tilt_buf, 0, sizeof(FLOAT32) * 5);
  memset(pstr_sig_class->msd_spec_tilt_short_buf, 0, sizeof(FLOAT32) * 5);
  memset(pstr_sig_class->ave_n_tonal_short_buf, 0, sizeof(FLOAT32) * 5);
  memset(pstr_sig_class->ave_n_tonal_buf, 0, sizeof(FLOAT32) * 5);
  return;
}

Coverage Report

Created: 2026-01-09 06:51