clean DTX and RC

2025-06-05 21:59:32 +02:00 · 2021-06-28 23:00:00 +02:00
parent b11042a0f0
commit 057bb87e64
7 changed files with 50 additions and 37 deletions
--- a/src/app/exhaleApp.rc
+++ b/src/app/exhaleApp.rc
@ -13,7 +13,7 @@
 0 ICON "exhaleApp.ico"
 VS_VERSION_INFO VERSIONINFO
-FILEVERSION 1,1,6
+FILEVERSION 1,1,6,1
 BEGIN
  BLOCK "StringFileInfo"
  BEGIN
--- a/src/lib/bitAllocation.cpp
+++ b/src/lib/bitAllocation.cpp
@ -101,6 +101,15 @@ void BitAllocator::getChAverageTempFlat (uint8_t meanTempFlatInCh[USAC_MAX_NUM_C
  memcpy (meanTempFlatInCh, m_avgTempFlat, nChannels * sizeof (uint8_t));
 }
 uint16_t BitAllocator::getRateCtrlFac (const int32_t rateRatio, const unsigned samplingRate, const uint32_t specFlatness)
 {
  const uint32_t brRatio = __max (1 << 15, __min (USHRT_MAX, rateRatio * (36 - 9 * m_rateIndex)));
  const uint16_t mSfmSqr = (m_rateIndex < 2 && samplingRate >= 27713 ? (specFlatness * specFlatness) >> m_rateIndex : 0);
  const uint16_t mSfmFac = 256 - (((32 + m_rateIndex) * (specFlatness << 4) - mSfmSqr + (1 << 9)) >> 10);
  return uint16_t ((brRatio * mSfmFac + (1 << 7)) >> 8);
 }
 uint8_t BitAllocator::getScaleFac (const uint32_t sfbStepSize, const int32_t* const sfbSignal, const uint8_t sfbWidth,
                                   const uint32_t sfbRmsValue)
 {
--- a/src/lib/bitAllocation.h
+++ b/src/lib/bitAllocation.h
@ -39,6 +39,7 @@ public:
  // public functions
  void getChAverageSpecFlat (uint8_t meanSpecFlatInCh[USAC_MAX_NUM_CHANNELS], const unsigned nChannels);
  void getChAverageTempFlat (uint8_t meanTempFlatInCh[USAC_MAX_NUM_CHANNELS], const unsigned nChannels);
  uint16_t   getRateCtrlFac (const int32_t rateRatio, const unsigned samplingRate, const uint32_t specFlatness);
  uint8_t       getScaleFac (const uint32_t sfbStepSize, const int32_t* const sfbSignal, const uint8_t sfbWidth,
                             const uint32_t sfbRmsValue);
  unsigned initAllocMemory  (LinearPredictor* const linPredictor, const uint8_t numSwb, const uint8_t bitRateMode);
--- a/src/lib/bitStreamWriter.cpp
+++ b/src/lib/bitStreamWriter.cpp
@ -936,7 +936,7 @@ unsigned BitStreamWriter::createAudioFrame (CoreCoderData** const elementData,
                                            const uint8_t numSwbShort,          uint8_t* const tempBuffer,
 #if !RESTRICT_TO_AAC
                                            const bool* const tw_mdct /*N/A*/,  const bool* const noiseFilling,
-                                            const uint32_t frameCount,          const uint32_t indepPeriod,
+                                            const uint32_t frameCount,          const uint32_t indepPeriod,  uint32_t* rate,
 #endif
                                            const uint8_t sbrRatioShiftValue,   int32_t** const sbrInfoAndData,
                                            unsigned char* const accessUnit,    const unsigned nSamplesInFrame)
@ -971,7 +971,6 @@ unsigned BitStreamWriter::createAudioFrame (CoreCoderData** const elementData,
  }
 #endif
  m_auBitStream.reset ();
  m_frameLength = nSamplesInFrame;
  m_numSwbShort = numSwbShort;
  m_uCharBuffer = tempBuffer;
  m_auBitStream.write (usacIndependencyFlag ? 1 : 0, 1);
@ -1112,6 +1111,18 @@ unsigned BitStreamWriter::createAudioFrame (CoreCoderData** const elementData,
 #if RESTRICT_TO_AAC || defined (NO_PREROLL_DATA)
  memcpy (accessUnit, &m_auBitStream.stream.front (), __min (768 * ci, bitCount >> 3));
 #else
  m_auByteCount += bitCount >> 3;
  if (rate != nullptr)  // sampling rate
  {
    const double framesPerSec = (double) *rate / nSamplesInFrame;
    const unsigned targetRate = (4 - (sbrRatioShiftValue & 1)) * ci; // frame average for preset 1
    if (framesPerSec > 0.0 && targetRate > 0 && frameCount < UINT_MAX) // running overcoding ratio
    {
      *rate = uint32_t (0.5 + (m_auByteCount * framesPerSec) / (__max (20.0 * framesPerSec, (double) frameCount) * targetRate));
    }
    else *rate = 0; // insufficient data
  }
  memcpy (accessUnit, &m_auBitStream.stream.front (), __min (ci * (ipf ? 1152 : 768), bitCount >> 3));
 #endif
  return (bitCount >> 3);  // byte count
--- a/src/lib/bitStreamWriter.h
+++ b/src/lib/bitStreamWriter.h
@ -25,7 +25,7 @@ private:
  // member variables
  OutputStream m_auBitStream; // access unit bit-stream to write
-  uint32_t     m_frameLength;
+  uint64_t     m_auByteCount;
  uint8_t      m_numSwbShort; // max. SFB count in short windows
  uint8_t*     m_uCharBuffer; // temporary buffer for ungrouping
 #ifndef NO_PREROLL_DATA
@ -37,7 +37,7 @@ private:
 #endif
  // helper functions
  void     writeByteAlignment (); // write 0s for byte alignment
-  unsigned writeChannelWiseIcsInfo (const IcsInfo& icsInfo); // ics_info()
+  unsigned writeChannelWiseIcsInfo (const IcsInfo& icsInfo);
  unsigned writeChannelWiseSbrData (const int32_t* const sbrDataCh0, const int32_t* const sbrDataCh1,
                                    const bool indepFlag = false);
  unsigned writeChannelWiseTnsData (const TnsData& tnsData, const bool eightShorts);
@ -56,7 +56,7 @@ private:
 public:
  // constructor
-  BitStreamWriter () { m_auBitStream.reset (); m_frameLength = 0; m_numSwbShort = 0; m_uCharBuffer = nullptr;
+  BitStreamWriter () { m_auBitStream.reset (); m_auByteCount = m_numSwbShort = 0; m_uCharBuffer = nullptr;
 #ifndef NO_PREROLL_DATA
                       memset (m_usacConfig, 0, 20); m_usacConfigLen = 0; memset (m_usacIpfState, 0, 4);
 #endif
@ -77,7 +77,7 @@ public:
                              const uint8_t numSwbShort,          uint8_t* const tempBuffer,
 #if !RESTRICT_TO_AAC
                              const bool* const tw_mdct /*N/A*/,  const bool* const noiseFilling,
-                              const uint32_t frameCount,          const uint32_t indepPeriod,
+                              const uint32_t frameCount,          const uint32_t indepPeriod,  uint32_t* rate,
 #endif
                              const uint8_t sbrRatioShiftValue,   int32_t** const sbrInfoAndData,
                              unsigned char* const accessUnit,    const unsigned nSamplesInFrame);
--- a/src/lib/exhaleEnc.cpp
+++ b/src/lib/exhaleEnc.cpp
@ -445,13 +445,13 @@ static const uint8_t numSwbOffsetS[USAC_NUM_FREQ_TABLES] = {13, 13, 15, 16, 16,
 // ISO/IEC 23003-3, Table 79
 static const uint8_t freqIdxToSwbTableIdxAAC[USAC_NUM_SAMPLE_RATES + 2] = {
-  /*96000*/ 0, 0, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, // AAC
+  /*96000*/ 0, 0, 1, 2, 2, 2,/*24000*/ 3, 3, 4, 4, 4, 5, 5, // AAC
-  255, 255, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4 // USAC
+  255, 255, 1, 2, 2, 2, 2, 2,/*25600*/ 3, 3, 3, 4, 4, 4, 4 // USAC
 };
 #if !RESTRICT_TO_AAC
 static const uint8_t freqIdxToSwbTableIdx768[USAC_NUM_SAMPLE_RATES + 2] = {
-  /*96000*/ 0, 0, 0, 1, 1, 2, 2, 2, 3, 4, 4, 4, 4, // AAC
+  /*96000*/ 0, 0, 0, 1, 1, 2,/*24000*/ 2, 2, 3, 4, 4, 4, 4, // AAC
-  255, 255, 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4 // USAC
+  255, 255, 0, 1, 2, 2, 2, 2,/*25600*/ 2, 3, 3, 3, 3, 4, 4 // USAC
 };
 #endif
@ -745,7 +745,6 @@ unsigned ExhaleEncoder::getOptParCorCoeffs (const SfbGroupData& grpData, const u
  return (predGainMax >> 24) & UCHAR_MAX; // max pred gain of all orders and length-1 groups
 }
 #ifndef NO_DTX_MODE
 uint32_t ExhaleEncoder::getThr (const unsigned channelIndex, const unsigned sfbIndex)
 {
  const uint16_t* const sfbLoudMem = m_sfbLoudMem[channelIndex][sfbIndex];
@ -756,7 +755,6 @@ uint32_t ExhaleEncoder::getThr (const unsigned channelIndex, const unsigned sfbI
  return sumSfbLoud * (sumSfbLoud >> (toSamplingRate (m_frequencyIdx) >> 13)); // scaled SMR
 }
 #endif
 unsigned ExhaleEncoder::psychBitAllocation () // perceptual bit-allocation via scale factors
 {
@ -933,8 +931,7 @@ unsigned ExhaleEncoder::psychBitAllocation () // perceptual bit-allocation via s
 #if !RESTRICT_TO_AAC
        const uint8_t numSwbCh = (eightShorts ? m_numSwbShort : m_numSwbLong);
 #endif
-        const uint16_t mSfmSqr = (m_bitRateMode < 2 && samplingRate >= 27713 ? ((uint16_t) meanSpecFlat[ci] * meanSpecFlat[ci]) >> m_bitRateMode : 0);
+        const uint16_t rateFac = m_bitAllocator.getRateCtrlFac (m_priLength ? m_rateFactor : 0, samplingRate, meanSpecFlat[ci]); // RC factor
        const uint16_t mSfmFac = 256u - (((32u + m_bitRateMode) * ((uint32_t) meanSpecFlat[ci] << 4) - mSfmSqr + (1u << 9)) >> 10);
        uint32_t*    stepSizes = &sfbStepSizes[ci * m_numSwbShort * NUM_WINDOW_GROUPS];
        memset (grpData.scaleFactors, 0, (MAX_NUM_SWB_SHORT * NUM_WINDOW_GROUPS) * sizeof (uint8_t));
@ -951,10 +948,10 @@ unsigned ExhaleEncoder::psychBitAllocation () // perceptual bit-allocation via s
            const unsigned lfConst = (samplingRate < 27713 && !eightShorts ? 1 : 2); // lfAtten: LF SNR boost, as in my M.Sc. thesis
            const unsigned lfAtten = (b <= 5 ? (eightShorts ? 1 : 4) + b * lfConst : 5 * lfConst - 1 + b + ((b + 5) >> 4));
            const uint8_t sfbWidth = grpOff[b + 1] - grpOff[b];
-            const uint64_t   scale = scaleBr * mSfmFac * __min (32, lfAtten * grpData.numWindowGroups); // rate control part 1 (SFB)
+            const uint64_t   scale = scaleBr * rateFac * __min (32, lfAtten * grpData.numWindowGroups); // rate control part 1 (SFB)
            // scale step-sizes according to VBR mode & derive scale factors from step-sizes
-            grpStepSizes[b] = uint32_t (__max (BA_EPS, ((1u << 17) + grpStepSizes[b] * scale) >> 18));
+            grpStepSizes[b] = uint32_t (__max (BA_EPS, ((1u << 24) + grpStepSizes[b] * scale) >> 25));
 #if !RESTRICT_TO_AAC
            if (!m_noiseFilling[el] || (m_bitRateMode > 0) || (m_shiftValSBR == 0) || (samplingRate < 23004) ||
                (b + 3 - (meanSpecFlat[ci] >> 6) < m_numSwbLong)) // HF
@ -971,19 +968,15 @@ unsigned ExhaleEncoder::psychBitAllocation () // perceptual bit-allocation via s
          const bool keepMaxSfbCurr = ((samplingRate < 37566) || (samplingRate >= 46009 && samplingRate < 55426 && eightShorts));
          const uint8_t numSwbFrame = __min ((numSwbCh * ((maxSfbCh == maxSfbCurr) || (m_bitRateMode <= 2) || (m_shiftValSBR > 0) ? 4u : 3u)) >> 2,
                                      (eightShorts ? maxSfbCh : maxSfbLong) + (m_bitRateMode < 2 || m_bitRateMode > 3 || keepMaxSfbCurr ? 0u : 1u));
-#ifndef NO_DTX_MODE
+
          if ((m_bitRateMode == 0) && (m_numElements == 1) && (samplingRate < 27713) && eightShorts)
          {
            for (s = 0; s < 26; s++) m_sfbLoudMem[ch][s][m_frameCount & 31] = uint16_t (sqrt (double (getThr (ch, s) << (samplingRate >> 13))));
          }
          if ((maxSfbCh < numSwbFrame) || (m_bitRateMode <= 2)) // increase coding bandwidth
 #else
          if (maxSfbCh < numSwbFrame) // increase coding bandwidth
 #endif
          {
            for (uint16_t gr = 0; gr < grpData.numWindowGroups; gr++)
            {
 #ifndef NO_DTX_MODE
              const uint32_t*  grpRms = &grpData.sfbRmsValues[m_numSwbShort * gr];
              if ((m_bitRateMode == 0) && (m_numElements == 1) && (samplingRate < 27713))
@ -1009,7 +1002,7 @@ unsigned ExhaleEncoder::psychBitAllocation () // perceptual bit-allocation via s
                  if (grpRms[s] < ((3 * TA_EPS) >> 1)) grpData.scaleFactors[s + m_numSwbShort * gr] = 0;
                }
              }
-#endif
+
              memset (&grpData.scaleFactors[maxSfbCh + m_numSwbShort * gr], 0, (numSwbFrame - maxSfbCh) * sizeof (uint8_t));
            }
            grpData.sfbsPerGroup = coreConfig.icsInfoCurr[ch].maxSfb = numSwbFrame;
@ -1026,12 +1019,10 @@ unsigned ExhaleEncoder::psychBitAllocation () // perceptual bit-allocation via s
            coreConfig.commonMaxSfb = (maxSfb0 == maxSfb1);
          }
        }
 #ifndef NO_DTX_MODE
        else if (m_noiseFilling[el] && (m_bitRateMode == 0) && (m_numElements == 1) && (samplingRate < 27713))
        {
          for (s = 0; s < 26; s++) m_sfbLoudMem[ch][s][m_frameCount & 31] = BA_EPS;
        }
 #endif
 #endif // !RESTRICT_TO_AAC
        ci++;
      } // for ch
@ -1294,10 +1285,11 @@ unsigned ExhaleEncoder::quantizationCoding ()  // apply MDCT quantization and en
      entrCoder.setIsShortWindow (shortWinPrev);
 #if !RESTRICT_TO_AAC
      s = 22050 + 7350 * m_bitRateMode; // compute channel-wise noise_level and noise_offset
      sfIdxPred = ((m_bitRateMode == 0) && (m_priLength) && (m_shiftValSBR) && ((m_tempAnaCurr[ci] >> 24) || (m_tempAnaNext[ci] >> 24)) && (meanSpecFlat[ci] +
                    __min ((m_tempAnaCurr[ci] >> 16) & UCHAR_MAX, (m_tempAnaNext[ci] >> 16) & UCHAR_MAX) >= 192) ? UCHAR_MAX : meanSpecFlat[ci]);
      coreConfig.specFillData[ch] = (!m_noiseFilling[el] ? 0 : m_specGapFiller.getSpecGapFillParams (m_sfbQuantizer, m_mdctQuantMag[ci], m_numSwbShort,
                                                                                                     grpData, nSamplesInFrame, samplingRate, s,
-                                                                                                     shortWinCurr ? 0 : meanSpecFlat[ci]));
+                                                                                                     shortWinCurr ? 0 : sfIdxPred));
      // NOTE: gap-filling SFB bit count might be inaccurate now since scale factors changed
      if (coreConfig.specFillData[ch] == 1) errorValue |= 1;
 #endif
      s = ci + nrChannels - 1 - 2 * ch; // other channel in stereo
@ -1332,11 +1324,13 @@ unsigned ExhaleEncoder::quantizationCoding ()  // apply MDCT quantization and en
      ci++;
    }
  } // for el
-
+#if !RESTRICT_TO_AAC
  if (m_priLength) m_rateFactor = samplingRate;
 #endif
  return (errorValue > 0 ? 0 : m_outStream.createAudioFrame (m_elementData, m_entropyCoder, m_mdctSignals, m_mdctQuantMag, m_indepFlag,
                                                             m_numElements, m_numSwbShort, (uint8_t* const) m_tempIntBuf,
 #if !RESTRICT_TO_AAC
-                                                             m_timeWarping, m_noiseFilling, m_frameCount - 1u, m_indepPeriod,
+                                                             m_timeWarping, m_noiseFilling, m_frameCount - 1u, m_indepPeriod, &m_rateFactor,
 #endif
                                                             m_shiftValSBR, m_coreSignals, m_outAuData, nSamplesInFrame)); // returns AU size
 }
@ -1829,7 +1823,7 @@ ExhaleEncoder::ExhaleEncoder (int32_t* const inputPcmData,           unsigned ch
  if (m_channelConf == CCI_CONF) m_channelConf = CCI_2_CHM; // passing numChannels = 0 means 2-ch dual-mono
  m_numElements  = elementCountConfig[m_channelConf % USAC_MAX_NUM_ELCONFIGS]; // used in UsacDecoderConfig
  m_shiftValSBR  = (frameLength >= 1536 ? 1 : 0);
-  m_frameCount   = m_priLength = 0;
+  m_frameCount   = m_rateFactor = m_priLength = 0;
  m_frameLength  = USAC_CCFL (frameLength >> m_shiftValSBR); // ccfl signaled using coreSbrFrameLengthIndex
  m_frequencyIdx = toSamplingFrequencyIndex (sampleRate >> m_shiftValSBR); // as usacSamplingFrequencyIndex
  m_indepFlag    = true; // usacIndependencyFlag in UsacFrame(), will be set per frame, true in first frame
@ -2085,9 +2079,8 @@ unsigned ExhaleEncoder::initEncoder (unsigned char* const audioConfigBuffer, uin
      m_elementData[el]->elementType = elementTypeConfig[chConf][el]; // usacElementType[el]
    }
  }
 #ifndef NO_DTX_MODE
  memset (m_sfbLoudMem, 1, 2 * 26 * 32 * sizeof (uint16_t));
-#endif
+
  // allocate all signal buffers
  if (m_shiftValSBR > 0)
  {
--- a/src/lib/exhaleEnc.h
+++ b/src/lib/exhaleEnc.h
@ -50,7 +50,9 @@ typedef enum USAC_CCI : signed char
 typedef enum USAC_CCFL : short
 {
  CCFL_UNDEF = -1,
 #if !RESTRICT_TO_AAC
  CCFL_768   = 768, // LD
 #endif
  CCFL_1024  = 1024 // LC
 } USAC_CCFL;
@ -92,10 +94,9 @@ private:
  uint8_t         m_perCorrHCurr[USAC_MAX_NUM_ELEMENTS];
  uint8_t         m_perCorrLCurr[USAC_MAX_NUM_ELEMENTS];
  uint8_t         m_priLength;
  uint32_t        m_rateFactor; // RC
  SfbGroupData*   m_scaleFacData[USAC_MAX_NUM_CHANNELS];
 #ifndef NO_DTX_MODE
  uint16_t        m_sfbLoudMem[2][26][32]; // loudness mem
 #endif
  SfbQuantizer    m_sfbQuantizer; // powerlaw quantization
  uint8_t         m_shiftValSBR; // SBR ratio for shifting
  SpecAnalyzer    m_specAnalyzer; // for spectral analysis
@ -128,9 +129,7 @@ private:
                               int32_t* const mdctSignal, int32_t* const mdstSignal);
  unsigned getOptParCorCoeffs (const SfbGroupData& grpData, const uint8_t maxSfb, TnsData& tnsData,
                               const unsigned channelIndex, const uint8_t firstGroupIndexToTest = 0);
 #ifndef NO_DTX_MODE
  uint32_t getThr             (const unsigned channelIndex, const unsigned sfbIndex);
 #endif
  unsigned psychBitAllocation ();
  unsigned quantizationCoding ();
  unsigned spectralProcessing ();
@ -146,7 +145,7 @@ public:
 #if !RESTRICT_TO_AAC
               , const bool useNoiseFilling = true, const bool useEcodisExt = false
 #endif
-                );
+    );
  // destructor
  virtual ~ExhaleEncoder ();
  // public functions