finish 1.2.0 release

2022-10-23 19:00:00 +02:00 · 2022-10-23 19:00:00 +02:00 · 9202dbcc19
parent 444a006269
commit 9202dbcc19
11 changed files with 197 additions and 51 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 ## CMakeLists.txt - Main CMake file that defines how cmake should process and generate the necessary build files
- # written by C. D. Degawa, last modified in 2021 - see License.htm for legal notices
+ # written by C. D. Degawa, last modified in 2022 - see License.htm for legal notices
 #
 # The copyright in this software is being made available under the exhale Copyright License
 # and comes with ABSOLUTELY NO WARRANTY. This software may be subject to other third-
@ -16,7 +16,7 @@ if("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_BINARY_DIR}")
 endif()


-project(exhale VERSION 1.1.9 LANGUAGES CXX)
+project(exhale VERSION 1.2.0 LANGUAGES CXX)

 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
    set(CMAKE_BUILD_TYPE Release
--- a/README.md
+++ b/README.md
@ -31,7 +31,7 @@ ____________________________________________________________________
 Copyright
 ---------

-(c) 2021 Christian R. Helmrich, project ecodis. All rights reserved.
+(c) 2022 Christian R. Helmrich, project ecodis. All rights reserved.


 License
@ -41,7 +41,7 @@ exhale is being made available under an open-source license which is
 based on the 3-clause BSD license but modified to address particular
 aspects dictated by the nature and the output of this application.

-The license text and release notes for the current version 1.1.9 can
+The license text and release notes for the current version 1.2.0 can
 be found in the `include` subdirectory of the exhale distribution.


--- a/include/Release.htm
+++ b/include/Release.htm
@ -25,9 +25,15 @@
 <td valign="top">

 <h1><br><span class="pink">exhale</span> - <span class="pink">e</span>codis e<span class="pink">x</span>tended <span class="pink">h</span>igh-efficiency <span class="pink">a</span>nd <span class="pink">l</span>ow-complexity <span class="pink">e</span>ncoder<br><span class="gray"><sup><br>Software Release Notes, Version History, Known Issues, Upcoming Feature Roadmap</sup></span><br><br></h1>
-<h3>&nbsp; &nbsp;The version of this distribution of the &laquo;exhale&raquo; software release is <b>1.1.9</b> (official pub&shy;lic minor release) from January 2022. Please check <a href="http://www.ecodis.de/audio.htm#mpeg">www.ecodis.de</a> regularly for new versions of this software. A summary of each version up to this release, a list of known issues with this release, and a roadmap of additional functionality are provided below.</h3>
+<h3>&nbsp; &nbsp;The version of this distribution of the &laquo;exhale&raquo; software release is <b>1.2.0</b> (official pub&shy;lic major release) from December 2022. Please check <a href="http://www.ecodis.de/audio.htm#mpeg">www.ecodis.de</a> regularly for new versions of this software. A summary of each version up to this release, a list of known issues with this release, and a roadmap of additional functionality are provided below.</h3>
 <h3><br><b>Chronological Version History</b></h3>
-<h3>&nbsp; &nbsp;Version <b>1.1.9 <span class="gray">&nbsp;Dec. 2021, this release</span></b></h3>
+<h3>&nbsp; &nbsp;Version <b>1.2.0 <span class="gray">&nbsp;Dec. 2022, this release</span></b></h3>
+<ul>
+ <li><h3>C API correction, some code sanitizing (issue 24, merge requests 8&#x2013;11, J. Regan)</h3></li>
+ <li><h3>exhaleLib: code cleanup, very minor quality improvements in CVBR modes f and 5</h3></li>
+ <li><h3>exhaleLib: 5&#37; speedup of all modes, better target rate matching in CVBR mode g</h3></li>
+</ul>
+<h3>&nbsp; &nbsp;Version <b>1.1.9 <span class="gray">&nbsp;Dec. 2021</span></b></h3>
 <ul>
 <li><h3>exhaleApp: write encoder name and version as &laquo;udta&raquo; tool string into MP4 header</h3></li>
 <li><h3>exhaleApp: optimize leading and trailing PCM read for gapless playback (issue 21)</h3></li>
@ -167,7 +173,7 @@
 <li><h3>exhaleLib: speed-ups and further quality tuning for difficult signals, as necessary.</h3></li>
 </ul>
 <h3><br></h3>
-<h4><span class="gray">Written by C. R. Helmrich for exhale 1.1.9, Dec. 2021. Available at www.ecodis.de/exhale/release.htm.</span><br><br></h4>
+<h4><span class="gray">Written by C. R. Helmrich for exhale 1.2.0, Dec. 2022. Available at www.ecodis.de/exhale/release.htm.</span><br><br></h4>

 </td>
 <td valign="top" colspan="2">
--- a/include/version.h
+++ b/include/version.h
@ -1,5 +1,5 @@
 /* version.h - header file with major and minor library version numbers as characters
- * written by C. R. Helmrich, last modified in 2021 - see License.htm for legal notices
+ * written by C. R. Helmrich, last modified in 2022 - see License.htm for legal notices
 *
 * The copyright in this software is being made available under the exhale Copyright License
 * and comes with ABSOLUTELY NO WARRANTY. This software may be subject to other third-
@ -12,8 +12,8 @@
 # define EXHALELIB_VERSION_MAJOR "1"
 #endif
 #ifndef EXHALELIB_VERSION_MINOR
-# define EXHALELIB_VERSION_MINOR "1"
+# define EXHALELIB_VERSION_MINOR "2"
 #endif
 #ifndef EXHALELIB_VERSION_BUGFIX
-# define EXHALELIB_VERSION_BUGFIX ".9" // "RC" or ".0", ".1", ...
+# define EXHALELIB_VERSION_BUGFIX "RC" // "RC" or ".0", ".1", ...
 #endif
--- a/src/app/exhaleApp.cpp
+++ b/src/app/exhaleApp.cpp
@ -477,7 +477,7 @@ int main (const int argc, char* argv[])
  // check arg. list, print usage if needed
  if ((argc < 3) || (argc > 6) || (argc > 1 && argv[1][1] != 0))
  {
-    fprintf_s (stdout, " Copyright 2018-2021 C.R.Helmrich, project ecodis. See License.htm for details.\n\n");
+    fprintf_s (stdout, " Copyright 2018-2022 C.R.Helmrich, project ecodis. See License.htm for details.\n\n");

    fprintf_s (stdout, " This software is made available under the exhale Copyright License and comes\n");
    fprintf_s (stdout, " with ABSOLUTELY NO WARRANTY. This software may be subject to other third-party\n");
--- a/src/app/exhaleApp.rc
+++ b/src/app/exhaleApp.rc
@ -13,7 +13,7 @@

 0 ICON "exhaleApp.ico"
 VS_VERSION_INFO VERSIONINFO
-FILEVERSION 1,1,9,2
+FILEVERSION 1,2,0,0
 BEGIN
  BLOCK "StringFileInfo"
  BEGIN
@ -22,7 +22,7 @@ BEGIN
      VALUE "CompanyName", "ecodis"
      VALUE "FileDescription", "exhale - ecodis extended high-efficiency and low-complexity encoder"
      VALUE "InternalName", "exhaleApp.exe"
-      VALUE "LegalCopyright", "<EFBFBD> 2018-2021 C. R. Helmrich, ecodis"
+      VALUE "LegalCopyright", "© 2018-2022 C. R. Helmrich, ecodis"
      VALUE "OriginalFilename", "exhale.exe"
      VALUE "ProductName", "exhaleApp"
      VALUE "ProductVersion", EXHALELIB_VERSION_MAJOR "." EXHALELIB_VERSION_MINOR EXHALELIB_VERSION_BUGFIX
--- a/src/lib/entropyCoding.cpp
+++ b/src/lib/entropyCoding.cpp
@ -1,5 +1,5 @@
 /* entropyCoding.cpp - source file for class with lossless entropy coding capability
- * written by C. R. Helmrich, last modified in 2020 - see License.htm for legal notices
+ * written by C. R. Helmrich, last modified in 2022 - see License.htm for legal notices
 *
 * The copyright in this software is being made available under the exhale Copyright License
 * and comes with ABSOLUTELY NO WARRANTY. This software may be subject to other third-
@ -254,13 +254,18 @@ static const uint16_t arithCumFreqR[3][4] = { // arith_cf_r
 };

 static const uint8_t arithFastPkIndex[32] = {
-  1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 0, 58, 3, 0, 49, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62
+  1, 4, 0, 49, 0, 0, 0, 0, 0, 0, 0, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 3, 0, 62
 };

 // static helper functions
-static inline unsigned arithGetPkIndex (const unsigned ctx) // cumul. frequency table index pki = arith_get_pk(c)
+static inline uint8_t arithGetPkIndex (const unsigned ctx) // cumul. frequency table index pki = arith_get_pk(c)
 {
-  if ((ctx & 0xEEEEE) == 0) return arithFastPkIndex[((ctx >> 12) & 16) | ((ctx >> 9) & 8) | ((ctx >> 6) & 4) | ((ctx >> 3) & 2) | (ctx & 1)];
+  if ((ctx & 0xEEEEE) == 0)
+  {
+    const unsigned tmp = ctx | (ctx >> 6);
+
+    return arithFastPkIndex[(tmp | (tmp >> 9)) & 31];
+  }

  int32_t iMax = ARITH_SIZE - 1;
  int32_t iMin = -1;
@ -534,11 +539,114 @@ unsigned EntropyCoder::arithCodeSigMagn (const uint8_t* const magn, const uint16
    if (sigEnd > 1) m_csCurr |= m_qcCurr[sigEnd - 2] << 26;
    if (sigEnd > 2) m_csCurr |= __min (3, m_qcCurr[sigEnd - 3]) << 30;
  }
-  m_csCurr |= ((unsigned) m_acBits << 17) | c;
+  m_csCurr |= ((unsigned) __min (31, m_acBits) << 17) | c;

  return bitCount;
 }

+#if EC_TRELLIS_OPT_CODING
+unsigned EntropyCoder::arithCodeSigTest (const uint8_t* const magn, const uint16_t sigOffset, const uint16_t sigLength)
+{
+  const unsigned inAcBits = m_acBits;
+  const uint8_t* a = &magn[sigOffset    ];
+  const uint8_t* b = &magn[sigOffset + 1];
+  unsigned c = m_csCurr & 0x1FFFF;
+  unsigned bitCount = 0;
+  uint16_t r[7];
+  int16_t s = sigOffset >> 1;
+
+  for (uint16_t sigEnd = (uint16_t) s + (sigLength >> 1); s < sigEnd; s++)
+  {
+    uint32_t lev = 0;
+    uint16_t a1 = *a;
+    uint16_t b1 = *b;
+
+    a += 2; b += 2;
+
+    // arith_get_context, cf Scl. 7.4
+    c = arithGetContext (c, (unsigned) s);
+    // arith_update_context, Scl. 7.4
+    m_qcCurr[s] = __min (0xF, a1 + b1 + 1);
+
+    // MSB encoding as in Scl. B.25.3
+    while ((a1 > 3) || (b1 > 3))
+    {
+      // write escaped codeword value
+      bitCount += arithCodeSymbol (ARITH_ESCAPE, arithCumFreqM[arithGetPkIndex (c | (lev << 17))]);
+      // store LSBs in r, right-shift
+      r[lev++] = (a1 & 1) | ((b1 & 1) << 1);
+      a1 >>= 1; b1 >>= 1;
+    }
+    // write the m MSB codeword value
+    bitCount += arithCodeSymbol (a1 | (b1 << 2), arithCumFreqM[arithGetPkIndex (c | (lev << 17))]);
+
+    // LSB encoding, Table 38, B.25.3
+    while (lev--)
+    {
+      const uint16_t rLev = r[lev];
+
+      bitCount += arithCodeSymbol (rLev, arithCumFreqR[a1 == 0 ? 1 : (b1 == 0 ? 0 : 2)]);
+      a1 = (a1 << 1) | (rLev & 1);
+      b1 = (b1 << 1) | ((rLev >> 1) & 1);
+    }
+  } // for s
+
+  m_csCurr = m_qcCurr[--s] << 22;
+  if ((s--) > 0) m_csCurr |= m_qcCurr[s] << 26;
+  if ((s--) > 0) m_csCurr |= __min (3, m_qcCurr[s]) << 30;
+  m_csCurr |= ((unsigned) __min (31, m_acBits) << 17) | c;
+  bitCount += m_acBits;
+
+  return bitCount - __min (inAcBits, bitCount);
+}
+
+unsigned EntropyCoder::arithCodeTupTest (const uint8_t* const magn, const uint16_t sigOffset)
+{
+  const unsigned inAcBits = m_acBits;
+  uint16_t a1 = magn[sigOffset    ];
+  uint16_t b1 = magn[sigOffset + 1];
+  unsigned bitCount = 0;
+  uint32_t lev = 0;
+  uint16_t r[7];
+  int16_t  s = sigOffset >> 1;
+
+  // arith_get_context, cf Scl. 7.4
+  const unsigned c = arithGetContext (m_csCurr & 0x1FFFF, (unsigned) s);
+  // arith_update_context, Scl. 7.4
+  m_qcCurr[s] = __min (0xF, a1 + b1 + 1);
+
+  // MSB encoding as in Scl. B.25.3
+  while ((a1 > 3) || (b1 > 3))
+  {
+    // write escaped codeword value
+    bitCount += arithCodeSymbol (ARITH_ESCAPE, arithCumFreqM[arithGetPkIndex (c | (lev << 17))]);
+    // store LSBs in r, right-shift
+    r[lev++] = (a1 & 1) | ((b1 & 1) << 1);
+    a1 >>= 1; b1 >>= 1;
+  }
+  // write the m MSB codeword value
+  bitCount += arithCodeSymbol (a1 | (b1 << 2), arithCumFreqM[arithGetPkIndex (c | (lev << 17))]);
+
+  // LSB encoding, Table 38, B.25.3
+  while (lev--)
+  {
+    const uint16_t rLev = r[lev];
+
+    bitCount += arithCodeSymbol (rLev, arithCumFreqR[a1 == 0 ? 1 : (b1 == 0 ? 0 : 2)]);
+    a1 = (a1 << 1) | (rLev & 1);
+    b1 = (b1 << 1) | ((rLev >> 1) & 1);
+  }
+
+  m_csCurr = m_qcCurr[s] << 22;
+  if ((s--) > 0) m_csCurr |= m_qcCurr[s] << 26;
+  if ((s--) > 0) m_csCurr |= __min (3, m_qcCurr[s]) << 30;
+  m_csCurr |= ((unsigned) __min (31, m_acBits) << 17) | c;
+  bitCount += m_acBits;
+
+  return bitCount - __min (inAcBits, bitCount);
+}
+#endif // EC_TRELLIS_OPT_CODING
+
 unsigned EntropyCoder::arithGetResetBit (const uint8_t* const magn, const uint16_t sigOffset, const uint16_t sigLength)
 {
  const uint16_t sigEnd = (sigOffset >> 1) + (sigLength >> 1);
--- a/src/lib/entropyCoding.h
+++ b/src/lib/entropyCoding.h
@ -1,5 +1,5 @@
 /* entropyCoding.h - header file for class with lossless entropy coding capability
- * written by C. R. Helmrich, last modified in 2020 - see License.htm for legal notices
+ * written by C. R. Helmrich, last modified in 2022 - see License.htm for legal notices
 *
 * The copyright in this software is being made available under the exhale Copyright License
 * and comes with ABSOLUTELY NO WARRANTY. This software may be subject to other third-
@ -55,6 +55,10 @@ public:
  // public functions
  unsigned arithCodeSigMagn (const uint8_t* const magn, const uint16_t sigOffset, const uint16_t sigLength,
                             const bool arithFinish = false, OutputStream* const stream = nullptr);
+#if EC_TRELLIS_OPT_CODING
+  unsigned arithCodeSigTest (const uint8_t* const magn, const uint16_t sigOffset, const uint16_t sigLength); // +-m_acBits
+  unsigned arithCodeTupTest (const uint8_t* const magn, const uint16_t sigOffset); // for sigLength of 2 - also +-m_acBits
+#endif
  unsigned arithGetCodState () const                     { return ((unsigned) m_acHigh << 16) | (unsigned) m_acLow; }
  unsigned arithGetCtxState () const                     { return m_csCurr; }
  unsigned arithGetResetBit (const uint8_t* const magn, const uint16_t sigOffset, const uint16_t sigLength);
@ -70,7 +74,7 @@ public:
  unsigned indexGetHuffCode (const int scaleFactorDelta) const;

  unsigned initCodingMemory (const unsigned maxTransfLength);
-  unsigned initWindowCoding (const bool     forceArithReset, const bool shortWin = false);
+  unsigned initWindowCoding (const bool forceArithReset, const bool shortWin = false);

  bool     getIsShortWindow () const                     { return m_shortTrafoCurr; }
  void     setIsShortWindow (const bool shortWin)        { m_shortTrafoCurr = shortWin; }
--- a/src/lib/exhaleEnc.cpp
+++ b/src/lib/exhaleEnc.cpp
@ -1404,7 +1404,7 @@ unsigned ExhaleEncoder::spectralProcessing ()  // complete ics_info(), calc TNS

        if ((int) s == steAnaStats * -1) coreConfig.stereoConfig = 2;  // 2: S>M, pred_dir=1
        if (s > (UCHAR_MAX * (6u + m_shiftValSBR)) / 8) coreConfig.stereoMode = 2; // 2: all
-        if (s >= UCHAR_MAX - 2u + (meanSpecFlat >> 6)) coreConfig.stereoConfig |= 8; // mono
+        if (s >= UCHAR_MAX - 2u + (m_bitRateMode / 5) + (meanSpecFlat >> 6)) coreConfig.stereoConfig |= 8; // tuning for mono-in-stereo audio
      }
      else if (nrChannels > 1) m_perCorrHCurr[el] = m_perCorrLCurr[el] = 128; // "mid" value

--- a/src/lib/quantization.cpp
+++ b/src/lib/quantization.cpp
@ -1,5 +1,5 @@
 /* quantization.cpp - source file for class with nonuniform quantization functionality
- * written by C. R. Helmrich, last modified in 2020 - see License.htm for legal notices
+ * written by C. R. Helmrich, last modified in 2022 - see License.htm for legal notices
 *
 * The copyright in this software is being made available under the exhale Copyright License
 * and comes with ABSOLUTELY NO WARRANTY. This software may be subject to other third-
@ -10,6 +10,9 @@

 #include "exhaleLibPch.h"
 #include "quantization.h"
+#if SFB_QUANT_SSE
+# include <xmmintrin.h>
+#endif

 #define EC_TRAIN (0 && EC_TRELLIS_OPT_CODING) // for RDOC testing

@ -23,12 +26,9 @@ static inline short getBitCount (EntropyCoder& entrCoder, const int sfIndex, con
  if (groupLength == 1) // include arithmetic coding in bit count
  {
 #if EC_TRELLIS_OPT_CODING
-    const unsigned bitsStart = (entrCoder.arithGetCtxState () >> 17) & 31;
-#endif
+    bitCount += entrCoder.arithCodeSigTest (coeffQuant, coeffOffset, numCoeffs);
+#else
    bitCount += entrCoder.arithCodeSigMagn (coeffQuant, coeffOffset, numCoeffs);
-#if EC_TRELLIS_OPT_CODING
-    bitCount += (entrCoder.arithGetCtxState () >> 17) & 31;
-    bitCount -= __min (bitsStart, bitCount); // +new-old m_acBits
 #endif
  }

@ -46,6 +46,26 @@ static inline double getLagrangeValue (const uint16_t rateIndex) // RD optimizat
 double SfbQuantizer::getQuantDist (const unsigned* const coeffMagn, const uint8_t scaleFactor,
                                   const uint8_t* const coeffQuant, const uint16_t numCoeffs)
 {
+#if SFB_QUANT_SSE
+  const __m128 stepSizeDiv = _mm_set_ps1 ((float) m_lutSfNorm[scaleFactor]); // or _mm_set1_ps ()
+  __m128 sumsSquares = _mm_setzero_ps ();
+  float dist[4];
+
+  for (int i = numCoeffs - 4; i >= 0; i -= 4)
+  {
+    __m128 orig = _mm_set_ps ((float) coeffMagn[i + 0], (float) coeffMagn[i + 1],
+                              (float) coeffMagn[i + 2], (float) coeffMagn[i + 3]);
+    __m128 reco = _mm_set_ps ((float) m_lutXExp43[coeffQuant[i + 0]], (float) m_lutXExp43[coeffQuant[i + 1]],
+                              (float) m_lutXExp43[coeffQuant[i + 2]], (float) m_lutXExp43[coeffQuant[i + 3]]);
+    __m128 diff = _mm_sub_ps (reco, _mm_mul_ps (orig, stepSizeDiv));
+
+    sumsSquares = _mm_add_ps (sumsSquares, _mm_mul_ps (diff, diff));
+  }
+  _mm_storeu_ps (dist, sumsSquares);
+
+  // consider quantization step-size in calculation of distortion
+  return ((double) dist[0] + dist[1] + dist[2] + dist[3]) * m_lut2ExpX4[scaleFactor] * m_lut2ExpX4[scaleFactor];
+#else
  const double stepSizeDiv = m_lutSfNorm[scaleFactor];
  double dDist = 0.0;

@ -58,6 +78,7 @@ double SfbQuantizer::getQuantDist (const unsigned* const coeffMagn, const uint8_

  // consider quantization step-size in calculation of distortion
  return dDist * m_lut2ExpX4[scaleFactor] * m_lut2ExpX4[scaleFactor];
+#endif
 }

 uint8_t SfbQuantizer::quantizeMagnSfb (const unsigned* const coeffMagn, const uint8_t scaleFactor,
@ -150,11 +171,7 @@ uint8_t SfbQuantizer::quantizeMagnSfb (const unsigned* const coeffMagn, const ui
 #if EC_TRAIN
    const uint32_t codStart = entrCoder.arithGetCodState ();
    const uint32_t ctxStart = entrCoder.arithGetCtxState ();
-    uint32_t bitCount = entrCoder.arithCodeSigMagn (&coeffQuant[-((int) coeffOffset)], coeffOffset, numCoeffs);
-
-    bitCount += (entrCoder.arithGetCtxState () >> 17) & 31; // refinement: +new-old m_acBits
-    bitCount -= __min ((ctxStart >> 17) & 31, bitCount);
-    bitCount += (uint32_t) numQ;  // add sign bits for completion
+    uint32_t bitCount = entrCoder.arithCodeSigTest (&coeffQuant[-((int) coeffOffset)], coeffOffset, numCoeffs) + (uint32_t) numQ;

    entrCoder.arithSetCodState (codStart);  // back to last state
    entrCoder.arithSetCtxState (ctxStart);
@ -180,21 +197,21 @@ uint8_t SfbQuantizer::quantizeMagnSfb (const unsigned* const coeffMagn, const ui
            dNum += m_lutXExp43[q] * normalizedMagn;
            dDen += m_lutXExp43[q] * m_lutXExp43[q];
          }
-#if SFB_QUANT_PERCEPT_OPT
+# if SFB_QUANT_PERCEPT_OPT
          else   // assume perceptual transparency for code below
          {
            dNum += normalizedMagn * normalizedMagn;
            dDen += normalizedMagn * normalizedMagn;
          }
-#endif
+# endif
        }

        // re-compute least-squares optimal scale factor modifier
        if (dNum > SF_THRESH_POS * dDen) sf++;
-#if !SFB_QUANT_PERCEPT_OPT
+# if !SFB_QUANT_PERCEPT_OPT
        else
        if (dNum < SF_THRESH_NEG * dDen) sf--; // reduces SFB RMS
-#endif
+# endif
      } // if nonzero

      if (sigMaxQ) *sigMaxQ = (numQ > 0 ? maxQ : 0); // a new max
@ -206,6 +223,23 @@ uint8_t SfbQuantizer::quantizeMagnSfb (const unsigned* const coeffMagn, const ui
 #if SFB_QUANT_PERCEPT_OPT
  if ((numQ > 0) && (sf > 0 && sf <= scaleFactor)) // recover RMS
  {
+# if SFB_QUANT_SSE
+    const __m128 magnNormDiv = _mm_set_ps1 ((float) m_lutSfNorm[sf]); // or _mm_set1_ps ()
+    __m128 sumsSquares = _mm_setzero_ps ();
+    float fl[4]; // dDen has normalized energy after quantization
+
+    for (int i = numCoeffs - 4; i >= 0; i -= 4)
+    {
+      __m128 orig = _mm_set_ps ((float) coeffMagn[i + 0], (float) coeffMagn[i + 1],
+                                (float) coeffMagn[i + 2], (float) coeffMagn[i + 3]);
+      __m128 norm = _mm_mul_ps (orig, magnNormDiv);
+
+      sumsSquares = _mm_add_ps (sumsSquares, _mm_mul_ps (norm, norm));
+    }
+    _mm_storeu_ps (fl, sumsSquares);
+
+    if ((double) fl[0] + fl[1] + fl[2] + fl[3] > SF_THRESH_POS * SF_THRESH_POS * dDen) sf++;
+# else
    const double magnNormDiv = m_lutSfNorm[sf];

    dNum = 0.0;  // dDen has normalized energy after quantization
@ -217,6 +251,7 @@ uint8_t SfbQuantizer::quantizeMagnSfb (const unsigned* const coeffMagn, const ui
    }

    if (dNum > SF_THRESH_POS * SF_THRESH_POS * dDen) sf++;
+# endif
  }
 #endif
  return (uint8_t) __max (0, sf); // optimized scale factor index
@ -273,7 +308,7 @@ uint32_t SfbQuantizer::quantizeMagnRDOC (EntropyCoder& entropyCoder, const uint8

    for (is = 0; is < numStates; is++)  // populate tuple trellis
    {
-      uint8_t* const mag = (is != 0 ? tempQuant : quantCoeffs) - (int) tupleOffset; // see arithCodeSigMagn()
+      uint8_t* const mag = (is != 0 ? tempQuant : quantCoeffs) - (int) tupleOffset; // see arithCodeTupTest()
      uint8_t*  currRate = &quantRate[(is + tuple * numStates) * numStates];
      double diffA, diffB;

@ -302,13 +337,9 @@ uint32_t SfbQuantizer::quantizeMagnRDOC (EntropyCoder& entropyCoder, const uint8
      if (tuple == 0) // first tuple, with tupleStart == sfbStart
      {
        entropyCoder.arithSetCodState (codStart); // start of SFB
-        entropyCoder.arithSetCtxState (ctxStart);
-        tempBitCount = entropyCoder.arithCodeSigMagn (mag, tupleOffset, 2);
+        entropyCoder.arithSetCtxState (ctxStart, 0);

-        tempBitCount += (entropyCoder.arithGetCtxState () >> 17) & 31;  // +new-old m_acBits
-        tempBitCount -= __min ((ctxStart >> 17) & 31, tempBitCount);
-
-        memset (currRate, tempBitCount + numQ, numStates);
+        memset (currRate, entropyCoder.arithCodeTupTest (mag, tupleOffset) + numQ, numStates); // +- m_acBits
      }
      else // tuple > 0, rate depends on decisions for last tuple
      {
@ -323,12 +354,8 @@ uint32_t SfbQuantizer::quantizeMagnRDOC (EntropyCoder& entropyCoder, const uint8

          entropyCoder.arithSetCodState (prevCodState[ds]);
          entropyCoder.arithSetCtxState (prevCtxState[ds], tupleOffset);
-          tempBitCount = entropyCoder.arithCodeSigMagn (mag, tupleOffset, 2);

-          tempBitCount += (entropyCoder.arithGetCtxState () >> 17) & 31;// +new-old m_acBits
-          tempBitCount -= __min ((prevCtxState[ds] >> 17) & 31, tempBitCount);
-
-          currRate[ds] = uint8_t (tempBitCount + numQ);
+          currRate[ds] = uint8_t (entropyCoder.arithCodeTupTest (mag, tupleOffset) + numQ); // incl. m_acBits
        }
      }
      // statistically best place to save states is after ds == 0
@ -877,7 +904,7 @@ unsigned SfbQuantizer::quantizeSpecRDOC (EntropyCoder& entropyCoder, uint8_t* co
      if (sfb == 0) // first SFB, having sfbStart - grpStart == 0
      {
        entropyCoder.arithSetCodState (codStart);  // group start
-        entropyCoder.arithSetCtxState (ctxStart, 0);
+        entropyCoder.arithSetCtxState (ctxStart);
        tempBitCount = (maxSnrReached ? USHRT_MAX : numQCurr + getBitCount (entropyCoder, sfBest, UCHAR_MAX, 1, mag, 0, sfbWidth));

        for (ds = m_numCStates - 1; ds >= 0; ds--)
@ -1005,7 +1032,7 @@ unsigned SfbQuantizer::quantizeSpecRDOC (EntropyCoder& entropyCoder, uint8_t* co
    if (grpStats)
    {
      entropyCoder.arithSetCodState (codStart);// set group start
-      entropyCoder.arithSetCtxState (ctxStart, 0);
+      entropyCoder.arithSetCtxState (ctxStart);

      tempBitCount = 0;
    }
--- a/src/lib/quantization.h
+++ b/src/lib/quantization.h
@ -1,5 +1,5 @@
 /* quantization.h - header file for class with nonuniform quantization functionality
- * written by C. R. Helmrich, last modified in 2020 - see License.htm for legal notices
+ * written by C. R. Helmrich, last modified in 2022 - see License.htm for legal notices
 *
 * The copyright in this software is being made available under the exhale Copyright License
 * and comes with ABSOLUTELY NO WARRANTY. This software may be subject to other third-
@ -26,6 +26,7 @@
 #else
 #define SFB_QUANT_OFFSET 0.405396 // 1 - 0.5^(3/4)
 #endif
+#define SFB_QUANT_SSE (0 && defined (_MSC_VER))

 // class for BL USAC quantization
 class SfbQuantizer