Bump, fix and configure ovos-stt-plugin-whispercpp

2025-06-05 22:19:21 +02:00 · 2022-11-16 10:26:53 +01:00
parent 16da771266
commit fbb9788b03
4 changed files with 9 additions and 342 deletions
--- a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch
+++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch
@@ -1,340 +0,0 @@
 From 23942ff3134d3897a40a62ed3a69e7a5c78392df Mon Sep 17 00:00:00 2001
 From: jarbasai <jarbasai@mailfence.com>
 Date: Tue, 25 Oct 2022 00:45:13 +0100
 Subject: [PATCH 1/2] feat/python_bindings
 ---
 README.md                              |  20 ++--
 build_lib.sh                           |   8 ++
 ovos_stt_plugin_whispercpp/__init__.py | 130 ++++++++++++++++++++-----
 3 files changed, 127 insertions(+), 31 deletions(-)
 create mode 100644 build_lib.sh
 diff --git a/README.md b/README.md
 index 0eea31d..f41d8de 100644
 --- a/README.md
 +++ b/README.md
@@ -19,16 +19,19 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 ### WhisperCPP
 -First let's install whisper.cpp from source and move the binary to the plugin expected default path
 +First let's build whisper.cpp from source and move the shared library to the plugin expected default path
 ```bash
 -git clone https://github.com/ggerganov/whisper.cpp
 -cd whisper.cpp
 -make
 -cp main ~/.local/bin/whispercpp
 +# build shared libwhisper.so
 +git clone https://github.com/ggerganov/whisper.cpp /tmp/whispercpp
 +cd /tmp/whispercpp
 +# last commit before a breaking change
 +git checkout d6b84b2a23220dd8b8792872a3ab6802cd24b424
 +gcc -O3 -std=c11   -pthread -mavx -mavx2 -mfma -mf16c -fPIC -c ggml.c
 +g++ -O3 -std=c++11 -pthread --shared -fPIC -static-libstdc++ whisper.cpp ggml.o -o libwhisper.so
 +cp libwhisper.so ~/.local/bin/libwhisper.so
 ```
 -Models will be autodownloaded to `/home/user/.local/share/whispercpp/{model_name}` when plugin is loaded
 ## Configuration
@@ -42,7 +45,7 @@ available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small",
   "stt": {
     "module": "ovos-stt-plugin-whispercpp",
     "ovos-stt-plugin-whispercpp": {
 -        "bin": "/home/user/.local/bin/whispercpp",
 +        "lib": "~/.local/bin/libwhisper.so",
         "model": "tiny"
     }
   }
@@ -51,6 +54,9 @@ available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small",
 ## Models
 +Models will be autodownloaded to `/home/user/.local/share/whispercpp/{model_name}` when plugin is loaded
 +
 +
 Memory usage
 | Model  | Disk   | Mem     |
 diff --git a/build_lib.sh b/build_lib.sh
 new file mode 100644
 index 0000000..2b69c5e
 --- /dev/null
 +++ b/build_lib.sh
@@ -0,0 +1,8 @@
 +# build shared libwhisper.so
 +git clone https://github.com/ggerganov/whisper.cpp /tmp/whispercpp
 +cd /tmp/whispercpp
 +# last commit before a breaking change
 +git checkout d6b84b2a23220dd8b8792872a3ab6802cd24b424
 +gcc -O3 -std=c11   -pthread -mavx -mavx2 -mfma -mf16c -fPIC -c ggml.c
 +g++ -O3 -std=c++11 -pthread --shared -fPIC -static-libstdc++ whisper.cpp ggml.o -o libwhisper.so
 +cp libwhisper.so $HOME/.local/bin/libwhisper.so
 \ No newline at end of file
 diff --git a/ovos_stt_plugin_whispercpp/__init__.py b/ovos_stt_plugin_whispercpp/__init__.py
 index d347898..91982eb 100644
 --- a/ovos_stt_plugin_whispercpp/__init__.py
 +++ b/ovos_stt_plugin_whispercpp/__init__.py
@@ -1,11 +1,101 @@
 +import ctypes
 import os
 -import subprocess
 -from tempfile import NamedTemporaryFile
 +import pathlib
 +# this is needed to read the WAV file properly
 +import numpy
 import requests
 from ovos_plugin_manager.templates.stt import STT
 from ovos_utils.log import LOG
 from ovos_utils.xdg_utils import xdg_data_home
 +from speech_recognition import AudioData
 +
 +
 +# this needs to match the C struct in whisper.h
 +class WhisperFullParams(ctypes.Structure):
 +    _fields_ = [
 +        ("strategy", ctypes.c_int),
 +        ("n_threads", ctypes.c_int),
 +        ("offset_ms", ctypes.c_int),
 +        ("translate", ctypes.c_bool),
 +        ("no_context", ctypes.c_bool),
 +        ("print_special_tokens", ctypes.c_bool),
 +        ("print_progress", ctypes.c_bool),
 +        ("print_realtime", ctypes.c_bool),
 +        ("print_timestamps", ctypes.c_bool),
 +        ("language", ctypes.c_char_p),
 +        ("greedy", ctypes.c_int * 1),
 +    ]
 +
 +
 +class WhisperEngine:
 +    def __init__(self, libname, model_path):
 +        # load library and model
 +        self.libname = pathlib.Path().absolute() / libname
 +        self.whisper = ctypes.CDLL(libname)
 +
 +        # tell Python what are the return types of the functions
 +        self.whisper.whisper_init.restype = ctypes.c_void_p
 +        self.whisper.whisper_full_default_params.restype = WhisperFullParams
 +        self.whisper.whisper_full_get_segment_text.restype = ctypes.c_char_p
 +
 +        # initialize whisper.cpp context
 +        self.ctx = self.whisper.whisper_init(model_path.encode("utf-8"))
 +
 +        # get default whisper parameters and adjust as needed
 +        self.params = self.whisper.whisper_full_default_params(0)
 +        self.params.print_realtime = True
 +        self.params.print_progress = False
 +        self.params.print_timestamps = False
 +        self.params.n_threads = os.cpu_count() - 1
 +        self.params.translate = False
 +
 +    def audiodata2array(self, audio_data):
 +        assert isinstance(audio_data, AudioData)
 +        # Convert buffer to float32 using NumPy
 +        audio_as_np_int16 = numpy.frombuffer(audio_data.get_wav_data(), dtype=numpy.int16)
 +        audio_as_np_float32 = audio_as_np_int16.astype(numpy.float32)
 +
 +        # Normalise float32 array so that values are between -1.0 and +1.0
 +        max_int16 = 2 ** 15
 +        data = audio_as_np_float32 / max_int16
 +        return data
 +
 +    def transcribe_wav(self, wav, lang="en"):
 +        self.params.language = lang.encode()
 +
 +        if isinstance(wav, str):
 +            with AudioFile(wav) as source:
 +                audio = Recognizer().record(source)
 +        elif isinstance(wav, AudioData):
 +            audio = wav
 +        else:
 +            raise ValueError(f"invalid audio: {wav}")
 +
 +        return self.transcribe_audio(audio, lang)
 +
 +    def transcribe_audio(self, audio, lang="en"):
 +        self.params.language = lang.encode()
 +
 +        data = self.audiodata2array(audio)
 +
 +        # run the inference
 +        result = self.whisper.whisper_full(ctypes.c_void_p(self.ctx), self.params,
 +                                           data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
 +                                           len(data))
 +        if result != 0:
 +            raise RuntimeError(f"Error: {result}")
 +
 +        # print results from Python
 +        n_segments = self.whisper.whisper_full_n_segments(ctypes.c_void_p(self.ctx))
 +        txt = b""
 +        for i in range(n_segments):
 +            txt += self.whisper.whisper_full_get_segment_text(ctypes.c_void_p(self.ctx), i)
 +        return txt.decode("utf-8")
 +
 +    def shutdown(self):
 +        # free the memory
 +        self.whisper.whisper_free(ctypes.c_void_p(self.ctx))
 class WhispercppSTT(STT):
@@ -115,11 +205,12 @@ class WhispercppSTT(STT):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 -        self.bin = self.config.get("binary") or os.path.expanduser("~/.local/bin/whispercpp")
 +        lib = self.config.get("lib") or "~/.local/bin/libwhisper.so"
 +        self.lib = os.path.expanduser(lib)
         # self.bin = os.path.expanduser("~/whisper.cpp/main")
 -        if not self.bin:
 -            LOG.error(
 -                "you need to manually install whisper.cpp, please provide full path to ./main binary generated by compiling https://github.com/ggerganov/whisper.cpp")
 +        if not os.path.isfile(self.lib):
 +            LOG.error("you need to provicde libwhisper.so, please follow the README.md instructions")
 +            raise ImportError("libwhisper.so not found")
         self.model_folder = self.config.get("model_folder") or f"{xdg_data_home()}/whispercpp"
         model = self.config.get("model")
@@ -129,7 +220,8 @@ def __init__(self, *args, **kwargs):
             else:
                 model = "tiny"
         os.makedirs(self.model_folder, exist_ok=True)
 -        self.model_path = self.get_model(model)
 +        model_path = self.get_model(model)
 +        self.engine = WhisperEngine(self.lib, model_path)
     def get_model(self, model_name):
         if os.path.isfile(model_name):
@@ -145,27 +237,18 @@ def get_model(self, model_name):
                 f.write(data)
         return model_path
 -    def execute(self, audio, language=None, model=None):
 +    def execute(self, audio, language=None):
         lang = language or self.lang
 -        if model:
 -            if not os.path.isfile(model):
 -                model = self.get_model(model)
 -
 -        model = model or self.model_path
 -
 -        with NamedTemporaryFile() as f:
 -            f.write(audio.get_wav_data())
 -            lang = lang.split('-')[0].lower()
 -            cmd = f"{self.bin} -m {model} -l {lang} -f {f.name}"
 -            t = subprocess.check_output(cmd, shell=True, stderr=subprocess.PIPE).decode("utf-8")
 -            transcript = t.split("] ")[-1].strip()
 -
 -        return transcript
 +        return self.engine.transcribe_audio(audio, lang)
     @property
     def available_languages(self) -> set:
         return set(self.LANGUAGES.keys())
 +    def __del__(self):
 +        if self.engine:
 +            self.engine.shutdown()
 +
 WhispercppSTTConfig = {
     lang: [{"model": "tiny",
@@ -195,12 +278,11 @@ def available_languages(self) -> set:
 if __name__ == "__main__":
     b = WhispercppSTT()
 -
     from speech_recognition import Recognizer, AudioFile
     with AudioFile("/home/user/PycharmProjects/selene_api/test/test.wav") as source:
         audio = Recognizer().record(source)
 -    a = b.execute(audio, model="base")
 +    a = b.execute(audio)
     print(a)
 From ed15bfab9643d68a39f91c621e21ab450a03a628 Mon Sep 17 00:00:00 2001
 From: jarbasai <jarbasai@mailfence.com>
 Date: Tue, 25 Oct 2022 01:20:07 +0100
 Subject: [PATCH 2/2] remove print
 ---
 ovos_stt_plugin_whispercpp/__init__.py | 29 +++++++++-----------------
 1 file changed, 10 insertions(+), 19 deletions(-)
 diff --git a/ovos_stt_plugin_whispercpp/__init__.py b/ovos_stt_plugin_whispercpp/__init__.py
 index 91982eb..7daf309 100644
 --- a/ovos_stt_plugin_whispercpp/__init__.py
 +++ b/ovos_stt_plugin_whispercpp/__init__.py
@@ -44,7 +44,7 @@ def __init__(self, libname, model_path):
         # get default whisper parameters and adjust as needed
         self.params = self.whisper.whisper_full_default_params(0)
 -        self.params.print_realtime = True
 +        self.params.print_realtime = False
         self.params.print_progress = False
         self.params.print_timestamps = False
         self.params.n_threads = os.cpu_count() - 1
@@ -62,19 +62,12 @@ def audiodata2array(self, audio_data):
         return data
     def transcribe_wav(self, wav, lang="en"):
 -        self.params.language = lang.encode()
 -
 -        if isinstance(wav, str):
 -            with AudioFile(wav) as source:
 -                audio = Recognizer().record(source)
 -        elif isinstance(wav, AudioData):
 -            audio = wav
 -        else:
 -            raise ValueError(f"invalid audio: {wav}")
 -
 +        with AudioFile(wav) as source:
 +            audio = Recognizer().record(source)
         return self.transcribe_audio(audio, lang)
     def transcribe_audio(self, audio, lang="en"):
 +        lang = lang.lower().split("-")[0]
         self.params.language = lang.encode()
         data = self.audiodata2array(audio)
@@ -91,7 +84,7 @@ def transcribe_audio(self, audio, lang="en"):
         txt = b""
         for i in range(n_segments):
             txt += self.whisper.whisper_full_get_segment_text(ctypes.c_void_p(self.ctx), i)
 -        return txt.decode("utf-8")
 +        return txt.decode("utf-8").strip()
     def shutdown(self):
         # free the memory
@@ -215,10 +208,7 @@ def __init__(self, *args, **kwargs):
         self.model_folder = self.config.get("model_folder") or f"{xdg_data_home()}/whispercpp"
         model = self.config.get("model")
         if not model:
 -            if self.lang.startswith("en"):
 -                model = "tiny.en"
 -            else:
 -                model = "tiny"
 +            model = "tiny"
         os.makedirs(self.model_folder, exist_ok=True)
         model_path = self.get_model(model)
         self.engine = WhisperEngine(self.lib, model_path)
@@ -280,9 +270,10 @@ def __del__(self):
     b = WhispercppSTT()
     from speech_recognition import Recognizer, AudioFile
 -    with AudioFile("/home/user/PycharmProjects/selene_api/test/test.wav") as source:
 +    jfk = "/home/user/whisper.cpp/samples/jfk.wav"
 +    with AudioFile(jfk) as source:
         audio = Recognizer().record(source)
 -    a = b.execute(audio)
 -
 +    a = b.execute(audio, language="en")
     print(a)
 +
--- a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash
+++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash
@@ -1,2 +1,2 @@
 # sha256 locally computed
-sha256  efe2ec892f6a0e9f1f9b24ae804d6975f7fbd8aaaf0b264e0d5bf4917b0e7a71  python-ovos-stt-plugin-whispercpp-630db8ff8b2ea8f56c30f39254d880b0f572c921.tar.gz
+sha256  375f983e2783f7ff247a813de88333685e76b334bba0d888a5239fe77a63fe9a  python-ovos-stt-plugin-whispercpp-2eac10f51d6cc80c5d75894babbdc4f902d6273f.tar.gz
--- a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk
+++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk
@@ -4,7 +4,7 @@
 #
 ################################################################################
-PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION = 630db8ff8b2ea8f56c30f39254d880b0f572c921
+PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION = 2eac10f51d6cc80c5d75894babbdc4f902d6273f
 PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_SITE = $(call github,OpenVoiceOS,ovos-stt-plugin-whispercpp,$(PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION))
 PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_SETUP_TYPE = setuptools
 PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_ENV = MYCROFT_LOOSE_REQUIREMENTS=true
--- a/buildroot-external/rootfs-overlay/etc/mycroft/mycroft.conf
+++ b/buildroot-external/rootfs-overlay/etc/mycroft/mycroft.conf
@@ -37,6 +37,13 @@
      "wakeup": true
    }
  },
  "stt": {
    "module": "ovos-stt-plugin-whispercpp",
    "ovos-stt-plugin-whispercpp": {
        "lib": "/usr/lib/libwhisper.so",
        "model": "tiny"
    }
  },
  "tts": {
    "module": "ovos-tts-plugin-mimic3-server",
    "fallback_module": "ovos-tts-plugin-mimic",
`@@ -1,2 +1,2 @@`
	`# sha256 locally computed`	`# sha256 locally computed`
	`sha256 efe2ec892f6a0e9f1f9b24ae804d6975f7fbd8aaaf0b264e0d5bf4917b0e7a71 python-ovos-stt-plugin-whispercpp-630db8ff8b2ea8f56c30f39254d880b0f572c921.tar.gz`	`sha256 375f983e2783f7ff247a813de88333685e76b334bba0d888a5239fe77a63fe9a python-ovos-stt-plugin-whispercpp-2eac10f51d6cc80c5d75894babbdc4f902d6273f.tar.gz`