diff --git a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch deleted file mode 100644 index c5c3e54e..00000000 --- a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/0001-python-bindings.patch +++ /dev/null @@ -1,340 +0,0 @@ -From 23942ff3134d3897a40a62ed3a69e7a5c78392df Mon Sep 17 00:00:00 2001 -From: jarbasai -Date: Tue, 25 Oct 2022 00:45:13 +0100 -Subject: [PATCH 1/2] feat/python_bindings - ---- - README.md | 20 ++-- - build_lib.sh | 8 ++ - ovos_stt_plugin_whispercpp/__init__.py | 130 ++++++++++++++++++++----- - 3 files changed, 127 insertions(+), 31 deletions(-) - create mode 100644 build_lib.sh - -diff --git a/README.md b/README.md -index 0eea31d..f41d8de 100644 ---- a/README.md -+++ b/README.md -@@ -19,16 +19,19 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp - - ### WhisperCPP - --First let's install whisper.cpp from source and move the binary to the plugin expected default path -+First let's build whisper.cpp from source and move the shared library to the plugin expected default path - - ```bash --git clone https://github.com/ggerganov/whisper.cpp --cd whisper.cpp --make --cp main ~/.local/bin/whispercpp -+# build shared libwhisper.so -+git clone https://github.com/ggerganov/whisper.cpp /tmp/whispercpp -+cd /tmp/whispercpp -+# last commit before a breaking change -+git checkout d6b84b2a23220dd8b8792872a3ab6802cd24b424 -+gcc -O3 -std=c11 -pthread -mavx -mavx2 -mfma -mf16c -fPIC -c ggml.c -+g++ -O3 -std=c++11 -pthread --shared -fPIC -static-libstdc++ whisper.cpp ggml.o -o libwhisper.so -+cp libwhisper.so ~/.local/bin/libwhisper.so - ``` - --Models will be autodownloaded to `/home/user/.local/share/whispercpp/{model_name}` when plugin is loaded - - ## Configuration - -@@ -42,7 +45,7 @@ available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small", - "stt": { - "module": "ovos-stt-plugin-whispercpp", - "ovos-stt-plugin-whispercpp": { -- "bin": "/home/user/.local/bin/whispercpp", -+ "lib": "~/.local/bin/libwhisper.so", - "model": "tiny" - } - } -@@ -51,6 +54,9 @@ available models are `"tiny.en", "tiny", "base.en", "base", "small.en", "small", - - ## Models - -+Models will be autodownloaded to `/home/user/.local/share/whispercpp/{model_name}` when plugin is loaded -+ -+ - Memory usage - - | Model | Disk | Mem | -diff --git a/build_lib.sh b/build_lib.sh -new file mode 100644 -index 0000000..2b69c5e ---- /dev/null -+++ b/build_lib.sh -@@ -0,0 +1,8 @@ -+# build shared libwhisper.so -+git clone https://github.com/ggerganov/whisper.cpp /tmp/whispercpp -+cd /tmp/whispercpp -+# last commit before a breaking change -+git checkout d6b84b2a23220dd8b8792872a3ab6802cd24b424 -+gcc -O3 -std=c11 -pthread -mavx -mavx2 -mfma -mf16c -fPIC -c ggml.c -+g++ -O3 -std=c++11 -pthread --shared -fPIC -static-libstdc++ whisper.cpp ggml.o -o libwhisper.so -+cp libwhisper.so $HOME/.local/bin/libwhisper.so -\ No newline at end of file -diff --git a/ovos_stt_plugin_whispercpp/__init__.py b/ovos_stt_plugin_whispercpp/__init__.py -index d347898..91982eb 100644 ---- a/ovos_stt_plugin_whispercpp/__init__.py -+++ b/ovos_stt_plugin_whispercpp/__init__.py -@@ -1,11 +1,101 @@ -+import ctypes - import os --import subprocess --from tempfile import NamedTemporaryFile -+import pathlib - -+# this is needed to read the WAV file properly -+import numpy - import requests - from ovos_plugin_manager.templates.stt import STT - from ovos_utils.log import LOG - from ovos_utils.xdg_utils import xdg_data_home -+from speech_recognition import AudioData -+ -+ -+# this needs to match the C struct in whisper.h -+class WhisperFullParams(ctypes.Structure): -+ _fields_ = [ -+ ("strategy", ctypes.c_int), -+ ("n_threads", ctypes.c_int), -+ ("offset_ms", ctypes.c_int), -+ ("translate", ctypes.c_bool), -+ ("no_context", ctypes.c_bool), -+ ("print_special_tokens", ctypes.c_bool), -+ ("print_progress", ctypes.c_bool), -+ ("print_realtime", ctypes.c_bool), -+ ("print_timestamps", ctypes.c_bool), -+ ("language", ctypes.c_char_p), -+ ("greedy", ctypes.c_int * 1), -+ ] -+ -+ -+class WhisperEngine: -+ def __init__(self, libname, model_path): -+ # load library and model -+ self.libname = pathlib.Path().absolute() / libname -+ self.whisper = ctypes.CDLL(libname) -+ -+ # tell Python what are the return types of the functions -+ self.whisper.whisper_init.restype = ctypes.c_void_p -+ self.whisper.whisper_full_default_params.restype = WhisperFullParams -+ self.whisper.whisper_full_get_segment_text.restype = ctypes.c_char_p -+ -+ # initialize whisper.cpp context -+ self.ctx = self.whisper.whisper_init(model_path.encode("utf-8")) -+ -+ # get default whisper parameters and adjust as needed -+ self.params = self.whisper.whisper_full_default_params(0) -+ self.params.print_realtime = True -+ self.params.print_progress = False -+ self.params.print_timestamps = False -+ self.params.n_threads = os.cpu_count() - 1 -+ self.params.translate = False -+ -+ def audiodata2array(self, audio_data): -+ assert isinstance(audio_data, AudioData) -+ # Convert buffer to float32 using NumPy -+ audio_as_np_int16 = numpy.frombuffer(audio_data.get_wav_data(), dtype=numpy.int16) -+ audio_as_np_float32 = audio_as_np_int16.astype(numpy.float32) -+ -+ # Normalise float32 array so that values are between -1.0 and +1.0 -+ max_int16 = 2 ** 15 -+ data = audio_as_np_float32 / max_int16 -+ return data -+ -+ def transcribe_wav(self, wav, lang="en"): -+ self.params.language = lang.encode() -+ -+ if isinstance(wav, str): -+ with AudioFile(wav) as source: -+ audio = Recognizer().record(source) -+ elif isinstance(wav, AudioData): -+ audio = wav -+ else: -+ raise ValueError(f"invalid audio: {wav}") -+ -+ return self.transcribe_audio(audio, lang) -+ -+ def transcribe_audio(self, audio, lang="en"): -+ self.params.language = lang.encode() -+ -+ data = self.audiodata2array(audio) -+ -+ # run the inference -+ result = self.whisper.whisper_full(ctypes.c_void_p(self.ctx), self.params, -+ data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), -+ len(data)) -+ if result != 0: -+ raise RuntimeError(f"Error: {result}") -+ -+ # print results from Python -+ n_segments = self.whisper.whisper_full_n_segments(ctypes.c_void_p(self.ctx)) -+ txt = b"" -+ for i in range(n_segments): -+ txt += self.whisper.whisper_full_get_segment_text(ctypes.c_void_p(self.ctx), i) -+ return txt.decode("utf-8") -+ -+ def shutdown(self): -+ # free the memory -+ self.whisper.whisper_free(ctypes.c_void_p(self.ctx)) - - - class WhispercppSTT(STT): -@@ -115,11 +205,12 @@ class WhispercppSTT(STT): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) -- self.bin = self.config.get("binary") or os.path.expanduser("~/.local/bin/whispercpp") -+ lib = self.config.get("lib") or "~/.local/bin/libwhisper.so" -+ self.lib = os.path.expanduser(lib) - # self.bin = os.path.expanduser("~/whisper.cpp/main") -- if not self.bin: -- LOG.error( -- "you need to manually install whisper.cpp, please provide full path to ./main binary generated by compiling https://github.com/ggerganov/whisper.cpp") -+ if not os.path.isfile(self.lib): -+ LOG.error("you need to provicde libwhisper.so, please follow the README.md instructions") -+ raise ImportError("libwhisper.so not found") - - self.model_folder = self.config.get("model_folder") or f"{xdg_data_home()}/whispercpp" - model = self.config.get("model") -@@ -129,7 +220,8 @@ def __init__(self, *args, **kwargs): - else: - model = "tiny" - os.makedirs(self.model_folder, exist_ok=True) -- self.model_path = self.get_model(model) -+ model_path = self.get_model(model) -+ self.engine = WhisperEngine(self.lib, model_path) - - def get_model(self, model_name): - if os.path.isfile(model_name): -@@ -145,27 +237,18 @@ def get_model(self, model_name): - f.write(data) - return model_path - -- def execute(self, audio, language=None, model=None): -+ def execute(self, audio, language=None): - lang = language or self.lang -- if model: -- if not os.path.isfile(model): -- model = self.get_model(model) -- -- model = model or self.model_path -- -- with NamedTemporaryFile() as f: -- f.write(audio.get_wav_data()) -- lang = lang.split('-')[0].lower() -- cmd = f"{self.bin} -m {model} -l {lang} -f {f.name}" -- t = subprocess.check_output(cmd, shell=True, stderr=subprocess.PIPE).decode("utf-8") -- transcript = t.split("] ")[-1].strip() -- -- return transcript -+ return self.engine.transcribe_audio(audio, lang) - - @property - def available_languages(self) -> set: - return set(self.LANGUAGES.keys()) - -+ def __del__(self): -+ if self.engine: -+ self.engine.shutdown() -+ - - WhispercppSTTConfig = { - lang: [{"model": "tiny", -@@ -195,12 +278,11 @@ def available_languages(self) -> set: - - if __name__ == "__main__": - b = WhispercppSTT() -- - from speech_recognition import Recognizer, AudioFile - - with AudioFile("/home/user/PycharmProjects/selene_api/test/test.wav") as source: - audio = Recognizer().record(source) - -- a = b.execute(audio, model="base") -+ a = b.execute(audio) - - print(a) - -From ed15bfab9643d68a39f91c621e21ab450a03a628 Mon Sep 17 00:00:00 2001 -From: jarbasai -Date: Tue, 25 Oct 2022 01:20:07 +0100 -Subject: [PATCH 2/2] remove print - ---- - ovos_stt_plugin_whispercpp/__init__.py | 29 +++++++++----------------- - 1 file changed, 10 insertions(+), 19 deletions(-) - -diff --git a/ovos_stt_plugin_whispercpp/__init__.py b/ovos_stt_plugin_whispercpp/__init__.py -index 91982eb..7daf309 100644 ---- a/ovos_stt_plugin_whispercpp/__init__.py -+++ b/ovos_stt_plugin_whispercpp/__init__.py -@@ -44,7 +44,7 @@ def __init__(self, libname, model_path): - - # get default whisper parameters and adjust as needed - self.params = self.whisper.whisper_full_default_params(0) -- self.params.print_realtime = True -+ self.params.print_realtime = False - self.params.print_progress = False - self.params.print_timestamps = False - self.params.n_threads = os.cpu_count() - 1 -@@ -62,19 +62,12 @@ def audiodata2array(self, audio_data): - return data - - def transcribe_wav(self, wav, lang="en"): -- self.params.language = lang.encode() -- -- if isinstance(wav, str): -- with AudioFile(wav) as source: -- audio = Recognizer().record(source) -- elif isinstance(wav, AudioData): -- audio = wav -- else: -- raise ValueError(f"invalid audio: {wav}") -- -+ with AudioFile(wav) as source: -+ audio = Recognizer().record(source) - return self.transcribe_audio(audio, lang) - - def transcribe_audio(self, audio, lang="en"): -+ lang = lang.lower().split("-")[0] - self.params.language = lang.encode() - - data = self.audiodata2array(audio) -@@ -91,7 +84,7 @@ def transcribe_audio(self, audio, lang="en"): - txt = b"" - for i in range(n_segments): - txt += self.whisper.whisper_full_get_segment_text(ctypes.c_void_p(self.ctx), i) -- return txt.decode("utf-8") -+ return txt.decode("utf-8").strip() - - def shutdown(self): - # free the memory -@@ -215,10 +208,7 @@ def __init__(self, *args, **kwargs): - self.model_folder = self.config.get("model_folder") or f"{xdg_data_home()}/whispercpp" - model = self.config.get("model") - if not model: -- if self.lang.startswith("en"): -- model = "tiny.en" -- else: -- model = "tiny" -+ model = "tiny" - os.makedirs(self.model_folder, exist_ok=True) - model_path = self.get_model(model) - self.engine = WhisperEngine(self.lib, model_path) -@@ -280,9 +270,10 @@ def __del__(self): - b = WhispercppSTT() - from speech_recognition import Recognizer, AudioFile - -- with AudioFile("/home/user/PycharmProjects/selene_api/test/test.wav") as source: -+ jfk = "/home/user/whisper.cpp/samples/jfk.wav" -+ with AudioFile(jfk) as source: - audio = Recognizer().record(source) - -- a = b.execute(audio) -- -+ a = b.execute(audio, language="en") - print(a) -+ diff --git a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash index 08b11772..8d526833 100644 --- a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash +++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.hash @@ -1,2 +1,2 @@ # sha256 locally computed -sha256 efe2ec892f6a0e9f1f9b24ae804d6975f7fbd8aaaf0b264e0d5bf4917b0e7a71 python-ovos-stt-plugin-whispercpp-630db8ff8b2ea8f56c30f39254d880b0f572c921.tar.gz +sha256 375f983e2783f7ff247a813de88333685e76b334bba0d888a5239fe77a63fe9a python-ovos-stt-plugin-whispercpp-2eac10f51d6cc80c5d75894babbdc4f902d6273f.tar.gz diff --git a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk index 0f2dbb31..1fdefb2a 100644 --- a/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk +++ b/buildroot-external/package/python-ovos-stt-plugin-whispercpp/python-ovos-stt-plugin-whispercpp.mk @@ -4,7 +4,7 @@ # ################################################################################ -PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION = 630db8ff8b2ea8f56c30f39254d880b0f572c921 +PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION = 2eac10f51d6cc80c5d75894babbdc4f902d6273f PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_SITE = $(call github,OpenVoiceOS,ovos-stt-plugin-whispercpp,$(PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_VERSION)) PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_SETUP_TYPE = setuptools PYTHON_OVOS_STT_PLUGIN_WHISPERCPP_ENV = MYCROFT_LOOSE_REQUIREMENTS=true diff --git a/buildroot-external/rootfs-overlay/etc/mycroft/mycroft.conf b/buildroot-external/rootfs-overlay/etc/mycroft/mycroft.conf index 915a5455..2cc0c341 100644 --- a/buildroot-external/rootfs-overlay/etc/mycroft/mycroft.conf +++ b/buildroot-external/rootfs-overlay/etc/mycroft/mycroft.conf @@ -37,6 +37,13 @@ "wakeup": true } }, + "stt": { + "module": "ovos-stt-plugin-whispercpp", + "ovos-stt-plugin-whispercpp": { + "lib": "/usr/lib/libwhisper.so", + "model": "tiny" + } + }, "tts": { "module": "ovos-tts-plugin-mimic3-server", "fallback_module": "ovos-tts-plugin-mimic",